Sunday, April 20, 2025

Script to download images from a website using popular libraries.

 Python3.12 Version


import requests

from bs4 import BeautifulSoup

import os

from urllib.parse import urljoin, urlparse

import asyncio

import aiohttp

import aiofiles


async def download_image(session, url, folder):

    """Download a single image asynchronously"""

    # Create a valid filename from the URL

    filename = os.path.join(folder, os.path.basename(urlparse(url).path))

    

    # Skip if file exists

    if os.path.exists(filename):

        print(f"Skipping existing file: {filename}")

        return

    

    try:

        async with session.get(url) as response:

            if response.status == 200:

                # Ensure the filename has an extension

                if not os.path.splitext(filename)[1]:

                    content_type = response.headers.get('Content-Type', '')

                    if 'jpeg' in content_type or 'jpg' in content_type:

                        filename += '.jpg'

                    elif 'png' in content_type:

                        filename += '.png'

                    elif 'gif' in content_type:

                        filename += '.gif'

                    elif 'webp' in content_type:

                        filename += '.webp'

                    else:

                        filename += '.jpg'  # Default extension

                

                # Save the image

                async with aiofiles.open(filename, 'wb') as f:

                    await f.write(await response.read())

                print(f"Downloaded: {filename}")

            else:

                print(f"Failed to download {url}, status code: {response.status}")

    except Exception as e:

        print(f"Error downloading {url}: {e}")


async def download_images_from_website(url, folder="downloaded_images"):

    """Download all images from a website"""

    # Create the download folder if it doesn't exist

    os.makedirs(folder, exist_ok=True)

    

    headers = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

    }

    

    # Get the webpage content

    response = requests.get(url, headers=headers)

    if response.status_code != 200:

        print(f"Failed to access {url}")

        return

    

    # Parse HTML content

    soup = BeautifulSoup(response.text, 'html.parser')

    

    # Find all image tags

    img_tags = soup.find_all('img')

    img_urls = []

    

    # Extract image URLs

    for img in img_tags:

        img_url = img.get('src')

        if img_url:

            # Convert relative URLs to absolute URLs

            img_url = urljoin(url, img_url)

            img_urls.append(img_url)

    

    print(f"Found {len(img_urls)} images")

    

    # Download images concurrently

    async with aiohttp.ClientSession(headers=headers) as session:

        tasks = [download_image(session, img_url, folder) for img_url in img_urls]

        await asyncio.gather(*tasks)


if __name__ == "__main__":

    website_url = input("Enter the website URL: ")

    output_folder = input("Enter output folder (default: downloaded_images): ") or "downloaded_images"

    

    # Run the async function

    asyncio.run(download_images_from_website(website_url, output_folder))

    print("Download complete!")


This Script Notes:

  1. Uses requests and BeautifulSoup to fetch and parse the webpage
  2. Finds all <img> tags and extracts their URLs
  3. Uses aiohttp and asyncio for concurrent downloads (much faster than sequential)
  4. Creates filenames based on the image URL paths
  5. Adds appropriate file extensions based on Content-Type
  6. Skips already downloaded files