WebLearns | Odoo | Learn Programming Languages: April 2025

Python3.12 Version

import requests

from bs4 import BeautifulSoup

import os

from urllib.parse import urljoin, urlparse

import asyncio

import aiohttp

import aiofiles

async def download_image(session, url, folder):

"""Download a single image asynchronously"""

# Create a valid filename from the URL

filename = os.path.join(folder, os.path.basename(urlparse(url).path))

# Skip if file exists

if os.path.exists(filename):

print(f"Skipping existing file: {filename}")

return

try:

async with session.get(url) as response:

if response.status == 200:

# Ensure the filename has an extension

if not os.path.splitext(filename)[1]:

content_type = response.headers.get('Content-Type', '')

if 'jpeg' in content_type or 'jpg' in content_type:

filename += '.jpg'

elif 'png' in content_type:

filename += '.png'

elif 'gif' in content_type:

filename += '.gif'

elif 'webp' in content_type:

filename += '.webp'

else:

filename += '.jpg' # Default extension

# Save the image

async with aiofiles.open(filename, 'wb') as f:

await f.write(await response.read())

print(f"Downloaded: {filename}")

else:

print(f"Failed to download {url}, status code: {response.status}")

except Exception as e:

print(f"Error downloading {url}: {e}")

async def download_images_from_website(url, folder="downloaded_images"):

"""Download all images from a website"""

# Create the download folder if it doesn't exist

os.makedirs(folder, exist_ok=True)

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

}

# Get the webpage content

response = requests.get(url, headers=headers)

if response.status_code != 200:

print(f"Failed to access {url}")

return

# Parse HTML content

soup = BeautifulSoup(response.text, 'html.parser')

# Find all image tags

img_tags = soup.find_all('img')

img_urls = []

# Extract image URLs

for img in img_tags:

img_url = img.get('src')

if img_url:

# Convert relative URLs to absolute URLs

img_url = urljoin(url, img_url)

img_urls.append(img_url)

print(f"Found {len(img_urls)} images")

# Download images concurrently

async with aiohttp.ClientSession(headers=headers) as session:

tasks = [download_image(session, img_url, folder) for img_url in img_urls]

await asyncio.gather(*tasks)

if __name__ == "__main__":

website_url = input("Enter the website URL: ")

output_folder = input("Enter output folder (default: downloaded_images): ") or "downloaded_images"

# Run the async function

asyncio.run(download_images_from_website(website_url, output_folder))

print("Download complete!")

This Script Notes:

Uses requests and BeautifulSoup to fetch and parse the webpage
Finds all <img> tags and extracts their URLs
Uses aiohttp and asyncio for concurrent downloads (much faster than sequential)
Creates filenames based on the image URL paths
Adds appropriate file extensions based on Content-Type
Skips already downloaded files

WebLearns | Odoo | Learn Programming Languages

Sunday, April 20, 2025

Script to download images from a website using popular libraries.