Python3.12 Version
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse
import asyncio
import aiohttp
import aiofiles
async def download_image(session, url, folder):
"""Download a single image asynchronously"""
# Create a valid filename from the URL
filename = os.path.join(folder, os.path.basename(urlparse(url).path))
# Skip if file exists
if os.path.exists(filename):
print(f"Skipping existing file: {filename}")
return
try:
async with session.get(url) as response:
if response.status == 200:
# Ensure the filename has an extension
if not os.path.splitext(filename)[1]:
content_type = response.headers.get('Content-Type', '')
if 'jpeg' in content_type or 'jpg' in content_type:
filename += '.jpg'
elif 'png' in content_type:
filename += '.png'
elif 'gif' in content_type:
filename += '.gif'
elif 'webp' in content_type:
filename += '.webp'
else:
filename += '.jpg' # Default extension
# Save the image
async with aiofiles.open(filename, 'wb') as f:
await f.write(await response.read())
print(f"Downloaded: {filename}")
else:
print(f"Failed to download {url}, status code: {response.status}")
except Exception as e:
print(f"Error downloading {url}: {e}")
async def download_images_from_website(url, folder="downloaded_images"):
"""Download all images from a website"""
# Create the download folder if it doesn't exist
os.makedirs(folder, exist_ok=True)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# Get the webpage content
response = requests.get(url, headers=headers)
if response.status_code != 200:
print(f"Failed to access {url}")
return
# Parse HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Find all image tags
img_tags = soup.find_all('img')
img_urls = []
# Extract image URLs
for img in img_tags:
img_url = img.get('src')
if img_url:
# Convert relative URLs to absolute URLs
img_url = urljoin(url, img_url)
img_urls.append(img_url)
print(f"Found {len(img_urls)} images")
# Download images concurrently
async with aiohttp.ClientSession(headers=headers) as session:
tasks = [download_image(session, img_url, folder) for img_url in img_urls]
await asyncio.gather(*tasks)
if __name__ == "__main__":
website_url = input("Enter the website URL: ")
output_folder = input("Enter output folder (default: downloaded_images): ") or "downloaded_images"
# Run the async function
asyncio.run(download_images_from_website(website_url, output_folder))
print("Download complete!")
This Script Notes:
- Uses
requests
andBeautifulSoup
to fetch and parse the webpage - Finds all
<img>
tags and extracts their URLs - Uses
aiohttp
andasyncio
for concurrent downloads (much faster than sequential) - Creates filenames based on the image URL paths
- Adds appropriate file extensions based on Content-Type
- Skips already downloaded files
Odoo offers powerful tools for Odoo project management. If you're looking for honest insights, this Odoo project management review shows how it improves team collaboration and tracking.
ReplyDelete