import pickle
import logging
from datetime import datetime
from dateutil.parser import parse as parse_date
from brisque import BRISQUE
import os
import cv2
import numpy as np
from PIL import Image
from io import BytesIO
import os
import requests
from skimage import color
from time import sleep
from random import choice
import concurrent.futures
from requests.exceptions import Timeout
from robots import RobotParser
from headers import HEADERS
MAX_RETRIES = 3 # Number of times the crawler should retry a URL
INITIAL_BACKOFF = 2 # Initial backoff delay in seconds
DEFAULT_SLEEP = 10 # Default sleep time in seconds after a 429 error
brisque = BRISQUE(url=False)
# --- SETUP LOGGER ---
filename = 'image-scraper.log'
filepath = os.path.dirname(os.path.abspath(__file__))
# create file path for log file
log_file = os.path.join(filepath, filename)
# create a FileHandler to log messages to the log file
handler = logging.FileHandler(log_file)
# set the log message formats
'%(levelname)s %(threadName)s (%(asctime)s): %(message)s')
# create a logger with the given name and log level
logger = logging.getLogger('image-scraper')
# prevent logging from being send to the upper logger - that includes the console logging
logger.propagate = False
# add the FileHandler to the logger
def get_image_quality_metrics(response):
Calculate various image quality metrics for an image.
response (requests.Response): The response object containing the image data.
dict: A dict of image quality metrics including brightness, sharpness, contrast, and colorfulness.
image_array = np.frombuffer(response.content, np.uint8)
image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
metrics = dict()
# Calculate brightness
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
metrics['brightness'] = np.mean(gray)
# Calculate sharpness using variance of Laplacian
metrics['sharpness'] = cv2.Laplacian(gray, cv2.CV_64F).var()
# Calculate contrast using root mean squared contrast
metrics['contrast'] = np.sqrt(np.mean((gray - np.mean(gray)) ** 2))
# Calculate image noise using variance of Gaussian or median absolute deviation (MAD)
metrics['noise'] = np.var(image)
# Calculate saturation using average saturation of pixels or histogram analysis
hsv = color.rgb2hsv(image)
saturation = hsv[:, :, 1]
metrics['saturation'] = np.mean(saturation)
# Calculate colorfulness
lab = color.rgb2lab(image)
a, b = lab[:, :, 1], lab[:, :, 2]
metrics['colorfulness'] = np.sqrt(np.mean(a ** 2 + b ** 2))
# Get dimenstions of the image
height, width, _ = image.shape
metrics['height'] = height
metrics['width'] = width
return metrics
def send_request(url: str) -> requests.Response:
Sends a GET request to the specified URL, checks whether the link is valid,
and returns a response object.
url (str): The URL to send the GET request to
retry_count = 0
header = choice(HEADERS)
# 亿牛云 爬虫代理加强版
proxyHost = "www.16yun.cn"
proxyPort = "31111"
# 代理验证信息
proxyUser = "16YUN"
proxyPass = "16IP"
# create a proxy server object using the proxy information
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
proxies = {
"http": proxyMeta,
"https": proxyMeta,
while retry_count < MAX_RETRIES:
# Send a GET request to the website and return the response object
req = requests.get(url, headers=header, proxies=proxies, timeout=20)
logger.info(f"Successfully fetched {url}")
return req
except Timeout:
# Handle timeout error: log the error and increase the retry count and backoff delay
logger.error(f"Timeout error for {url}")
retry_count += 1
backoff *= 2
except requests.exceptions.HTTPError as e:
# Handle HTTP error: log the error and check the status code
logger.error(f"HTTP error for {url}: {e}")
status_code = e.response.status_code
if status_code == 429:
# Handle 429 error: wait for some time and retry
logger.info(f"Waiting for {DEFAULT_SLEEP} seconds after 429 error")
retry_count += 1
elif status_code == 403 or status_code == 404:
# Handle 403 or 404 error: break the loop and return None
logger.info(f"Skipping {url} due to {status_code} error")
# Handle other errors: raise the exception and log the error
logger.error(f"Other HTTP error for {url}: {e}")
raise e
# Return None if the loop ends without returning a response object
return None
def process_image(response, url):
Process an image from a response object and calculate its quality metrics and BRISQUE score.
response (requests.Response): The response object containing the image data.
url (str): The URL of the image.
dict: A dict of image information including quality metrics and BRISQUE score.
# Open the image data from the response object and convert it to RGBA format
image = Image.open(BytesIO(response.content)).convert('RGBA')
# Create a folder named "images" to store the downloaded images
os.makedirs('images', exist_ok=True)
# Get the current date and time and convert it to a string format as the image file name
date_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
# Open a file with the date and time as the file name and write the image data to it
with open(f'images/{date_time}.png', 'wb') as f:
image.save(f, 'PNG')
# Calculate the BRISQUE score of the image and add it to the dict
image_info = dict()
image_info['brisque'] = get_brisque_score(response)
# Calculate the other quality metrics of the image and add them to the dict
# Delete the response object and the image object to free up memory
del response
del image
# Return the dict of image information
return image_info
# Create a list of websites to scrape images from
websites = [
# Create a list to store the results of each website
results = []
# Create a thread pool with 10 threads and submit tasks for each website
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
for website in websites:
# Submit a task to send a request to the website and get a response object
future = executor.submit(send_request, website)
# Add the future object to the results list
# Iterate over the results list and get the result of each future object
for future in results:
# Get the response object from the future object
response = future.result()
# Check if the response object is None or not
if response is not None:
# Process the response object and get the image information dict
image_info = process_image(response, website)
# Add the image information dict to the results list
# Skip the website if the response object is None
# Serialize and save the results list to a file using pickle module
with open('results.pkl', 'wb') as f:
pickle.dump(results, f)