使用爬虫批量下载图片链接并去重

  • 设置timeout=(20, 20), verify=False避免超时和校验问题
  • jpeg以jpg格式保存
  • 获取图片编码的md5并存为文件名,以避免重复
import pandas as pd
import requests
import os
import hashlib
from tqdm import tqdm
file_path = 'xiaofang.xlsx'
save_dir = 'xiaofang'
df = pd.read_excel(file_path)
url_lists = df.iloc[:, 2]
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}

def save_image(file_url):
    r = requests.get(file_url, headers = headers, timeout=(20, 20), verify=False)
    r.encoding = r.apparent_encoding
    if r.status_code == 200:
        ext = r.headers['Content-Type'].split('/')[-1]
        if ext == 'jpeg':
            ext = 'jpg'
        if ext not in ('jpg', 'png'):
            raise Exception("{}未包含指定格式的图片".format(file_url))
        file_name = hashlib.md5(r.content).hexdigest()+'.'+ext
        file_path = os.path.join(save_dir, file_name)
        if os.path.exists(file_path):
            raise Exception("{}图片重复".format(file_url))
        with open(file_path, "wb") as f:
            f.write(r.content)
    else:
        raise Exception("{}的状态码为{}".format(file_url, str(r.status_code)))


if __name__ == '__main__':
    for url in tqdm(url_lists):
        try:
            save_image(url)
        except Exception as e:
            print(e)

你可能感兴趣的:(爬虫,爬虫,python)