1、爬取下载百度图片线程类定义
# 批量采集百度图库图片数据
import re
import requests
import threading
import urllib.request,urllib.response
import uuid
import time
from day17.demo04_python_logging import *
# 定义下载器线程
class Downloader(threading.Thread):
def __init__(self, url, fileName):
threading.Thread.__init__(self)
self.__url = url
self.__fileName = fileName
pass
# 重写run方法
def run(self):
# 获得资源文件名称和资源信息(封装在响应头)
try:
fileName, headers = urllib.request.urlretrieve(self.__url)
print("正在下载:")
urllib.request.urlretrieve(self.__url, self.__fileName, self.download)
print("下载完成!")
except Exception as e:
logger.error("下载文件失败:" + url )
pass
pass
@staticmethod
def download(bockNum, blockSize, contentLength):
p = 100*bockNum*blockSize/contentLength
if p > 100:
p = 100
print("下载进度{0}%".format(p))
pass
pass
def decode_url(url):
"""
对百度加密后的地址进行解码\n
:param url:百度加密的url\n
:return:解码后的url
"""
table = {'w': "a", 'k': "b", 'v': "c", '1': "d", 'j': "e", 'u': "f", '2': "g", 'i': "h",
't': "i", '3': "j", 'h': "k", 's': "l", '4': "m", 'g': "n", '5': "o", 'r': "p",
'q': "q", '6': "r", 'f': "s", 'p': "t", '7': "u", 'e': "v", 'o': "w", '8': "1",
'd': "2", 'n': "3", '9': "4", 'c': "5", 'm': "6", '0': "7",
'b': "8", 'l': "9", 'a': "0", '_z2C$q': ":", "_z&e3B": ".", 'AzdH3F': "/"}
url = re.sub(r'(?P_z2C\$q|_z\&e3B|AzdH3F+)', lambda matched: table.get(matched.group('value')), url)
return re.sub(r'(?P[0-9a-w])', lambda matched: table.get(matched.group('value')), url)
#
if __name__ == "__main__":
pn = 0
# 初始化url地址
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
}
# 通过对比两个网址可以发现只有pn和gsm的值发生了改变。pn代表的是页数,rn代表每页显示的图片数量。gsm是pn 的 16位显示。
while True:
url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&" \
"ct=201326592&is=&fp=result&queryWord=汽车&cl=2&lm=&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=" \
"&latest=©right=&word=汽车&s=&se=&tab=&width=&height=&face=0" \
"&istype=2&qc=&nc=1&fr=&expermode=&pn={0}&rn=30&gsm={1}&1545483113251=".format(pn, (str(hex(pn)))[2:])
print(url)
try:
response = requests.get(url, headers= headers)
obj = response.json()
for temp in obj['data']:
if temp.get('objURL'):
imgURL = temp['objURL']
if imgURL.startswith('ippr_z2C'): # endswith
imgURL = decode_url(imgURL)
pass
downloader = Downloader(imgURL, str(uuid.uuid4()) + '.jpg')
downloader.start()
pass
pass
time.sleep(10)
pn += 30
except Exception as e:
logger.error("URL地址访问数据异常:" + url)
pass
pass