爬取百度图片---可以下载10000+张,多线程更快,python实现

from pprint import pprint
import re
import requests
import json
import time
import threading

# 要运行该爬虫,在命令行输入如下命令即可:
# 'python 百度图片cmd.py -k "张伯芝"  -p 2 -d "pic_dir"'
# 'python 百度图片cmd.py -k "搜索关键词"  -p 多少页应该是整数 -d "图片保存在哪里"'

# 1.加入了多线程下载速度更快
# 2.支持命令行运行,运行更方便
# 3.设置了自动翻页,下载数据更多。

class myThread_pic(threading.Thread):
	"""docstring for myThead"""
	def __init__(self, urllist,data_folder_name,non_repetitive_url,lock):
		super(myThread_pic, self).__init__()
		self.urllist = urllist
		self.data_folder_name=data_folder_name
		self.non_repetitive_url=non_repetitive_url
		self.lock=lock
	def run(self,):
		global num
		for i in self.urllist :
			print('开始下载*******',num)
			if i not in self.non_repetitive_url:
				resp1=requests.get(i).content
				with open(self.data_folder_name+r'/' +str(time.time())+'.jpg', mode='wb') as f:
					f.write(resp1)
				self.non_repetitive_url.add(i)
				self.lock.acquire()
				num+=1
				self.lock.release()

header={'User-Agent': 
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'
}



def baidutupian(keyword,page,data_folder_name):
	# 用于确保是增量爬取
	non_repetitive_url=set()
	regr=r'''hoverURL":"(.*?)"'''
	pat= re.compile(regr)
	url='''https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&queryWord='''+keyword+'''&word='''+keyword+'''&pn=%d&rn=%d&gsm=168'''

	#构造翻页链接
	urllist=[url%(i,30) for i in range(0,30*page,30)] #300表示从第三百张开始,并且每页30张
	# print(urllist)
	result=[]
	# 遍历页码链接,并提取图片链接
	lock=threading.Lock()
	for uri in urllist:
		resp=requests.get(uri,headers=header).text
		# text = json.loads(resp)
		data=pat.findall(resp)
		data=[i for i in data if len(i) >0]
		data=set(data)
		# 开启一个子线程下载图片
		pic_crawler=myThread_pic(data,data_folder_name,non_repetitive_url,lock)
		pic_crawler.start()

# baidutupian('戛纳电影节')
if __name__ == '__main__':
	import argparse
	import os
	# 命令行解析对象
	parser = argparse.ArgumentParser()
	parser.add_argument("-k", "--keyword", type=str,help="搜索关键词?")
	parser.add_argument("-p", "--page", type=int,help="下载的页数?")
	parser.add_argument("-d", "--data_folder_name", type=str,help="数据保存的文件夹?")
	args = parser.parse_args()
	num=0

	if not os.path.exists(args.data_folder_name):
		os.mkdir(args.data_folder_name)
	baidutupian(args.keyword,args.page,args.data_folder_name)

 

你可能感兴趣的:(爬虫公开)