目的:全景网通过关键字下载所有搜索获得图片
前期分析,这个通过开发者工具查看各个链接之间的关系发现高清图片下载逻辑关系如下
url1 = "http://www.quanjing.com/downcomp.aspx?pic_id=488-8193&Catalogid=chineseview069"#需要pic_id和catalogid
url2 = "http://search.quanjing.com/search?key=%E7%94%B0%E5%9B%AD%E9%A3%8E%E5%85%89&pageSize=200&pageNum=1&imageType=2&sortType=1&callback=searchresult&_=1513818809900"#key=搜索内容--response中有pic_id和catalogid
不废话直接上代码了:
#coding=utf-8
import requests
import json
import re
import sys
import random
import os
import time
import multiprocessing
from multiprocessing import Pool
from time import sleep
import copy_reg
import types
def _pickle_method(m):
if m.im_self is None:
return getattr, (m.im_class, m.im_func.func_name)
else:
return getattr, (m.im_self, m.im_func.func_name)
copy_reg.pickle(types.MethodType, _pickle_method)#使用copy_reg将MethodType注册为可序列化的方法
reload(sys)
sys.setdefaultencoding('utf-8')#初始化编码防止乱码
class Picture_spider(object):
def __init__(self, search_keyword):
self. search_keyword = search_keyword
def make_saving_path(self):#创建保存文件
key = self.search_keyword.encode('gbk')
directory_path = 'F:\\test_auto\\spider\\pictures\\%s'%key
os.mkdir(directory_path)
return directory_path
def get_url_source_by_keyword(self):#通过关键字取得下载中间结果字典(包含pic_id和catalogid的字典)
keyword = self.search_keyword
dict_middle_result = {}
url = "http://search.quanjing.com/search?key=%s&pageSize=200&pageNum=1&imageType=2&sortType=1&callback=searchresult&_=1513818809900"%keyword
response = requests.get(url)
content = response.content
result_1 = re.findall(r'\"pagecount\".*?\d+',content,re.S)
page_number = str(result_1[0].split(":")[-1])#总页数对应pageNum
for page_num in range(1, int(page_number)+1):#总页数
urls = "http://search.quanjing.com/search?key=%s&pageSize=200&pageNum=%s&imageType=2&sortType=1&callback=searchresult&_=1513818809900"%(keyword, page_num)
try:
response = requests.get(urls)
content = response.content
result_2 = re.findall(r'\"imglist\"\:(.*)',content,re.S)[0]
result_3 = re.findall(r'\{\"pic.*\"\}',result_2,re.S)[0]#对信息使用正则表达式进行筛选
result_4 = result_3.split(',{')
for j in range(0,len(result_4)):
if j==0:
dict = json.loads(result_4[j])
dict_middle_result[dict['pic_id']] = dict['catalogid']
else:
strlog ='{'+result_4[j]
dict = json.loads(strlog)
dict_middle_result[dict['pic_id']] = dict['catalogid']#通过关键字获得pic_id和catalogid等信息
return dict_middle_result
except:
print "页面%s获取图片id失败"%page_num
return dict_middle_result#中间结果字典
def get_urls_by_middle_result(self):#后期发现全景网服务器对每个下载请求会生成一个seiid,因此有了这一步处理中间过程取得sei
url_pool = []
middle_result_dict = self.get_url_source_by_keyword()
for key in middle_result_dict:
pic_id = key
catalogid = middle_result_dict[key]
url = "http://www.quanjing.com/downcomp.aspx?pic_id=%s&Catalogid=%s"%(pic_id,catalogid)
response = requests.get(url)
content = response.content
final_url_1 = re.findall(r'
final_url_2 = re.findall(r'document.location.href =(.*)\;<{1}',final_url_1,re.S)
final_url_3 = final_url_2[0]#正则表达式处理获得信息
final_url = re.findall(r'\'(.*)\'',final_url_3,re.S)[0]#取得最终下载URL
url_pool.append(final_url)
return url_pool#最终下载URL组成URL池
def get_picture_by_url(self, url):
fold_path = 'F:\\test_auto\\spider\\pictures'#存放目录,因为无法多进程方法只能提供一个可迭代对象,所以无法自主创建目录
key = self.search_keyword.encode('gbk')
picture_name = key+str(random.randint(10000, 99999))+'.jpg'
picture_path = '%s\\%s'%(fold_path, picture_name)
if os.path.exists(picture_path):
picture_name = key+str(random.randint(1000,9999))+'.jpg'
picture_path = '%s\\%s'%(fold_path, picture_name)
response = requests.get(url)
content = response.content
with open(picture_path, 'wb') as fi:
fi.write(content)
fi.close()
picture_size = os.path.getsize(picture_path)
if picture_size < 7000:
os.remove(picture_path)
fail_url = url
dict = {}
dict[fail_url] = 'fail'
return dict
else:
print "%s下载完成..."%picture_name
success_url = url
dict = {}
dict[success_url] = 'success'
return dict
def main():
keyword = raw_input('请输入需要搜索下载的图片关键字:')
spider = Picture_spider(keyword)
#directory_path = spider.make_saving_path()
url_pool = spider.get_urls_by_middle_result()
picture_num = len(url_pool)
print "根据关键字一共搜索到%s,现在开始下载..."%picture_num
while True:#因为多进程下载存在文件破损或者服务器因为访问过于频繁而出错,所以无限循环直到全部下载完成
if len(url_pool) == 0:
break
else:
p = multiprocessing.Pool(processes=4)#建立4进程
res = p.map(spider.get_picture_by_url, url_pool)#进程执行
p.close()
p.join()
url_pool = []
for item in res:
for key in item:
if item[key] == 'fail':
url_pool.append(key)#损坏文件重新加入URL池重新下载
if __name__ == '__main__':
main()
写的不好,欢迎斧正……