首先设置UA池
def UserAgent():
list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',]
return list
使用进程池调用将皮肤,技能保存至文件夹
from fake_useragent import *
import requests
import json
import random
import re
import os
from multiprocessing import Pool
UA = UserAgent()
headers = {'User-Agent':random.choice(UA)}
#获得所有英雄的基本信息的json串
def get_hero_list():
url = 'http://pvp.qq.com/web201605/js/herolist.json'
response = requests.get(url, headers=headers)
if response.status_code == 200:
result = json.loads(response.text)
return result
else:
print('爬取失败')
return None
#将每个英雄的代码,姓名等拆分出来
def get_hero_html(info):
ename = info['ename']
cname = info['cname']
skin_name_list = info['skin_name'].split('|')
#获得每个英雄的皮肤数量
skin_num = len(skin_name_list)
#将皮肤数字和英雄代码传入URL 获得图片二进制流
for i in range(1,skin_num+1):
url = 'http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{}/{}-bigskin-{}.jpg'.format(ename,ename,i)
response = requests.get(url, headers=headers)
if response.status_code ==200:
file = response.content
#判断以英雄命名的文件夹是否存在,并且写入
try:
if not os.path.exists('./images/{}'.format(cname)):
os.makedirs('./images/{}'.format(cname))
else:
with open('./images/{}/{}.jpg'.format(cname, skin_name_list[i - 1]), 'wb') as f:
f.write(file)
except Exception:
raise
else:
print(cname,skin_name_list[i - 1], '图片爬取失败')
detail_hero_info(cname,ename)
def detail_hero_info(cname,ename):
#访问每个英雄的HTML 注意设置字符编码
url = 'http://pvp.qq.com/web201605/herodetail/{}.shtml'.format(ename)
response = requests.get(url,headers=headers)
response.encoding ='GBK'
if response.status_code == 200:
html = response.text
#将得到的文本传入保存文件函数
save_to_info(html,cname)
else:
return None
def save_to_info(html,cname):
#使用正则匹配目标信息,做成字典写入文件
pattern =re.compile('([\s\S]*?)([\s\S]*?)([\s\S]*?)
\s+([\s\S]*?)
\s+([\s\S]*?)')
items = re.findall(pattern,html)
if not items[-1][0]:
items = items[:-1]
for item in items:
result = {
'技能名称':item[0],
'冷却值':item[1][4:],
'消耗':item[2][3:],
'技能介绍':item[3],
'技能详解':item[4],
}
targ = json.dumps(result,ensure_ascii=False)
try:
with open('./images/{}/技能.txt'.format(cname),'a',encoding='utf-8') as f:
f.write(targ+'\n\n')
except Exception:
raise
else:
print(cname + '技能收录完毕')
def main():
#调用进程池
result = get_hero_list()
pool = Pool()
pool.map(get_hero_html,result)
pool.close()
pool.join()
if __name__ == '__main__':
main()
得到的结果: