网页分析
通过源代码和Network分析,发现返回的卡牌是用post请求的json文件
请求数据为:
cardClass: hunter #卡的类型:职业
keywords:
standard: 1
t: 1576286199445 #时间串
cardSet: #费用
p: 1#页码(每页8张卡)
导入全部所需包
import urllib.request
import urllib.parse
import jsonpath
import json
import os
import time
请求并解析全部js
cardClasses=["druid","hunter","mage","paladin","priest","rogue","shaman","warlock","warrior","neutral"]
lushi_urls = 'https://hs.blizzard.cn/action/cards/query'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
for cardClass in cardClasses:
function(){ //XM http://www.hantecglobal.org.cn/
if not os.path.exists(cardClass):
os.mkdir(cardClass)
print("开始爬{}了".format(cardClass))
for p in range(1,60):
print(p)
try:
lushi_urls = 'https://hs.blizzard.cn/action/cards/query'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
data = {
'cost': '',
'cardClass': cardClass,
'keywords': '',
'standard': '1',
't': int(time.time()),
'cardSet': '',
'p': p
}
data = urllib.parse.urlencode(data).encode("utf-8")
request = urllib.request.Request(url=lushi_urls, headers=headers, data=data)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
jsondict = json.loads(content)
card_names = jsonpath.jsonpath(jsondict, '$..cards..name')
card_pics = jsonpath.jsonpath(jsondict, '$..cards..pic')
创建目录并保存文件
for i in range(len(card_names)):
card_path=cardClass+"/"+card_names[i]+".png"
urllib.request.urlretrieve(url=card_pics[i], filename=card_path)
完整全代码
import urllib.request
import urllib.parse
import jsonpath
import json
import os
import time
print("努力成为爬虫大神")
timestart=time.time()
cardClasses=["druid","hunter","mage","paladin","priest","rogue","shaman","warlock","warrior","neutral"]
lushi_urls = 'https://hs.blizzard.cn/action/cards/query'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
for cardClass in cardClasses:
if not os.path.exists(cardClass):
os.mkdir(cardClass)
print("开始爬{}了".format(cardClass))
for p in range(1,60):
print(p)
try:
lushi_urls = 'https://hs.blizzard.cn/action/cards/query'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
data = {
'cost': '',
'cardClass': cardClass,
'keywords': '',
'standard': '1',
't': int(time.time()),
'cardSet': '',
'p': p
}
data = urllib.parse.urlencode(data).encode("utf-8")
request = urllib.request.Request(url=lushi_urls, headers=headers, data=data)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
jsondict = json.loads(content)
card_names = jsonpath.jsonpath(jsondict, '$..cards..name')
card_pics = jsonpath.jsonpath(jsondict, '$..cards..pic')
for i in range(len(card_names)):
card_path=cardClass+"/"+card_names[i]+".png"
urllib.request.urlretrieve(url=card_pics[i], filename=card_path)
except:
continue
timeend=time.time()
print("一共用时:{}秒".format(timeend-timestart))
import osimport requestsfrom bs4 import BeautifulSoupimport time# 发送请求def send():
r = requests.get(url=base_url)
# 设置编码防止乱码
r.encoding ="GBK";
content = r.text
parseAndSave(content)# 解析页面和保存数据def parseAndSave(html):
soup = BeautifulSoup(html, 'lxml')
ulList = soup.find_all('ul', attrs={'class': 'kzlist'})
# print(ulList);
for ul in ulList:
li = ul.find_all('li');
for item in li:
name = item.find('img').next_sibling
obtain_method = item.find('a').find('p').text
rootDir = os.getcwd()
if not os.path.exists(name):
os.mkdir(name);
os.chdir(name);
src = item.find('a').find('img')['src']
pic = requests.get(src)
with open('pic.jpg', 'wb') as fw:
fw.write(pic.content)
with open('info.txt', 'a+') as fw:
fw.write(name+'\n')
fw.write(obtain_method)
os.chdir(rootDir);def main():
start_time = time.time()
send()
end_time = time.time()
print('程序用时:',(end_time - start_time))if __name__ == '__main__':
cardList = []
main()