源码奉上
import requests
import re
import os
from lxml import etree
if __name__ == '__main__':
#创建一个文件夹wz
#打算爬取皮肤图片,铭文,出装等 先创建一个文件夹使用
if not os.path.exists('./wz'):
os.mkdir('./wz')
#爬取的网站
url = 'https://pvp.qq.com/web201605/herolist.shtml'
#伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows'
' NT 10.0; WOW64)'
'AppleWebK'
'it/537.36 (KHTML, like Gecko)'
' Chrome/85.0.4183.102 Sa'
'fari/537.36'}
#爬取源码数据
page= requests.get(url=url,headers=headers)
#防止乱码
page.encoding='gbk'
page_text=page.text
# print(page_text)
#进行数据解析
tree= etree.HTML(page_text)
#定位到所需数据
list_w=tree.xpath('//ul[@class="herolist clearfix"]/li')
for li in list_w:
#爬取英雄名字
cover_name=li.xpath('./a/text()')[0]
#爬取单个英雄对应的网址
w='https://pvp.qq.com/web201605/'+li.xpath('./a/@href')[0]
# print(cover_name)
# print(w)
page_w=requests.get(url=w,headers=headers)
page_w.encoding='gbk'
# 爬取英雄页面源码数据
page_w_text=page_w.text
#print(page_w_text)
#数据解析(英雄故事)
tree_story =etree.HTML(page_w_text)
list_story=tree_story.xpath('//div[@class="pop-bd"]')
for li_story in list_story:
#拿到英雄故事TXT文本
story=li_story.xpath('./p/text()')
print(cover_name,':',story)
tree_skill = etree.HTML(page_w_text)
list_skill=tree_skill.xpath('//div[@class="skill-show"]/div')
for li_skill in list_skill:
#拿到英雄技能和技能描述
skill=li_skill.xpath('./p[@class="skill-name"]/b/text()')+li_skill.xpath('./p[@class="skill-desc"]/text()')
# skill_desc=li_skill.xpath('./p[@class="skill-desc"]/text()')
print(skill)