爬虫流程:优先假设是JSON数据,抓包方式只能翻页 JSON数据 HTML数据 1.异步数据(即先返回HTML,再返回目标的数据,只是触发了JSON请求),不在HTML中 2.不能刷新网页,直接翻页
测试链接:https://live.huya.com/
源代码:
import requests, json, os class Two(object): def __init__(self): # 初始化 self.no = 1 self.start_url = 'https://live.huya.com/liveHttpUI/getLiveList?' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' } # params不能写在__init__()里面,因为__init__()只执行一次,params是变化的 def confrim_params(self): # 构造7页的params for i in range(1, 8): params = { 'iGid': '1663', 'iPageNo': '{}'.format(i), # i是字符串 'iPageSize': '120' } self.request_start_url(params) # break def request_start_url(self, params): # 请求起始地址 # 法一: # response = requests.get(self.start_url, headers = self.headers(), params=params).text # response = json.loads(response) # 法二(request自带,自动变字典)---常用: response = requests.get(self.start_url, headers=self.headers, params=params).json() self.parse_response(response) def parse_response(self, response): # 解析响应 for data in response['vList']: name = data['sNick'].replace('.', '').replace('/', '') link = data['sScreenshot'] self.request_link(name, link) def request_link(self, name, link): # 请求图片链接 img_data = requests.get(link, headers=self.headers).content self.create_dir(name, img_data) def create_dir(self, name, img_data): # 创建文件夹 if not os.path.exists('../虎牙'): os.mkdir('../虎牙') self.save_data(name, img_data) def save_data(self, name, img_data): # 保存图片 with open(f'虎牙/{name}.jpg', 'wb') as f: f.write(img_data) print('ok 第{}张--{}'.format(self.no, name)) self.no += 1 def main(self): # 逻辑控制部分 self.confrim_params() if __name__ == '__main__': t = Two() t.main()