爬取斗鱼中的内容时,分析页面发现,页面跳转时,url不会发生改变,所以,我们可以通过Fiddler抓取真正的网址
import scrapy
import json
from Douyu.items import DouyuItem
class DouyuSpider(scrapy.Spider):
name = 'douyu'
allowed_domains = ['douyu.com']
offset = 0
url = 'https://www.douyu.com/gapi/rkc/directory/0_0/'
# 'https://www.douyu.com/directory/all/gapi/rkc/directory/0_0/' 这个是页面跳转的真实页面
"""直接json格式链接"""
"""https://www.douyu.com/gapi/rkc/directory/0_0/0"""
start_urls = ['https://www.douyu.com/gapi/rkc/directory/0_0/0'] #这个是json格式的
def parse(self, response):
json_text = json.loads(response.text)#将字符串格式的转化为json数据
total_data = len(json_text['data']['rl'])
for i in range(0,total_data):
item = DouyuItem()
item['img'] = json_text['data']['rl'][i]['rs1']
item['name'] = json_text['data']['rl'][i]['nn']
yield item
if self.offset < 100:
self.offset += 1
yield scrapy.Request(self.url+str(self.offset), callback=self.parse)
"""--------------pipline中------------------------------"""
"""实现将图片下载到指定位置""" class ImagesPipeline(ImagesPipeline): IMAGES_STORE = get_project_settings().get("IMAGES_STORE") def get_media_requests(self, item, info): image_url = item["img"] yield scrapy.Request(image_url) def item_completed(self, result, item, info): image_path = [x["path"] for ok, x in result if ok] os.rename(self.IMAGES_STORE + image_path[0], self.IMAGES_STORE + item["name"] + ".jpg") item["imagePath"] = self.IMAGES_STORE + item["name"] return item
可以查看源代码,/site-packages/scrapy/pipelines/image.py中查看,或者查看官方文档也可以,有详细解释
网址 https://doc.scrapy.org/en/latest/topics/media-pipeline.html
IMAGES_STORE 在setting中设置,路径如果是windows的话 注意是 \ Linux的为 / 我的windows为
IMAGES_STORE = 'F:/linux-scrapy/Douyu/image/'
linux 的为 /home/chase/python3-crawling/Douyu/image 这个随意