python爬虫:爬取多页斗鱼颜值图片 并且下载到本地

项目结构
python爬虫:爬取多页斗鱼颜值图片 并且下载到本地_第1张图片

main.py
运行scrapy

from scrapy import cmdline
cmdline.execute("scrapy crawl douyu_scrapy".split())

douyu_scrapy.py
爬虫代码

import scrapy
import json
from douyu.items import DouyuItem

class DouyuScrapySpider(scrapy.Spider):
    name = 'douyu_scrapy'
    # allowed_domains = ['www.douyu.com']
    start_urls = ['https://www.douyu.com/gapi/rknc/directory/yzRec/1']
    offset = 1

    def parse(self, response):
        data_list = json.loads(response.body)["data"]["rl"]  #通过对json的分析,名称和图片都在rl下,我们直接拿到data下的rl进行遍历获取
        for data in data_list:
            nn = data["nn"]  #名称
            img_url = data["rs1"] #图片地址
            item = DouyuItem(nn=nn,img_url=img_url)
            yield item

        #爬取多页,只需改变offset的限制条件,然后进行回调访问
        self.offset += 1
        if self.offset < 4:
            num = int(str(response).split(" ")[1].replace(">", "").split("/")[-1])
            num += 1
            url = "https://www.douyu.com/gapi/rknc/directory/yzRec/" + str(num)
            yield scrapy.Request(url=url,callback=self.parse,encoding="utf-8",dont_filter=True)

items.py

import scrapy


class DouyuItem(scrapy.Item):

   nn = scrapy.Field()  #主播名称
   img_url = scrapy.Field() #直播间封面

pipelines.py
利用images中间件下载

from scrapy.pipelines.images import ImagesPipeline
import os,scrapy

class DouyuPipeline(ImagesPipeline):

	#调用images中间件下载
   def get_media_requests(self, item, info):
       image_link = item["img_url"]
       image_name = item['nn']
       yield scrapy.Request(image_link,meta={"image_name" : image_name})
       
   #images保存地址
   def file_path(self, request, response=None, info=None, *, item=None):
       category = request.meta['image_name']
       return category + ".jpg

settings.py
配置

BOT_NAME = 'douyu'
SPIDER_MODULES = ['douyu.spiders']
NEWSPIDER_MODULE = 'douyu.spiders'
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
DEFAULT_REQUEST_HEADERS = {
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 'Accept-Language': 'en',
 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 '
}
ITEM_PIPELINES = {
  'douyu.pipelines.DouyuPipeline': 300,
}
IMAGES_STORE = "download"

python爬虫:爬取多页斗鱼颜值图片 并且下载到本地_第2张图片

你可能感兴趣的:(python爬虫)