Python爬虫:scrapy爬取斗鱼直播图片

通过斗鱼给出的api,获取json文件,解析出图片地址,可以获取直播间的图片
斗鱼api接口:

http://open.douyucdn.cn/api/RoomApi/live/{num}

比如:
http://open.douyucdn.cn/api/RoomApi/live/1

当然也可以用这个获取好多妹子的图片,当然也有小哥哥
http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=

项目文件为spider.py、item.py、pipeline.py

spider.py

# -*- coding: utf-8 -*-

# 通过接口,爬取斗鱼图片保存本地

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

import scrapy
import os
from douyu_item import DouyuItem
import json

# 图片存储位置,也可以在 settings.py 中统一配置
BASE_DIR = os.path.dirname(os.path.abspath(__file__))

IMAGES_STORE = os.path.join(BASE_DIR, "images")

class DouyuSpider(scrapy.Spider):
    name = "douyu"
    allowed_domains = ["douyucdn.cn"]

    # 设置 图片存储位置 和 处理数据的管道文件
    custom_settings = {
        "IMAGES_STORE": IMAGES_STORE,
        "ITEM_PIPELINES":{
             "myspider.douyu_spider.douyu_pipeline.DouyuPipeline": 100,
        }
    }

    # 接口
    # base_url = "http://open.douyucdn.cn/api/RoomApi/live/"
    base_url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
    offset = 0
    start_urls = [base_url + str(offset)]

    def parse(self, response):
        # 解析json文件
        datas = json.loads(response.body).get("data")

        # 停止条件
        if datas == []:
            print "已经没有图片啦"
            return

        for data in datas:
            room_src = data.get("room_src")
            room_name = data.get("room_name")

            item = DouyuItem()
            item["img_name"] = room_name
            item["img_link"] = room_src

            yield item

        # 继续下一页
        self.offset += 20
        yield scrapy.Request(self.base_url + str(self.offset))

item.py

# -*- coding: utf-8 -*-

import scrapy

class DouyuItem(scrapy.Item):
    img_name = scrapy.Field()  # 图片名称
    img_link = scrapy.Field()  # 图片链接

pipeline.py

# -*- coding: utf-8 -*-


import scrapy
import os
from scrapy.pipelines.images import ImagesPipeline
from myspider.douyu_spider.douyu_spider import images_path

class DouyuPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        # 返回需要下载的图片链接
        img_link = item.get("img_link")
        yield scrapy.Request(img_link)

    def item_completed(self, results, item, info):
        # 将下载完成的图片重命名
        print results
        """
        [(True, 
        {'url': 'https://rpic.douyucdn.cn/amrpic-180422/4475021_1048.jpg', 
        'path': 'full/aa6df7582a33bbe025ec0e3ebd21ff133aa56b36.jpg', 
        'checksum': 'bc89354a577ee6cf22a7d065859bc990'})]
        """
        image_path = [url["path"] for ok, url in results if ok]
        odl_path = os.path.join(images_path, image_path[0])
        new_path = os.path.join(images_path, item["img_name"]+".jpg")

        # 重命名
        os.rename(odl_path, new_path)
        return item

你可能感兴趣的:(scrapy)