scrapy框架爬取起点小说分类

spider代码

class QidianSpider(scrapy.Spider):
    name = 'qidian'
    allowed_domains = ['qidian.com']
    start_urls = ['https://www.qidian.com/all?orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0']

    def parse(self, response):
        #大分类分组
        li_list = response.xpath("//ul[@type='category']/li")[1:]
        for li in li_list:
            item = {}
            item["first_category_title"] = li.xpath("./a/text()").extract_first()
            item["first_category_url"] = "https:" + li.xpath("./a/@href").extract_first()
            #小分类分组
            yield scrapy.Request(
                url=item["first_category_url"],
                callback=self.parse_first_category,
                meta={"item": item}
            )


    def parse_first_category(self, response):
        dd_list = response.xpath("//div[@class='sub-type']/dl[@class='']/dd")
        first_category_title = response.meta["item"]["first_category_title"]
        for dd in dd_list:
            item = {}
            item["first_category_title"] = first_category_title
            item["second_category_title"] = dd.xpath("./a/text()").extract_first()
            item["second_category_url"] = "https:" + dd.xpath("./a/@href").extract_first()
            yield scrapy.Request(
                url=item["second_category_url"],
                callback=self.parse_second_category,
                meta={"item": item}
            )


    def parse_second_category(self, response):
        first_category_title = response.meta["item"]["first_category_title"]
        second_category_title = response.meta["item"]["second_category_title"]
        li_list = response.xpath("//ul[@class='all-img-list cf']/li")

        for li in li_list:
            item = {}
            item["first_category_title"] = first_category_title
            item["second_category_title"] = second_category_title
            item["book_name"] = li.xpath(".//h4/a/text()").extract_first()
            item["author_name"] = li.xpath(".//p[@class='author']/a[@class='name']/text()").extract_first()
            item["is_end"] = li.xpath(".//span/text()").extract_first()
            item["info"] = li.xpath(".//p[@class='intro']/text()").extract_first().strip()
            item["book_poster_src"] = "http:" + li.xpath(".//div[@class='book-img-box']/a/img/@src").extract_first()
            yield item

        next_url = "https:" + response.xpath("//a[contains(text(), '>')]/@href").extract_first()
        if next_url is not None:
            yield scrapy.Request(url=next_url, callback=self.parse_second_category)

pipelines代码

import json
import requests
from p4.settings import USER_AGENT

class P4Pipeline(object):

    def open_spider(self, spider):
        self.fp = open("./data/book.json", "w", encoding="utf-8")

    def process_item(self, item, spider):
        json.dump(item, self.fp, ensure_ascii=False)
        self.fp.write("\n")
        self.save_poster(item["book_poster_src"], item["book_name"])
        print(item["book_name"] + "save into local documnet successfully")
        return item

    def close_spider(self, spider):
        self.fp.close()

    def save_poster(self, url, title):
        file_name = "./data/img/" + title + ".jpg"
        with open(file_name, "wb") as f:
            f.write(requests.get(url, headers={"User_Agent": USER_AGENT}).content)

主方法

from scrapy import cmdline

cmdline.execute("scrapy crawl qidian".split())

scrapy框架爬取起点小说分类_第1张图片
scrapy框架爬取起点小说分类_第2张图片
爬取结果如下:
scrapy框架爬取起点小说分类_第3张图片

你可能感兴趣的:(爬虫框架)