京东书籍爬取

# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
import json
import urllib
class BookjdSpider(scrapy.Spider):
    name = 'bookjd'
    allowed_domains = ['jd.com','p.3.cn']  #价格域名也要加进去
    start_urls = ['https://book.jd.com/booksort.html']

    def parse(self, response):
        #第一种
        # dt_list = response.xpath("//div[@class='mc']/dl/dt")
        # dd_list = response.xpath("//div[@class='mc']/dl/dd")
        # # for dt in dt_list:
        # #     item = {}
        # #     item["fenlei"] = dt.xpath("./a/text()").extract_first()
        # #     print(item)
        # # for dd in dd_list:
        # #     em_list = dd.xpath("//em")
        # #     for em in em_list:
        # #         item = {}
        # #         item["fenlei"] = em.xpath("./a/text()").extract_first()
        # #         print(item)
        # for dt,dd in zip(dt_list,dd_list):
        #     item = {}
        #     item["dafenlei"] = dt.xpath("./a/text()").extract_first()
        #     em_list = dd.xpath("./em")
        #     for em in em_list:
        #
        #         item["fenlei"] = em.xpath("./a/text()").extract_first()
        #         print(item)
        #第二种
        dt_list = response.xpath("//div[@class='mc']/dl/dt")
        for dt in dt_list:
            item = {}
            item["dafenlei"] = dt.xpath("./a/text()").extract_first()
            em_list = dt.xpath("./following-sibling::dd[1]/em")
            for em in em_list:

                item["fenlei"] = em.xpath("./a/text()").extract_first()
                #//list.jd.com/1713-3258-3297.html
                #https://list.jd.com/list.html?cat=1713,3258,3297&tid=3297
                item["href"] = em.xpath("./a/@href").extract_first()
                # num1 = item["href"].rsplit('/')[3].rsplit('.')[0].rsplit('-')[0]
                # num2 = item["href"].rsplit('/')[3].rsplit('.')[0].rsplit('-')[1]
                # num3 = item["href"].rsplit('/')[3].rsplit('.')[0].rsplit('-')[2]
                # item["href"] = "https://list.jd.com/list.html?cat={},{},{}&tid={}".format(num1,num2,num3,num3)
                if item["href"] is not None:
                    item["href"] = "https:" + item["href"]
                print(item)
                yield scrapy.Request(
                    item["href"],
                    callback=self.book_list,
                    meta = {"item":deepcopy(item)}

                )
    def book_list(self,response):
        item = response.meta["item"]
        li_list = response.xpath("//div[@id='plist']/ul/li")
        for li in li_list:
            item["book_title"] = li.xpath(".//div[@class='p-name']/a/em/text()").extract_first().strip()
            item["book_img"] = li.xpath(".//div[@class='p-img']/a/img/@src").extract_first()
            if item["book_img"] is None:
                item["book_img"] = li.xpath(".//div[@class='p-img']/a/img/@data-lazy-img").extract_first()
            item["book_img"] = "https:"+ item["book_img"] if item["book_img"] is not None else None
            item["book_href"] = li.xpath(".//div[@class='p-name']/a/@href").extract_first()
            item["book_publish"] = li.xpath(".//span[@class='p-bi-store']/a/text()").extract_first()
            item["book_sku"] = li.xpath("./div/@data-sku").extract_first()
            print(item)
            #多个书籍价格的URL
            #https://p.3.cn/prices/mgets?callback=jQuery5548003&ext=11101000&pin=&type=1&area=1_72_4137_0&skuIds=J_11757834
            # %2CJ_12090377%2CJ_10616501%2CJ_12192773%2CJ_12155241%2CJ_11716978%2CJ_12174897%2CJ_10367073%2CJ_11711801
            # %2CJ_10960247%2CJ_10019917%2CJ_11711801%2CJ_10199768%2CJ_11711801%2CJ_12174895%2CJ_12173835%2CJ_12350509
            # %2CJ_12052646%2CJ_12479361%2CJ_12160627%2CJ_12406846%2CJ_10162899%2CJ_12449755%2CJ_11888857%2CJ_11982184
            # %2CJ_12271618%2CJ_12184621%2CJ_12041776%2CJ_12174923%2CJ_11982172
            # &pdbp=0&pdtk=&pdpin=&pduid=15498906230231700222016&source=list_pc_front&_=1549895670335  #多余的可以去去掉
            yield scrapy.Request(
                "https://p.3.cn/prices/mgets?skuIds=J_{}".format(item["book_sku"]),  #拼接单个书籍的价格
                callback=self.pare_book_price,
                meta={"item":deepcopy(item)}

            )
        next_url = response.xpath("//a[@class='pn-next']/@href").extract_first()
        if next_url is not None:
            next_url = urllib.parse.urljoin(response.url,next_url)
            yield scrapy.Request(
                next_url,
                callback=self.book_list,
                meta = {"item":item}
            )

    def pare_book_price(self,response):
        item = response.meta["item"]
        item["book_price"] = json.loads(response.body.decode())[0]["op"]

settings.py

# 指定使用scrapy-redis的去重
DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
# 指定使用scrapy-redis的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 在redis中保持scrapy-redis用到的各个队列,从而允许暂停和暂停后恢复,也就是不清理redis queues
SCHEDULER_PERSIST = True
# REDIS_URL = "redis://192.168.170.141:6379"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False

REDIS_HOST = '192.168.170.141'            # 主机名
REDIS_PORT = 6379                         # 端口
REDIS_PARAMS  = {'password':'xxxxx'}    # Redis连接参数
REDIS_ENCODING = "utf-8"  

项目地址:https://github.com/CH-chen/jdbook

转载于:https://www.cnblogs.com/chvv/p/10365015.html

你可能感兴趣的:(京东书籍爬取)