2019-05-25

苏宁列表页及详情页

import scrapy

import json

import re

from copy import deepcopy

class JdSpider(scrapy.Spider):

    name = 'jd'

    allowed_domains = ['jd.com','3.cn','suning.com',"dangdang.com"]

    def start_requests(self):

        # url = "https://p.3.cn/prices/mgets?skuIds=J_11495238747,"

        # url = "https://pas.suning.com/nspcsale_0_000000000646340846_000000000646340846_0070167435_60_311_3110101_502282_1000095_9095_10638_Z001___R9011242_0.2___.html"

        # url = "http://product.dangdang.com/26920352.html"

        url = "https://list.suning.com/1-502322-0.html"

        yield scrapy.Request(

            url=url,

            callback=self.parse

        )

    def parse(self, response):

        book_list = response.css('p.sell-point a')

        item = {}

        for book in book_list:

            item['sa-data'] = book.css('::attr("sa-data")').extract_first()

            title1 = book.css('::attr("title")').extract_first()

            title1 += book.css('::text').extract_first()

            title1 += book.xpath('//em/text()').extract_first("")

            item['title'] = title1

            item['href'] = book.xpath('./@href').extract_first()

            yield scrapy.Request(

                url="https:"+item['href'],

                callback=self.parse_detail,

                meta = {'item':deepcopy(item)}

            )

    def parse_detail(self,response):

        item = response.meta['item']

        data = item['sa-data']

        url = "https://pas.suning.com/nspcsale_0_{}_{}_{}_60_311_3110101_502282_1000095_9095_10638_Z001.html"

        id1= "000000000000000" + re.search("prdid':'(\d+)','shopid':'(\d+)'",data).group(1)

        id1 = id1[-18:]

        id2= re.search("prdid':'(\d+)','shopid':'(\d+)'",data).group(2)

        url = url.format(id1,id1,id2)

        print(url)

你可能感兴趣的:(2019-05-25)