苏宁列表页及详情页
import scrapy
import json
import re
from copy import deepcopy
class JdSpider(scrapy.Spider):
name = 'jd'
allowed_domains = ['jd.com','3.cn','suning.com',"dangdang.com"]
def start_requests(self):
# url = "https://p.3.cn/prices/mgets?skuIds=J_11495238747,"
# url = "https://pas.suning.com/nspcsale_0_000000000646340846_000000000646340846_0070167435_60_311_3110101_502282_1000095_9095_10638_Z001___R9011242_0.2___.html"
# url = "http://product.dangdang.com/26920352.html"
url = "https://list.suning.com/1-502322-0.html"
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response):
book_list = response.css('p.sell-point a')
item = {}
for book in book_list:
item['sa-data'] = book.css('::attr("sa-data")').extract_first()
title1 = book.css('::attr("title")').extract_first()
title1 += book.css('::text').extract_first()
title1 += book.xpath('//em/text()').extract_first("")
item['title'] = title1
item['href'] = book.xpath('./@href').extract_first()
yield scrapy.Request(
url="https:"+item['href'],
callback=self.parse_detail,
meta = {'item':deepcopy(item)}
)
def parse_detail(self,response):
item = response.meta['item']
data = item['sa-data']
url = "https://pas.suning.com/nspcsale_0_{}_{}_{}_60_311_3110101_502282_1000095_9095_10638_Z001.html"
id1= "000000000000000" + re.search("prdid':'(\d+)','shopid':'(\d+)'",data).group(1)
id1 = id1[-18:]
id2= re.search("prdid':'(\d+)','shopid':'(\d+)'",data).group(2)
url = url.format(id1,id1,id2)
print(url)