# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class JingdongItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() # 创建容器
shop = scrapy.Field()
shoplink=scrapy.Field()
price = scrapy.Field()
comment = scrapy.Field()
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jingdong.items import JingdongItem
from scrapy.http import Request
import urllib.request
import re
class JdSpider(CrawlSpider):
name = 'jd'
allowed_domains = ['jd.com']
start_urls = ['http://www.jd.com/']
'''
def start_requests(self):
ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0'}
yield Request('https://search.jd.com/Search?keyword=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%86%AC%E5%A5%B3&enc=utf-8', headers=ua)
'''
rules = (
Rule(LinkExtractor(allow=''), callback='parse_item', follow=True),
)
def parse_item(self, response):
try:
i = JingdongItem()
thisurl=response.url #后加
pat='item.jd.com/(.*?).html'
x=re.search(pat,thisurl)
if (x):
thisid=re.compile(pat).findall(thisurl)[0]
title=response.xpath('//html/head/title/text()').extract()
#//div[@class="name"]/a/@title div[@class="brand-logo"]/a/img
shop=response.xpath('//div[@class="name"]/a/@title').extract()
shoplink=response.xpath('//div[@class="name"]/a/@href').extract()
priceurl='https://p.3.cn/prices/mgets?callback=jQuery9030294&type=1&area=1_72_4137_0&pdtk=&pduid=378203029&pdpin=&pin=null&pdbp=0&skuIds=J_'+str(thisid)
commenturl='https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv191&productId='+str(thisid)+'&score=0&sortType=5&page=1&pageSize=10&isShadowSku=0&rid=0&fold=1'
pricedata=urllib.request.urlopen(priceurl).read().decode('utf-8','ignore')
commentdata=urllib.request.urlopen(commenturl).read().decode('utf-8','ignore')
pricepat='"p":"(.*?)"'
commentpat='"goodRateShow":(.*?),'
price=re.compile(pricepat).findall(pricedata)
comment=re.compile(commentpat).findall(commentdata)
if(len(title) and len(shop) and len(shoplink) and len(price) and len(comment)):
i['title']=title
i['shop']=shop
i['shoplink']=shoplink
i['price']=price
i['comment']=comment
'''print(title[0])
print(shop[0])
print(shoplink[0])
print(price[0])
print(comment[0])
print('-----------')'''
else:
pass
else:
pass
return i
except Exception as e:
print(e)
# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class JingdongPipeline(object):
def process_item(self, item, spider):
conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456', db='dd')
for i in range(0,len(item['title'])):
title=item['title'][i]
shop=item['shop'][i]
shoplink=item['shoplink'][i]
price=item['price'][i]
comment=item['comment'][i]
sql = "insert into jd(title,shop,shoplink,price,comment)values('" + title + "','" + shop + "','" + shoplink + "','" + price + "','" + comment + "')"
conn.query(sql)
conn.commit()
conn.close()
return item