crawl将自动对所有链接进行分析,将符合的链接数据爬取。官方文档
,其中价格,好评率需要用浏览器抓包分析真实地址,本文所用的基础技术包括:sql语句,re表达式,xpath表达式,基本的网络知识和python基础
# -*- coding: utf-8 -*-
import scrapy
import urllib.request
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from jingdong.items import JingdongItem
class JdSpider(CrawlSpider):
name = 'jd'
allowed_domains = ['jd.com']
start_urls = ['http://jd.com/']
rules = (
Rule(LinkExtractor(allow=''), callback='parse_item', follow=True),#爬取所有链接,均不加限制
)
def parse_item(self, response):
i = JingdongItem()
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
thisurl = response.url#获取当前所在的地址,下一步判断是否是商品地址
pat = 'item.jd.com/(.*?).html'
url = re.search(pat,thisurl)
if(url):#如果url是商品页
product_id = re.findall(pat,thisurl)[0]
price_link = 'http://p.3.cn/prices/mgets?callback=jQuery6325563&type=1&area=1_72_2799_0&pdtk=&pduid=1509845912914927705768&pdpin=&pin=null&pdbp=0&skuIds=J_'+str(product_id)+'&ext=11000000&source=item-pc'#当前商品价格链接地址
pat_price = 'p":"(.*?)"'#获取价格的正则
price_str=urllib.request.urlopen(price_link).read().decode('utf-8','ignore')#价格所在url
i['price'] = re.compile(pat_price).findall(price_str)[0]#获取价格
goodRate_link = 'http://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv244&productId='+str(product_id)+'&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'#当前商品好评率链接地址
pat_goodrate = '{"goodRateShow":(.*?),'#正则表达式匹配价格
goodRate_str = urllib.request.urlopen(goodRate_link).read().decode('utf-8','ignore')#好评地址
i['goodRate'] = re.compile(pat_goodrate).findall(goodRate_str)[0]#好评率
i['title'] = response.xpath('//title/text()').extract()#商品标题
i['store'] = response.xpath('//div[@class="name"]/a/text()').extract()#店铺名字
i['link'] = response.xpath('//link[@rel="canonical"]/@href').extract()#商品链接
print(i['price'],i['goodRate'],i['title'],i['store'],i['link'])
else:#不是商品叶
pass
#print("不是商品")
return i
#价格地址
#http://p.3.cn/prices/mgets?callback=jQuery6325563&type=1&area=1_72_2799_0&pdtk=&pduid=1509845912914927705768&pdpin=&pin=null&pdbp=0&skuIds=J_5560552&ext=11000000&source=item-pc
#好评率地址
#http://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv244&productId=5560552&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1
#简介xpath
#/html/body/div[7]/div/div[2]/div[1]/text()
#店铺xpath
#//div[@class="name"]/a/text()
#商品链接
#//link[@rel="canonical"]/@href
尝试运行
# -*- coding: utf-8 -*-
import mysql.connector
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class JingdongPipeline(object):
def process_item(self, item, spider):
db = mysql.connector.connect(host='localhost',
user='root',
passwd='123456',
db='python') # 链接数据库
cur = db.cursor() # 获取数据库游标
title = item['title']
link = item['link']
price = item['price']
goodRate = item['goodRate']
store = item['store']
print(title,link,price,goodRate,store)
cur.execute("insert into jingdong VALUES ('"+title+"','"+link+"','"+price+"','"+goodRate+"','"+store+"')") # 执行语句
db.commit() # 提交事务,没有此句在数据库中不能查询到数据
cur.close() # 关闭游标
db.close() # 关闭数据库