python3 scrapy框架crawl模版爬取京东产品并写入mysql

crawl将自动对所有链接进行分析,将符合的链接数据爬取。官方文档
python3 scrapy框架crawl模版爬取京东产品并写入mysql_第1张图片
,其中价格,好评率需要用浏览器抓包分析真实地址,本文所用的基础技术包括:sql语句,re表达式,xpath表达式,基本的网络知识和python基础

jd.py

# -*- coding: utf-8 -*-
import scrapy
import urllib.request
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from jingdong.items import JingdongItem


class JdSpider(CrawlSpider):
    name = 'jd'
    allowed_domains = ['jd.com']
    start_urls = ['http://jd.com/']

    rules = (
        Rule(LinkExtractor(allow=''), callback='parse_item', follow=True),#爬取所有链接,均不加限制
    )

    def parse_item(self, response):
        i = JingdongItem()
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        thisurl = response.url#获取当前所在的地址,下一步判断是否是商品地址
        pat = 'item.jd.com/(.*?).html'
        url = re.search(pat,thisurl)
        if(url):#如果url是商品页
            product_id = re.findall(pat,thisurl)[0]
            price_link = 'http://p.3.cn/prices/mgets?callback=jQuery6325563&type=1&area=1_72_2799_0&pdtk=&pduid=1509845912914927705768&pdpin=&pin=null&pdbp=0&skuIds=J_'+str(product_id)+'&ext=11000000&source=item-pc'#当前商品价格链接地址
            pat_price = 'p":"(.*?)"'#获取价格的正则
            price_str=urllib.request.urlopen(price_link).read().decode('utf-8','ignore')#价格所在url
            i['price'] = re.compile(pat_price).findall(price_str)[0]#获取价格
            goodRate_link = 'http://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv244&productId='+str(product_id)+'&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'#当前商品好评率链接地址
            pat_goodrate = '{"goodRateShow":(.*?),'#正则表达式匹配价格
            goodRate_str = urllib.request.urlopen(goodRate_link).read().decode('utf-8','ignore')#好评地址
            i['goodRate'] = re.compile(pat_goodrate).findall(goodRate_str)[0]#好评率
            i['title'] = response.xpath('//title/text()').extract()#商品标题
            i['store'] = response.xpath('//div[@class="name"]/a/text()').extract()#店铺名字
            i['link'] = response.xpath('//link[@rel="canonical"]/@href').extract()#商品链接
            print(i['price'],i['goodRate'],i['title'],i['store'],i['link'])
        else:#不是商品叶
            pass
            #print("不是商品")
        return i



#价格地址
#http://p.3.cn/prices/mgets?callback=jQuery6325563&type=1&area=1_72_2799_0&pdtk=&pduid=1509845912914927705768&pdpin=&pin=null&pdbp=0&skuIds=J_5560552&ext=11000000&source=item-pc
#好评率地址
#http://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv244&productId=5560552&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1
#简介xpath
#/html/body/div[7]/div/div[2]/div[1]/text()
#店铺xpath
#//div[@class="name"]/a/text()
#商品链接
#//link[@rel="canonical"]/@href

尝试运行

pipelines.py

# -*- coding: utf-8 -*-
import mysql.connector
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class JingdongPipeline(object):
    def process_item(self, item, spider):
        db = mysql.connector.connect(host='localhost',
                                     user='root',
                                     passwd='123456',
                                     db='python')  # 链接数据库
        cur = db.cursor()  # 获取数据库游标
        title = item['title']
        link = item['link']
        price = item['price']
        goodRate = item['goodRate']
        store = item['store']
        print(title,link,price,goodRate,store)
        cur.execute("insert into jingdong VALUES ('"+title+"','"+link+"','"+price+"','"+goodRate+"','"+store+"')")  # 执行语句
        db.commit()  # 提交事务,没有此句在数据库中不能查询到数据
        cur.close()  # 关闭游标
        db.close()  # 关闭数据库

你可能感兴趣的:(python数据抓取)