爬虫项目:京东商品数据爬取

spider代码:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from jingdong.items import JingdongItem
import re
import urllib

class JdSpider(scrapy.Spider):
    name = 'jd'
    allowed_domains = ['jd.com']
    start_urls = ['http://jd.com/']

    def parse(self, response):
        key = "笔记本"
        search_url = "https://search.jd.com/Search?keyword=" + key + "&enc=utf-8&wq=" + key
        for i in range(1,101):
            page_url = search_url + "&page=" + str(i*2-1)
            yield Request(url=page_url,callback=self.next)
    def next(self,response):
        id = response.xpath('//ul[@class="gl-warp clearfix"]/li/@data-sku').extract()
        #print(id)
        for j in range(len(id)):
            ture_url = "https://item.jd.com/" + str(id[j]) + ".html"
            yield Request(url=ture_url,callback=self.next2)
    def next2(self,response):
        item = JingdongItem()
        item['title'] = response.xpath('//head/title/text()').extract()[0].replace('【图片 价格 品牌 报价】-京东','').replace('【行情 报价 价格 评测】-京东','')
        item['link'] = response.url
        #价格抓包
        ture_id = re.findall(r'https://item.jd.com/(.*?).html',item['link'])[0]
        price_url = "https://p.3.cn/prices/mgets?skuIds=J_" + str(ture_id)
        price_txt = urllib.request.urlopen(price_url).read().decode('utf-8', 'ignore')
        item['price'] = re.findall(r'"p":"(.*?)"',price_txt)[0]
        #评论抓包
        comment_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(ture_id)
        comment_txt = urllib.request.urlopen(comment_url).read().decode('utf-8', 'ignore')
        item['comment'] = re.findall(r'"CommentCount":(.*?),"',comment_txt)[0]
        return item

pipline代码:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql.cursors

class JingdongPipeline(object):
    # 连接登录mysql,新建数据表
    def __init__(self):
        self.conn = pymysql.connect(host="127.0.0.1",
                                    user="root",
                                    passwd="",
                                    db="jd",
        charset = 'utf8')
        cur = self.conn.cursor()
        cur.execute("USE jd")
        cur.execute(
            "CREATE TABLE computer(title VARCHAR(100),link VARCHAR(50),price VARCHAR(50),comment VARCHAR(50))")
        self.conn.commit()

    def process_item(self, item, spider):
        try:
            title_1 = item['title']
            link_1 = item['link']
            price_1 = item['price']
            comment_1 = item['comment']
            cur = self.conn.cursor()
            cur.execute("INSERT INTO computer(title,link,price,comment) VALUES (%s,%s,%s,%s)",(title_1,link_1,price_1,comment_1))
            self.conn.commit()
        except Exception as err:
            pass
        return item



使用的是navicat作为mysql的交互

最后结果:

爬虫项目:京东商品数据爬取_第1张图片

遇到的一些难题:

1、mysql的安装,参考我的另一篇博文:

当python遇到mysql时,如何顺利安装mysql

2、抓包:我所取的数据里面,有两个字段是需要抓包的,一个是price,另一个是comment,抓包的时候注意包的地址,里面一般会包括关键字,例如price的包的链接名里面也会有price

3、我的navicat插数据进去的时候中文会显示‘???‘的乱码,这里我是参考:点击打开链接

4、经过多次调试之后,发现访问数据量太多了,京东开始问我要验证码了,验证码解码方面还在学习当中,掌握了之后再回头做修改


你可能感兴趣的:(项目)