jd爬虫

需要注意的点是评价数据是通过ajax加载的,如何读取里面返回的json格式的数据

import scrapy
import json
import re
from jd.items import JdItem


class JinDongSpider(scrapy.Spider):
    name = 'jin_dong'
    allowed_domains = ['jd.com']
    start_urls = ['https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA']
    def parse(self, response):
        one_contents = response.xpath("//div[@id='J_goodsList']/ul/li")
        for one_content in one_contents:
            data_sku = one_content.xpath("./@data-sku").get()
            data_url = one_content.xpath(".//div[@class='p-img']/a/@href").get()
            # 有些连接前面添加了https的,有些连接没有添加https,需要自己判断
            if data_url.startswith("http"):
                pass
            else:
                data_url = "https:" + data_url
            data_content = one_content.xpath(".//div[@class='p-img']/a/@title").get()
            # print(data_content)
            header={
                "authority": "item.jd.com",
                "method": "GET",
                "path": "/{}.html".format(data_sku),
                # ":scheme": "https",
            }

            yield scrapy.Request(data_url, callback=self.parse_content, meta={"sku":data_sku}, headers=header)

    # 进去详情页面
    def parse_content(self,response):
        sku = response.meta.get("sku")
        print(sku)
        url = "https://sclub.jd.com/comment/productPageComments.action?productId={}&score=0&sortType=5&page=0&pageSize=10".format(sku)
        header={
            "authority": "sclub.jd.com",
            "method": "GET",
            "accept-language": "zh-CN,zh",
            "referer": "https://item.jd.com/{}.html".format(sku),
        }

        yield scrapy.Request(url,callback=self.parse_contents, meta={"sku":sku},headers=header)

    # 从第一页中的json数据中获取总的评论页数,在访问每一页页数,把response交给parse_cl函数处理
    def parse_contents(self, response):
        sku = response.meta.get("sku")
        jso = json.loads(response.body.decode(response.encoding))
        # print(jsons['maxPage'])
        d = int(jso['maxPage'])
        print("这是ddddddsdsdasdfasdas",d)
        header = {
            "authority": "sclub.jd.com",
            "method": "GET",
            "accept-language": "zh-CN,zh",
            "referer": "https://item.jd.com/{}.html".format(sku),
        }
        if d >= 1:
            for i in range(d+1):
                url = "https://sclub.jd.com/comment/productPageComments.action?productId={}&score=0&sortType=5&page={}&pageSize=10".format(sku,i)

                yield scrapy.Request(url,callback=self.parse_cl, meta={"sku":sku}, headers=header)

# 处理每一页评论当中的json格式的数据
    def parse_cl(self,response):
        sku = response.meta.get('sku')
        try:
            js = json.loads(response.body.decode(response.encoding))
        except:
            pass
        else:
            cc = js['comments']
            if cc:
                for l in cc:
                    user_id = l.get('id')     # 用户id
                    # content = l['content']
                    content = l.get('content')    # 用户评论   用get获取不存在的内容不会报错,如果获取不到返回None,   也可以自己设置默认返回值:content = l.get('content','hehe') 当不能获取content的时候返回hehe
                    
                    pro_name = l.get('referenceName')   # 商品名称
                    image_url = l.get('images')
                    img_list = []
                    if image_url:
                        for k in image_url:
                            ii = k.get('imgUrl')
                            img_url1 = ii.replace("n0","shaidan")
                            img_url = re.sub(r'(\d+x\d+)','616x415',img_url1)    # 把晒图的小图片的地址换成大图片的地址
                            img_list.append(img_url)
                    else:
                        pass
                    video_url = l.get('video')
                    video_list = []
                    if video_url:
                        for q in video_url:
                            video_url = g.get('remark')
                            video_list.append(video_url)
                    else:
                        pass
                    color = l.get('productColor')
                    size = l.get('productSize')
                    origin_phone = l.get('userClientShow')

                    item = JdItem(
                        sku=sku,   # 商品sku
                        user_id=user_id,
                        content=content,    # 评价详情
                        pro_name=pro_name,
                        img_list=img_list,     # 晒图列表
                        video_list=video_list,    # 晒图视频列表
                        origin_phone=origin_phone    # 评价来源
                    )
                    yield item
            else:
                pass





把数据保存到mongodb中settings文件与pipelines文件的相关内容填写

# 这里是本来settings文件就有的,取消注释就行了
ITEM_PIPELINES = {
    'jd.pipelines.JdPipeline': 300,       # 如果在pipelines中新创建了类,需要添加到里面来
}


MONGODB_HOST = '127.0.0.1'
# 端口号,默认27017
MONGODB_PORT = 27017
# 设置数据库名称
MONGODB_DBNAME = 'JD'
# 存放本数据的表名称
MONGODB_DOCNAME = 'jin_dong'



# pipelines文件中的内容

from scrapy.conf import settings
import pymongo


class JdPipeline(object):

    def __init__(self):
        # 获取setting主机名、端口号和数据库名称
        host = settings['MONGODB_HOST']
        port = settings['MONGODB_PORT']
        dbname = settings['MONGODB_DBNAME']

        # 创建数据库连接
        client = pymongo.MongoClient(host=host, port=port)

        # 指向指定数据库
        mdb = client['JD']

        # 获取数据库里面存放数据的表名
        self.post = mdb[settings['MONGODB_DOCNAME']]

    def process_item(self, item, spider):
        data = dict(item)
        # 向指定的表里添加数据
        self.post.insert(data)
        return item

 

你可能感兴趣的:(爬虫)