scrapy爬取亚马逊商品评论

这一篇使用scrapy爬虫框架实现亚马逊商品评论的抓取。

1、创建一个爬虫项目:
scrapy startproject MySpiderTest

2、item.py中定义数据item:

import scrapy
from scrapy.item import Field, Item

class ItcastItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = Field()
    title = Field()
    info = Field()

# 评论信息
class AmazonReviewItem(Item):

    user_id = Field()
    user_name = Field()
    data_asin = Field()
    name = Field()  # 商品名称
    review_title = Field()
    review_star_rating = Field()    # 评分
    review_date = Field()   # 日期
    review_info = Field()


# 商品信息
class AmazonGoodsItem(scrapy.Item):

    # define the fields for your item here like:
    #collection = 'amazon'  # 数据表

    s_href = scrapy.Field()  # 小分类url
    data_asin = scrapy.Field()  # 商品编号
    name = scrapy.Field()  # 商品名称
    goods_url = scrapy.Field()  # 商品url
    brand = scrapy.Field()  # 商品品牌
    price = scrapy.Field()  # 商品价格
    freight = scrapy.Field()  # 运费

3、spider目录创建爬虫amazon_review.py:

# -*- coding: utf-8 -*-
import scrapy
from urllib import parse as url_parse
from mySpiderTest.items import AmazonGoodsItem, AmazonReviewItem
import re
from copy import deepcopy

# 爬取亚马逊评论信息
# 通过搜索关键字查询出来的列表,如k=phone
class AmazonReviewSpider(scrapy.Spider):
    name = 'amazon_review'
    allowed_domains = ['www.amazon.com']
    # start_urls = ['https://www.amazon.com/s?k=phone&ref=nb_sb_noss']

    def __init__(self, category=None, *args, **kwargs):
        super(AmazonReviewSpider, self).__init__(*args, **kwargs)
        self.start_urls = []
        if category is not None:
            keys = category.split(",")
            for key in keys:
                self.start_urls.append('https://www.amazon.com/s?k=' + key + '&ref=nb_sb_noss')
        else:
            # 默认搜索phone
            self.start_urls = ['https://www.amazon.com/s?k=phone&ref=nb_sb_noss']

        self.log("category = %s" % category)

    def parse(self, response):

        item = AmazonGoodsItem()

        div_list = response.xpath("//*[@id=\"search\"]//div[@class=\"s-result-list s-search-results sg-row\"]/div")
        self.log("div_list_len=%s" % str(len(div_list)))
        for each_div in div_list:
            # data_asin = each_div.xpath("@data-asin").extract_first()
            # item['data_asin'] = data_asin
            goods_url = each_div.xpath(".//h2/a/@href").extract_first()
            item['goods_url'] = url_parse.unquote(goods_url)
            item['name'] = self.get_goods_name(item['goods_url'])
            item['data_asin'] = self.get_data_asin(item['goods_url'])
            # self.log("************* item[name]: %s" % item)
            # 商品评论详情第一页
            review_url = 'https://www.amazon.com/' + item['name'] \
                         + '/product-reviews/' + item['data_asin'] \
                         + '/ref=cm_cr_getr_d_paging_btm_next_1?ie=UTF8' \
                         + '&reviewerType=all_reviews&pageNumber=1'
            yield scrapy.Request(
                review_url,
                callback=self.parse_review_detail,
                meta={"item": deepcopy(item)}
            )

        # 下一页 xpath=//*[@id="search"]/div[1]/div[2]/div/span[7]/div/div/div/ul/li[7]/a
        next_url = response.xpath(
            "//*[@id='search']/div[1]/div[2]/div/span[7]/div/div/div/ul/li[7]/a/@href").extract_first()
        if next_url is not None:
            next_url = 'https://www.amazon.cn' + next_url
            yield scrapy.Request(
                next_url,
                callback=self.parse
            )

    def parse_review_detail(self, response):

        goods_item = response.meta["item"]

        # //*[@id="customer_review-R35WB3S3WWC9DN"]/div[4]/span
        for each in response.xpath("//*[starts-with(@id,\"customer_review-\")]"):
            item = AmazonReviewItem()

            item['data_asin'] = goods_item['data_asin']
            item['name'] = goods_item['name']
            item['data_asin'] = goods_item['data_asin']
            item['name'] = goods_item['name']

            item["user_id"] = each.xpath("@id").extract_first().split("-")[1]
            item["user_name"] = each.xpath("//span[@class='a-profile-name']")\
                .xpath('string(.)').extract()[0]
            item['review_title'] = each.xpath("//a[@data-hook='review-title']")\
                .xpath('string(.)').extract()[0]
            item['review_star_rating'] = each.xpath("//i[@data-hook='review-star-rating']")\
                .xpath('string(.)').extract()[0]
            item['review_date'] = each.xpath("//span[@data-hook='review-date']")\
                .xpath('string(.)').extract()[0]
            item['review_info'] = each.xpath("//span[@data-hook='review-body']") \
                .xpath('string(.)').extract()[0]

            yield item

        # 是否有下一页
        next_page = response.xpath("//*[@id=\"cm_cr-pagination_bar\"]/ul/li[2]/a/@href").extract_first()
        self.log("-------next_page = %s" % next_page)
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(
                next_page,
                callback=self.parse_review_detail,
                meta={"item": deepcopy(goods_item)}
            )

    @staticmethod
    def get_goods_name(url):
        name = ''
        if url is None:
            return name

        regex1 = re.compile(r"url=\/.*?\/")

        is_contain_url = re.search(regex1, url)
        if is_contain_url:
            '''
            url = /gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_2?ie=UTF8&adId=A0805009AF1ES0KA13RD&url=/VTech-CS6529-4-Answering-Cordless-Handsets/dp/B00WHYS0R2/ref=sr_1_2_sspa?keywords=phone&qid=1561524478&s=gateway&sr=8-2-spons&psc=1&qualifier=1561524478&id=96741371645175&widgetName=sp_atf
            '''
            name = re.findall(re.compile(regex1), url)[0].split("/")[1]
        else:
            '''
            url =  /Panasonic-KX-TGD532W-Expandable-Cordless-Answering/dp/B071GQB94T/ref=sr_1_3?keywords=phone&qid=1561524478&s=gateway&sr=8-3
            '''
            name = url.split("/")[1]

        return name

    @staticmethod
    def get_data_asin(url):
        asin = ''
        if url is None:
            return asin

        regex1 = re.compile(r"dp\/.*?\/")

        is_contain_dp = re.search(regex1, url)
        '''
        url = /gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_2?ie=UTF8&adId=A0805009AF1ES0KA13RD&url=/VTech-CS6529-4-Answering-Cordless-Handsets/dp/B00WHYS0R2/ref=sr_1_2_sspa?keywords=phone&qid=1561524478&s=gateway&sr=8-2-spons&psc=1&qualifier=1561524478&id=96741371645175&widgetName=sp_atf
        url =  /Panasonic-KX-TGD532W-Expandable-Cordless-Answering/dp/B071GQB94T/ref=sr_1_3?keywords=phone&qid=1561524478&s=gateway&sr=8-3
        '''
        if is_contain_dp:
            asin = re.findall(regex1, url)[0].split("/")[1]
        else:
            asin = url.split("/")[1]

        return asin

4、定义pipelines.py:

class MyspidertestPipeline(object):

    def __init__(self):
        # super(self)
        self.review_file = codecs.open('amazon_reviews.json', 'a', encoding="utf-8")

    def process_item(self, item, spider):
        lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.review_file.write(lines)
        return item

    def spider_closed(self, spider):
        self.review_file.closed()

5、修改settings.py :

ITEM_PIPELINES = {
    'mySpiderTest.pipelines.MyspidertestPipeline': 300,
}

6、运行:

# category=phone是传递key参数字,多个用逗号隔开:category=huawei,oppo,vivo
scrapy crawl amazon_review -a category=phone

你可能感兴趣的:(scrapy,python,爬虫,scrapy)