这一篇使用scrapy爬虫框架实现亚马逊商品评论的抓取。
1、创建一个爬虫项目:
scrapy startproject MySpiderTest
2、item.py中定义数据item:
import scrapy
from scrapy.item import Field, Item
class ItcastItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = Field()
title = Field()
info = Field()
# 评论信息
class AmazonReviewItem(Item):
user_id = Field()
user_name = Field()
data_asin = Field()
name = Field() # 商品名称
review_title = Field()
review_star_rating = Field() # 评分
review_date = Field() # 日期
review_info = Field()
# 商品信息
class AmazonGoodsItem(scrapy.Item):
# define the fields for your item here like:
#collection = 'amazon' # 数据表
s_href = scrapy.Field() # 小分类url
data_asin = scrapy.Field() # 商品编号
name = scrapy.Field() # 商品名称
goods_url = scrapy.Field() # 商品url
brand = scrapy.Field() # 商品品牌
price = scrapy.Field() # 商品价格
freight = scrapy.Field() # 运费
3、spider目录创建爬虫amazon_review.py:
# -*- coding: utf-8 -*-
import scrapy
from urllib import parse as url_parse
from mySpiderTest.items import AmazonGoodsItem, AmazonReviewItem
import re
from copy import deepcopy
# 爬取亚马逊评论信息
# 通过搜索关键字查询出来的列表,如k=phone
class AmazonReviewSpider(scrapy.Spider):
name = 'amazon_review'
allowed_domains = ['www.amazon.com']
# start_urls = ['https://www.amazon.com/s?k=phone&ref=nb_sb_noss']
def __init__(self, category=None, *args, **kwargs):
super(AmazonReviewSpider, self).__init__(*args, **kwargs)
self.start_urls = []
if category is not None:
keys = category.split(",")
for key in keys:
self.start_urls.append('https://www.amazon.com/s?k=' + key + '&ref=nb_sb_noss')
else:
# 默认搜索phone
self.start_urls = ['https://www.amazon.com/s?k=phone&ref=nb_sb_noss']
self.log("category = %s" % category)
def parse(self, response):
item = AmazonGoodsItem()
div_list = response.xpath("//*[@id=\"search\"]//div[@class=\"s-result-list s-search-results sg-row\"]/div")
self.log("div_list_len=%s" % str(len(div_list)))
for each_div in div_list:
# data_asin = each_div.xpath("@data-asin").extract_first()
# item['data_asin'] = data_asin
goods_url = each_div.xpath(".//h2/a/@href").extract_first()
item['goods_url'] = url_parse.unquote(goods_url)
item['name'] = self.get_goods_name(item['goods_url'])
item['data_asin'] = self.get_data_asin(item['goods_url'])
# self.log("************* item[name]: %s" % item)
# 商品评论详情第一页
review_url = 'https://www.amazon.com/' + item['name'] \
+ '/product-reviews/' + item['data_asin'] \
+ '/ref=cm_cr_getr_d_paging_btm_next_1?ie=UTF8' \
+ '&reviewerType=all_reviews&pageNumber=1'
yield scrapy.Request(
review_url,
callback=self.parse_review_detail,
meta={"item": deepcopy(item)}
)
# 下一页 xpath=//*[@id="search"]/div[1]/div[2]/div/span[7]/div/div/div/ul/li[7]/a
next_url = response.xpath(
"//*[@id='search']/div[1]/div[2]/div/span[7]/div/div/div/ul/li[7]/a/@href").extract_first()
if next_url is not None:
next_url = 'https://www.amazon.cn' + next_url
yield scrapy.Request(
next_url,
callback=self.parse
)
def parse_review_detail(self, response):
goods_item = response.meta["item"]
# //*[@id="customer_review-R35WB3S3WWC9DN"]/div[4]/span
for each in response.xpath("//*[starts-with(@id,\"customer_review-\")]"):
item = AmazonReviewItem()
item['data_asin'] = goods_item['data_asin']
item['name'] = goods_item['name']
item['data_asin'] = goods_item['data_asin']
item['name'] = goods_item['name']
item["user_id"] = each.xpath("@id").extract_first().split("-")[1]
item["user_name"] = each.xpath("//span[@class='a-profile-name']")\
.xpath('string(.)').extract()[0]
item['review_title'] = each.xpath("//a[@data-hook='review-title']")\
.xpath('string(.)').extract()[0]
item['review_star_rating'] = each.xpath("//i[@data-hook='review-star-rating']")\
.xpath('string(.)').extract()[0]
item['review_date'] = each.xpath("//span[@data-hook='review-date']")\
.xpath('string(.)').extract()[0]
item['review_info'] = each.xpath("//span[@data-hook='review-body']") \
.xpath('string(.)').extract()[0]
yield item
# 是否有下一页
next_page = response.xpath("//*[@id=\"cm_cr-pagination_bar\"]/ul/li[2]/a/@href").extract_first()
self.log("-------next_page = %s" % next_page)
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(
next_page,
callback=self.parse_review_detail,
meta={"item": deepcopy(goods_item)}
)
@staticmethod
def get_goods_name(url):
name = ''
if url is None:
return name
regex1 = re.compile(r"url=\/.*?\/")
is_contain_url = re.search(regex1, url)
if is_contain_url:
'''
url = /gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_2?ie=UTF8&adId=A0805009AF1ES0KA13RD&url=/VTech-CS6529-4-Answering-Cordless-Handsets/dp/B00WHYS0R2/ref=sr_1_2_sspa?keywords=phone&qid=1561524478&s=gateway&sr=8-2-spons&psc=1&qualifier=1561524478&id=96741371645175&widgetName=sp_atf
'''
name = re.findall(re.compile(regex1), url)[0].split("/")[1]
else:
'''
url = /Panasonic-KX-TGD532W-Expandable-Cordless-Answering/dp/B071GQB94T/ref=sr_1_3?keywords=phone&qid=1561524478&s=gateway&sr=8-3
'''
name = url.split("/")[1]
return name
@staticmethod
def get_data_asin(url):
asin = ''
if url is None:
return asin
regex1 = re.compile(r"dp\/.*?\/")
is_contain_dp = re.search(regex1, url)
'''
url = /gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_2?ie=UTF8&adId=A0805009AF1ES0KA13RD&url=/VTech-CS6529-4-Answering-Cordless-Handsets/dp/B00WHYS0R2/ref=sr_1_2_sspa?keywords=phone&qid=1561524478&s=gateway&sr=8-2-spons&psc=1&qualifier=1561524478&id=96741371645175&widgetName=sp_atf
url = /Panasonic-KX-TGD532W-Expandable-Cordless-Answering/dp/B071GQB94T/ref=sr_1_3?keywords=phone&qid=1561524478&s=gateway&sr=8-3
'''
if is_contain_dp:
asin = re.findall(regex1, url)[0].split("/")[1]
else:
asin = url.split("/")[1]
return asin
4、定义pipelines.py:
class MyspidertestPipeline(object):
def __init__(self):
# super(self)
self.review_file = codecs.open('amazon_reviews.json', 'a', encoding="utf-8")
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.review_file.write(lines)
return item
def spider_closed(self, spider):
self.review_file.closed()
5、修改settings.py :
ITEM_PIPELINES = {
'mySpiderTest.pipelines.MyspidertestPipeline': 300,
}
6、运行:
# category=phone是传递key参数字,多个用逗号隔开:category=huawei,oppo,vivo
scrapy crawl amazon_review -a category=phone