- 【存储结构说明】
- class CategoriesItem(Item):存储京东类目信息
- class ProductsItem(Item):存储京东商品信息
- class ShopItem(Item):存储京东店铺信息
- class CommentSummaryItem(Item):存储京东每个商品的评论概况信息
- class CommentItem(Item):存储京东每个商品的评论基本信息
- class CommentImageItem(Item):存储京东每个商品中每条评论的图像信息
- 说明:类中所定义字段可依据具体采集要求或response内容进行调整
【items.py程序】
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy import Item, Field
class CategoriesItem(Item):
"""
存储京东类目信息
"""
name = Field() # 商品三级类目名称
url = Field() # 商品三级类目对应url
_id = Field() # 商品类目对应id[一级id,二级id,三级id]
class ProductsItem(Item):
"""
存储京东商品信息
"""
name = Field() # 商品名称
url = Field() # 商品url[用于商品主图提取]
_id = Field() # 商品sku
category = Field() # 商品三级类目
description = Field() # 商品描述
shopId = Field() # 商品所在店铺id(名称)
commentCount = Field() # 商品评价总数=CommentSummaryItem.commentCount
# goodComment = Field() # 商品好评数
# generalComment = Field() # 商品中评数
# poolComment = Field() # 商品差评数
# favourableDesc1 = Field() # 商品优惠描述1
# favourableDesc2 = Field() # 商品优惠描述2
# venderId = Field() # 供应商id
# reallyPrice = Field() # 商品现价
# originalPrice = Field() # 商品原价
class ShopItem(Item):
_id = Field() # 店铺url
shopName = Field() # 店铺名称
shopItemScore = Field() # 店铺[商品评价]
shopLgcScore = Field() # 店铺[物流履约]
shopAfterSale = Field() # 店铺[售后服务]
class CommentItem(Item):
_id = Field() # 评论id
productId = Field() # 商品id=sku
guid = Field() # 评论全局唯一标识符
firstCategory = Field() # 商品一级类目
secondCategory = Field() # 商品二级类目
thirdCategory = Field() # 商品三级类目
score = Field() # 用户评分
nickname = Field() # 用户昵称
plusAvailable = Field() # 用户账户等级(201:PLUS, 103:普通用户,0:无价值用户)
content = Field() # 评论内容
creationTime = Field() # 评论时间
replyCount = Field() # 评论的评论数
usefulVoteCount = Field() # 用户评论的被点赞数
imageCount = Field() # 评论中图片的数量
class CommentImageItem(Item):
_id = Field() # 晒图对应id(1张图对应1个id)
commentGuid = Field() # 晒图所在评论的全局唯一标识符guid
imgId = Field() # 晒图对应id
imgUrl = Field() # 晒图url
imgTitle = Field() # 晒图标题
imgStatus = Field() # 晒图状态
class CommentSummaryItem(Item):
"""商品评论总结"""
_id = Field() # 商品sku
productId = Field() # 商品pid
commentCount = Field() # 商品累计评论数
score1Count = Field() # 用户评分为1的数量
score2Count = Field() # 用户评分为2的数量
score3Count = Field() # 用户评分为3的数量
score4Count = Field() # 用户评分为3的数量
score5Count = Field() # 用户评分为5的数量
- 【管道文件说明】
- 数据库:MongoDB
- 数据库名称:JD
- 数据库集合:Categories、Products、Shop、CommentSummary、Comment和CommentImage
- 处理过程:先判断待插入数据库集合类型是否匹配,然后插入,并为重复数据插入抛出异常
【pipelines.py】
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from JDSpider.items import *
class MongoDBPipeline(object):
def __init__(self):
clinet = pymongo.MongoClient("localhost", 27017)
db = clinet["JD"]
self.Categories = db["Categories"]
self.Products = db["Products"]
self.Shop = db["Shop"]
self.Comment = db["Comment"]
self.CommentImage = db["CommentImage"]
self.CommentSummary = db["CommentSummary"]
def process_item(self, item, spider):
""" 判断item的类型,并作相应的处理,再入数据库 """
if isinstance(item, CategoriesItem):
try:
self.Categories.insert(dict(item))
except Exception as e:
print('get failed:', e)
elif isinstance(item, ProductsItem):
try:
self.Products.insert(dict(item))
except Exception as e:
print('get failed:', e)
elif isinstance(item, ShopItem):
try:
self.Shop.insert(dict(item))
except Exception as e:
print('get failed:', e)
elif isinstance(item, CommentItem):
try:
self.Comment.insert(dict(item))
except Exception as e:
print('get failed:', e)
elif isinstance(item, CommentImageItem):
try:
self.CommentImage.insert(dict(item))
except Exception as e:
print('get failed:', e)
elif isinstance(item, CommentSummaryItem):
try:
self.CommentSummary.insert(dict(item))
except Exception as e:
print('get failed:', e)
elif isinstance(item, ShopItem):
try:
self.Shop.insert(dict(item))
except Exception as e:
print('get failed:', e)
return item
- 【中间件文件说明】
- 包括“爬虫代理中间件”和“缓存中间件”
- 爬虫代理中间件:防止连续请求被京东后台发现并拉黑
- 缓存中间件:判断京东后台服务器响应情况,并作出针对性处理
【middlewares.py】
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
import os
import logging
from scrapy.exceptions import IgnoreRequest
from scrapy.utils.response import response_status_message
from scrapy.downloadermiddlewares.retry import RetryMiddleware
import random
logger = logging.getLogger(__name__)
class UserAgentMiddleware(object):
""" 换User-Agent """
def process_request(self, request, spider):
"""设置爬虫代理"""
with open("E://proxy.txt", "r") as f:
PROXIES = f.readlines()
agent = random.choice(PROXIES)
agent = agent.strip()
request.headers["User-Agent"] = agent
class CookiesMiddleware(RetryMiddleware):
""" 维护Cookie """
def process_request(self, request, spider):
pass
def process_response(self, request, response, spider):
if response.status in [300, 301, 302, 303]:
try:
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response # 重试
except Exception as e:
raise IgnoreRequest
elif response.status in [403, 414]:
logger.error("%s! Stopping..." % response.status)
os.system("pause")
else:
return response
- 【修改说明】
- robot协议:置位False,防止京东网站不允许爬虫抓取数据
- 爬虫最大并发请求:可依据电脑实际性能进行设置
- 下载中间件优先级:值越小,优先级越高
- 管道文件优先级:值越小,优先级越高
- 说明:代码文件过长,故不再展示
- 【商品类目抓取说明】
- 有些类别里面包含有很多子类别,所以对于这样的url,需要再次yield并进行抓取
texts = selector.xpath('//div[@class="category-item m"]/div[@class="mc"]/div[@class="items"]/dl/dd/a').extract()
for text in texts:
# 获取全部三级类目链接+三级类目名称
items = re.findall(r'(.*?)', text)
for item in items:
# 判断“商品链接”是否需要继续请求
if item[0].split('.')[0][2:] in key_word:
if item[0].split('.')[0][2:] != 'list':
yield Request(url='https:' + item[0], callback=self.parse_category)
else:
# 记录一级类目:名称/可提数URL/id编码
categoriesItem = CategoriesItem()
categoriesItem['name'] = item[1]
categoriesItem['url'] = 'https:' + item[0]
categoriesItem['_id'] = item[0].split('=')[1].split('&')[0]
yield categoriesItem
meta = dict()
meta["category"] = item[0].split("=")[1]
yield Request(url='https:' + item[0], callback=self.parse_list, meta=meta)
- 【店铺信息抓取说明】
- 流程:访问每个类别的url,在产品列表中获取每个商品对应的url,进入详情页面抓取产品的详情
- 注意:此处要通过分析得出翻页请求对应的response地址,并解析规律进行翻页
【获取商品链接】
selector = Selector(response)
texts = selector.xpath('//*[@id="J_goodsList"]/ul/li/div/div[@class="p-img"]/a').extract()
for text in texts:
items = text.split("=")[3].split('"')[1]
yield Request(url='https:' + items, callback=self.parse_product, meta=meta)
# 翻页[仅翻前50页]
maxPage = int(response.xpath('//div[@id="J_filter"]/div/div/span/i/text()').extract()[0])
if maxPage > 1:
if maxPage > 50:
maxPage = 50
for i in range(2, maxPage):
num = 2*i - 1
caterory = meta["category"].split(",")[0]+'%2C' + meta["category"].split(",")[1] + '%2C' + meta["category"].split(",")[2]
url = list_url % (caterory, num, 30*num)
print('products next page:', url)
yield Request(url=url, callback=self.parse_list2, meta=meta)
- 【店铺信息抓取说明】
- 店铺信息在抓取商品信息的页面便可以获取
- 但是,要区分自营和非自营,因为自营缺少一些内容
# 商品在售店铺id+店铺信息获取
shopItem["shopName"] = response.xpath('//div[@class="m m-aside popbox"]/div/div/h3/a/text()').extract()[0]
shopItem["_id"] = "https:" + response.xpath('//div[@class="m m-aside popbox"]/div/div/h3/a/@href').extract()[0]
productsItem['shopId'] = shopItem["_id"]
# 区分是否自营
res = response.xpath('//div[@class="score-parts"]/div/span/em/@title').extract()
if len(res) == 0:
shopItem["shopItemScore"] = "京东自营"
shopItem["shopLgcScore"] = "京东自营"
shopItem["shopAfterSale"] = "京东自营"
else:
shopItem["shopItemScore"] = res[0]
shopItem["shopLgcScore"] = res[1]
shopItem["shopAfterSale"] = res[2]
# shopItem["_id"] = response.xpath('//div[@class="m m-aside popbox"]/div/div/h3/a/@href').extract()[0].split("-")[1].split(".")[0]
yield shopItem
- 【评论信息抓取说明】
- 评论的信息也是动态加载,返回的格式也是json,且会不定期进行更新,访问格式如下:
comment_url = 'https://club.jd.com/comment/productPageComments.action?productId=%s&score=0&sortType=5&page=%s&pageSize=10'
def parse_comments(self, response):
"""
获取商品评论
:param response: 评论相应的json脚本
:return:
"""
try:
data = json.loads(response.text)
except Exception as e:
print('get comment failed:', e)
return None
product_id = response.meta['product_id']
# 商品评论概况获取[仅导入一次]
commentSummaryItem = CommentSummaryItem()
commentSummary = data.get('productCommentSummary')
commentSummaryItem['_id'] = commentSummary.get('skuId')
commentSummaryItem['productId'] = commentSummary.get('productId')
commentSummaryItem['commentCount'] = commentSummary.get('commentCount')
commentSummaryItem['score1Count'] = commentSummary.get('score1Count')
commentSummaryItem['score2Count'] = commentSummary.get('score2Count')
commentSummaryItem['score3Count'] = commentSummary.get('score3Count')
commentSummaryItem['score4Count'] = commentSummary.get('score4Count')
commentSummaryItem['score5Count'] = commentSummary.get('score5Count')
# 判断commentSummaryItem类型
yield commentSummaryItem
# 商品评论[第一页,剩余页面评论由,parse_comments2]
for comment_item in data['comments']:
comment = CommentItem()
comment['_id'] = str(product_id)+","+str(comment_item.get("id"))
comment['productId'] = product_id
comment["guid"] = comment_item.get('guid')
comment['firstCategory'] = comment_item.get('firstCategory')
comment['secondCategory'] = comment_item.get('secondCategory')
comment['thirdCategory'] = comment_item.get('thirdCategory')
comment['score'] = comment_item.get('score')
comment['nickname'] = comment_item.get('nickname')
comment['plusAvailable'] = comment_item.get('plusAvailable')
comment['content'] = comment_item.get('content')
comment['creationTime'] = comment_item.get('creationTime')
comment['replyCount'] = comment_item.get('replyCount')
comment['usefulVoteCount'] = comment_item.get('usefulVoteCount')
comment['imageCount'] = comment_item.get('imageCount')
yield comment
# 存储当前用户评论中的图片
if 'images' in comment_item:
for image in comment_item['images']:
commentImageItem = CommentImageItem()
commentImageItem['commentGuid'] = comment_item.get('guid')
commentImageItem['imgId'] = image.get('id')
commentImageItem['_id'] = str(product_id)+","+str(comment_item.get('id'))+","+str(image.get('id'))
commentImageItem['imgUrl'] = 'http:' + image.get('imgUrl')
commentImageItem['imgTitle'] = image.get('imgTitle')
commentImageItem['imgStatus'] = image.get('status')
yield commentImageItem
# 评论翻页[尽量保证评分充足]
max_page = int(data.get('maxPage', '1'))
# if max_page > 60:
# # 设置评论的最大翻页数
# max_page = 60
for i in range(1, max_page):
url = comment_url % (product_id, str(i))
meta = dict()
meta['product_id'] = product_id
yield Request(url=url, callback=self.parse_comments2, meta=meta)
有数据需要的可以联系,数据非常大