先记录代码吧,回头再写文字:
# -*- coding: utf-8 -*-
import scrapy
import json
import pprint
from copy import deepcopy
class JdSpider(scrapy.Spider):
name = 'jd'
allowed_domains = ['jd.com', 'p.3.cn']
# 这是由于后期使用有使用json解析,使用的域名为p.3.cn
start_urls = ['https://book.jd.com/booksort.html']
def parse(self, response):
dt_list = response.xpath('//div[@class="mc"]/dl/dt') # 获取大分类列表
for dt in dt_list:
item = dict() # 创建字典,存储相关信息
item['b_cate'] = dt.xpath('./a/text()').extract_first() # 大分类标题
em_list = dt.xpath('./following-sibling::dd[1]/em') # 获取小分类列表位置
for em in em_list: # 小分类列表
item["s_href"] = em.xpath('./a/@href').extract_first()
item['s_cate'] = em.xpath('./a/text()').extract_first()
if item["s_href"] is not None:
item["s_href"] = 'https:' + item["s_href"] # 补全url,之后进入下级链接,并进行抓取
yield scrapy.Request(item["s_href"],
callback=self.parse_book_list, meta={'item': deepcopy(item)})
def parse_book_list(self, response):
item = response.meta['item']
li_list = response.xpath('//div[@id= "plist"]/ul/li')
for li in li_list:
item['book_name'] = li.xpath(
'.//div[@class="p-name"]/a/em/text()').extract_first().strip() # 书名前后有换行符,修掉
item['book_img'] = li.xpath(
'.//div[@class="p-img"]//img/@src').extract_first()
if item['book_img'] is None:
item['book_img'] = li.xpath('.//div[@class="p-img"]//img/@data-lazy-img').extract_first()
# 这里图片有两种情况,分别抓取
item['book_img'] = 'https:' + item['book_img']
# 补全地址
item['book_author'] = li.xpath('.//span[@class="author_type_1"]/a/text()').extract()
# 获取作者信息列表,注意部分没有
item['book_author'] = ', '.join(item['book_author'])
# 列表转字符串
item['book_press'] = li.xpath(
'.//span[@class="p-bi-store"]/a/text()').extract_first()
# 获取出版社信息
item['book_date'] = li.xpath('.//span[@class="p-bi-date"]/text()').extract_first().strip()
# 获取出版日期,并规范格式
item['book_sku'] = li.xpath('./div[1]/@data-sku').extract_first()
# 获取商品编号
yield scrapy.Request('https://p.3.cn/prices/mgets?skuIds=J_{}'.format(item['book_sku']),
callback=self.parse_book_price, meta={'item': deepcopy(item)})
# 通过分析得到价格信息的json数据地址,访问并获取价格
# 翻页
next_url = response.xpath(
'//a[@class="pn-next"]/@href').extract_first()
if next_url is not None:
next_url = 'https://list.jd.com/' + next_url
yield scrapy.Request(next_url, callback=self.parse_book_list, meta={'item': item})
# 此时不需要deepcopy
def parse_book_price(self, response):
item = response.meta['item']
item['book_price'] = json.loads(response.body.decode())[0]['op']
# 将json转为字典,并提取价格,参考流程,在检查中搜索价格,找到对应字符,之后查看器网址与对应响应。
pprint.pprint(item)
settings里面:
LOG_LEVEL= 'WARNING'
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_URL = 'redis://127.0.0.1:6379'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
安装了Redis,爬取结果:
{'b_cate': '小说',
'book_author': ' 阿瑟·克拉克 ',
'book_date': '2019-04',
'book_img': 'https://img13.360buyimg.com/n7/jfs/t1/11337/28/15060/482476/5ca5afe5E0248d72a/6f1489b91ad092d5.jpg',
'book_name': '2001:太空漫游(刘慈欣说:我所有作品都是对《2001:太空漫游》的拙劣模仿!)',
'book_press': '上海文艺出版社',
'book_price': '51.50',
'book_sku': '12512383',
's_cate': '科幻小说',
's_href': 'https://list.jd.com/1713-3258-6569.html'}
{'b_cate': '小说',
'book_author': '',
'book_date': '2014-12',
'book_img': 'https://img10.360buyimg.com/n7/jfs/t2080/170/1100535587/24948/c8227a68/5646a6ebN523d6961.jpg',
'book_name': '四大名著全套 精装珍藏版 原著无障碍阅读 青少年中小学生版 西游记 水浒传 红楼梦 三国演义',
'book_press': '世界图书出版公司',
'book_price': '128.00',
'book_sku': '10034517855',
's_cate': '四大名著',
's_href': 'https://list.jd.com/1713-3258-3300.html'}
{'b_cate': '小说',
'book_author': ' 天下霸唱 ',
'book_date': '2019-05',
'book_img': 'https://img11.360buyimg.com/n7/jfs/t1/35972/33/9712/412296/5cd506afEdfad6130/3cc4d2743b602d00.jpg',
'book_name': '绝对循环(京东专享天下霸唱明信片签名本 手绘插画精装版,随书附赠精美书签 )',
'book_press': '中国文联出版社',
'book_price': '40.90',
'book_sku': '12606906',
's_cate': '惊悚/恐怖',
's_href': 'https://list.jd.com/1713-3258-3305.html'}
{'b_cate': '小说',
'book_author': ' 南派三叔 ',
'book_date': '2019-06',
'book_img': 'https://img10.360buyimg.com/n7/jfs/t1/82523/15/3491/358366/5d19d4f9Ec98668c1/db7c4b9356086922.jpg',
'book_name': '盗墓笔记.5谜海归巢(典藏纪念版)(盗墓诡异秘闻——鬼城魅影:谁在如影随形)',
'book_press': '上海文化出版社',
'book_price': '30.20',
'book_sku': '12646090',
's_cate': '惊悚/恐怖',
's_href': 'https://list.jd.com/1713-3258-3305.html'}
{'b_cate': '小说',
'book_author': ' 南派三叔 ',
'book_date': '2019-06',
'book_img': 'https://img11.360buyimg.com/n7/jfs/t1/50397/34/3897/367912/5d19dd57Ee7ed6843/0477447ee0e03b42.jpg',
'book_name': '盗墓笔记.4蛇沼鬼城(典藏纪念版)(盗墓诡异秘闻——闷油瓶归来:寻找西王母国)',
'book_press': '上海文化出版社',
'book_price': '32.10',
'book_sku': '12544811',
's_cate': '惊悚/恐怖',
's_href': 'https://list.jd.com/1713-3258-3305.html'}