cmd 命令创建项目
scrapy startproject yiyaowang
cd yiyaowang
scrapy genspider yaowang yaowang.com
先进入settings.py文件将服从爬虫协议改成False,因为有些网站不盖爬取不了,因此都改了
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class YiyaowangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 定义药品名
title = scrapy.Field()
# 定义药品价格
price = scrapy.Field()
# 定义评论数
comment = scrapy.Field()
pass
yaowang.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import YiyaowangItem
class YaowangSpider(scrapy.Spider):
name = 'yaowang'
# allowed_domains = ['yaowang.com']
# 分页:找URL规律
base_url = 'https://www.111.com.cn/categories/953710-j{}.html'
start_urls = []
for i in range(1,51):
start_urls.append(base_url.format(i))
def parse(self, response):
# 实例化对象
item = YiyaowangItem()
# 提取数据
li_list = response.xpath('//ul[@id="itemSearchList"]/li')
for li in li_list:
# 获取药品名
title = li.xpath('.//p[@class="titleBox"]/a/text()').extract()[1].strip()
# 发现问题:一片空白
# 分析:
# 1. xpath路径问题
# 2. 使用xpath获取值的时候,列表中的第一个元素是空白字符
# 解决:
# 使用extract()或者getall()获取列表,取出我们想要的数据即可
# 获取药品价格
# price = li.xpath('.//p[@class="price"]/span/text()').extract_first().strip()
# 发现问题:
# 有的药品有价格,有的价格为None
# 经过在页面中的查看,发现,价格为None的药品,其实是有真实价格的
# 所以,断定,xpath路径有问题。爬虫爬取的是网页源代码,我们看网页源代码,价格的span外面还有一侧标签
price = li.xpath('.//p[@class="price"]//span/text()').extract_first().strip()
# 获取评论数
comment = li.xpath('.//a[@id="pdlink3"]/em/text()').get()
item['title'] = title
item['price'] = price
item['comment'] = comment
yield item
# print(title,price,comment)
pass
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 保存MongoDB数据库
import pymongo
class YiyaowangPipeline(object):
def open_spider(self,spider):
# 1. 链接数据库
self.client = pymongo.MongoClient(host='10.10.34.163',port=27017)
# 2. 进入数据库
self.db = self.client['yiyaowang']
# 3. 进入集合
self.col = self.db['yaowang']
pass
def process_item(self, item, spider):
# 插入数据
self.col.insert(dict(item))
return item
# 关闭数据库
def close_spider(self,spider):
self.client.close()
最后记得将settings.py 里面ITEM_PIPELINES注释解开,不解开的话是不会执行pipelinnes文件的
ITEM_PIPELINES = {
'yiyaowang.pipelines.YiyaowangPipeline': 300,
}