之前我们在scrapy入门使用一节中学习了管道的基本使用,接下来我们深入的学习scrapy管道的使用
继续完善wangyi爬虫,在pipelines.py代码中完善
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json
from pymongo import MongoClient
class WangyiPipeline:
# def __init__(self):
# self.file = open('wangyi.json','w')
def open_spider(self,spider):
if spider.name == 'job':
self.file = open('wangyi.json','w',encoding='utf-8')
def process_item(self, item, spider):
if spider.name == 'job':
# 将item对象转换成字典类型
item = dict(item)
# 将字典类型数据转换成字符串
str_data = json.dumps(item,ensure_ascii=False) +',\n'
self.file.write(str_data)
return item
def close_spider(self,spider):
if spider.name == 'job':
self.file.close()
class WangyiSimplePipeline:
# def __init__(self):
# self.file = open('wangyi.json','w')
def open_spider(self,spider):
if spider.name == 'job_simple':
self.file = open('wangyiSimple.json','w',encoding='utf-8')
def process_item(self, item, spider):
if spider.name == 'job_simple':
# 将item对象转换成字典类型
item = dict(item)
# 将字典类型数据转换成字符串
str_data = json.dumps(item,ensure_ascii=False) +',\n'
self.file.write(str_data)
return item
def close_spider(self,spider):
if spider.name == 'job_simple':
self.file.close()
class MongoPipeline(object):
def open_spider(self,spider):
self.client = MongoClient('127.0.0.1',27017)
self.db = self.client['itcast']
self.col = self.db['wangyi']
def process_item(self,item,spider):
# 将item对象转换成字符串
data =dict(item)
# 将data写入数据库
self.col.insert(data)
return item
def close_spider(self,spider):
self.client.close()
import json
from pymongo import MongoClient
class WangyiFilePipeline(object):
def open_spider(self, spider): # 在爬虫开启的时候仅执行一次
if spider.name == 'itcast':
self.f = open('json.txt', 'a', encoding='utf-8')
def close_spider(self, spider): # 在爬虫关闭的时候仅执行一次
if spider.name == 'itcast':
self.f.close()
def process_item(self, item, spider):
if spider.name == 'itcast':
self.f.write(json.dumps(dict(item), ensure_ascii=False, indent=2) + ',\n')
# 不return的情况下,另一个权重较低的pipeline将不会获得item
return item
class WangyiMongoPipeline(object):
def open_spider(self, spider): # 在爬虫开启的时候仅执行一次
if spider.name == 'itcast':
# 也可以使用isinstanc函数来区分爬虫类:
con = MongoClient(host='127.0.0.1', port=27017) # 实例化mongoclient
self.collection = con.itcast.teachers # 创建数据库名为itcast,集合名为teachers的集合操作对象
def process_item(self, item, spider):
if spider.name == 'itcast':
self.collection.insert(item)
# 此时item对象必须是一个字典,再插入
# 如果此时item是BaseItem则需要先转换为字典:dict(BaseItem)
# 不return的情况下,另一个权重较低的pipeline将不会获得item
return item
在settings.py设置开启pipeline
......
ITEM_PIPELINES = {
'myspider.pipelines.ItcastFilePipeline': 400, # 400表示权重
'myspider.pipelines.ItcastMongoPipeline': 500, # 权重值越小,越优先执行!
}
......
别忘了开启mongodb数据库 sudo service mongodb start
并在mongodb数据库中查看 mongo
思考:在settings中能够开启多个管道,为什么需要开启多个?