scrapy爬虫pipline 代码,储存为MongoDB

item转为字典存入mongodb        

只需要在setting中给出MongoDB的url和库名就可以用

 

 

class WeibMongoPipline(object):
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE'),
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def process_item(self, item, spider):
        if isinstance(item, WeiboItem):
            self._process_use_item(item)
        elif isinstance(item, WeiboContentItem):
            self._process_cont_item(item)
        return item

    def _process_use_item(self, item):
        # 以id去重插入   db后面的是你自己的集合名   
        self.db.user.update({'id': item['id']}, {'$set': dict(item)}, True)
        print(u'微博用户插入成功')

    def _process_cont_item(self, item):
        self.db.content.update({'cont_id': item['cont_id']}, {'$set': dict(item)}, True)
        print(u'微博内容插入成功')

 

你可能感兴趣的:(scrapy,MongoDB)