Scrapy MongoDB异步插入

MongoDB 异步插入写法

由于Scrapy 是异步执行,写入数据库时如果用传统的写入方法,这样会拖慢速度

settings.py添加MONGO_URI, MONGO_DB,MONGO_COL

MONGO_URI = 'mongodb://127.0.0.1:27017/'
MONGO_DB = '数据库名'

pipelines.py 中:

import pymongo
from twisted.internet import reactor, defer

class MongoPipline(object):
    """
    异步插入MongoDB
    """
    def __init__(self, mongo_uri, mongo_db, mongo_col):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
        self.mongo_col = mongo_col

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI', 'mongodb://127.0.0.1:27017/'),
            mongo_db=crawler.settings.get('MONGO_DB'),
            mongo_db=crawler.settings.get('MONGO_COL'),
        )

    def open_spider(self, spider):
        """
        爬虫启动时,启动
        :param spider:
        :return:
        """
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.mongodb = self.client[self.mongo_db]

    def close_spider(self, spider):
        """
        爬虫关闭时执行
        :param spider:
        :return:
        """
        self.client.close()

    @defer.inlineCallbacks
    def process_item(self, item, spider):
        out = defer.Deferred()
        reactor.callInThread(self._insert, item, out, spider)
        yield out
        defer.returnValue(item)

    def _insert(self, item, out, spider):
        """
        插入函数
        :param item:
        :param out:
        :return:
        """
        self.mongodb[self.mongo_col].insert(dict(item))
        reactor.callFromThread(out.callback, item)

你可能感兴趣的:(遇到的问题)