python爬虫

以下是连接池的参数说明:

 def __init__(self, creator,
            mincached=0, maxcached=0,
            maxshared=0, maxconnections=0, blocking=False,
            maxusage=None, setsession=None, reset=True,
            failures=None, ping=1,
            *args, **kwargs):
        """Set up the DB-API 2 connection pool.

        creator: either an arbitrary function returning new DB-API 2
            connection objects or a DB-API 2 compliant database module
        mincached: initial number of idle connections in the pool
            (0 means no connections are made at startup)
        maxcached: maximum number of idle connections in the pool
            (0 or None means unlimited pool size)
        maxshared: maximum number of shared connections
            (0 or None means all connections are dedicated)
            When this maximum number is reached, connections are
            shared if they have been requested as shareable.
        maxconnections: maximum number of connections generally allowed
            (0 or None means an arbitrary number of connections)
        blocking: determines behavior when exceeding the maximum
            (if this is set to true, block and wait until the number of
            connections decreases, otherwise an error will be reported)
        maxusage: maximum number of reuses of a single connection
            (0 or None means unlimited reuse)
            When this maximum usage number of the connection is reached,
            the connection is automatically reset (closed and reopened).
        setsession: optional list of SQL commands that may serve to prepare
            the session, e.g. ["set datestyle to ...", "set time zone ..."]
        reset: how connections should be reset when returned to the pool
            (False or None to rollback transcations started with begin(),
            True to always issue a rollback for safety's sake)
        failures: an optional exception class or a tuple of exception classes
            for which the connection failover mechanism shall be applied,
            if the default (OperationalError, InternalError) is not adequate
        ping: determines when the connection should be checked with ping()
            (0 = None = never, 1 = default = whenever fetched from the pool,
            2 = when a cursor is created, 4 = when a query is executed,
            7 = always, and all other bit combinations of these values)
        args, kwargs: the parameters that shall be passed to the creator
            function or the connection constructor of the DB-API 2 module

        """

在 pipeline中的代码,以下代码是以一千条数据为一批,在spider开启的时候创建连接池,spider关闭的时候,将可能不足一批的数据写入数据库同时关闭连接池:

# -*- coding: utf-8 -*-
import sys
import MySQLdb

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)
from DBUtils.PooledDB import PooledDB

class MySQLStorePipeline(object):
    pgs = []

    def open_spider(self, spider):
        self.pool = PooledDB(creator=MySQLdb, maxcached=5,maxshared=5, host='localhost', user='root', passwd='root', db='vboxdb', port=3306,
                             charset="utf8")
        self.conn = self.pool.connection()
        self.cursor = self.conn.cursor()
        # 清空表:
        self.cursor.execute("truncate table epg")
        self.conn.commit()

    # 批量插入
    def bulk_insert_to_mysql(self, bulkdata):
        try:
            print "inserting data in batch--->>>>>", len(self.pgs)
            sql = """INSERT INTO epg (date, channel, time, pg) VALUES (%s, %s, %s, %s)"""
            self.cursor.executemany(sql, bulkdata)
            self.conn.commit()
        except:
            self.conn.rollback()

    def process_item(self, item, spider):
        # print item['date'], item['channel'], item['time'], item['pg']
        self.pgs.append((item['date'], item['channel'], item['time'], item['pg']))
        if len(self.pgs) == 1000:
            self.bulk_insert_to_mysql(self.pgs)
            # 清空缓冲区
            del self.pgs[:]
        return item

    # spider结束
    def close_spider(self, spider):
        print "closing spider,last commit", len(self.pgs)
        self.bulk_insert_to_mysql(self.pgs)
        self.conn.commit()
        self.cursor.close()
        self.conn.close()

你可能感兴趣的:(python爬虫)