以下是连接池的参数说明:
def __init__(self, creator,
mincached=0, maxcached=0,
maxshared=0, maxconnections=0, blocking=False,
maxusage=None, setsession=None, reset=True,
failures=None, ping=1,
*args, **kwargs):
"""Set up the DB-API 2 connection pool.
creator: either an arbitrary function returning new DB-API 2
connection objects or a DB-API 2 compliant database module
mincached: initial number of idle connections in the pool
(0 means no connections are made at startup)
maxcached: maximum number of idle connections in the pool
(0 or None means unlimited pool size)
maxshared: maximum number of shared connections
(0 or None means all connections are dedicated)
When this maximum number is reached, connections are
shared if they have been requested as shareable.
maxconnections: maximum number of connections generally allowed
(0 or None means an arbitrary number of connections)
blocking: determines behavior when exceeding the maximum
(if this is set to true, block and wait until the number of
connections decreases, otherwise an error will be reported)
maxusage: maximum number of reuses of a single connection
(0 or None means unlimited reuse)
When this maximum usage number of the connection is reached,
the connection is automatically reset (closed and reopened).
setsession: optional list of SQL commands that may serve to prepare
the session, e.g. ["set datestyle to ...", "set time zone ..."]
reset: how connections should be reset when returned to the pool
(False or None to rollback transcations started with begin(),
True to always issue a rollback for safety's sake)
failures: an optional exception class or a tuple of exception classes
for which the connection failover mechanism shall be applied,
if the default (OperationalError, InternalError) is not adequate
ping: determines when the connection should be checked with ping()
(0 = None = never, 1 = default = whenever fetched from the pool,
2 = when a cursor is created, 4 = when a query is executed,
7 = always, and all other bit combinations of these values)
args, kwargs: the parameters that shall be passed to the creator
function or the connection constructor of the DB-API 2 module
"""
在 pipeline中的代码,以下代码是以一千条数据为一批,在spider开启的时候创建连接池,spider关闭的时候,将可能不足一批的数据写入数据库同时关闭连接池:
# -*- coding: utf-8 -*-
import sys
import MySQLdb
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
from DBUtils.PooledDB import PooledDB
class MySQLStorePipeline(object):
pgs = []
def open_spider(self, spider):
self.pool = PooledDB(creator=MySQLdb, maxcached=5,maxshared=5, host='localhost', user='root', passwd='root', db='vboxdb', port=3306,
charset="utf8")
self.conn = self.pool.connection()
self.cursor = self.conn.cursor()
# 清空表:
self.cursor.execute("truncate table epg")
self.conn.commit()
# 批量插入
def bulk_insert_to_mysql(self, bulkdata):
try:
print "inserting data in batch--->>>>>", len(self.pgs)
sql = """INSERT INTO epg (date, channel, time, pg) VALUES (%s, %s, %s, %s)"""
self.cursor.executemany(sql, bulkdata)
self.conn.commit()
except:
self.conn.rollback()
def process_item(self, item, spider):
# print item['date'], item['channel'], item['time'], item['pg']
self.pgs.append((item['date'], item['channel'], item['time'], item['pg']))
if len(self.pgs) == 1000:
self.bulk_insert_to_mysql(self.pgs)
# 清空缓冲区
del self.pgs[:]
return item
# spider结束
def close_spider(self, spider):
print "closing spider,last commit", len(self.pgs)
self.bulk_insert_to_mysql(self.pgs)
self.conn.commit()
self.cursor.close()
self.conn.close()