scrapy爬取数据之后,如何存入mysql

pipelines.py文件中新建MySQLPipeline类:

# 导入库
from scrapy.utils.project import get_project_settings
import pymysql

# 写入数据库
class MySQLPipeline(object):
    def connect_db(self):
        # 从settings.py文件中导入数据库连接需要的相关信息
        settings = get_project_settings()

        self.host = settings['DB_HOST']
        self.port = settings['DB_PORT']
        self.user = settings['DB_USER']
        self.password = settings['DB_PASSWORD']
        self.name = settings['DB_NAME']
        self.charset = settings['DB_CHARSET']

        # 连接数据库
        self.conn = pymysql.connect(
            host = self.host,
            port = self.port,
            user = self.user,
            password = self.password,
            db = self.name,  # 数据库名
            charset = self.charset,
        )

        # 操作数据库的对象
        self.cursor = self.conn.cursor()

    # 连接数据库
    def open_spider(self, spider):
        self.connect_db()

    # 关闭数据库连接
    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

    # 写入数据库
    def process_item(self, item, spider):
        # 写入数据库内容
        # 这里根据需求自行设置要写入的字段及值
        sql = 'insert into book (title, img_url) values ("%s", "%s")' % (item['title'], item['img_url'])

        # 执行sql语句
        self.cursor.execute(sql)

        # 需要强制提交数据,否则数据回滚之后,数据库为空
        self.conn.commit()

        return item

设置settings.py文件,开启ITEM_PIPELINE,并设置数据库相关信息:

ITEM_PIPELINES = {
   'dushuProject.pipelines.DushuprojectPipeline': 300,
   'dushuProject.pipelines.MySQLPipeline': 200,  # 优先级设置稍靠前点
}

DB_HOST = 'localhost'
DB_PORT = 3306
DB_USER = '数据库用户名'
DB_PASSWORD = '数据库密码'
DB_NAME = '数据库名'
DB_CHARSET = 'utf8'

你可能感兴趣的:(Python:爬虫学习)