scrapy保存数据

将item写入JSON文件

 

import json

class JsonWriterPipeline(object):

    def __init__(self):
        self.file = open('items.jl', 'wb')

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

将item写入MongoDB中

pipeline.py

 from scrapy.conf import settings

import pymongo


class MongoPipeline(object):
    def __init__(self):
        # 获取setting主机名,端口号和数据库名称
        host = settings['MONGODB_HOST']
        port = settings['MONGO_POST']
        dbname = settings['MONGO_DBNANME']
        # 创建数据库连接
        client = pymongo.MongoClient(host=host, port=port)
        # 指向指定数据库
        mdb = client['Hongxiu']
        # 获取数据库里面存放数据的表名
        self.post = mdb[settings['MONGO_DOCNAME']]

    def process_item(self, item, spider):
        data = dict(item)
        self.post.insert(data)
        return item

settings.py

MONGODB_HOST = '127.0.0.1'
# 端口号, 默认27017
MONGO_POST = 27017
# 设置数据库名称
MONGO_DBNANME = 'Hongxiu'
# 存放本数据的表名
MONGO_DOCNAME = 'novel'
 

 

将scrapy爬取的数据(item)存放在MySQL数据库中

import pymysql
class MysqlPipeline(object):
    def __init__(self, conn):
        self.conn = conn

    @classmethod
    def from_crawler(cls, crawler):
        username = crawler.settings.get('DB_USER', 'root')
        passwd = crawler.settings.get('DB_PWD', '123456')
        formname = crawler.settings.get('DB_NAME', 'jobbole')
        obj = cls(
            conn=pymysql.Connect('localhost', username, passwd, formname, charset='utf8')
        )
        return obj

    def process_item(self, item, spider):
        with self.conn as cursor:
            sql = "insert into jobbole(`title`, `time`, `sort`, `content`, `great_num`, `collect_num`, `comments_num`) values (%s, %s, %s, %s, %s, %s, %s)"
            cursor.execute(sql,(item['title'],item['time'],item['sort'],item['content'],item['great_num'],item['collect_num'],item['comments_num']))
            if cursor.rowcount > 0:
                print('数据保存成功, {}'.format(item['title']))
            else:
                print('数据保存失败')
 

将scrapy爬取的数据(item)存放在Excel表格中

pipeline.py

from openpyxl import Workbook


class Job51SpiderPipeline(object):
    def __init__(self):
        self.wb = Workbook()
        self.ws = self.wb.active
        self.ws.append(['职位名称', '薪资', '公司名称'])

    def process_item(self, item, spider):
        line = [item['job_title'], item['salary'], item['firm']]
        self.ws.append(line)
        self.wb.save('./51job.xlsx')
        return item

 

 

 

你可能感兴趣的:(python)