将item写入JSON文件
import json
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('items.jl', 'wb')
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
将item写入MongoDB中
pipeline.py
from scrapy.conf import settings
import pymongo
class MongoPipeline(object):
def __init__(self):
# 获取setting主机名,端口号和数据库名称
host = settings['MONGODB_HOST']
port = settings['MONGO_POST']
dbname = settings['MONGO_DBNANME']
# 创建数据库连接
client = pymongo.MongoClient(host=host, port=port)
# 指向指定数据库
mdb = client['Hongxiu']
# 获取数据库里面存放数据的表名
self.post = mdb[settings['MONGO_DOCNAME']]
def process_item(self, item, spider):
data = dict(item)
self.post.insert(data)
return item
settings.py
MONGODB_HOST = '127.0.0.1'
# 端口号, 默认27017
MONGO_POST = 27017
# 设置数据库名称
MONGO_DBNANME = 'Hongxiu'
# 存放本数据的表名
MONGO_DOCNAME = 'novel'
将scrapy爬取的数据(item)存放在MySQL数据库中
import pymysql
class MysqlPipeline(object):
def __init__(self, conn):
self.conn = conn
@classmethod
def from_crawler(cls, crawler):
username = crawler.settings.get('DB_USER', 'root')
passwd = crawler.settings.get('DB_PWD', '123456')
formname = crawler.settings.get('DB_NAME', 'jobbole')
obj = cls(
conn=pymysql.Connect('localhost', username, passwd, formname, charset='utf8')
)
return obj
def process_item(self, item, spider):
with self.conn as cursor:
sql = "insert into jobbole(`title`, `time`, `sort`, `content`, `great_num`, `collect_num`, `comments_num`) values (%s, %s, %s, %s, %s, %s, %s)"
cursor.execute(sql,(item['title'],item['time'],item['sort'],item['content'],item['great_num'],item['collect_num'],item['comments_num']))
if cursor.rowcount > 0:
print('数据保存成功, {}'.format(item['title']))
else:
print('数据保存失败')
将scrapy爬取的数据(item)存放在Excel表格中
pipeline.py
from openpyxl import Workbook
class Job51SpiderPipeline(object):
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
self.ws.append(['职位名称', '薪资', '公司名称'])
def process_item(self, item, spider):
line = [item['job_title'], item['salary'], item['firm']]
self.ws.append(line)
self.wb.save('./51job.xlsx')
return item