无聊医生玩python
很多人开始使用scrapy框架会对pipelines的作用不够了解,其实可以理解为spiders负责解析网页,中间件负责request和dawnload,剩下的数据部分可以都交给pipelines来完成。
使用多年的感受就是,pipelines可以具有很强的通用性,基本上一个写好的管道可以用于所有的爬虫,每个爬虫都可以公用一个pipelines.py,新的项目只需要复制一个之前定义好的pipelines,改几行代码就可以实现很强大的功能了。
下面是一个图片下载spider的数据处理管道,主要是为图片下载做准备,添加了时间戳,去除没有src的无效数据,添加Referer,添加存储路径dirpath(这里只需要相对路径)basedir在setting中指定。
class TimePipeline(object):
def process_item(self, item, spider):
if item['src'] is None:
raise DropItem('Dtop empty item!!')
else:
item['crawled'] = str(datetime.utcnow())#将datetime转化为字符串,为了支持json.
item['spider'] = spider.name
if spider.name == 'meitulu':
item['Referer'] = 'https://www.meitulu.com/img.html?img=' + item['src']
item['dirpath'] = 'meitulu_scrapy/' + item['title'] + '/'+item['dirname'] + '/' + item['picname']+'.jpg'
elif spider.name == 'meituri':
item['Referer'] = 'https://www.meituri.com/bigimg.html?img='+item['src']
item['dirpath'] = 'meituri_scrapy/' + item['title'] + '/'+item['dirname'] + '/' + item['picname'] +'.jpg'
elif spider.name == 'meituri_sql':
item['title'] = item['title'].replace(' ', '')
item['picname'] = item['picname'].replace(' ', '')
elif spider.name == 'jmrenti':
item['picname'] = item['picname'].replace('/','_')
item['src'] = 'http://www.jmrenti.org' + item['src']
item['Referer'] = 'http://www.jmrenti.org/'
item['dirpath'] = 'jmrenti_scrapy/' + item['title'] + '/'+item['dirname'] + '/' + item['picname'] + '.jpg'
elif spider.name == 'lituwu':
item['picname'] = item['picname'].replace(' ', '')
item['src'] = 'https://www.lituwu.com/' + item['src']
item['Referer'] = 'https://www.lituwu.com/'
item['dirpath'] = 'lituwu_scrapy/' + item['title'] + '/' +item['dirname'] + '/' + item['picname'] + '.jpg'
elif spider.name == 'mntup':
item['src'] = 'https://www.mntup.com' + item['src']
item['Referer'] = 'https://www.mntup.com/'
# item['dirpath'] = 'mntup_scrapy/' + item['title'] + '/' +item['dirname'] + '/' + item['picname'] + '.jpg'
return item
以下存储为json文件
class TextPipeline(object):
def process_item(self, item, spider):
content = json.dumps(dict(item), ensure_ascii=False) + ',\n\n'
textpath = spider.name +'.json'
with open(textpath, 'a') as fp:
fp.write(content)
return item
mongodb
class MongoDBPipeline(object):
'''mongodb管道'''
def open_spider(self, spider):
'''连接数据库'''
db_url = spider.settings.get('MONGODB_URL', 'mongodb://localhost:27017')
db_name = spider.settings.get('MONGODB_DB_NAME', 'scrapy_default')
self.db_client = MongoClient(db_url)
self.db = self.db_client[db_name]
self.collection = self.db[spider.name]
def process_item(self, item, spider):
'''插入数据'''
item = dict(item) # 将数据转化为字典格式
self.collection.insert_one(item) # 向集合aisinei中插入数据
return item
def close_spider(self, spider):
'''关闭'''
self.db_client.close()
mysql:下面是之前写的存储疫情数据的管道,没有mongodb自由度高。
class MysqlPipeline(object):
'''Mysql管道'''
def open_spider(self, spider):
'''用于连接数据库'''
mysql_db = spider.settings.get('MYSQL_DB')
self.db = pymysql.connect(mysql_db['HOST'], mysql_db['USER'], mysql_db['PASSWORD'], mysql_db['NAME'])
self.cursor = self.db.cursor()
def process_item(self, item, spider):
'''插入数据'''
# print(item)
item = dict(item) # 将数据转化为字典格式
sql = '''INSERT INTO china2020(
date,
suspectedCount,
confirmedCount,
curedCount,
deadCount,
seriousCount,
suspectedIncr,
confirmedIncr,
curedIncr,
deadIncr,
seriousIncr
) VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')''' %(
item['datadate'],
item['suspectedCount'],
item['confirmedCount'],
item['curedCount'],
item['deadCount'],
item['seriousCount'],
item['suspectedIncr'],
item['confirmedIncr'],
item['curedIncr'],
item['deadIncr'],
item['seriousIncr']
)
print(sql)
try:
self.cursor.execute(sql)
self.db.commit()
print('---------------写入MySQL成功------------')
except:
self.db.rollback()
print('---------------写入MySQL不成功------------')
return item
下面的代码是参考官方文档写的,可以按照路径分类存储图片,非常好用。
注意要用meta传递定义好的存储路径,在file_path中接收。
class DownloadPipeline(ImagesPipeline):
'''下载图片'''
def get_media_requests(self, item, info):
# 发起请求下载图片
yield scrapy.Request(item['src'], meta = {'name':item['picstore']})
def item_completed(self, results, item, info):
if not results[0][0]:
raise DropItem('下载失败')
return item
def file_path(self, request, response=None, info=None):
# 接收上面meta传递过来的图片名称
picname = request.meta['name']
return picname