第1步:
scrapy startproject huyaPro
第2步:
cd huyaPro
scrapy genspider huya www.xxx.com
第3步:到setting中进行相关的设置
第4步:进行数据解析
4.1: 基于终端指令进行的持久化存储
def parse(self, response):
li_list = response.xpath('//*[@id="js-live-list"]/li')
all_data = []
for li in li_list:
title = li.xpath('./a[2]/text()').extract_first()
man = li.xpath('./span/span[1]/i/text()').extract_first()
hot = li.xpath('./span/span[2]/i[2]/text()').extract_first()
dic = {
'title': title,
'man': man,
'hot': hot,
}
all_data.append(dic)
return all_data
4.2:基于管道的持久化存储
def parse(self, response):
li_list = response.xpath('//*[@id="js-live-list"]/li')
for li in li_list:
title = li.xpath('./a[2]/text()').extract_first()
author = li.xpath('./span/span[1]/i/text()').extract_first()
hot = li.xpath('./span/span[2]/i[2]/text()').extract_first()
#实例化item类型的对象
item = HuyaproItem()
item['title'] = title
item['author'] = author
item['hot'] = hot
yield item #提交给管道
第5步:到item.py中设置
class HuyaproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
author = scrapy.Field()
hot = scrapy.Field()
第6步:到pipelines.py中设置一下各个管道
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from redis import Redis
class HuyaproPipeline(object):
fp = None
def open_spider(self,spider):
print('i am open_spider()')
self.fp = open('huyazhibo.txt','w',encoding='utf-8')
def process_item(self, item, spider):#item就是接收到爬虫类提交过来的item对象
self.fp.write(item['title']+':'+item['author']+':'+item['hot']+'\n')
print(item['title'],'写入成功!!!')
return item
def close_spider(self,spider):
self.fp.close()
print('i am close_spider()')
class mysqlPipeLine(object):
conn = None
cursor = None
def open_spider(self,spider):
self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123',db='Spider',charset='utf8')#utf-8会报错
print(self.conn)
def process_item(self,item,spider):
sql = 'insert into huya values("%s","%s","%s")'%(item['title'],item['author'],item['hot'])
self.cursor = self.conn.cursor()
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self,spider):
self.cursor.close()
self.conn.close()
class RedisPipeLine(object):
conn = None
def open_spider(self,spider):
self.conn = Redis(host='127.0.0.1',port=6379)
def process_item(self,item,spider):
self.conn.lpush('huyaList',item)
return item
第7步:在setting里面设置各个管道以及它们的优先级
ITEM_PIPELINES = {
'huyaPro.pipelines.HuyaproPipeline': 300,
'huyaPro.pipelines.mysqlPipeLine': 301,
'huyaPro.pipelines.RedisPipeLine': 302,
}