scrapy测试爬取虎牙部分数据(两种存储方式)

第1步:

scrapy startproject huyaPro

第2步:
 

cd huyaPro

scrapy genspider huya www.xxx.com

第3步:到setting中进行相关的设置

第4步:进行数据解析

       4.1: 基于终端指令进行的持久化存储

    def parse(self, response):
        
        li_list = response.xpath('//*[@id="js-live-list"]/li')
        all_data = []
        for li in li_list:

            title = li.xpath('./a[2]/text()').extract_first()
            man = li.xpath('./span/span[1]/i/text()').extract_first()
            hot = li.xpath('./span/span[2]/i[2]/text()').extract_first()
            dic = {
                'title': title,
                'man': man,
                'hot': hot,
            }
            all_data.append(dic)
        return all_data

     4.2:基于管道的持久化存储

    def parse(self, response):
        li_list = response.xpath('//*[@id="js-live-list"]/li')
        for li in li_list:
            title = li.xpath('./a[2]/text()').extract_first()
            author = li.xpath('./span/span[1]/i/text()').extract_first()
            hot = li.xpath('./span/span[2]/i[2]/text()').extract_first()

            #实例化item类型的对象
            item = HuyaproItem()
            item['title'] = title
            item['author'] = author
            item['hot'] = hot

            yield item #提交给管道

第5步:到item.py中设置

class HuyaproItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    author = scrapy.Field()
    hot = scrapy.Field()

第6步:到pipelines.py中设置一下各个管道

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from redis import Redis

class HuyaproPipeline(object):
    fp = None
    def open_spider(self,spider):
        print('i am open_spider()')
        self.fp = open('huyazhibo.txt','w',encoding='utf-8')
    def process_item(self, item, spider):#item就是接收到爬虫类提交过来的item对象

        self.fp.write(item['title']+':'+item['author']+':'+item['hot']+'\n')
        print(item['title'],'写入成功!!!')
        return item

    def close_spider(self,spider):
        self.fp.close()
        print('i am close_spider()')


class mysqlPipeLine(object):
    conn = None
    cursor = None
    def open_spider(self,spider):
        self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123',db='Spider',charset='utf8')#utf-8会报错
        print(self.conn)
    def process_item(self,item,spider):
        sql = 'insert into huya values("%s","%s","%s")'%(item['title'],item['author'],item['hot'])
        self.cursor = self.conn.cursor()
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item
    def close_spider(self,spider):
        self.cursor.close()
        self.conn.close()

class RedisPipeLine(object):
    conn = None
    def open_spider(self,spider):
        self.conn = Redis(host='127.0.0.1',port=6379)
    def process_item(self,item,spider):
        self.conn.lpush('huyaList',item)
        return item

第7步:在setting里面设置各个管道以及它们的优先级

ITEM_PIPELINES = {
   'huyaPro.pipelines.HuyaproPipeline': 300,
    'huyaPro.pipelines.mysqlPipeLine': 301,
    'huyaPro.pipelines.RedisPipeLine': 302,
}

 

你可能感兴趣的:(爬虫,scrapy)