scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架。
scrapy集成好的功能:
环境安装
基于终端指令的持久化存储
import scrapy
class FirstSpider(scrapy.Spider):
name = 'first'
# allowed_domains = ['www.baidu.com']
start_urls = ['https://www.huya.com/g/1663']
# 基于终端指定进行的持久化存储
def parse(self, response):
li_list = response.xpath('//*[@id="js-live-list"]/li')
all_data = []
for li in li_list:
title = li.xpath('./a[2]/text()').extract_first()
author = li.xpath('./span/span[1]/i/text()').extract_first()
hot = li.xpath('/span/span[2]/i[2]/text()').extract_first()
dic = {
'title': title,
'author': author,
'hot': hot,
}
all_data.append(dic)
return all_data
基于管道持久化存储
import scrapy
from huya.items import HuyaItem
class FirstSpider(scrapy.Spider):
name = 'first'
start_urls = ['https://www.huya.com/g/1663']
# 基于管道的持久化存储
def parse(self, response):
li_list = response.xpath('//*[@id="js-live-list"]/li')
for li in li_list:
title = li.xpath('./a[2]/text()').extract_first()
author = li.xpath('./span/span[1]/i/text()').extract_first()
# 实例化item类型的对象
item = HuyaItem()
item['title'] = title
item['author'] = author
yield item # 提交给管道
# pipelines.py
class HuyaPipeline:
fp = None
def open_spider(self, spider):
self.fp = open('hy.txt', 'w', encoding='utf-8')
def process_item(self, item, spider): # item就是接收爬虫类提交过来的item对象
self.fp.write(item['title'] + ':' + item['author'] + '\n')
print(item['title'])
return item
def close_spider(self, spider):
self.fp.close()
import pymysql
# 插入到mysql中
class MysqlPipeline:
conn = None
cursor = None
def open_spider(self, spider):
self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='root', db='spider',
charset='utf8')
def process_item(self, item, spider): # item就是接收爬虫类提交过来的item对象
sql = 'insert into huya values ("%s", "%s")' % (item['title'], item['author'])
self.cursor = self.conn.cursor()
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
self.conn.rollback()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
# settings.py
ITEM_PIPELINES = {
'huya.pipelines.HuyaPipeline': 300, # 值越小优先级越高
'huya.pipelines.MysqlPipeline': 301,
}
全站数据爬取:将所有页码的数据进行爬取
class HuyazbSpider(scrapy.Spider):
name = 'huyaZB'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.huya.com/g/xingxiu']
# 通用url模板
url = 'https://www.huya.com/cache.php?m=LiveList&do=getLiveListByPage&gameId=1663&tagAll=0&page=%d'
def parse(self, response):
li_list = response.xpath('//*[@id="js-live-list"]/li')
for li in li_list:
title = li.xpath('./a[2]/text()').extract_first()
author = li.xpath('./span/span[1]/i/text()').extract_first()
hot = li.xpath('./span/span[2]/i[2]/text()').extract_first()
# 实例化item类型的对象
item = HuyaItem()
item['title'] = title
item['author'] = author
item['hot'] = hot
yield item # 提交给管道
# 手动请求发送
for page in range(2,5):
new_url = format(self.url%page)
yield scrapy.Request(url=new_url, callback=self.parse_other)
# 所有的解析方法都必须模拟parse进行定义:必须要有和parse同样的参数
def parse_other(self, response):
print(response.text) # 这里拿到其他页面,然后进行解析就可以了
作用:实现深度爬取
使用场景:如果使用scrapy爬取的数据没有存在同一个页面中
传递item:yield scrapy.Request(url, callback, meta)
接收item:response.meta
提升scrapy爬取数据的效率
import scrapy
from movie4567.items import Movie4567Item
class MovieSpider(scrapy.Spider):
name = 'movie'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.4567kan.com/frim/index6.html']
urls = 'https://www.4567kan.com/frim/index6-%d.html'
page = 1
def parse(self, response):
print(f'正在爬取第{self.page}页数据.....')
li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
for li in li_list:
item = Movie4567Item()
name = li.xpath('./div/a/@title').extract_first()
item['name'] = name
detail_url = 'https://www.4567kan.com' + li.xpath('./div/a/@href').extract_first()
# 详情页的url进行手动请求的发送
# 请求传参:让Request将一个数据值传递给回调函数
yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item})
if self.page < 5:
self.page += 1
new_url = format(self.urls % self.page)
yield scrapy.Request(new_url, callback=self.parse)
def parse_detail(self, response):
# 接收请求传参的数据(字典)
item = response.meta['item']
desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
item['desc'] = desc
yield item
# pipelines.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class ImgproPipeline(ImagesPipeline):
# 是用来对媒体资源进行请求的(数据下载)
def get_media_requests(self, item, info):
yield scrapy.Request(item['img_src'])
# 指明数据存储的路径
def file_path(self, request, response=None, info=None):
return request.url.split('/')[-1]
# 将item传递给下一个即将执行的管道类
def item_completed(self, results, item, info):
return item
# settings.py
# 图片存储文件夹的名称+路径
IMAGES_STORE = './imgLibs'