CrawlSpider类是Spider的子类,主要用于全站数据的爬取。
scrapy startproject testCrawlSpider
cd testCrawlSpider
scrapy genspider -t crawl test www.test.com
scrapy crawl test
修改配置文件settings.py
BOT_NAME = 'testCrawlSpider'
SPIDER_MODULES = ['testCrawlSpider.spiders']
NEWSPIDER_MODULE = 'testCrawlSpider.spiders'
USER_AGENT: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
ITEM_PIPELINES = {
'testCrawlSpider.pipelines.TestcrawlspiderPipeline': 300,
}
test.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['www.test.com']
start_urls = ['http://www.test.com/']
# 链接提取器
link = LinkExtractor(allow=r'Items/')
rules = (
# 规则解析器
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
item = {}
return item
链接提取器
类LinkExtractor实例化的对象称为链接提取器,用于在页面中基于指定的规则进行链接(url)的提取。
链接提取器只用于提取链接,规则由参数allow指定,allow=r'正则表达式'
。
规则解析器
类Rule实例化的对象称为规则解析器,用于接收链接提取器获得的链接并对这些链接进行请求发送,根据指定的规则对请求到的响应数据进行数据解析,规则以及解析方案由参数callback指定的方法进行处理。
参数follow用于指定提取的链接是否需要跟进。
可以理解为可以将链接提取器继续作用到链接提取器提取到的所有链接所对应的页面源码中,也就是说将链接提取器获取的每个链接作为新的起始链接再次发送请求,并从响应页面数据中继续提取链接,重复的链接发送的相同请求对象会在Scrapy的调度器中被识别并过滤掉。例如页面上只显示部分页码链接,使用follow=True
可以获取所有的页码链接。
阳光问政平台的最新问政数据:http://wz.sun0769.com/political/index/politicsNewest
重点:使用链接提取器获取页码链接。
页码链接正则表达式:id=1&page=\d+
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TestSpider(CrawlSpider):
name = 'test'
start_urls = ['http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1']
rules = (
Rule(LinkExtractor(allow=r'/political/index/politicsNewest\?id=1&page=\d+'), callback='parse_item', follow=True), # 注意对?进行转义 => \?
)
def parse_item(self, response):
print(response)
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TestSpider(CrawlSpider):
name = 'test'
start_urls = ['https://www.hao123.com/index.html']
rules = (
Rule(LinkExtractor(allow=r''), callback='parse_item', follow=False),
)
def parse_item(self, response):
print(response)
4567电影网的动作片页面:https://www.4567kan.com/index.php/vod/show/class/动作/id/1/page/1.html
提取页码链接的正则表达式:/id/1/page/\d+\.html
test.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from testCrawlSpider.items import TestcrawlspiderItem
class TestSpider(CrawlSpider):
name = 'test'
# allowed_domains = ['www.test.com']
start_urls = ['https://www.4567kan.com/index.php/vod/show/class/动作/id/1/page/1.html']
rules = (
# 提取页码链接
Rule(LinkExtractor(allow=r'/id/1/page/\d+\.html'), callback='parse_item', follow=True),
)
def parse_item(self, response):
li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
for each_li in li_list:
title_str = each_li.xpath('./div/a/@title').extract_first()
detail_url = 'https://www.4567kan.com/' + each_li.xpath('./div/a/@href').extract_first()
item = TestcrawlspiderItem()
item['title'] = title_str
# 请求传参
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
def parse_detail(self, response):
description_str = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
item = response.meta['item']
item['description'] = description_str
yield item
piplines.py
class TestcrawlspiderPipeline:
def process_item(self, item, spider):
print(item)
return item
items.py
import scrapy
class TestcrawlspiderItem(scrapy.Item):
title = scrapy.Field()
description = scrapy.Field()
搭建一个分布式机群,让每台计算机进行联合且分布的数据爬取工作,将所有计算机爬取到的数据进行汇总并持久化存储。
Scrapy框架无法实现分布式爬虫,需要结合Scrapy-Redis组件。
Scrapy框架无法实现分布式爬虫原因:调度器和管道不能被机群内的计算机共享;
Scrapy-Redis组件作用:为Scrapy框架提供可以被共享的调度器和管道,每台机器都去争抢共享的调度器提供的请求对象资源。
分布式爬虫爬取到的数据只能存储到Redis数据库中。
from scrapy_redis.spiders import RedisCrawlSpider
class TestSpider(CrawlSpider) => class TestSpider(RedisCrawlSpider)
start_urls
替换为redis_key
,其值的类型为字符串,表示可以被共享的调度器队列名称,调度器队列位于Redis数据库中;# 可以被共享的调度器队列名称
start_urls = 'movie_queue'
settings.py
# 使用scrapy-redis组件提供的去重队列
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis组件提供的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 使用scrapy_redis组件提供的管道
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 400
}
# 不再使用自定义管道
# ITEM_PIPELINES = {
# 'testCrawlSpider.pipelines.TestcrawlspiderPipeline': 300,
# }
# 设置Redis
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
# 实现增量式(推荐)
SCHEDULER_PERSIST = True
# bind 127.0.0.1
protected-mode no
lpush 调度器队列名称 起始url
lpush movie_queue https://www.4567kan.com/index.php/vod/show/class/动作/id/1/page/1.html
test:items
中存储了爬取到的数据。>>>keys *
1) "test:items"
2) "test:dupefilter"
3) "test requests"
>>>llen test:items
(integer) 300
test.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
from testCrawlSpider.items import TestcrawlspiderItem
class TestSpider(CrawlSpider):
name = 'test'
# allowed_domains = ['www.test.com']
# start_urls = ['https://www.4567kan.com/index.php/vod/show/class/动作/id/1/page/1.html']
# 可以被共享的调度器队列名称
start_urls = 'sun_queue'
rules = (
# 提取页码链接
Rule(LinkExtractor(allow=r'/id/1/page/\d+\.html'), callback='parse_item', follow=True),
)
def parse_item(self, response):
li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
for each_li in li_list:
title_str = each_li.xpath('./div/a/@title').extract_first()
detail_url = 'https://www.4567kan.com/' + each_li.xpath('./div/a/@href').extract_first()
item = TestcrawlspiderItem()
item['title'] = title_str
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
def parse_detail(self, response):
description_str = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
item = response.meta['item']
item['description'] = description_str
yield item
增量式爬虫用于检测网站数据更新的情况,每次只爬取网站更新的数据。
核心:记录表
将每次爬取到的数据记录写入记录表中,之后再次爬取时将爬取到的数据记录与记录表中的记录进行比对,
如果不存在,则说明是更新的数据,需要爬取;
如果存在,则说明是已经爬取过的数据,不需要爬取。
记录表特征:自动去重和持久化存储
可以使用Redis数据库的set集合作为数据表。
分析:
在电影网站中每一部电影的详情页url可以作为电影的唯一标识,即爬取的记录,因此记录表中存储爬取过的电影的详情页url。
zls.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from zlsPro.items import ZlsproItem
class ZlsSpider(CrawlSpider):
name = 'zls'
conn = Redis(host='127.0.0.1', port=6379)
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.4567kan.com/index.php/vod/show/class/%E5%8A%A8%E4%BD%9C/id/1/page/1.html']
rules = (
Rule(LinkExtractor(allow=r'/id/1/page/\d+\.html'), callback='parse_item', follow=False),
)
def parse_item(self, response):
# 解析电影的名称
li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
for li in li_list:
title = li.xpath('./div/a/@title').extract_first()
# 唯一标识
detail_url = 'https://www.4567kan.com/' + li.xpath('./div/a/@href').extract_first()
# 数据表
ex = self.conn.sadd('movie_urls', detail_url)
# ex为1表示插入成功,0表示插入失败,即插入了重复数据。
if ex == 1:
item = ZlsproItem()
item['title'] = title
print('正在爬取的电影是:', title)
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
else:
print('暂无更新数据。')
def parse_detail(self, response):
desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
item = response.meta['item']
item['desc'] = desc
yield item
pipelines.py
class ZlsproPipeline:
def process_item(self, item, spider):
spider.conn.lpush('movie_data', item)
return item
settings.py
BOT_NAME = 'zlsPro'
SPIDER_MODULES = ['zlsPro.spiders']
NEWSPIDER_MODULE = 'zlsPro.spiders'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
ITEM_PIPELINES = {
'zlsPro.pipelines.ZlsproPipeline': 300,
}
items.py
import scrapy
class ZlsproItem(scrapy.Item):
title = scrapy.Field()
desc = scrapy.Field()
import threading
import requests
from lxml import etree
import os
from urllib import request
from queue import Queue
class Producer(threading.Thread):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
}
def __init__(self, page_queue, img_queue, *args, **kwargs):
super(Producer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.parse_page(url)
def parse_page(self, url):
response = requests.get(url=url, headers=self.headers)
text = response.text
html = etree.HTML(text)
img_list = html.xpath('//div[@class="page-content text-center"]/div/a/img')
for img in img_list:
img_url = img.xpath('./@data-original')[0]
img_name = img.xpath('./@alt')[0] + '.jpg'
self.img_queue.put((img_url, img_name))
class Consumer(threading.Thread):
def __init__(self, page_queue, img_queue, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.page_queue.empty() and self.img_queue.empty():
break
img_url, img_name = self.img_queue.get()
request.urlretrieve(img_url, 'imgs/' + img_name)
print(img_name + " 下载完成。")
def main():
page_queue = Queue(50) # 存储页码链接
img_queue = Queue(100) # 存储解析出来的图片链接
# 爬取前10页的数据
for page_num in range(1, 11):
url = "https://www.doutula.com/photo/list/?page=%d" % page_num
page_queue.put(url) # 将每一页页码链接加入到page_queue中
# 三个生产者
for page_num in range(3):
t = Producer(page_queue, img_queue)
t.start()
# 三个消费者
for page_num in range(3):
t = Consumer(page_queue, img_queue)
t.start()
if __name__ == '__main__':
main()