图片下载的管道
彼岸图网的实例
import scrapy from imgPileLinePro.items import ImgpilelineproItem class ImgSpider(scrapy.Spider): name = 'img' # allowed_domains = ['www.xxx.com'] start_urls = ['http://pic.netbian.com/4kmeinv/'] url = 'http://pic.netbian.com/4kmeinv/index_%d.html' page = 2 def parse(self, response): li_list = response.xpath('//*[@id="main"]/div[3]/ul/li') for li in li_list: img_src = 'http://pic.netbian.com' + li.xpath('./a/img/@src').extract_first() item = ImgpilelineproItem() item['img_src'] = img_src yield item if self.page <= 2: new_url = format(self.url % self.page) self.page += 1 yield scrapy.Request(url=new_url, callback=self.parse)
import scrapy class ImgpilelineproItem(scrapy.Item): # define the fields for your item here like: img_src = scrapy.Field() # pass
BOT_NAME = 'imgPileLinePro' SPIDER_MODULES = ['imgPileLinePro.spiders'] NEWSPIDER_MODULE = 'imgPileLinePro.spiders' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36' # Crawl responsibly by identifying yourself (and your website) on the user-agent # USER_AGENT = 'imgPileLinePro (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_LEVEL = 'ERROR' IMAGES_STORE = './imgsLib' ITEM_PIPELINES = { 'imgPileLinePro.pipelines.ImgPileLine': 300, }
from scrapy.pipelines.images import ImagesPipeline import scrapy class ImgpilelineproPipeline(object): def process_item(self, item, spider): return item class ImgPileLine(ImagesPipeline): # 接收item且将item中存储的img_src进行请求发送 def get_media_requests(self, item, info): yield scrapy.Request(url=item['img_src']) # 指定数据存储的路径(文件夹【在配置文件中指定】+图片名称【该方法中返回】) def file_path(self, request, response=None, info=None): img_name = request.url.split('/')[-1] return img_name # 就是将item传递给下一个即将被执行的管道类 def item_completed(self, result, item, info): return item
中间件
下载中间件
作用:批量拦截整个工程中发起的所有请求和响应
为什么要拦截请求
- UA伪装:
- process_request:request.headers['User-Agent'] = xxx
- 代理ip的设定
- process_exception:request.meta['proxy'] = 'http://ip:port'
为什么要拦截响应
- 篡改响应数据/篡改响应对象
- 注意:中间件需要在配置文件中手动开启
没用到的类函数可以删除掉
from scrapy import signals import random user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] class ImgpilelineproDownloaderMiddleware(object): # 拦截正常的请求 def process_request(self, request, spider): # UA伪装 request.headers['User-Agent'] = random.choice(user_agent_list) return None # 拦截响应 def process_response(self, request, response, spider): return response # 拦截发生异常的请求对象 def process_exception(self, request, exception, spider): # 需要将异常的请求进行修正,然后让其进行重新发送 # 代理操作 request.meta['proxy'] = 'https://ip:port' return request # 重新发送请求
Scrapy中使用selenium
在scrapy中使用selenium
- 在爬虫文件中构造方法中实例化一个浏览器对象
- 在爬虫文件中重写一个closed(self,spider)方法,关闭浏览器对象
- 在下载中间件的process_response中获取浏览器对象,然后执行浏览器自动化的相关操作
案例:网易新闻爬取
需求:网易新闻中国内,国际,军事,航空,无人机这五个板块下的新闻标题和内容
使用百度AI自然语言处理分析出标签和分类,并且创建一个四字段的库表(title,content,keys,type)进行持久化存储(mysql)
分析:
1. 每一个板块下对应的新闻标题数据都是动态加载的。
2. 新闻详情页的数据不是动态加载的
# -*- coding: utf-8 -*- import scrapy from ..items import WangyiproItem from selenium import webdriver class WangyiSpider(scrapy.Spider): name = 'wangyi' # allowed_domains = ['www.xxx.com'] start_urls = ['https://news.163.com/'] model_urls = [] # 存放5个板块对应的url def __init__(self): self.bro = webdriver.Chrome(executable_path=r'D:\chromedriver.exe') def parse(self, response): # 解析5个板块对应的页面链接(非动态加载) li_list = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li') index_list = [3, 4] for index in index_list: li = li_list[index] # 每个板块的url model_url = li.xpath('./a/@href').extract_first() self.model_urls.append(model_url) # 对每个板块的url发起请求 yield scrapy.Request(url=model_url, callback=self.parse_title) # 解析每个板块新闻的标题(动态加载的) def parse_title(self, response): # 没有动态加载数据的response div_list = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div') for div in div_list: new_title = div.xpath('.//div[@class="news_title"]/h3/a/text()').extract_first() item = WangyiproItem() item['title'] = new_title detail_url = div.xpath('.//div[@class="news_title"]/h3/a/@href').extract_first() print(detail_url) # 对新闻详情页发起请求 yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item}) # 解析新闻内容(不是动态加载的) def parse_detail(self, response): item = response.meta['item'] content = response.xpath('//*[@id="endText"]//text()').extract() content = ''.join(content) item['content'] = content yield item # 程序全部结束的时候被调用 def closed(self, spider): print('结束爬虫!!!') self.bro.quit()
import scrapy class WangyiproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() content = scrapy.Field()
from aip import AipNlp import pymysql class WangyiproPipeline(object): conn = None cursor = None def open_spider(self, spider): self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', db='spider', charset='utf8') print(self.conn) def process_item(self, item, spider): self.cursor = self.conn.cursor() #调用百度AI """ 你的 APPID AK SK """ APP_ID = '16821895' API_KEY = 'kLRYwUHKHwgGeowOdfeU9MmZ' SECRET_KEY = 'mtLySt8KRkF0hrFLKnlEQ8L8WNK4CTwu' client = AipNlp(APP_ID, API_KEY, SECRET_KEY) title = item['title'] content = item['content'] content = content.replace("\n", "") content = content.replace(u'\xa0', u'') # """ 调用文章标签 """ keys = None result = client.keyword(title, content).get('items') for key in result: if key.get('score') > 0.60: keys = key.get("tag") # """ 调用文章分类 """ type = None types = client.topic(title, content).get('item').get('lv2_tag_list') for t in types: if t.get('score') > 0.50: type = t.get('tag') print('标题:', title, '标签:', keys, '分类', type) sql = 'insert into wangyi values ("%s","%s","%s","%s")' % (title, content, keys, type) # 进行事物处理 try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self, spider): self.cursor.close() self.conn.close()
from scrapy.http import HtmlResponse from time import sleep class WangyiproDownloaderMiddleware(object): def process_request(self, request, spider): return None # 拦截所有的相应 def process_response(self, request, response, spider): # 将五个板块对应的相应对象改成符合要求的相应对象并返回 model_urls = spider.model_urls # 获得对应的url bro = spider.bro if request.url in model_urls: # response # 对应的相应对象 bro.get(request.url) sleep(2) bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(1) bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(1) page_text = bro.page_source # 获取到的动态加载数据 new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf8', request=request) return new_response return response def process_exception(self, request, exception, spider): pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3861.0 Safari/537.36 Edg/77.0.230.2' # Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_LEVEL = 'ERROR' DOWNLOADER_MIDDLEWARES = { 'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543, } ITEM_PIPELINES = { 'wangyiPro.pipelines.WangyiproPipeline': 300, }
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3861.0 Safari/537.36 Edg/77.0.230.2' # Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_LEVEL = 'ERROR' DOWNLOADER_MIDDLEWARES = { 'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543, } ITEM_PIPELINES = { 'wangyiPro.pipelines.WangyiproPipeline': 300, }
CrawlSpider全站爬取
CrawlSpider其实是Spider的一个子类,除了继承到Spider的特性和功能外,还派生除了其自己独有的更加强大的特性和功能。其中最显著的功能就是”LinkExtractors链接提取器“。Spider是所有爬虫的基类,其设计原则只是为了爬取start_url列表中网页,而从爬取到的网页中提取出的url进行继续的爬取工作使用CrawlSpider更合适。
创建scrapy工程:
scrapy startproject projectName
创建一个基于CrawlSpider的爬虫文件:
scrapy genspider -t crawl spiderName www.xxx.com
执行爬虫文件工程:
scrapy crawl spiderName
实例:东莞阳光网
from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule # 简单的实现全站爬取 class SunSpider(CrawlSpider): name = 'sun' # allowed_domains = ['www.xxx.com'] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page='] # 链接提取器 -- 根据指定的规则(allow:正则)进行链接的提取 link = LinkExtractor(allow=r'type=4&page=\d+') rules = ( # 规则解析器 -- 解析页面数据 根据callback进行解析 # -- 负责对链接提取器提取到的链接所对应的页面源码进行指定规则(callback) Rule(link, callback='parse_item', follow=True), # follow=True保证取到所有页,而不是当前页面 # follow=True : 将链接提取器 继续 作用到 链接提取器提取到的连接 所对应的页面源码中 ) def parse_item(self, response): print(response) # 所有的页面地址
深度爬取:
from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from ..items import SunProDetail, SunproItem # 深度爬取 class SunSpider(CrawlSpider): name = 'sun' # allowed_domains = ['www.xxx.com'] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page='] # 链接提取器 -- 根据指定的规则(allow:正则)进行链接的提取 link = LinkExtractor(allow=r'type=4&page=\d+') # 为提取详情页数据,新建一个连接提取器 link_detail = LinkExtractor(allow=r'question/\d+/\d+\.shtml') rules = ( # 规则解析器 -- 解析页面数据 根据callback进行解析 # -- 负责对链接提取器提取到的链接所对应的页面源码进行指定规则(callback) Rule(link, callback='parse_item', follow=True), # follow=True保证取到所有页,而不是当前页面 Rule(link_detail, callback='parse_detail') # 这里不用写follow=True,默认就是false,这里是不需要的 ) def parse_item(self, response): tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr') for tr in tr_list: title = tr.xpath('./td[2]/a[2]/text()').extract_first() num = tr.xpath('./td[1]/text()').extract_first() item = SunproItem() item['title'] = title item['num'] = num yield item def parse_detail(self, response): content = response.xpath('/html/body/div[9]/table[2]//tr[1]/td/div[2]/text()').extract_first() num = response.xpath('/html/body/div[9]/table[1]//tr/td[2]/span[2]/text()').extract_first() num = num.split(':')[-1] item = SunProDetail() item['content'] = content item['num'] = num yield item
import scrapy class SunproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() num = scrapy.Field() class SunProDetail(scrapy.Item): content = scrapy.Field() num = scrapy.Field()
class SunproPipeline(object): def process_item(self, item, spider): if item.__clsss__.__name__ == "SunProDetail": content = item['content'] num = item['num'] # 这里就可以导入数据库来存储,三个字段(num,title,content) # 先将num和content插入数据库 else: title = item['title'] num = item['num'] # 再这里插入数据库时,where num = num return item
不涉及中间件和管道存储
分布式爬虫
分布式:
概念:组建一个分布式的机群,让分布式机群对同一组数据进行分布爬取.
作用:提升数据爬取的效率
如何实现分布式?
scrapy+redis实现的分布式
scrapy结合着scrapy-redis组建实现的分布式
原生的scrapy是无法实现分布式
调度器无法被分布式机群共享
管道无法被分布式机群共享
scrapy-redis的作用是什么?
给原生的scrapy提供了可以被共享的调度器和管道
为什么这个组建叫做scrapy-redis?
分布爬取的数据必须存储到redis中
编码流程:
- pip install scrapy-redis - 创建爬虫文件(CrawlSpider/Spider) - 修改爬虫文件: - 导入scrapy-redis模块封装好的类 - from scrapy_redis.spiders import RedisCrawlSpider - 将爬虫类的父类指定成RedisCrawlSpider - 将allowed_domains和start_urls删除 - 添加一个新属性:redis_key = 'xxx'#可以被共享的调度器队列的名称 - 进行settings文件的配置: - 指定管道: ITEM_PIPELINES = { 'scrapy_redis.pipelines.RedisPipeline': 400 } - 指定调度器: DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" SCHEDULER = "scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = True - 指定数据库 REDIS_HOST = 'redis服务的ip地址' REDIS_PORT = 6379 - 配置redis的配置文件redis.windows.conf: - 56行:#bind 127.0.0.1 - 关闭保护模式:protected-mode no - 启动redis服务: - redis-server ./redis.windows.conf - redis-cli - 执行程序: 进入到爬虫文件对应的目录中:scrapy runspider xxx.py - 向调度器队列中放入一个起始的url: - 调度器的队列名称就是redis_key值 - 在redis-cli:lpush 队列名称 www.xxx.com
案例:
爬虫文件:
from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapy_redis.spiders import RedisCrawlSpider from fbsPro.items import FbsproItem class FbsSpider(RedisCrawlSpider): name = 'fbs' # allowed_domains = ['www.xxx.com'] # start_urls = ['http://www.xxx.com/'] redis_key = 'sunQueue' # 可以被共享的调度器队列的名称 rules = ( Rule(LinkExtractor(allow=r'type=4&page=\d+'), callback='parse_item', follow=True), ) def parse_item(self, response): tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr') for tr in tr_list: title = tr.xpath('./td[2]/a[2]/text()').extract_first() item = FbsproItem() item['title'] = title yield item
items
import scrapy class FbsproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() # pass
管道:
class FbsproPipeline(object): def process_item(self, item, spider): return item
settings
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36' ROBOTSTXT_OBEY = False CONCURRENT_REQUESTS = 2 ITEM_PIPELINES = { 'scrapy_redis.pipelines.RedisPipeline': 400 } # 增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 使用scrapy-redis组件自己的调度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据 SCHEDULER_PERSIST = True REDIS_HOST = 'xxx.xxx.xx.xx' REDIS_PORT = 6379
增量式爬虫
- 概念:监测网站数据更新的情况,爬取最新更新出来的数据。
- 实现增量式
- 去重!
实例:
- 电影网站:爬取的数据没有在同一张页面上
- 需要对每一个电影的详情页url做记录
- 下载执行程序的时候,需要将即将被爬取电影详情页的url做记录监测
- 电影详情页的url记录可以存储到set或者redis的set
- 爬取到的所有电影数据可以存储到redis
解析:
- 对一个url对应页面中的数据做监测- 数据指纹:对一组数据制定的一个唯一标识
示例代码:
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from redis import Redis from zls_movie_Pro.items import ZlsMovieProItem class MovieSpider(CrawlSpider): conn = Redis(host='127.0.0.1', port=6379) name = 'movie' # allowed_domains = ['www.xxx.com'] start_urls = ['http://www.4567kan.com/index.php/vod/show/id/5.html'] rules = ( Rule(LinkExtractor(allow=r'/index\.php/vod/show/id/5/page/\d+\.html'), callback='parse_item', follow=False), ) def parse_item(self, response): # 电影名称和详情页的url li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: name = li.xpath('.//div[@class="stui-vodlist__detail"]/h4/a/text()').extract_first() item = ZlsMovieProItem() item['name'] = name detail_url = 'http://www.4567kan.com' + li.xpath( './/div[@class="stui-vodlist__detail"]/h4/a/@href').extract_first() # ex == 1 :字符串插入成功 ex == 0 插入的字符串重复了 ex = self.conn.sadd('movie_detail_urls', detail_url) if ex == 1: print('有最新更新的数据可爬......') yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item}) else: print('暂无数据更新!') def parse_detail(self, response): movie_desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first() item = response.meta['item'] item['desc'] = movie_desc yield item
import scrapy class ZlsMovieProItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() desc = scrapy.Field() # pass
class ZlsMovieProPipeline(object): def process_item(self, item, spider): conn = spider.conn conn.lpush('movie_data',item) return item
settings中固定配置好,开启管道即可。