scrapy startproject [工程名称]
# 进入工程目录
# 创建爬虫
scrapy genspider [爬虫名称] [要爬的目标url]
scrapy crawl [爬虫名称]
scrapy crawl [爬虫名称] -o quotes.json # 爬取结果保存到文件
# 支持文件类型json、csv、xml、pickle、marshal
# 工程目录创建main.py :
# 利于调试
from scrapy.cmdline import execute
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(['scrapy','crawl','quotes'])
定义爬取的数据结构,使用方法与字典类似
import scrapy class QuoteItem(scrapy.Item): text = scrapy.Field() author = scrapy.Field() tags = scrapy.Field() class ImageItem(scrapy.Item): ''' ImagePipeline Item 定义 ''' collection = table = 'images' id = scrapy.Field() url = scrapy.Field() title = scrapy.Field() thumb = scrapy.Field()
下载中间件,处于Scrapy的Request与Response之间的处理模块
Downloader Middleware的功能:修改User_Agent、处理重定向、设置代理、失败重试、设置Cookies等
DOWNLOADER_MIDDLEWARES_BASE 是 Scrapy中默认已经启用的一些默认的下载中间件
{ 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100, 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300, 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350, 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 400, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 500, 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550, 'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560, 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590, 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600, 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750, 'scrapy.downloadermiddlewares.stats.DownloaderStats': 850, 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900, }
核心方法
process_request(request,spider)
Request被Scrapy引擎调度给Downloader之前,执行process_request()。返回类型不同,产生的效果不同
- None : Scrapy 将继续处理该Request,继续执行其他Downloader Middleware 的process_request,一直到Downloader把Request执行后得到Response才结束
- Response : 更低优先级的Downloader Middleware 的process_request不再被调用,转而process_response()会依次被调用
- Request:更低优先级的Downloader Middleware会停止执行,Request会放到调度器中,如果被Scheduler调度了,所有的Downloader Middleware会重新执行
- IgnoreRequest : 所有的Downloader Middleware的process_exception()方法会依次执行。如果没处理异常,则Request的errorback()方法会被调用
process_response(request,response,spider)
Downloader执行Request下载之后,会等到相应的Response。Scrapy引擎便会将Response发送给Spider解析,在发送之前调用process_response()
- request : 更低优先级的Downloader Middleware的process_response() 方法不会继续调用。Request会放到调度器中,如果被Scheduler调度了,所有的Downloader Middleware会重新执行
- Response:更低优先级的Downloader Middleware的process_response() 方法会继续调用
- IgnoreRequest : 所有的Downloader Middleware的process_exception()方法会依次执行。如果没处理异常,则Request的errorback()方法会被调用
process_exception(request,exception,spider)
当Downloader或process_request()方法抛出异常时
- None:更低优先级的Downloader Middleware的process_exception()会被调用
- Response:更低优先级的Downloader Middleware的process_exception()不再被调用,转而调用process_response()
- Request:更低优先级的Downloader Middleware的process_exception()不再被调用。Request会放到调度器中,如果被Scheduler调度了,所有的Downloader Middleware会重新执行
import random from scrapy import signals class UserAgentMiddleware(object): def __init__(self,user_agent='Scrapy') : self.user_agent = user_agent self.user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36", "Mozilla/5.0 (Windows NT 9.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36" ] def from_crawler(cls,crawler): o = cls(crawler.settings['USER_AGENT']) crawler.signals.connect(o.spider_opened,signal = signals.spider_closed) return o def spider_opened(self,spider): self.user_agent = getattr(spider,'user_agent',self.user_agent) def process_request(self,request,spider): if self.user_agent: request.headers.setdefault(b'User-Agent',self.user_agent) # request.headers['User-Agent'] = self.user_agent # request.headers['User-Agent'] = random.choice(self.user_agents) # 推荐随机 user_agents
Soider Middleware 是介入到Scrapy的Spider处理机制的钩子框架
当Downloader生成Response之后,Response会被发送给Spider,在发送给Spider之前Response会首先经过Spider Middleware处理,当Spider处理生成Item和Request之后,Item和Request还会经过Spider Middleware的处理。
Spider Middleware有一下三个作用:
- Downloader生成Response发送给Spider之间处理Response
- Spider生成Request发送给Schedule之间,处理Request
- Spider生成Item发送给Item Pipeline之间处理Item
SPIDER_MIDDLEWARES_BASE定义了默认中间件:
{ 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50, 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500, 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700, 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800, 'scrapy.spidermiddlewares.depth.DepthMiddleware': 900, }
核心方法
process_spider_input(response,spider)
当Response被Spider Middleware处理时调用;
- 返回None,Scrapy 将会继续处理Response,调用其他的Spider Middleware
- 抛出异常,不再调用其他Spider Middleware的process_spider_input,而调用Request的errback()方法。errback的输出将会被重新输入到中间件中,使用process_spider_output方法处理
process_spider_output(response,result,spider)
当Spider处理Response返回结果时被调用;参数中result包含Request或Item对象的可迭代对象;方法必须返回包含Request或Item对象的可迭代对象
process_spider_exception(response,exception,spider)
当process_spider_input方法抛出异常时,process_spider_exception方法被调用;
- 返回None:Scrapy继续处理异常,调用其他Spider Middleware中的process_spider_exception方法
- 返回一个可迭代对象:其他Spider Middleware的process_spider_output,process_spider_exception不会再被调用
process_start_requests(start_requests,spider)
以Spider启动的Request为参数被调用,执行过程类似process_spider_output,且必须返回Request
定义数据管道,定义一个类并实现process_item()方法,方法必须返回包含数据的字典或Item对象,或者抛出DropItem异常
from_crawler : 是一个类方法,@class method 标识,是一种依赖注入的方式。它的参数就是crawler,通过crawler可以拿到全局配置的每个配置信息 。
open_spider:当Spider开启时,方法被调用。可以做一些初始化工作,比如连接数据库
close_spider:当Spider关闭时,方法调用。处理一些收尾工作
process_item : 最主要的函数,处理数据
数据存储建议使用mongoDB
from itemadapter import ItemAdapter from scrapy.exceptions import DropItem from sympy import im from tutorial.items import QuoteItem import pymysql import pymongo class TextPipeline(object): ''' 文本处理 ''' def __init__(self): self.limit = 50 def process_item(self,item:QuoteItem,spider): if item['text']: if len(item['text']) > self.limit: item['text'] = item['text'][0:self.limit].rstrip()+'...' else: return DropItem('Missing Text') return item class MysqlPipeline(object): ''' 文本存储到数据库 ''' def __init__(self,host,port,user,pwsd,db_name): self.host = host self.port = port self.user = user self.pwsd = pwsd self.db_name = db_name @classmethod def from_crawler(cls,crawler): host = crawler.settings.get("MYSQL_HOST") port = crawler.settings.get("MYSQL_PORT") user = crawler.settings.get("MYSQL_USER") pwsd = crawler.settings.get("MYSQL_PWSD") db_name = crawler.settings.get("MYSQL_DB") return cls(host,port,user,pwsd,db_name) def open_spider(self,spider): self.client = pymysql.connect(host= self.host, port= self.port, user= self.user, password= self.pwsd , db= self.db_name) def process_item(self,item,spider): name = item.__class__.__name__ author = item['author'] tags = item['tags'] text = pymysql.converters.escape_string(item['text']) cursor = self.client.cursor() sqlstr = "insert into quotes values(\"%s\",\"%s\",\"%s\",\"%s\")" % (name,author,tags,text) cursor.execute(sqlstr.replace('\r','').replace('\n','').replace('\'','')) self.client.commit() def close_spider(self,spider): self.client.close() class MongoPipeline(object): def __init__(self,mongo_uri,mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls,crawler): ''' 静态函数,创建类对象 ''' return cls( mongo_uri = crawler.settings.get("MONGO_URI"), mongo_db = crawler.settings.get("MONGO_DB") ) def open_spider(self,spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def process_item(self,item,spider): name = item.__class__.__name__ self.db['name'].insert_one(dict(item)) return item def close_spider(self,spider): self.client.close()
Image Pipeline 是Scrapy提供的专门下载图片、文件的Pipeline。
内置的ImagesPipeline 会默认读取Item的image_urls字段,并认为该字段是一个列表并。
并不是所以的Item都有image_urls字段,需要自定义ImagePipeline(继承内置的ImagePipeline,重写file_path、item_completed、get_media_requests函数)
from scrapy import Request from scrapy.exceptions import DropItem from tutorial.items import QuoteItem from scrapy.pipelines.images import ImagesPipeline class ImagesPipeline(ImagesPipeline): def file_path(self, request, response=None, info=None, *, item=None): url = request.url; file_name = url.split('/')[-1] return file_name def item_completed(self, results, item, info): ''' :results get_media_requests的返回结果 results为一个列表,其元素是一个元组,元组的第一个元素为bool值即ok,用来判断下载成功或失败。 第二个元素为一个字典即x,是该Item对应的下载结果,字典中分别有url、path、checksum三个键值对。 ''' image_paths = [x['path'] for ok,x in results if ok] if not image_paths: raise DropItem('Image Download Failed') # 返回item,供下一个pipeline使用 return item def get_media_requests(self, item, info): ''' 执行下载 文件自动保存到 IMAGES_STORE = './images' ''' yield Request(item['url'])
配置文件
# ImagePipeline 下载文件、图片的路径 IMAGES_STORE = './images' # 全局生效,修改USER_AGENT USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36" ITEM_PIPELINES={ # 数字越小 越被先调用 'tutorial.pipelines.TextPipeline':300, 'tutorial.pipelines.MysqlPipeline':400 } MYSQL_HOST='localhost' MYSQL_PORT = 3306 MYSQL_USER = 'root' MYSQL_PWSD = '123456' MYSQL_DB = 'spider_text' MONGO_URI = 'localhost' MONGO_DB = 'tutorial'
放置Spider的文件夹
spdier结构
name:每个项目唯一的名字,用来区分不同的spider
allowed_domains:允许爬取的域名,如果初始或后续的请求链接不是这个域名下的,则请求链接会被过滤掉。
start_urls:Spider在启动时爬取的url列表,初始请求是由它来定义的 。
custom_settings : 一个字典,是专属spider的配置,此配置会覆盖全局配置
crawler:它是是由from_crawler() 方法设置的,代表的是本Spider类对应的Crawler对象。Crawler对象包含很多的项目组件,最常用的是获取项目设置信息,即Settings
settings:是一个Settings对象,利用它可以直接获取项目的全局设置变量
start_requests():用户生成初始请求,它必须返回一个可迭代对象,会默认使用start_urls中的地址构造Request;Request是GET请求方式,FormRequest是POST请求方式
parse:是 Spider 的一个方法,被调用时 start_urls 里面的链接构成的请求完成下载执行后
closed() : 当Spider关闭时,该方法会被调用,处理收尾工作
import scrapy from tutorial.items import QuoteItem from scrapy.http.response.html import HtmlResponse class QuotesSpider(scrapy.Spider): name = 'quotes' allowed_domains = ['quotes.toscrape.com'] start_urls = ['http://quotes.toscrape.com/'] # spider 单独设置 USER_AGENT custom_settings = { "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36", } def parse(self, response:HtmlResponse): ''' 解析html ''' quotes = response.css('.quote') for quote in quotes: item = QuoteItem() item['text'] = quote.css('.text::text').extract_first() item['author'] = quote.css('.author::text').extract_first() item['tags'] = quote.css('.tags .tag::text').extract() yield item next = response.css('.pager .next a::attr("href")').extract_first() url = response.urljoin(next) yield scrapy.Request(url=url,callback=self.parse)
start_requests :发起请求,可以在此处修改要爬取的地址;
默认发起的是get请求,如需要用post,则必须重写start_requests ,使用FormRequest、JsonRequest;
可以在start_requests 设置headers,cookies模拟登陆;
import json from urllib.parse import urlencode from scrapy import Request ,Spider from scrapy.http.response.html import HtmlResponse from tutorial.items import ImageItem class ImagesSpider(Spider): name = 'images' allowed_domains = ['images.so.com'] start_urls = ['https://image.so.com/'] def start_requests(self): data = {'ch':'beauty','t1':595} base_url = "https://image.so.com/zjl?" for page in range(0,50): data['sn'] = page*30 params = urlencode(data) url = base_url+params # get请求 yield Request(url,self.parse) # 也可以定义其他 parse # # post请求 # JsonRequest(url, data=data, callback=self.parse) # json 参数 # FormRequest(url, formdata=data, callback=self.parse) # form表单 # Request(url, method='POST', body=json.dumps(body),headers={'Content-Type':'application/json'}) # json 参数 def parse(self, response:HtmlResponse): result = json.loads(response.text) for image in result.get('list'): item = ImageItem() item['id'] = image.get('id') item['url'] = image.get('qhimg_url') item['title'] = image.get('title') item['thumb'] = image.get('qhimg_thumb') yield item
Selector是一个可以独立使用的模块。使用方法是构建一个对象,调用相关的方法:xpath()、css()等来获取数据。
from scrapy import Selector body = '....' selector = Selector(body) title = selector.xpath('//title/text()').extract_first() print(title)
Spider 的 response有一个selector,可以通过response.selector.xpath()、css()等获取信息;调用response的xpath()、css()等于调用selector。
selector方法:
extract():方法可以提取节点(返回节点数组)
extract_first():方法提取列表的第一个元素
Selector 选择器xpath和css可以嵌套使用
Selector选择器支持正则
response.xpath('//a/text()').re('Name:\s(.*)') # 返回列表 response.xpath('//a/text()').re_first('Name:\s(.*)') # 返回第一个
# 必须开启,否则自定义cookie不起作用
# Disable cookies (enabled by default) 开启cookie
COOKIES_ENABLED = True
# 在downloader middleware的process_request中设置
def process_request(self, request, spider):
request.cookies={
"sessionid":"......"
}
return request
# 在spider爬虫主文件中,重写start_request方法,在scrapy的Request函数的参数中传递cookies
def start_requests(self):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0"}
# 指定cookies
cookies = {
'uuid': '66a0f5e7546b4e068497.1542881406.1.0.0',
'_lxsdk_cuid': '1673ae5bfd3c8-0ab24c91d32ccc8-143d7240-144000-1673ae5bfd4c8',
'__mta': '222746148.1542881402495.1542881402495.1542881402495.1', 'ci': '20',
'rvct': '20%2C92%2C282%2C281%2C1',
'_lx_utm': 'utm_source%3DBaidu%26utm_medium%3Dorganic',
'_lxsdk_s': '1674f401e2a-d02-c7d-438%7C%7C35'
}
# 再次请求到详情页,并且声明回调函数callback,dont_filter=True 不进行域名过滤,meta给回调函数传递数据
yield Request(detailUrl, headers=headers, cookies=cookies, callback=self.detail_parse, meta={'myItem': item}, dont_filter=True)
# 必须关闭
COOKIES_ENABLED = False
# 设置全局cookie
DEFAULT_REQUEST_HEADERS={
'cookie':'sessionid=1232...',
'Accept':'text/html,...',
'Accept-Language':'zh-CN,zh',
'Connection':'keep-alive'
}
如果Scrapy项目包含多个爬虫,可以利用CrawlerProcess类并发执行多个爬虫
# main.py from scrapy.utils.project import get_project_settings from scrapy.crawler import CrawlerProcess def start(): setting = get_project_settings() process = CrawlerProcess(setting) # 不运行的爬虫 spider_besides=['other'] for spider_name in process.spiders.list(): if spider_name in spider_besides: continue print('执行爬虫:%s'%(spider_name)) process.crawl(spider_name) process.start() if __name__ == '__main__': start()