CrawlSpider也继承自Spider,所以具备它的所有特性,在CrawlSpider源码中最先定义的是类Rule。
class Rule(object):
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity):
self.link_extractor = link_extractor
self.callback = callback
self.cb_kwargs = cb_kwargs or {}
self.process_links = process_links
self.process_request = process_request
if follow is None:
self.follow = False if callback else True
else:
self.follow = follow
link_extractor:
该方法是一个Link Extractor实例,主要定义的就是链接的解析规则。默认的link解析器是LinkExtractor,也就是LxmlLinkExtractor。在之前的scrapy版本中还有其他的解析器,不过现在已经弃用了。
callback:
该值可以是一个方法,也可以是一个字符串(spider实例中一个方法的名称)。它就是一个回调方法,要慎用parse做为回调方法,因为这边的parse已经不像spider类中的那样没有具体操作。
cb_kwargs:
这是一个字典,用于给callback方法传递参数
follow:
是一个布尔对象,表示是当前response否继续采集。如果callback是None,那么它就默认为True,否则为False。
process_links:
该方法在crawlspider中的_requests_to_follow方法中被调用,它接收一个元素为Link的列表作为参数,返回值也是一个元素为Link的列表。可以用该方法对采集的Link对象进行修改,比如修改Link.url。这里的如果你的目标url是相对的链接,那么scrapy会将其扩展成绝对的。
process_request:
用于处理request的,根据需要,可以自己构造该方法。
代码流程图
crawl.py原码
import copy
import six
from scrapy.http import Request, HtmlResponse
from scrapy.utils.spider import iterate_spider_output
from scrapy.spiders import Spider
def identity(x):
return x
class Rule(object):
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity):
self.link_extractor = link_extractor
self.callback = callback
self.cb_kwargs = cb_kwargs or {}
self.process_links = process_links
self.process_request = process_request
if follow is None:
self.follow = False if callback else True
else:
self.follow = follow
class CrawlSpider(Spider):
rules = ()
def __init__(self, *a, **kw):
super(CrawlSpider, self).__init__(*a, **kw)
self._compile_rules()
def parse(self, response):
return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
def parse_start_url(self, response):
return []
def process_results(self, response, results):
return results
def _build_request(self, rule, link):
r = Request(url=link.url, callback=self._response_downloaded)
r.meta.update(rule=rule, link_text=link.text)
return r
def _requests_to_follow(self, response):
# 阅读源码可以发现,它的作用就是从response中解析出目标url,并将其包装成request请求。
# 该请求的回调方法是_response_downloaded,这里为request的meta值添加了rule参数,
# 该参数的值是这个url对应rule在rules中的下标。
if not isinstance(response, HtmlResponse):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def _response_downloaded(self, response):
rule = self._rules[response.meta['rule']]
return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
def _parse_response(self, response, callback, cb_kwargs, follow=True):
# 核心函数。该方法将resposne交给参数callback代表的方法去处理,
# 然后处理callback方法的requests_or_item。再根据rule.follow and spider._follow_links
# 来判断是否继续采集,如果继续那么就将response交给_requests_to_follow方法,根据规则提取相关的链接。
# spider._follow_links的值是从settings的CRAWLSPIDER_FOLLOW_LINKS值获取到的。
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for requests_or_item in iterate_spider_output(cb_res):
yield requests_or_item
if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item
def _compile_rules(self):
def get_method(method):
if callable(method):
return method
elif isinstance(method, six.string_types):
return getattr(self, method, None)
self._rules = [copy.copy(r) for r in self.rules]
for rule in self._rules:
rule.callback = get_method(rule.callback)
rule.process_links = get_method(rule.process_links)
rule.process_request = get_method(rule.process_request)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
spider._follow_links = crawler.settings.getbool(
'CRAWLSPIDER_FOLLOW_LINKS', True)
return spider
def set_crawler(self, crawler):
super(CrawlSpider, self).set_crawler(crawler)
self._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)