调用下载器downloader返回response后将使用_handle_downloader_output(self, response, request, spider)解析response。
def _handle_downloader_output(self, response, request, spider): assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) if isinstance(response, Request): #如果返回的request则跳转到crawl self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) # 解析response d.addErrback(lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
看完上面代码后就应该了解实际的解析是有scraper.py的scraper.enqueue_scrape(response, request, spider)方法来完成的。
def enqueue_scrape(self, response, request, spider): slot = self.slot dfd = slot.add_response_request(response, request) def finish_scraping(_): slot.finish_response(response, request) self._check_if_closing(spider, slot) self._scrape_next(spider, slot) return _ dfd.addBoth(finish_scraping) dfd.addErrback( lambda f: logger.error('Scraper bug processing %(request)s', {'request': request}, exc_info=failure_to_exc_info(f), extra={'spider': spider})) self._scrape_next(spider, slot) return dfd def _scrape_next(self, spider, slot): while slot.queue: response, request, deferred = slot.next_response_request_deferred() #从dequez中取出response request self._scrape(response, request, spider).chainDeferred(deferred)
_scrape方法通过调用_scrape2返回ITEM,然后将ITEM传递到handle_spider_output方法.
def _scrape(self, response, request, spider): """Handle the downloaded response or failure trough the spider callback/errback""" assert isinstance(response, (Response, Failure)) dfd = self._scrape2(response, request, spider) # returns spiders processed output dfd.addErrback(self.handle_spider_error, request, response, spider) dfd.addCallback(self.handle_spider_output, request, response, spider) #调用回掉方法handle_spider_output return dfd
_scrape2方法通过调用call_spider实现回掉request的callback或spider默认的parse方法解析response返回ITEM.
def _scrape2(self, request_result, request, spider): """Handle the different cases of request's result been a Response or a Failure""" if not isinstance(request_result, Failure): return self.spidermw.scrape_response( self.call_spider, request_result, request, spider) #调用call_spider方法返回ITEM else: # FIXME: don't ignore errors in spider middleware dfd = self.call_spider(request_result, request, spider) return dfd.addErrback( self._log_download_errors, request_result, request, spider) def call_spider(self, result, request, spider): result.request = request dfd = defer_result(result) dfd.addCallbacks(request.callback or spider.parse, request.errback) #调用request的callback方法或者spider默认parse return dfd.addCallback(iterate_spider_output) def handle_spider_output(self, result, request, response, spider): if not result: return defer_succeed(None) it = iter_errback(result, self.handle_spider_error, request, response, spider) dfd = parallel(it, self.concurrent_items, self._process_spidermw_output, request, response, spider) return dfd
handle_spider_output中通过parallerl将ITEM传参到_process_spidermw_output方法
def handle_spider_output(self, result, request, response, spider): if not result: return defer_succeed(None) it = iter_errback(result, self.handle_spider_error, request, response, spider) dfd = parallel(it, self.concurrent_items, self._process_spidermw_output, request, response, spider) return dfd
_process_spidermw_output(self, output, request, response, spider)方法中通过调用itemproc.process_item将ITEM通过pipelines持久化的本地。
def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ if isinstance(output, Request): self.crawler.engine.crawl(request=output, spider=spider) elif isinstance(output, (BaseItem, dict)): #通过pipeline输出item self.slot.itemproc_size += 1 dfd = self.itemproc.process_item(output, spider) dfd.addBoth(self._itemproc_finished, output, response, spider) return dfd elif output is None: pass else: typename = type(output).__name__ logger.error('Spider must return Request, BaseItem, dict or None, ' 'got %(typename)r in %(request)s', {'request': request, 'typename': typename}, extra={'spider': spider})
最后说下将数据持久化本地的实现。
def process_item(self, item, spider): return self._process_chain('process_item', item, spider) def _add_middleware(self, pipe): #添加itempipeline到middleware super(ItemPipelineManager, self)._add_middleware(pipe) if hasattr(pipe, 'process_item'): self.methods['process_item'].append(pipe.process_item)