pyspider 的异步抓取是如何实现的:
首先看调用流程,
# pyspider\run.py
# line: 229
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent,
timeout, phantomjs_endpoint, splash_endpoint, fetcher_cls,
async=True, get_object=False, no_input=False):
"""
Run Fetcher.
"""
g = ctx.obj
Fetcher = load_cls(None, None, fetcher_cls)
if no_input:
inqueue = None
outqueue = None
else:
inqueue = g.scheduler2fetcher # schduler 存入task的队列
outqueue = g.fetcher2processor # 抓取结果队列
fetcher = Fetcher(inqueue=inqueue, outqueue=outqueue,
poolsize=poolsize, proxy=proxy, async=async)
fetcher.phantomjs_proxy = phantomjs_endpoint or g.phantomjs_proxy
fetcher.splash_endpoint = splash_endpoint
if user_agent:
fetcher.user_agent = user_agent
if timeout:
fetcher.default_options = copy.deepcopy(fetcher.default_options)
fetcher.default_options['timeout'] = timeout
g.instances.append(fetcher)
if g.get('testing_mode') or get_object:
return fetcher
if xmlrpc:
utils.run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
fetcher.run() # 开启fetcher
负责抓取的部分为
pyspider\fetcher\tordado_fetcher.py
# line 636 fetcher的run函数
def run(self):
'''Run loop'''
logger.info("fetcher starting...")
def queue_loop():
if not self.outqueue or not self.inqueue:
return
while not self._quit:
try:
if self.outqueue.full():
break
if self.http_client.free_size() <= 0:
break
task = self.inqueue.get_nowait() # 取task
# FIXME: decode unicode_obj should used after data selete from
# database, it's used here for performance
task = utils.decode_unicode_obj(task)
self.fetch(task)
except queue.Empty:
break
except KeyboardInterrupt:
break
except Exception as e:
logger.exception(e)
break
tornado.ioloop.PeriodicCallback(queue_loop, 100, io_loop=self.ioloop).start() # 循环取任务
tornado.ioloop.PeriodicCallback(self.clear_robot_txt_cache, 10000, io_loop=self.ioloop).start()
self._running = True
try:
self.ioloop.start()
except KeyboardInterrupt:
pass
logger.info("fetcher exiting...")
整个run部分大概流程:
fetch 函数 中调用 async fetch
def fetch(self, task, callback=None):
if self.async:
return self.async_fetch(task, callback)
else:
return self.async_fetch(task, callback).result()
async_fetch
@gen.coroutine
def async_fetch(self, task, callback=None):
'''Do one fetch'''
url = task.get('url', 'data:,')
if callback is None:
callback = self.send_result
type = 'None'
start_time = time.time()
try:
if url.startswith('data:'):
type = 'data'
result = yield gen.maybe_future(self.data_fetch(url, task))
elif task.get('fetch', {}).get('fetch_type') in ('js', 'phantomjs'):
type = 'phantomjs'
result = yield self.phantomjs_fetch(url, task)
elif task.get('fetch', {}).get('fetch_type') in ('splash', ):
type = 'splash'
result = yield self.splash_fetch(url, task)
else:
type = 'http'
result = yield self.http_fetch(url, task)
except Exception as e:
logger.exception(e)
result = self.handle_error(type, url, task, start_time, e)
callback(type, task, result)
self.on_result(type, task, result)
raise gen.Return(result)
1) 根据不同的task type 调用不同的具体抓取函数 这里我们看 http_fetch
2) 对结果进行解析,处理
这里的异步机制和平常用到的tornado 异步爬虫不太一样,一般情况下的代码是这样的
# coding=utf-8
"""
tornado异步爬虫示例
"""
import time
from datetime import timedelta
try:
from HTMLParser import HTMLParser
from urlparse import urljoin, urldefrag
except ImportError:
from html.parser import HTMLParser
from urllib.parse import urljoin, urldefrag
from tornado import httpclient, gen, ioloop, queues
base_url = 'http://www.tornadoweb.org/en/stable/'
concurrency = 10
@gen.coroutine
def get_links_from_url(url):
"""Download the page at `url` and parse it for links.
Returned links have had the fragment after `#` removed, and have been made
absolute so, e.g. the URL 'gen.html#tornado.gen.coroutine' becomes
'http://www.tornadoweb.org/en/stable/gen.html'.
"""
try:
response = yield httpclient.AsyncHTTPClient().fetch(url)
print('fetched %s' % url)
html = response.body if isinstance(response.body, str) \
else response.body.decode()
urls = [urljoin(url, remove_fragment(new_url))
for new_url in get_links(html)]
except Exception as e:
print('Exception: %s %s' % (e, url))
raise gen.Return([])
raise gen.Return(urls)
def remove_fragment(url):
pure_url, frag = urldefrag(url)
return pure_url
def get_links(html):
class URLSeeker(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
href = dict(attrs).get('href')
if href and tag == 'a':
self.urls.append(href)
url_seeker = URLSeeker()
url_seeker.feed(html)
return url_seeker.urls
@gen.coroutine
def main():
q = queues.Queue()
start = time.time()
fetching, fetched = set(), set()
@gen.coroutine
def fetch_url():
current_url = yield q.get()
try:
if current_url in fetching:
return
print('fetching %s' % current_url)
fetching.add(current_url)
urls = yield get_links_from_url(current_url)
fetched.add(current_url)
for new_url in urls:
# Only follow links beneath the base URL
if new_url.startswith(base_url):
yield q.put(new_url)
finally:
q.task_done()
@gen.coroutine
def worker():
while True:
yield fetch_url()
q.put(base_url)
# Start workers, then wait for the work queue to be empty.
for _ in range(concurrency):
worker()
yield q.join(timeout=timedelta(seconds=300))
assert fetching == fetched
print('Done in %d seconds, fetched %s URLs.' % (
time.time() - start, len(fetched)))
if __name__ == '__main__':
import logging
logging.basicConfig()
io_loop = ioloop.IOLoop.current()
io_loop.run_sync(main)
从实际抓取的函数向上调用每一层均需要@gen.coroutine修饰,最后通过 io_loop.run_async(main) 开启抓取。但是在pyspider 中通过
tornado.ioloop.PeriodicCallback实现
如果官方文档的描述,当前调用如果没有结束是不会开始下次调用的
If the callback runs for longer than callback_time milliseconds, subsequent invocations will be skipped to get back on schedule.
但实际上,如果在periodiccallback中 有异步的调用
Basically if a callback takes more then callback_time to execute subsequent invocations are only skipped if the callback is synchronous. If the callback calls another asynchronous routine PeriodicCallback has no way of knowing all will start the another invocation of the callback.
参照