定向抓取时,目标站点的数据不能单次请求获取,需要3,4次或者更多,请求之间有依赖关系,就是需要连续请求完成这个下载事物
前面讲过的 js 动态页面下载
......
下载器的不可以影响 twisted 框架本身的异步机制
与 scrapy 原有的下载器调用接口一致,符合插件规范
其他插件依赖的状态要保留
1. 新建项目
# 创建项目 scrapy startproject jstest # 创建蜘蛛 scrapy genspider -t basic testSpider 'sina.com.cn'
2. 修改蜘蛛文件 testSpider.py
from scrapy.spider import Spider class TestspiderSpider(Spider): name = "testSpider" allowed_domains = ["sina.com.cn"] start_urls = ( 'http://www.sina.com.cn/', ) def parse(self, response): print response.body print 'download_latency:', response.meta['download_latency']
3. 创建 jstest/handler 目录,新建文件 mydownloader.py
参照 /usr/local/lib/python2.7/dist-packages/scrapy/core/downloader/handlers/http11.py 文件
#!/usr/bin/env python # encoding: utf-8 import re from time import time from cStringIO import StringIO from urlparse import urldefrag from zope.interface import implements from twisted.internet import defer, reactor, protocol from twisted.web.http_headers import Headers as TxHeaders from twisted.web.iweb import IBodyProducer from twisted.internet.error import TimeoutError from twisted.web.http import PotentialDataLoss from scrapy.xlib.tx import Agent, ProxyAgent, ResponseDone, HTTPConnectionPool, TCP4ClientEndpoint from scrapy.http import Headers from scrapy.responsetypes import responsetypes from scrapy.core.downloader.webclient import _parse from scrapy.utils.misc import load_object from scrapy.http import HtmlResponse from twisted.internet import utils class MyLogicDownloader(object): ''' 定制下载逻辑 ''' def __init__(self, agent=None): '''agent: 异步下载代理''' self._agent = agent def download(self, request): ''' 需要异步返回,不可以阻塞,本例子的演示直接调用 phantomjs的一个简单包装脚本 ''' begintime = time() d = self._download(request) d.addCallback(self.parseData, request, begintime) print '证明我是异步的' return d def _download(self, request): '''使用twsited 的函数创建异步进程调用''' d = utils.getProcessOutput('scrapyweb.js', args=(request.url, '24000'), reactor=reactor) def getOutput(result): return result d.addCallback(getOutput) return d def parseData(self, htmldoc, request, begintime): '''解析函数,当请求完成后被调用''' # 这个下载时间在调整下载速度的扩展 AutoThrottle 中被使用 request.meta['download_latency'] = time() - begintime return HtmlResponse(request.url, body=htmldoc + '\n证明我被解析过', request=request) class MyDownloadHandler(object): ''' 下载接口, 被上层所调用 ''' def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() def download_request(self, request, spider): '''下载的主要被调用接口(异步),返回 deferred (twisted 的延迟回调对象)''' myDownloader = MyLogicDownloader() return myDownloader.download(request) def close(self): return self._pool.closeCachedConnections()
4. 添加配置到 settings.py
DOWNLOAD_HANDLERS = { 'http': 'jstest.handler.mydownloader.MyDownloadHandler' }
5. 在系统目录下新建一个 phantomjs 包装脚本 scrapyweb.js,并添加可执行权限
#!/usr/bin/env phantomjs if (phantom.args.length >= 1){ var url = phantom.args[0]; var timeOut = 10000; if (phantom.args.length == 2){ timeOut = Math.min(30000, Math.max(0, phantom.args[1])); } var page = require('webpage').create(); page.customHeaders = { 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'DNT': '1' }; page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36'; page.open(encodeURI(url), function(status){ if (status != 'success'){ console.log('Err, status=' + status); phantom.exit(1); } console.log(page.content); phantom.exit(); }); setTimeout(function(){ console.log(page.content); phantom.exit(); }, timeOut);}else { console.log('Usage:'); console.log('\tphantomjs scrapyweb.js url timeout'); phantom.exit(1);}
6. 运行
scrapy crawl testSpider
例子是下载 js 动态页面的例子。如果需要复杂些的连续下载,需要按照 scrapy 框架的 http11.py 文件修改
request.meta['download_latency'] 需要赋值,这个调整下载速度的 AutoThrottle 扩展依赖这个值判定
原文链接: http://www.hopez.org/blog/9/1396376115