selenium集成到scrapy

# middleware.py

from scrapy.http import HtmlResponse


class JsloadMiddleware(object):

    def process_request(self, request, spider):
        # 可以通过url来判断,或者spider来判断
        # if request.url == '':
        if spider.name == 'weixin':
            spider.browser.get(request.url)
            time.sleep(1)
            # 通过此方法下载器再重新下载,直接返回browser.page_source,编码根据页面来判断
            return HtmlResponse(url=request.url, body=spider.browser.page_source, encoding='urf-8')
# spider.py
from selenium import webdriver
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals

class WeixinSpider(scrapy.Spider):
    name = 'weixin'
    allowed_domains = ['weixin.sogou.com']
    start_urls = ['http://weixin.sogou.com/']

    def __init__(self):
        self.browser = webdriver.Chrome()
        super(WeixinSpider, self).__init__()

        # 分发器,第一个参数是处理函数,第二个参数是信号量
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    # 当spider关闭的时候,关闭webdriver
    def spider_closed(self, spider):
        print('spider closed')
        self.browser.quit()

你可能感兴趣的:(selenium集成到scrapy)