有些网站的动态加载的内容,对数据是进行了加密,这时候需要selenium来实现读取.
class SeleniumMiddleware(object):
def __init__(self):
self.driver = webdriver.Firefox()
def process_request(self, request, spider):
"""每个请求都会通过这个方法"""
pron = request.meta.get('usedSelenium',False)
if not pron:
return None
# 打开响应网址
self.driver.get(request.url)
# 返回响应,返回给爬虫文件
return HtmlResponse(
url=request.url,
status=200,
body=self.driver.page_source,
request=request,
encoding='utf-8'
)
def __del__(self):
"""关闭浏览器"""
self.driver.close()
# @classmethod
# def from_crawler(cls, crawler):
# """关闭浏览器的第二种方法"""
# s = cls()
# # 爬虫开启的时候执行spider_closed signals为信号对象,signals中有很多信号
# crawler.signals.connect(s.spider_closed, signal=signals.spider_opened)
# return s
#
# def spider_closed(self, spider):
# self.driver.close()
两种关闭浏览器的方法.