在管道中编写selenium
创建项目的文件夹
创建项目
cd D:\workspace\pythonVip\spider\day20
scrapy startproject blogs
cd D:\workspace\pythonVip\spider\day20\blogs>
scrapy genspider cnblog www.cnblogs.com
中间件写selenium后返回response,在配置文件配置中间件后,在spider中获取中间件的response进行打印
middlewares.py
#Selenium 中间件
from scrapy import signals
import random
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from scrapy.http import HtmlResponse
class SeleniumMiddleware:
def process_request(self, request, spider):
options = webdriver.ChromeOptions()
#google的路径
options.binary_location = r"D:\Program Files\Google\Chrome\Application\chrome.exe"
# chromedriver的路径
service = Service(r"D:\workspace\pythonVip\spider\day20\blogs\blogs\chromedriver.exe")
chrome = webdriver.Chrome(service=service, chrome_options=options) # 创建浏览器驱动
chrome.get(request.url)
chrome.maximize_window() # 窗口最大化
page_source = chrome.page_source # 获取源代码
#返回Response后,其他中间件就不执行了
return HtmlResponse(url=request.url, body=page_source, request=request, encoding="utf-8")
配置文件配置中间件
settings.py
DOWNLOADER_MIDDLEWARES = {
'blogs.middlewares.SeleniumMiddleware': 99
}
爬虫文件,获取中间件的response,并且打印
import scrapy
class CnblogSpider(scrapy.Spider):
name = 'cnblog'
allowed_domains = ['www.cnblogs.com']
start_urls = ['http://www.cnblogs.com/']
def parse(self, response):
print(response.text)
cd D:\workspace\pythonVip\spider\day20\blogs\blogs>
scrapy crawl cnblog