scrapy的中间件可分为爬虫中间件和下载中间件,本文主要介绍下载中间件。
下载中间件位于Downloader和engine之间,主要用于拦截请求和拦截响应。
class MiddleDownloaderMiddleware:
#UA池
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
#以下两个proxy是进行代理IP的设置
PROXY_http = [
'153.180.102.104:80',
'195.208.131.189:56055',
]
PROXY_https = [
'120.83.49.90:9000',
'95.189.112.214:35508',
]
#用于拦截请求
def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.user_agent_list)
# request.meta['proxy'] = 'https://61.64.144.123:8080'
print('现在使用的UA是:',request.headers['User-Agent'])
return None
import scrapy
from selenium import webdriver
from wynews.items import WynewsItem
class NewsSpiderSpider(scrapy.Spider):
name = 'news_spider'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://news.163.com/']
module_urls = []
# 实例化一个浏览器对象
def __init__(self):
self.driver = webdriver.Chrome(
executable_path=r'C:\Users\Legion\AppData\Local\Google\Chrome\Application\chromedriver.exe')
def parse(self, response):
info_nodes = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li')
node_idx = [3, 4, 6, 7, 8]
for idx in node_idx:
node = info_nodes[idx]
self.module_urls.append(node.xpath('./a/@href').extract_first())
for url in self.module_urls[:1]:
yield scrapy.Request(url, callback=self.parse_module)
def parse_module(self, response): # 用于解析每个模块页面中的每条新闻信息和新闻详情页链接
info_nodes = response.xpath('//div[@class="ndi_main"]/div')
for node in info_nodes[:5]:
item = WynewsItem()
news_title = node.xpath('.//div[@class="news_title"]//h3/a/text()').extract_first()
news_url = node.xpath('.//div[@class="news_title"]//h3/a/@href').extract_first()
item['news_title'] = news_title
item['news_url'] = news_url
yield item
middlewares.py中的代码:
from scrapy.http import HtmlResponse
import time
class WynewsDownloaderMiddleware:
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
driver = spider.driver #获取爬虫文件中定义的浏览器对象
#挑选出需要修改响应对象的url
if request.url in spider.module_urls:
driver.get(request.url)
time.sleep(2)
page_text = driver.page_source
new_response = HtmlResponse(request.url,body=page_text,encoding='utf-8',request=request)
return new_response
#不需要篡改的,就直接返回原始的response
else:
return response