根据scrapy运行流程中所在位置不同分为:
但在scrapy默认的情况下 两种中间件都在middlewares.py一个文件中
爬虫中间件使用方法和下载中间件相同,常用下载中间件
接下来我们对腾讯招聘爬虫进行修改完善,通过下载中间件来学习如何使用中间件 编写一个Downloader Middlewares和我们编写一个pipeline一样,定义一个类,然后在setting中开启
Downloader Middlewares默认的方法:
import random
from fake_useragent import UserAgent
from Tencent.settings import USER_AGENTS_LIST # 注意导入路径,请忽视pycharm的错误提示
ua = UserAgent()
class UserAgentMiddleware(object):
def process_request(self, request, spider):
user_agent = random.choice(USER_AGENTS_LIST)
request.headers['User-Agent'] = ua.random
class CheckUA:
def process_response(self,request,response,spider):
print(request.headers['User-Agent'])
DOWNLOADER_MIDDLEWARES = {
'Tencent.middlewares.UserAgentMiddleware': 543,
}
USER_AGENTS_LIST = [
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", \
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", \
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", \
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", \
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", \
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", \
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", \
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" ]
proxy
字段request.meta['proxy']
class ProxyMiddleware(object):
def process_request(self,request,spider):
proxy = random.choice(proxies) # proxies可以在settings.py中,也可以来源于代理ip的webapi
# proxy = 'http://192.168.1.1:8118'
request.meta['proxy'] = proxy
return None # 可以不写return
在使用了代理ip的情况下可以在下载中间件的process_response()方法中处理代理ip的使用情况,如果该代理ip不能使用可以替换其他代理ip
class ProxyMiddleware(object):
def process_response(self, request, response, spider):
if response.status != '200' and response.status != '302':
#此时对代理ip进行操作,比如删除
return request