1.middleware.py中代码
class IPPOOlS(HttpProxyMiddleware):
def __init__(self, ip=''):
self.ip = ip
# 请求处理 # 先随机选择一个IP
def process_request(self, request, spider):
thisip = random.choice(IPPOOL)
print("当前使用IP是:" + thisip["ipaddr"])
request.meta["proxy"] = "http://" + thisip["ipaddr"]
2.创建uamind.py文件(和middleware同一路径)
# -*- coding: utf-8 -*-#
# 导入随机模块
import random
# 导入settings文件中的UPPOOL
from .settings import UPPOOL
# 导入官方文档对应的HttpProxyMiddleware
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
class Uamid(UserAgentMiddleware):# 初始化 注意一定要user_agent,不然容易报错
def __init__(self, user_agent=''):
self.user_agent = user_agent
def process_request(self, request, spider):# 先随机选择一个用户代理
thisua = random.choice(UPPOOL)
print("当前使用User-Agent是:"+thisua)
request.headers.setdefault('User-Agent',thisua)
3.setting中代码
#========================================
# 设置IP池和用户代理
# 禁止本地Cookie
COOKIES_ENABLED = False
# 设置IP池
IPPOOL = [
{"ipaddr": "117.191.11.77:8080"},
{"ipaddr": "211.159.140.133:8080"},
{"ipaddr": "211.159.140.111:8080"},
{"ipaddr": "112.175.32.88:8080"},
]
# 设置用户代理池
UPPOOL = [
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
]
#启动中间键
DOWNLOADER_MIDDLEWARES = {
'company_cotacts.middlewares.IPPOOlS': 2,
'company_cotacts.uamind.Uamid':1
}
我们在使用用户代理时,还有一种简单的方式:
1.在scrapy环境下安装fake_useragent包
2.使用以下fake_useragent中UserAgent模块
from fake_useragent import UserAgent
3.在setting中修改代码如下:
#USER_AGENT = ‘company_cotacts (+http://www.yourdomain.com)’
ua = UserAgent().random
USER_AGENT = ua