第一步:在settings中配置
#在下载中间件中把原有的user-agent设置为None
DOWNLOADER_MIDDLEWARES = {
'jobboleSpider.middlewares.RandomUserAgentMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
#设置一个随机的浏览器user-agent
RANDOM_UA_TYPE= 'random'
第二步:
在中间件中重写user-agent方法
class RandomUserAgentMiddleware(object):
'''
随机更换User-Agent
'''
def __init__(self,crawler):
super(RandomUserAgentMiddleware, self).__init__()
self.ua = UserAgent()
self.ua_type = crawler.settings.get('RANDOM_UA_TYPE','random')
@classmethod
def from_crawler(cls,crawler):
return cls(crawler)
def process_request(self,request,spider):
def get_ua():
return getattr(self.ua,self.ua_type)
request.headers.setdefault('User-Agent',get_ua())
scrapy添加代理:
在中间件中添加 request.meta[“proxy”] = “http://222.162.172.38:8060”
要检测代理是否获取到,直接在配置代理后访http://httpbin.org/ip网站,获取到ip后再访问www.ip138.com就知道是否获取到了ip。
第一步:
创建工具tool包:
动态获取数据库中的ip代理池逻辑
·
def judge_ip(self,ip,port,ip_type):
#判断Ip是否可用
http_url = "http://www.baidu.com"
proxy_url = '{2}://{0}:{1}'.format(ip,port,ip_type)
try:
proxy_dict = {
"http":proxy_url
}
response = requests.get(http_url,proxies=proxy_dict)
except Exception as e:
print(e)
print("无效ip")
self.delete_ip(ip)
return False
else:
code = response.status_code
if code >=200 and code <= 300:
print('有效的ip')
return True
else:
print("无效ip")
self.delete_ip(ip)
return False
def get_random_ip(self):
#从数据库中随机获取一个可用的ip
random_sql = """
SELECT ip,port,ip_type FROM dl_ip
ORDER BY RAND()
LIMIT 1
"""
result = cursor.execute(random_sql)
for ip_info in cursor.fetchall():
ip = ip_info[0]
port = ip_info[1]
ip_type = ip_info[2]
judge_re = self.judge_ip(ip,port,ip_type)
if judge_re:
return '{2}://{0}:{1}'.format(ip,port,ip_type)
else:
return self.get_random_ip()
if __name__ == '__main__':
get_ip = GetIp()
get_ip.get_random_ip()
第二步:
在scrapy中间件中获取到
#随机获取代理ip
#代理ip池
from tool.xxxx import GetIp
class RandomProxyMiddleware(object):
#动态设置ip代理
def process_request(self,request,spider):
get_ip = GetIp()
request.meta["proxy"] = get_ip.get_random_ip()
第三步:
setting启动代理中间件