1.在setting中添加UA池和IP代理池
USER_AGENT_LIST = [
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", \
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", \
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", \
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", \
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", \
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", \
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", \
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" ]
PROXY_LIST_OLD = [
{'ip_port': 'http://219.141.153.35:80'},
{'ip_port': 'http://118.190.95.43:9001'},
{'ip_port': 'http://139.196.76.78:16816', 'user_pwd':'583349285:2zectsyx'}
]
PROXIES_NEW = {
"http": [
"http://61.135.217.7:80",
"http://219.141.153.38:80",
"http://583349285:[email protected]:16816"
],
"https": [
"https://113.226.18.243:80",
"https://121.31.100.209:8123",
"https://14.117.177.135:808",
"https://171.223.230.46:61234",
"https://117.57.90.121:25435",
"https://175.11.214.29:808",
"https://118.190.145.138:9001",
"https://182.112.89.23:8118",
"https://221.228.17.172:8181",
"https://115.46.70.48:8123",
"https://110.88.30.36:808",
"https://110.87.104.153:8118",
"https://1.195.25.204:61234",
"https://119.186.241.31:61234",
"https://175.155.152.41:61234",
"https://27.31.103.233:21973",
"https://125.105.110.4:3128",
"https://114.222.24.111:808",
"https://140.250.180.229:61234",
"https://120.83.98.216:61234",
"https://175.155.223.179:61234",
"https://115.198.37.56:6666",
"https://115.46.74.192:8123",
"https://106.56.102.39:8070",
"https://125.121.121.155:6666",
"https://219.157.147.113:8118",
"https://117.66.167.57:8118",
"https://183.128.242.93:6666",
"https://115.198.39.24:6666",
"https://114.223.162.171:8118",
"https://115.46.89.82:8123",
"https://58.208.16.70:37436",
"https://123.188.6.176:1133",
"https://112.195.51.225:61234",
"https://112.193.131.17:8118",
"https://221.234.250.204:8010",
"https://49.79.67.119:61234",
"https://220.184.215.223:6666",
"https://180.121.134.176:808",
"https://122.246.48.118:8010",
"https://119.7.59.13:61234",
"https://27.54.248.42:8000",
"https://59.32.37.99:8010",
"https://220.191.100.253:6666",
"https://112.193.70.85:61234",
"https://60.167.128.91:48963",
"https://119.4.70.128:61234",
"https://182.88.166.148:8123",
"https://113.117.65.112:61234",
"https://115.226.129.195:61234",
"https://106.75.71.122:80",
"https://125.122.171.167:6666",
"https://125.118.144.247:6666",
"https://60.184.173.221:8070",
"https://60.190.250.120:8080",
"https://36.6.146.199:47025",
"https://106.56.102.78:808",
"https://119.7.225.218:61234",
"https://583349285:[email protected]:16816"
]
}
2.在middlewares中实现下载中间件
"""
实现随机的User-Agent
思路:
1. 准备User-Agent列表, 在settings.py中
2. 实现一个下载器中间件, 实现process_request方法
3. 在process_request方法中, 从User-Agent中, 随机取出一个User-Agent
4. 把这个User-Agent设置给requests的headers
"""
from middlewares.settings import USER_AGENT_LIST
import random
class RandomUserAgentDownloaderMiddleware(object):
"""随机user-Agent下载器中间件"""
def process_request(self, request, spider):
user_agent = random.choice(USER_AGENT_LIST)
request.headers['User-Agent'] = user_agent
"""
随机的代理IP
旧版实现方式
思路:
1. 准备代理IP的列表, 在settings.py文件中
# 注: 如果是真实开发时候,代理IP列表存储数据库中, 比如:Redis
2. 实现随机的代理IP的下载器中间件, 实现process_request方法
3. 随机从列表中取出一个代理IP
4. 把这个代理IP设置给request.meta['proxy']
5. 如果带有用户名和密码的代理, 那么就需要对用户名和密码进行认证.
新版实现方式
"""
from middlewares.settings import PROXY_LIST_OLD
from base64 import b64encode
class RandomProxyDownloaderMiddlewareOld(object):
def process_request(self, request, spider):
proxy = random.choice(PROXY_LIST_OLD)
request.meta['proxy'] = proxy['ip_port']
user_pwd = proxy.get('user_pwd')
if user_pwd:
user_pwd = b64encode(user_pwd.encode()).decode()
request.headers['Proxy-Authorization'] = "Basic " + user_pwd
from middlewares.settings import PROXIES_NEW
class RandomProxyDownloaderMiddlewareNew(object):
def process_request(self, request, spider):
http = request.url.split('://')[0]
proxies = PROXIES_NEW.get(http)
if proxies:
request.meta['proxy'] = random.choice(proxies)