scrapy爬虫添加UA池和IP代理池

1.在setting中添加UA池和IP代理池

# 1. 准备User-Agent列表, 在settings.py中
USER_AGENT_LIST = [
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", \
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", \
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", \
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", \
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", \
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", \
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", \
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" ]


# 旧版的代理列表
PROXY_LIST_OLD = [
    {'ip_port': 'http://219.141.153.35:80'},
    {'ip_port': 'http://118.190.95.43:9001'},
    {'ip_port': 'http://139.196.76.78:16816', 'user_pwd':'583349285:2zectsyx'}
]


PROXIES_NEW = {
  "http": [
    "http://61.135.217.7:80",
    "http://219.141.153.38:80",
    "http://583349285:[email protected]:16816"
  ],
  "https": [
    "https://113.226.18.243:80",
    "https://121.31.100.209:8123",
    "https://14.117.177.135:808",
    "https://171.223.230.46:61234",
    "https://117.57.90.121:25435",
    "https://175.11.214.29:808",
    "https://118.190.145.138:9001",
    "https://182.112.89.23:8118",
    "https://221.228.17.172:8181",
    "https://115.46.70.48:8123",
    "https://110.88.30.36:808",
    "https://110.87.104.153:8118",
    "https://1.195.25.204:61234",
    "https://119.186.241.31:61234",
    "https://175.155.152.41:61234",
    "https://27.31.103.233:21973",
    "https://125.105.110.4:3128",
    "https://114.222.24.111:808",
    "https://140.250.180.229:61234",
    "https://120.83.98.216:61234",
    "https://175.155.223.179:61234",
    "https://115.198.37.56:6666",
    "https://115.46.74.192:8123",
    "https://106.56.102.39:8070",
    "https://125.121.121.155:6666",
    "https://219.157.147.113:8118",
    "https://117.66.167.57:8118",
    "https://183.128.242.93:6666",
    "https://115.198.39.24:6666",
    "https://114.223.162.171:8118",
    "https://115.46.89.82:8123",
    "https://58.208.16.70:37436",
    "https://123.188.6.176:1133",
    "https://112.195.51.225:61234",
    "https://112.193.131.17:8118",
    "https://221.234.250.204:8010",
    "https://49.79.67.119:61234",
    "https://220.184.215.223:6666",
    "https://180.121.134.176:808",
    "https://122.246.48.118:8010",
    "https://119.7.59.13:61234",
    "https://27.54.248.42:8000",
    "https://59.32.37.99:8010",
    "https://220.191.100.253:6666",
    "https://112.193.70.85:61234",
    "https://60.167.128.91:48963",
    "https://119.4.70.128:61234",
    "https://182.88.166.148:8123",
    "https://113.117.65.112:61234",
    "https://115.226.129.195:61234",
    "https://106.75.71.122:80",
    "https://125.122.171.167:6666",
    "https://125.118.144.247:6666",
    "https://60.184.173.221:8070",
    "https://60.190.250.120:8080",
    "https://36.6.146.199:47025",
    "https://106.56.102.78:808",
    "https://119.7.225.218:61234",
    "https://583349285:[email protected]:16816"
  ]
}

2.在middlewares中实现下载中间件

"""
实现随机的User-Agent
思路:
1. 准备User-Agent列表, 在settings.py中
2. 实现一个下载器中间件, 实现process_request方法
3. 在process_request方法中, 从User-Agent中, 随机取出一个User-Agent
4. 把这个User-Agent设置给requests的headers
"""

from middlewares.settings import USER_AGENT_LIST
import random

class RandomUserAgentDownloaderMiddleware(object):
    """随机user-Agent下载器中间件"""

    def process_request(self, request, spider):
        # 3. 在process_request方法中, 从User-Agent中, 随机取出一个User-Agent
        user_agent = random.choice(USER_AGENT_LIST)
        #  把这个User-Agent设置给requests的headers
        request.headers['User-Agent'] = user_agent

"""
随机的代理IP
旧版实现方式
思路:
1. 准备代理IP的列表, 在settings.py文件中
   # 注: 如果是真实开发时候,代理IP列表存储数据库中, 比如:Redis
2. 实现随机的代理IP的下载器中间件, 实现process_request方法
3. 随机从列表中取出一个代理IP
4. 把这个代理IP设置给request.meta['proxy'] 
5. 如果带有用户名和密码的代理, 那么就需要对用户名和密码进行认证.  


新版实现方式


"""

from middlewares.settings import PROXY_LIST_OLD
from base64 import b64encode

class RandomProxyDownloaderMiddlewareOld(object):

    def process_request(self, request, spider):
        # 随机取出一个代理IP
        proxy = random.choice(PROXY_LIST_OLD)
        # 取出对应ip和端口号, 代理地址
        request.meta['proxy'] = proxy['ip_port']
        # 如果有用户名和密码就需要对用户名和密码进行一个认证
        user_pwd = proxy.get('user_pwd')
        if user_pwd:
            # 1. 使用base64对用户名和密码进行编码处理
            user_pwd = b64encode(user_pwd.encode()).decode()
            # 2. 设置请求的headers进行认证
            request.headers['Proxy-Authorization'] = "Basic " + user_pwd


# 实现新版的随机代理IP

from middlewares.settings import PROXIES_NEW

class RandomProxyDownloaderMiddlewareNew(object):

    def process_request(self, request, spider):

        # 1. 取出请求请求的协议头
        http = request.url.split('://')[0]
        # 2. 获取http或https的代理列表
        proxies = PROXIES_NEW.get(http)
        # 3. 只有有对应代理列表, 才设置代理
        if proxies:
            # 随机取出一个代理
            request.meta['proxy'] = random.choice(proxies)

你可能感兴趣的:(Scrapy)