scrapy的随机请求头和随机代理IP

第一步:在settings中配置
#在下载中间件中把原有的user-agent设置为None

	DOWNLOADER_MIDDLEWARES = {
	    'jobboleSpider.middlewares.RandomUserAgentMiddleware': 543,
	    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
	}

#设置一个随机的浏览器user-agent

RANDOM_UA_TYPE= 'random'

第二步:
在中间件中重写user-agent方法

class RandomUserAgentMiddleware(object):
    '''
    随机更换User-Agent
    '''
    def __init__(self,crawler):
        super(RandomUserAgentMiddleware, self).__init__()
        self.ua = UserAgent()
        self.ua_type = crawler.settings.get('RANDOM_UA_TYPE','random')

@classmethod
def from_crawler(cls,crawler):
    return cls(crawler)

def process_request(self,request,spider):

    def get_ua():
        return getattr(self.ua,self.ua_type)
    request.headers.setdefault('User-Agent',get_ua())

scrapy添加代理:

在中间件中添加 request.meta[“proxy”] = “http://222.162.172.38:8060”
要检测代理是否获取到,直接在配置代理后访http://httpbin.org/ip网站,获取到ip后再访问www.ip138.com就知道是否获取到了ip。

第一步:
创建工具tool包:
动态获取数据库中的ip代理池逻辑

·

def judge_ip(self,ip,port,ip_type):
    #判断Ip是否可用
    http_url = "http://www.baidu.com"
    proxy_url = '{2}://{0}:{1}'.format(ip,port,ip_type)
    try:

        proxy_dict = {
            "http":proxy_url
        }
        response = requests.get(http_url,proxies=proxy_dict)

    except Exception as e:
        print(e)
        print("无效ip")
        self.delete_ip(ip)

        return False
    else:
        code = response.status_code
        if code >=200 and code <= 300:
            print('有效的ip')
            return True
        else:
            print("无效ip")
            self.delete_ip(ip)
            return False



def get_random_ip(self):
    #从数据库中随机获取一个可用的ip
    random_sql = """
            SELECT ip,port,ip_type FROM dl_ip
        ORDER BY RAND() 
        LIMIT 1
        """
    result = cursor.execute(random_sql)
    for ip_info in cursor.fetchall():
        ip = ip_info[0]
        port = ip_info[1]
        ip_type = ip_info[2]

        judge_re = self.judge_ip(ip,port,ip_type)

        if judge_re:
            return '{2}://{0}:{1}'.format(ip,port,ip_type)
        else:
            return self.get_random_ip()

if __name__ == '__main__':

get_ip = GetIp()
get_ip.get_random_ip()

第二步:
在scrapy中间件中获取到
#随机获取代理ip
#代理ip池

from tool.xxxx import GetIp
class RandomProxyMiddleware(object):

#动态设置ip代理
def process_request(self,request,spider):

    get_ip = GetIp()
    request.meta["proxy"] = get_ip.get_random_ip()

第三步:
setting启动代理中间件

你可能感兴趣的:(scrapy的随机请求头和随机代理IP)