scrapy中自定义下载中间件设置动态User-Agent和代理ip

'''
scrapy 自定义下载中间件
动态设置User-Agent
'''

import random

class RandomUserAgent:
    def __init__(self, agents):
        self.agents = agents

    @classmethod
    def from_crawler(cls, crawler):
        # 从Settings中加载USER_AGENTS的值
        return cls(crawler.settings.getlist('USER_AGENTS'))

    def process_request(self, request, spider):
        # 在process_request中设置User-Agent的值
        request.headers.setdefault('User-Agent', random.choice(self.agents))


'''
动态设置代理ip
'''
class RandomProxy:
    def __init__(self, iplist):
        self.iplist = iplist

    @classmethod
    def from_crawler(cls, crawler):
        # 加载IPLIST
        return cls(crawler.settings.getlist('IPLIST'))

    def process_request(self, request, spider):
        proxy = random.choice(self.iplist)
        request.meta['proxy'] = proxy

# 在settings中设置USER_AGENTS和IPLIST,并激活该中间件。

你可能感兴趣的:(python,爬虫,scrapy)