scrapy 在middelware里面加上随机headers 和代理

from fake_useragent import UserAgent
class RandomUserAgentMiddlerware(object):
    def __init__(self,crawler):
        super(RandomUserAgentMiddlerware,self).__init__()
        self.ua = UserAgent()
        self.ua_type = crawler.settings.get('RANDOM_UA_TYPE','random')
    @classmethod
    def from_crawler(cls,crawler):
        return cls(crawler)
    def process_request(self,request,spider):
        def get_ua():
            return getattr(self.ua,self.ua_type)
        request.headers.setdefault("User-Agent",get_ua())
        # request.cookies= {
        # }
    def process_response(self,request,response,spider):
        logger = logging.getLogger()
        if response.status != 200:
            logger.debug("Try Exception")
            proxy_addr = requests.get("http://127.0.0.1:5555/random").text 
            logger.debug(proxy_addr)
            request.meta['proxy'] = "http://{0}".format(proxy_addr)
            return request 
        return response

你可能感兴趣的:(scrapy,python)