Python中常见的添加IP代理简单介绍

文章摘要:

常用代理有:

1、购买的动态IP隧道:比如阿布云动态隧道,请求不返回IP,代理访问请求,返回请求值;

2、私密代理IP:即为能够拿到返回的具体IP值(有时间限制),然后我们再用拿到的代理IP构造代理池,然后发起请求;

3、自己通过抓取免费代理IP,构造自己的IP代理池,有兴趣请移步:https://blog.csdn.net/Owen_goodman/article/details/100074822

常见代理使用场景:

1、requests脚本:get/post请求

2、scrapy:get/post 请求,在中间件添加代理

3、自动化脚本:selenium+webdriver+代理

详细介绍:

一、在requests库中代理的使用

  •  阿布云动态隧道
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
# 这里是阿布云服务器
# 代理服务器
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = "****"  # 你的通行证书
proxyPass = "****"  # 你的通行秘钥
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
    "host": proxyHost,
    "port": proxyPort,
    "user": proxyUser,
    "pass": proxyPass,
}
proxies = {
    "http": proxyMeta,
    "https": proxyMeta,
}
time.sleep(0.1)
url = 'baidu.com'

# get请求使用阿布云
response = requests.get(url=url, proxies=proxies, headers=headers)

# post请求使用方法雷同
data={}
response = requests.post(url=url, proxies=proxies, headers=headers, data=data)
  • 私密代理IP(个人使用的蜻蜓代理)
class dandd():

    def spider(self):
        # 以 Python3 为例,其他编程语言也类似,仅需发送 HTTP GET 请求接口即可
        import requests
        # 点击上面的生成按钮生成的接口地址
        targetUrl = "***" # 你的接口地址
        resp = requests.get(targetUrl)
        print(resp.status_code)
        print(resp.text)
        # 这里是 蜻蜓代理
        with open("./ip.txt", "w+") as f:
            iplist = f.readlines()
            ipList = iplist[0::2]
            self.count = len(ipList)
            ip = random.choice(ipList)
            proxies = {
                    "http": 'http://' + ip.replace("\n", "")
                }
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
            }
            url = 'baidu.com'
            res = requests.get(url=url, proxies=proxies, headers=headers)
            res = requests.post(url=url, proxies=proxies, headers=headers)

if __name__ == '__main__':
    d1 = dandd()
    d1.spider()

二、在scrapy框架中代理的使用

  • 重写requests方法
from requests import Request


def start_requests(self, *args):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
    }
    url = 'baidu.com'
    proxies = {"http": 'http://' + ip.replace("\n", "")} # 见上面
    # post请求同理
    request = Request(url, callback=self.parse, dont_filter=True,headers=headers, meta={'proxy': proxies})
    # time.sleep(0.5)
    yield request
  • 在中间件DOWNLOAD添加代理
# 这里是阿布云代理
# 代理服务器
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware


proxyServer = "http://http-dyn.abuyun.com:9020"
# 代理隧道验证信息
proxyUser = "****"
proxyPass = "****"

'''
# for Python2
proxyAuth = "Basic " + base64.b64encode(proxyUser + ":" + proxyPass)

'''
# for Python3
proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((proxyUser + ":" + proxyPass), "ascii")).decode("utf8")

class ProxyMiddleware(object):
    # 将原本的process_request(self,request,spider)注释掉
    # 第一种
    def process_request(self, request, spider):
        request.meta["proxy"] = proxyServer
        print('使用代理服务器')
        request.headers["Proxy-Authorization"] = proxyAuth
    # 第二种
    # 通过制定spider名字,可以指定对特定脚本使用代理     
    def process_request(self, request, spider):
        if spider.name in ["name1", "name2", "name3"]:
            request.meta["proxy"] = proxyServer
            request.headers["Proxy-Authorization"] = proxyAuth
# 这里是蜻蜓代理
# 下面示例是写在中间件里,然后在settings里开启这个中间件
class qingTingMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    num = 1  #
    count = 0  # 统计所有的ip个数
    index = 0  # 取第几个ip
    now = 0

    def getIP(self):  # 获取一个ip
        if self.count - 1 == self.index or self.index == 0:  # 取完所有IP,或者第一次取IP
            pre = self.now
            self.now = time.time()
            if int((self.now - pre)) < 6:
                time.sleep(6 - int((self.now - pre)))
                self.now = time.time()
            print("重新调用IP")
            getAllIp = "your api"
            es = requests.get(url=getAllIp)
            res.encoding = "utf-8"
            with open("./ip.txt", "w") as f:
                f.write(res.text)
            if self.index != 0:
                self.index = 0
            ip = re.findall(r'(\d+\.\d+\.\d+\.\d+:\d+)', res.text)[self.index] 
            self.index += 1
        else:
            with open("./ip.txt", "r") as f:
                iplist = f.readlines()
            ipList = iplist[0::2]
            self.count = len(ipList)
            ip = ipList[self.index]
            self.index += 1
        return 'http://' + ip.replace("\n", "")

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    
    # 将原本的process_request(self,request,spider)注释掉
    # 第一种
    def process_request(self, request, spider):
        ip = self.getIp()
        request.meta['proxy'] = ip

    # 第二种
    def process_request(self, request, spider):
        if spider.name in ["name1", "name2", "name3"]:
            ip = self.getIp()
            request.meta['proxy'] = ip
        else:
            return None

    # def process_request(self, request, spider):
    #     # Called for each request that goes through the downloader
    #     # middleware.
    #
    #     # Must either:
    #     # - return None: continue processing this request
    #     # - or return a Response object
    #     # - or return a Request object
    #     # - or raise IgnoreRequest: process_exception() methods of
    #     #   installed downloader middleware will be called
    #     return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        # #####2020-05-26添加的重试机制,导致断开十几分钟都连着,费时
        # if isinstance(exception,TimeoutError):
        #     return request
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

三、自动化工具selenium

  • 直接放在中间件中
  • 写在请求中
    def parse(self, response):
        url = "http://.aspx"
        chrome_options = Options()
        # proxies = random.choice([
        #     "116.239.105.250:40049",
        #     "117.26.88.235:23525",
        #     "60.182.178.192:30221",
        #     "123.163.184.232:43565",
        #     "113.120.62.57:43358",
        #     "1.199.187.37:41380",
        #     "117.87.139.65:49842",
        #     "113.128.26.228:31984",
        #     "125.117.146.134:48840",
        #     "113.120.63.82:42216",
        # ])

        # 设置代理
        chrome_options.add_argument('--proxy-server=%s' % proxies)

        # chrome_options.add_argument('--headless')  # 无头模式
        chrome_options.add_argument('--disable-gpu')  # 谷歌文档提到需要加上这个属性来规避bug
        chrome_options.add_argument('--no-sandbox')  # 以最高权限运行
        chrome_options.add_argument("--test-type")
        chrome_options.add_argument(
            'user-agent="MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        driver.get(url)
        page = driver.page_source
        res = etree.HTML(page)  # 是将HTML转化为二进制/html 格式

 

你可能感兴趣的:(Python,代理ip)