过滤代理 爬虫代理 Max retries exceeded with URL

Max retries exceeded with URL :  主要是连接太多没关闭.

sess = requests.session()
sess.keep_alive = False

也可以自定义配置POOLSIZE :

#作用于全局
requests.adapters.DEFAULT_RETRIES = 2  
requests.adapters.DEFAULT_POOLSIZE = 100
#对某个会话配置
s = requests.Session()
adapter = requests.adapters.HTTPAdapter(100,100)
s.mount('http://', adapter)
s.mount('https://', adapter)

 

过滤代理及爬代理:

import gevent.monkey; gevent.monkey.patch_all()
import sys,threading,requests,re,random
from lxml import etree

import random
import gevent
import time
import  requests.adapters
import requests.sessions

USER_AGENT_LIST = [
    'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
    'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
    'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
    'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
    'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',
    'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
    'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
    'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)',
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
]

requests.adapters.DEFAULT_RETRIES = 2
requests.adapters.DEFAULT_POOLSIZE = 100
dict_proxy = {"http":list(), "https":list()}
filter_proxy = list()
p = Pool(20)
test_proxy_url = 'http://www.baidu.com'
test_proxy_headers = dict()
test_proxy_headers['Referer'] = test_proxy_url
test_proxy_headers['Host'] = 'www.baidu.com'
filter_sess = requests.Session()
filter_sess.keep_alive = False


#过滤函数
def filter(proxy , protocol):
    global  test_proxy_headers
    global  filter_proxy
    test_proxy_headers['User-Agent'] = random.choice(USER_AGENT_LIST)
    proxies = {protocol:proxy}
    code = 0
    try:
      res = filter_sess.get(test_proxy_url,proxies = proxies,headers = test_proxy_headers,timeout = 5,stream=True)
      print( proxies , res.elapsed)
      filter_proxy.append({"elapsed" : res.elapsed.microseconds, "protocol":protocol,"proxy" : proxy})
      res.close()
    except requests.Timeout as e:
        print('超时 :' , proxy , e)
    except Exception as e:
        print('->>>>>>>>>>>> proxy:', proxy , 'exception:' , e)



def getproxys(offset):
    global  dict_proxy
    url = 'https://www.kuaidaili.com/free/inha/'
    reg_host = re.compile(r'https?://(.*?)/')
    reg_match = reg_host.match(url)
    heads = {}
    heads['Referer'] = url
    heads['Host'] =reg_match.group(1)
    bContinue = True
    counts = 0
    proxies = dict()
    with requests.session() as s:
        s.keep_alive = False
        while bContinue:
            try:
                heads['User-Agent'] = random.choice(USER_AGENT_LIST)
                r = None
                if proxies.get('http') is None and proxies.get('https') is None:
                    r= requests.get(url + str(offset), headers=heads)
                else:
                    r = requests.get(url + str(offset), headers=heads,proxies = proxies)
                # print("response done : ",r.url,r.reason,r.status_code,proxies)
                r.raise_for_status()

                r.encoding = r.apparent_encoding
                html = etree.HTML(r.content)
                trs = html.xpath(".//tbody/tr")
                for t in trs:
                    ip = t.xpath("./td[1]/text()")[0]
                    port = t.xpath("./td[2]/text()")[0]
                    protocol= t.xpath("./td[4]/text()")[0]
                    protocol_str = str(protocol).lower()
                    proxy_str = str(protocol) + "://" + ip + ":" + port
                    dict_proxy[protocol_str].append(proxy_str)
                    p.spawn(filter,proxy_str,protocol_str)

                bContinue = False
                print('---------> ' +r.url + " done")


            except requests.exceptions.RequestException as e:
                # gevent.sleep(2)
                counts +=1
                if counts >= 3:
                    break
                bContinue = True if counts < 3 else False
                proxies.clear()
                if len(dict_proxy['http']) > 0:
                    proxies['http'] = random.choice(dict_proxy['http'])
                if len(dict_proxy["https"]) > 0:
                    proxies["https"] =random.choice(dict_proxy['https'])






for i in range(20,30):
    p.spawn(getproxys,i)
p.join()
for k in dict_proxy:
    print(k , len(dict_proxy[k]))

for i in sorted(filter_proxy, key=lambda  d : d['elapsed']):
    print(i)

 

你可能感兴趣的:(py)