1、采集地址:http://www.xicidaili.com/nn/
2、根据实际需求修改验证地址
3、免费代理稳定性不可靠,在使用中可增加重试,切换代理
# coding: utf-8
import urllib2
from bs4 import BeautifulSoup
import urllib
import socket
import threading
User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
header = {}
header['User-Agent'] = User_Agent
def getProxyIp():
proxy = []
for i in range(1, 60):
try:
url = 'http://www.xicidaili.com/nn/' + str(i)
req = urllib2.Request(url, headers=header)
res = urllib2.urlopen(req).read()
soup = BeautifulSoup(res, 'html5lib')
ips = soup.findAll('tr')
for x in range(1, len(ips)):
ip = ips[x]
tds = ip.findAll("td")
ip_temp = tds[1].contents[0] + "\t" + tds[2].contents[0]+"\t"+tds[5].contents[0]
proxy.append(ip_temp)
except:
continue
return proxy
f = open("ip.txt", "w")
def validateIp(proxy):
url = "http://ip.chinaz.com/getip.aspx"
socket.setdefaulttimeout(3)
for i in range(0, len(proxy)):
try:
ip = proxy[i].strip().split("\t")
proxy_host = ip[2]+"://" + ip[0] + ":" + ip[1]
proxy_temp = {ip[2]: proxy_host}
res = urllib.urlopen(url, proxies=proxy_temp).read()
f.write(proxy[i] + '\n')
print proxy[i]
except Exception, e:
continue
f.close()
lock = threading.Lock()
def validateIpThread():
threads = []
for i in range(len(proxy)):
thread = threading.Thread(target=validateIp2, args=[i])
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
f.close()
def validateIp2(i):
socket.setdefaulttimeout(5)
# 验证地址
url = "http://www.biquge.com.tw/17_17281/7647045.html"
try:
ip = proxy[i].strip().split("\t")
proxy_host = "http://" + ip[0] + ":" + ip[1]
proxy_temp = {"http": proxy_host}
proxy_support = urllib2.ProxyHandler(proxy_temp)
opener = urllib2.build_opener(proxy_support)
opener.addheaders = [("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64)")]
urllib2.install_opener(opener)
res = urllib2.urlopen(url).read()
if (len(res) < 10000):
raise RuntimeError('return error')
lock.acquire()
print(proxy[i], 'is OK')
f.write('%s\n' % str(proxy[i]))
lock.release()
except Exception as e:
lock.acquire()
print(proxy[i], e)
lock.release()
if __name__ == '__main__':
proxy = getProxyIp()
# validateIp(proxy)
validateIpThread()
下载链接:https://download.csdn.net/download/u012795120/10508293