网上免费代理有很多,免费的,爬取一下拿来用还是挺不错的,免费的意味着不提供任何服务,能用不能用人家才不管那么多,所以需要赛选一下。这两天研究了一下下,整理代码如下:
西刺代理:
#西刺:http://www.xicidaili.com/
#
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import threading
#代理是否成功测试网站
test_http = 'http://httpbin.org/get'
test_https = 'https://httpbin.org/get'
header ={
'Accept':'*/*',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Accept-Language':'zh-CN',
'Accept-Encoding':'gzip, deflate',
'Connection': 'Keep-Alive',
'Cache-Control': 'no-cache',
'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.01)'
}
def pandas_to_xlsx(filename, info): # 储存到xlsx
pd_look = pd.DataFrame(info)
pd_look.to_excel(filename, sheet_name='西刺-https')
def getAllHttpsProxy(url):
try:
for i in range(1, 2):
time.sleep(2)
cururl = url + str(i)
print('正在获取代理信息,网页', cururl)
webcontent = requests.get(cururl,headers=header)
if webcontent.status_code!=200 :
print('获取错误网页,错误码:',webcontent.status_code)
continue
soup = BeautifulSoup(webcontent.text, 'lxml')
ip_list = soup.select('#ip_list')
if len(ip_list) == 0:
print('获取错误网页,网页内容:',webcontent.text)
continue
a = ip_list[0]
b = a.select('tr')
for item in b:
if item.text.find('国家') < 0:
td = item.select('td')
info = {}
info['ip'] = td[1].text
info['port'] = td[2].text
info['服务器地址'] = td[3].text
info['是否匿名'] = td[4].text
info['type'] = td[5].text
info['存活时间'] = td[8].text
info['验证时间'] = td[9].text
allProxies.append(info)
except requests.exceptions.ConnectionError as e:
print('Error', e.args)
pandas_to_xlsx(allProxies)
return allProxies
def TestOneProxy(ip,port,n):
proxy = ip + ':' + port
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy,
}
try:
response = requests.get('http://httpbin.org/get', proxies=proxies , timeout=3)
if response.status_code == 200 :
print(n,'--验证代理通过 ip', ip, ' 端口:', port)
return True
else:
print(n,'--验证代理失败 ip', ip, ' 端口:', port)
return False
except BaseException as e:
print(n,'--Error', e.args)
return False
#线程函数
num = 0
def threadFun(n):
global num
while True:
#领取任务
lock.acquire()
if num >= len(allProxies):
lock.release()#这个地方忘了写这一行代码,调试了一整天,泪奔
break
curTestProxy = allProxies[num]
num = num + 1
lock.release()
#线程干活
if TestOneProxy(curTestProxy['ip'],curTestProxy['port'],n):
canUseProxies.append(curTestProxy)
print(n,'--运行结束')
if __name__ == '__main__':
allProxies = []
canUseProxies = []
#单线程获取所有可用代理
url = 'https://www.xicidaili.com/wn/'
getAllHttpsProxy(url)
# 多线程测试是否可用
lock = threading.Lock()
res = []
for i in range(15): # 创建线程50个线程
t = threading.Thread(target=threadFun, args=("thread-%s" % i,))
t.start()
res.append(t)
for r in res: # 循环线程实例列表,等待所有的线程执行完毕
r.join() # 线程执行完毕后,才会往后执行,相当于C语言中的wait()
if len(canUseProxies)>0:
pandas_to_xlsx('所有可用代理.xlsx',canUseProxies)
print ('end')
快代理:
#快代理:https://www.kuaidaili.com/free/
#
import requests
from bs4 import BeautifulSoup
import pandas as pd
import threading
import time
from time import sleep
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
#代理是否成功测试网站
test_http = 'http://httpbin.org/get'
test_https = 'https://httpbin.org/get'
header ={
'Accept':'*/*',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Accept-Language':'zh-CN',
'Accept-Encoding':'gzip, deflate',
'Connection': 'Keep-Alive',
'Cache-Control': 'no-cache',
'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.01)'
}
def pandas_to_xlsx(filename, info): # 储存到xlsx
pd_look = pd.DataFrame(info)
pd_look.to_excel(filename, sheet_name='快代理')
def TestOneProxy(ip, port,n):
proxy = ip + ':' + port
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy,
}
try:
response = requests.get('http://httpbin.org/get', proxies=proxies , timeout=3)
if response.status_code == 200 :
print(n,'--验证代理通过 ip', ip, ' 端口:', port)
return True
else:
print(n,'--验证代理失败 ip', ip, ' 端口:', port)
return False
except BaseException as e:
print(n,'--Error', e.args)
return False
def getHttpsProxy(url):
for i in range(1,300):
sleep(1)
curUrl = url + str(i) + '/'
try:
print('正在获取代理信息,网页', curUrl)
webcontent = requests.get(curUrl,verify=False)
if webcontent.status_code!=200 :
print('获取错误网页,错误码:',webcontent.status_code)
continue
soup = BeautifulSoup(webcontent.text, 'lxml')
list = soup.select('#list')
if len(list) == 0:
print('获取错误网页,网页内容:',webcontent.text)
continue
a = list[0].select('tbody')[0]
b = a.select('tr')
for item in b:
td = item.select('td')
info = {}
info['ip'] = td[0].text
info['port'] = td[1].text
info['匿名度'] = td[2].text
info['类型'] = td[3].text
info['位置'] = td[4].text
info['响应速度'] = td[5].text
info['最后验证时间'] = td[6].text
allProxies.append(info)
except requests.exceptions.ConnectionError as e:
print('--Error', e.args)
pandas_to_xlsx('所有代理.xlsx',allProxies)
return allProxies
#线程函数
num = 0
def threadFun(n):
global num
while True:
#领取任务
lock.acquire()
if num >= len(allProxies):
lock.release()#这个地方忘了写这一行代码,调试了一整天,泪奔
break
curTestProxy = allProxies[num]
num = num + 1
lock.release()
#线程干活
if TestOneProxy(curTestProxy['ip'],curTestProxy['port'],n):
canUseProxies.append(curTestProxy)
print(n,'--运行结束')
if __name__ == '__main__':
allProxies = []
canUseProxies = []
#单线程获取所有可用代理
url = 'http://www.kuaidaili.com/free/inha/'
getHttpsProxy(url)
# 多线程测试是否可用
lock = threading.Lock()
res = []
for i in range(15): # 创建线程50个线程
t = threading.Thread(target=threadFun, args=("thread-%s" % i,))
t.start()
res.append(t)
for r in res: # 循环线程实例列表,等待所有的线程执行完毕
r.join() # 线程执行完毕后,才会往后执行,相当于C语言中的wait()
if len(canUseProxies)>0:
pandas_to_xlsx('所有可用代理.xlsx',canUseProxies)
print ('end')
注意:
获取代理信息的时候最好不要使用多线程,高频率的访问会被服务器标记,西祠这个有黑名单机制的,大概就是12小时吧,快带来高频率的访问会返回503错误。并且这些信息不可以用代理方法获得,应该是服务器做了限制,禁止代理服务器访问之类的吧。