python验证代理IP是否可用
python爬虫-爬取代理IP并通过多线程快速验证(这个验证没跑通)
scrapy爬虫代理——利用crawlera神器,无需再寻找代理IP
Python验证IP是否可用
第一个用了BeautifulSoup,第二个用了PyQuery
有代理网站的样式:
自己代码:
#coding=UTF-8
import urllib.request
import chardet
from bs4 import BeautifulSoup
from pyquery import PyQuery as pyq
of = open('proxy.txt', 'w')
class ProxyIp:
def xiciProxy(self):
for page in range(1,10):
url = 'http://www.xicidaili.com/nn/%s' % page
print(url)
#user_agent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
#'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJTE3YTI3ZjgyYzE4NGVhMjhmZTVjMjRiOTVhMmE2YWFhBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMWxCYURmRS9DRHpmZ3hNeTFUVFVCcURCeFhSeXQyWG5qbTFsblFIM0Y2R2M9BjsARg%3D%3D--088e944b5bfb2e7d5c2547822a205693aeb68b0c; Hm_lvt_0cf76c77469e965d2957f0553e6ecf59=1491037294,1491448130; Hm_lpvt_0cf76c77469e965d2957f0553e6ecf59=1491449617',
'Host':'www.xicidaili.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36'
}
data = None
request = urllib.request.Request(url,data,headers)
try:
response = urllib.request.urlopen(request)
content = response.read();
#print(content)
#print(chardet.detect(content))
#print(type(content))
content = str(content,encoding='UTF-8')
#print(content.decode('UTF-8'))
#print(bytes.decode(content))
#print(content.decode())
#print(content)
except urllib.request.URLError as e:
print(e.reason,e.code)
soup = BeautifulSoup(content)
trs = soup.find('table', {"id":"ip_list"}).findAll('tr')
for tr in trs[1:]:
tds = tr.findAll('td')
ip = tds[1].text.strip()
port = tds[2].text.strip()
protocol = tds[5].text.strip()
#print(ip,port,protocol)
if protocol == 'HTTP' or protocol == 'HTTPS':
of.write('%s=%s:%s\n' % (protocol.lower(), ip, port))
print('%s://%s:%s' % (protocol.lower(), ip, port))
def youDailiProxy(self):
#尝试使用PyQuery解析
for page in range(1,2):
if page == 1:
url = 'http://www.youdaili.net/Daili/guonei/36718.html'
else:
url = 'http://www.youdaili.net/Daili/guonei/36718_%s.html' % page
print(url)
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
#'Accept-Encoding':'gzip, deflate, sdch'
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_f8bdd88d72441a9ad0f8c82db3113a84=1491557308; Hm_lpvt_f8bdd88d72441a9ad0f8c82db3113a84=1491557384',
'Host':'www.youdaili.net',
'If-Modified-Since':'Sun, 02 Apr 2017 05:58:18 GMT',
'If-None-Match':'W/"58e092fa-9770"',
'Referer':'http://www.youdaili.net/Daili/guonei/36718_2.html',
'Upgrade-Insecure-Requests':1,
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36'
}
request = urllib.request.Request(url,None,headers)
try:
response = urllib.request.urlopen(request)
content = response.read()
except urllib.request.URLError as e:
print(e.reason)
content = bytes.decode(content)
#print(content)
jq = pyq(content)
p = jq(".content").find("p")
for tmp in p.items():
#print(tmp.text())
arr = tmp.text().split("#")
arr2 = arr[0].split("@")
protocol = arr2[1]
ipAndPort = arr2[0]
print(protocol,ipAndPort)
if protocol == 'HTTP' or protocol == 'HTTPS':
of.write('%s=%s\n' % (protocol.lower(),ipAndPort))
print('%s://%s' %(protocol.lower(),ipAndPort))
proxy = ProxyIp()
#proxy.xiciProxy();
proxy.youDailiProxy()
。。。