#coding=utf-8
#作者:须尽欢
#收集代理ip地址
import urllib
import urllib2
import re
from bs4 import BeautifulSoup
import os
import socket
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
def GetProxyIp():
url_proxy = 'http://www.youdaili.net/Daili/http/'
request = urllib2.Request(url_proxy,headers = headers)
response = urllib2.urlopen(request)
html = response.read().decode('utf-8')
# soup = BeautifulSoup(html)
# for x in soup.select('a'):
pattern = '.*?'
ipport = re.findall(pattern, html)
proxy_url = ipport[0]
#print proxy_url
proxy_urls = 'http://www.youdaili.net/Daili/http/' + proxy_url + '.html'
print proxy_urls
request_ip = urllib2.Request(proxy_urls,headers = headers)
response_ip = urllib2.urlopen(request_ip)
proxy_html = response_ip.read().decode('utf-8')
#print proxy_html
soup = BeautifulSoup(proxy_html)
# f = open('ip_port.txt', 'a')
proxy = []
for x in soup.select('p'):
try:
ip_port = x.string
y = re.compile(u'\@')
ips = re.split(y, ip_port)[0]
# print 'proxy is print'
proxy.append(ips)
#print ips
# f.write(ips+'\n')
except TypeError:
continue
# print '异常'
except UnicodeEncodeError:
# print '异常'
break
# f.close()
return proxy
#验证收集的ip是否可用
def UsefulIp(proxy):
url = "http://ip.chinaz.com/getip.aspx"
f = open("ip_proxy.txt","a")
socket.setdefaulttimeout(3)
for ip in proxy:
try:
proxy_host = "http://"+ip
proxy_temp = {"http":proxy_host}
res = urllib.urlopen(url,proxies=proxy_temp).read()
f.write(ip+'\n')
print ip
except Exception,e:
continue
f.close()
if __name__ == '__main__':
proxy = GetProxyIp()
UsefulIp(proxy)