批量获取代理ip地址

#coding=utf-8
#作者:须尽欢
#收集代理ip地址
import urllib
import urllib2
import re
from bs4 import BeautifulSoup
import os
import socket

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
def GetProxyIp():
    url_proxy = 'http://www.youdaili.net/Daili/http/'
    request = urllib2.Request(url_proxy,headers = headers)
    response = urllib2.urlopen(request)
    html = response.read().decode('utf-8')
    #    soup = BeautifulSoup(html)
    #    for x in soup.select('a'):
    pattern = '.*?'
    ipport = re.findall(pattern, html)
    proxy_url = ipport[0]
    #print proxy_url
    proxy_urls = 'http://www.youdaili.net/Daili/http/' + proxy_url + '.html'
    print proxy_urls
    request_ip = urllib2.Request(proxy_urls,headers = headers)
    response_ip = urllib2.urlopen(request_ip)
    proxy_html = response_ip.read().decode('utf-8')
    #print proxy_html
    soup = BeautifulSoup(proxy_html)
#    f = open('ip_port.txt', 'a')
    proxy = []
    for x in soup.select('p'):
        try:
            ip_port = x.string
            y = re.compile(u'\@')
            ips = re.split(y, ip_port)[0]
    #        print 'proxy is print'
            proxy.append(ips)
            #print ips
#            f.write(ips+'\n')
        except TypeError:
            continue
    #        print '异常'
        except UnicodeEncodeError:
    #        print '异常'
            break
#   f.close()
    return proxy
#验证收集的ip是否可用
def UsefulIp(proxy):
    url = "http://ip.chinaz.com/getip.aspx"
    f = open("ip_proxy.txt","a")
    socket.setdefaulttimeout(3)
    for ip in proxy:
        try:
            proxy_host = "http://"+ip
            proxy_temp = {"http":proxy_host}
            res = urllib.urlopen(url,proxies=proxy_temp).read()
            f.write(ip+'\n')
            print ip
        except Exception,e:
            continue
    f.close()
if __name__ == '__main__':
    proxy = GetProxyIp()
    UsefulIp(proxy)

你可能感兴趣的:(python)