java扫描免费代理服务器


免费代理服务器的收集与使用(java版)



一、前言:

>概念:

代理(英语:Proxy),也称网络代理,是一种特殊的网络服务,允许一个网络终端(一般为客户端)

通过这个服务与另一个网络终端(一般为服务器)进行非直接的连接。一些网关、路由器等网络设备

具备网络代理功能。一般认为代理服务有利于保障网络终端的隐私或安全,防止攻击。

>功能:
1 突破自生IP访问限制。
2 网络用户可以通过代理访问外国网站。
3 访问一些单位或团体内部资源。
4 突破中国电信的IP封锁。
5 提高访问速度
6 隐藏真实IP。有防火墙的功能。

二、介绍:

>>在学习中,由于需要频繁抓取一些网站的数据,而且保证数据的实时性,有效性,就需要多次访问服务器。
这样的话就会增加服务器的负荷。所以网站管理员采取技术手段,对一定时间进行频繁访问的ip地址封锁处理。
以此时就需要大量的代理服务器交替使用IP地址。去帮助你探索数据。

>>文中所用到的代理服务器并不是从网络中扫描得到,而是在http://www.xicidaili.com/nn/这个网站中提取而来,
因为如果要是自己扫描的话,得需要很多高性能的服务器和别的技术手段.之前在无忧代理那个网站试过,但是提取
出来的IP地址是对的,端口号是错误的,我估计是在请求的文档加载完成以后采用异步js或者Ajax更新了端口,所以
我放弃了,此处只为学习,故提取现成的事半功陪。

>>由于只是为了验证与学习,文章中代码比较乱,下文中的程序属于半成品,但是为了更好理解,我在此说明
我的编程思路。如果我实在没说明白,就当玩玩而已^_^

文中有两个主方法,其实就是两个小程序,一个是StartIPSet,这个主要是把提取出来的数据放置在数据库中,
以便用到的时候直接在数据库中进行查取,里面有个方法update(),用来更新数据库中的数据,不是自动更新。第二
个主方法是TestProxy,主要功能是测试和验证有用的代理IP。还有一些辅助的类,比如DownloadHtml(用于从西刺代
理这个网站下载网页文档),还有GetCookie(这个类可有可无,主要是在12306网站中要想获取数据的话就必须用到
cookies,在此贴出来,只为学习)。       

三、代码:

StartIPSet:
package pitd;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.sql.Statement;

import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;

import dao.DBUtil;

public class StartIPSet {
	public static void main(String[] args) {
		StartIPSet s=new StartIPSet();
		s.update();
	}
	public void put() {
		String sql = "insert into proxyip values(?,?,?,?,?,sysdate)";
		Connection con = null;
		PreparedStatement ps = null;
		try {
			/*
			 * fw = new FileWriter(
			 * "C:\\Users\\Administrator\\Desktop\\proxyIP.txt");
			 */
			// out=new BufferedWriter(new
			// FileWriter("C:\\Users\\Administrator\\Desktop\\proxyIP.txt"));
			HtmlCleaner cleaner = new HtmlCleaner();
			TagNode tagNode = cleaner.clean(DownloadHtml.getHtml());
			Object[] action = tagNode.getElementsByName("td", true);
			System.out.println(action.length);
			con = DBUtil.getConnection();
			con.setAutoCommit(false);
			ps = con.prepareStatement(sql);
			for (int i = 1; i < action.length - 6;) {
				for (int j = 1; j <= 5; j++) {
					TagNode tna = (TagNode) action[i];
					ps.setString(j, tna.getText().toString());
					i++;
				}
				i = i + 5;
				ps.addBatch();
			}
			ps.executeBatch();
			con.commit();
			con.setAutoCommit(true);
		} catch (SQLException e) {
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			e.printStackTrace();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			DBUtil.closePreparedStatement(ps);
			DBUtil.closeConnection(con);
		}
	}
	public void update(){
		String sql="truncate table proxyip";
		Connection con=null;
		Statement st=null;
		try {
			con=DBUtil.getConnection();
			st=con.createStatement();
			st.execute(sql);
		} catch (ClassNotFoundException | SQLException e) {
			e.printStackTrace();
		}finally{
			DBUtil.closeStatement(st);
			DBUtil.closeConnection(con);
		}
		put();
	}
}

GetCookie:
package pitd;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;





public class GetCookie {
    
	public static String getCookie(String url){
		HttpURLConnection conn=null;
	    String cookie=null;
	    StringBuffer result=new StringBuffer();
		try {
			URL u=new URL(url);
		   conn = (HttpURLConnection)u.openConnection();  
		   conn.setRequestMethod("GET");
		   conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36");
			//conn.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch");   //kongzhi bainma
			conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
			conn.setInstanceFollowRedirects(false); 
			System.out.println(conn.getResponseCode());
			String cookieskey = "Set-Cookie";  
			cookie = conn.getHeaderField(cookieskey);
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		try {
			URL u=new URL(url);
		   conn = (HttpURLConnection)u.openConnection();  
		   conn.setRequestMethod("GET");
		   conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36");
			//conn.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch");   //kongzhi bainma
			conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
			conn.setInstanceFollowRedirects(false); 
			conn.setRequestProperty("Cookie", cookie);
			conn.connect();
			System.out.println(conn.getResponseCode());
			String cookieskey = "Set-Cookie";  
			String cookie2 = conn.getHeaderField(cookieskey);
			result.append(cookie+";"+cookie2);
			
			System.out.println(result.toString());
			
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return result.toString();
	}
}

DownloadHtml:

package pitd;

import java.io.BufferedReader;
import java.io.FileWriter;

import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;


public class DownloadHtml {
	public static String getHtml() {
		URL url;
		StringBuffer contentBuffer = new StringBuffer();
		FileWriter fw;
		String htmlstr=null;
		HttpURLConnection conn=null;
		try {
			fw = new FileWriter(
					"C:\\Users\\Administrator\\Desktop\\crawler.txt");
			String urlPath="http://www.xicidaili.com/nn/";
			url = new URL(urlPath);
			
			conn = (HttpURLConnection)url.openConnection();  
			conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36");
			//conn.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch");   //kongzhi bainma
			
			conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
			conn.setInstanceFollowRedirects(true); 
		    conn.setRequestProperty("Connection", "keep-alive");	
			conn.setRequestProperty("Upgrade-Insecure-Requests", "1"); 
			conn.setRequestProperty("Cookie",GetCookie.getCookie(urlPath));
			conn.connect();
			int returnCode = conn.getResponseCode();
			if (returnCode == 200) {
				InputStream input = conn.getInputStream();

				InputStreamReader istreamReader = new InputStreamReader(
						input, "UTF-8");
				
				BufferedReader buffStr = new BufferedReader(istreamReader);

				String str = null;
				while ((str = buffStr.readLine()) != null)
					contentBuffer.append(str);
				htmlstr = contentBuffer.toString();
				System.out.println(htmlstr);
				fw.write(htmlstr);
				input.close();
				istreamReader.close();
				buffStr.close();
				fw.close();
			}
		} catch (Exception e) {
			e.printStackTrace();
			
		}  finally {
			if (conn != null) {
				conn.disconnect();
			}
		}
		return htmlstr;
	}
}

TestProxy:

package proxyip;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.Proxy;
import java.net.URL;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import pitd.GetCookie;
import dao.DBUtil;

public class TestProxy {
	static Proxy proxy = null;
	Connection con = null;
	Statement st = null;
	ResultSet rs = null;

	public static void main(String[] args) throws IOException {
		try {
			@SuppressWarnings("unchecked")
			// 使用反射加载类。
			Class clazz = (Class) Class
					.forName("proxyip.TestProxy");
			TestProxy tp = clazz.newInstance();
			tp.checkProxy();
		} catch (ClassNotFoundException e) {
			e.printStackTrace();
		} catch (InstantiationException e) {
			e.printStackTrace();
		} catch (IllegalAccessException e) {
			e.printStackTrace();
		}
	}

	public void checkProxy() {
		int count = 100;
		try {
			String sql = "select * from proxyip";
			con = DBUtil.getConnection();
			st = con.createStatement();
			rs = st.executeQuery(sql);
			do {
				rs.next();count--;
				System.out.print("数据库中取出的数据为:");
				System.out.println(rs.getString(1) + "\t"
						+ new Integer(rs.getString(2)));
				proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(
						rs.getString(1), new Integer(rs.getString(2))));
				// 确定代理是否设置成功
				if (statuOk()) {
					System.out.println("<<<<----代理成功————>>>>\n代理信息:");
					System.out.println("Address:" + rs.getString(1) + "\nPort:"
							+ rs.getString(2) + "\nLocaltion:" + rs.getString("LOCALTION")
							+ "\nAnony:" + rs.getString(4) + "\nProtocal:"
							+ rs.getString(5));
					break;
				} else if (count <= 0) {
					System.out.println("代理失败,ip资源不足!");
					break;
				}
			} while (true);
		} catch (ClassNotFoundException | SQLException e) {
			e.printStackTrace();
		} finally {
			DBUtil.closeResultSet(rs);
			DBUtil.closeStatement(st);
			DBUtil.closeConnection(con);
		}
	}

	public boolean statuOk() {
		int flag = 0;
		String localIP = getV4IP();
		System.out.println("start...");
		try {
			String ipInfo = getHtml("http://ip.chinaz.com/getip.aspx");// http://city.ip138.com/ip2city.asp
			Pattern p = Pattern
					.compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}");
			Matcher m = p.matcher(ipInfo);
			if (m.find()) {
				String proxyIP = m.group(0);
				System.out.println("正在检测的代理ip:" + proxyIP);
				if (!localIP.equals(proxyIP)) {
					System.out.println("本机ip:" + localIP);
					flag = 1;
				}
			}

		} catch (Exception e) {
			System.out.println(e.getMessage());
			flag = 0;
		}
		if (flag == 1) {
			return true;
		} else {
			return false;
		}
	}

	private static String getHtml(String address) throws Exception {
		StringBuffer html = new StringBuffer();
		String result = null;
		/*
		 * System.getProperties().setProperty("proxySet", "true"); //
		 * 如果不设置,只要代理IP和代理端口正确,此项不设置也可以 String ip = "218.56.132.158";
		 * 
		 * System.getProperties().setProperty("http.proxyHost",
		 * "202.124.205.26");
		 * System.getProperties().setProperty("http.proxyPort", "3128");
		 */
		URL url = new URL(address);
		HttpURLConnection con = (HttpURLConnection) url.openConnection(proxy);
		con.setConnectTimeout(5000);
		con.setDoInput(true);
		con.setRequestMethod("GET");

		// conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 7.0; NT 5.1; GTB5; .NET CLR 2.0.50727; CIBA)");
		BufferedInputStream in = new BufferedInputStream(con.getInputStream());

		String inputLine;
		byte[] buf = new byte[4096];
		int bytesRead = 0;
		while (bytesRead >= 0) {
			inputLine = new String(buf, 0, bytesRead, "ISO-8859-1");
			html.append(inputLine);
			bytesRead = in.read(buf);
			inputLine = null;
		}
		buf = null;

		in.close();
		con = null;
		url = null;

		result = new String(html.toString().trim().getBytes("ISO-8859-1"),
				"gb2312").toLowerCase();

		return result;
	}

	public String getV4IP() {
		String ip = "";
		String chinaz = "http://ip.chinaz.com/getip.aspx";

		StringBuilder inputLine = new StringBuilder();
		String read = "";
		URL url = null;
		HttpURLConnection urlConnection = null;
		BufferedReader in = null;
		try {
			url = new URL(chinaz);
			urlConnection = (HttpURLConnection) url.openConnection();
			urlConnection
					.setRequestProperty(
							"User-Agent",
							"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36");
			// conn.setRequestProperty("Accept-Encoding",
			// "gzip, deflate, sdch"); //kongzhi bainma

			urlConnection
					.setRequestProperty("Accept",
							"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
			urlConnection.setInstanceFollowRedirects(true);
			urlConnection.setRequestProperty("Connection", "keep-alive");
			urlConnection.setRequestProperty("Upgrade-Insecure-Requests", "1");
			urlConnection.connect();
			if (urlConnection.getResponseCode() == 200) {
				in = new BufferedReader(new InputStreamReader(
						urlConnection.getInputStream(), "UTF-8"));
				while ((read = in.readLine()) != null) {
					inputLine.append(read);
				}
			}
			// System.out.println(inputLine.toString());
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (in != null) {
				try {
					in.close();
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
		}
		// "\\
(.*?)\\<\\/dd>" Pattern p = Pattern .compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"); Matcher m = p.matcher(inputLine.toString()); if (m.find()) { String ipstr = m.group(0); ip = ipstr; } return ip; } }



四、截图:

访问并下载网页,用HtmlCleaner进行解析后将数据放置数据库中,共1000个节点,大概100条记录。
java扫描免费代理服务器_第1张图片

提取的数据和原网页的数据:
java扫描免费代理服务器_第2张图片

采用循环检测代理IP,当一个不能使用的时候,自动提取下一条再进行测试,代理成功以后跳出循环,或者IP资源不足的时候跳出循环。
java扫描免费代理服务器_第3张图片




联系邮箱:[email protected]

                                                                                                                                                                                                                                                                                                                                                                2017_09_22



你可能感兴趣的:(java)