JAVA 配置动态转发代理IP做数据爬虫采集

这篇文章介绍的是使用动态转发代理IP(也叫隧道代理IP),参考http://www.xiaozhudaili.com/buy/tunnel.html

首先效果是很不错的,只需要设置好一次代理IP,然后每次都会自动换一个IP,不需要代码做什么了,看下我的效果:

标题

 

下面附上代码,把里面的用户名和密码改成你自己的就行了:JDK1.8

package com.xiaozhudaili.test;


import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.Authenticator;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.PasswordAuthentication;
import java.net.Proxy;
import java.net.URL;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSession;

/**
 * 动态转发测试
 * @author http://www.xiaozhudaili.com/
 *
 */
public class TestHttps {

	/**
	 * 信任所有证数:请求HTTPS网址时需要,如果请求HTTP网址可直接删除trustAllHttpsCertificates()与class miTM
	 * @throws Exception
	 */
    private static void trustAllHttpsCertificates() throws Exception {
        javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
        javax.net.ssl.TrustManager tm = new miTM();
        trustAllCerts[0] = tm;
        javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext.getInstance("TLS");
        sc.init(null, trustAllCerts, null);
        javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
    }
    static class miTM implements javax.net.ssl.TrustManager, javax.net.ssl.X509TrustManager {
        public java.security.cert.X509Certificate[] getAcceptedIssuers() {
            return null;
        }
        public boolean isServerTrusted(java.security.cert.X509Certificate[] certs) {
            return true;
        }
        public boolean isClientTrusted(java.security.cert.X509Certificate[] certs) {
            return true;
        }
        public void checkServerTrusted(java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException {
            return;
        }
        public void checkClientTrusted(java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException {
            return;
        }
    }
    
	public static void main(String[] args) {

		// 如果爬虫请求HTTPS网址,必须加入这两行
		System.setProperty("jdk.http.auth.proxying.disabledSchemes", "");
		System.setProperty("jdk.http.auth.tunneling.disabledSchemes", "");
	    
	    // 要请求的网址
	    final String targetUrl = "https://pv.sohu.com/cityjson?ie=utf-8";
	    
	    // 代理IP
		final String httpsIpport = "tunnel.xiaozhudaili:15678";
		// 代理用户名和密码
		final String username = "420E7E42B08CBBE82C110D22C9888888";
		final String password = "888888";
		
		// 发送请求次数
		int requestTime = 5;
		
		for(int i = 0; i < requestTime; i++) {
			final int reqNo = i;
			new Thread(new Runnable() {
				@Override
				public void run() {
					try {

						// 信任所有证书,当请求HTTPS网址时需要,该部分必须在获取connection前调用
			            trustAllHttpsCertificates();
			            HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {
			                public boolean verify(String urlHostName, SSLSession session) {
			                    return true;
			                }
			            });

				    	URL link = new URL(targetUrl);
			    		
						Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress((httpsIpport.split(":"))[0], Integer.parseInt((httpsIpport.split(":"))[1])));
						HttpURLConnection connection = (HttpURLConnection)link.openConnection(proxy);
						
						// Java系统自带的鉴权模式,请求HTTPS网址时需要
			    		Authenticator.setDefault(new Authenticator() {
			    			public PasswordAuthentication getPasswordAuthentication() {
			    				return new PasswordAuthentication(username, password.toCharArray());
			    			}
			    		});
			    		
						connection.setRequestMethod("GET");
				    	connection.setDoInput(true);
				    	connection.setDoOutput(true);
				    	connection.setUseCaches(false);
				    	
				    	// 设置超时时间为60秒
				    	connection.setConnectTimeout(60000);
			    		
			    		connection.connect();
			    		
				        String line = null;
				        StringBuilder html = new StringBuilder();
				        BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "utf-8"));
				        while((line = reader.readLine()) != null){
				        	html.append(line);
				        }
				        try {
							if (reader != null) {
								reader.close();
							}
						} catch (Exception e) {
						}
				        
				        connection.disconnect();
				        
				        // 输出结果
						System.out.println(reqNo + " [OK]" + "→→→→→" + targetUrl + "  " + connection.getResponseCode() + "   " + html.toString());
					} catch (Exception e) {
						e.printStackTrace();
						System.err.println(reqNo + " [ERR]" + "→→→→→" + e.getMessage());
					}					
				}
			}).start();
		}
	}
}

 

你可能感兴趣的:(爬虫系列,Java,代理IP)