>概念:
代理(英语:Proxy),也称网络代理,是一种特殊的网络服务,允许一个网络终端(一般为客户端)
通过这个服务与另一个网络终端(一般为服务器)进行非直接的连接。一些网关、路由器等网络设备
具备网络代理功能。一般认为代理服务有利于保障网络终端的隐私或安全,防止攻击。
1 突破自生IP访问限制。2 网络用户可以通过代理访问外国网站。3 访问一些单位或团体内部资源。4 突破中国电信的IP封锁。5 提高访问速度6 隐藏真实IP。有防火墙的功能。
>>在学习中,由于需要频繁抓取一些网站的数据,而且保证数据的实时性,有效性,就需要多次访问服务器。
>>文中所用到的代理服务器并不是从网络中扫描得到,而是在http://www.xicidaili.com/nn/这个网站中提取而来,
>>由于只是为了验证与学习,文章中代码比较乱,下文中的程序属于半成品,但是为了更好理解,我在此说明
文中有两个主方法,其实就是两个小程序,一个是StartIPSet,这个主要是把提取出来的数据放置在数据库中,
package pitd;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.sql.Statement;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import dao.DBUtil;
public class StartIPSet {
public static void main(String[] args) {
StartIPSet s=new StartIPSet();
s.update();
}
public void put() {
String sql = "insert into proxyip values(?,?,?,?,?,sysdate)";
Connection con = null;
PreparedStatement ps = null;
try {
/*
* fw = new FileWriter(
* "C:\\Users\\Administrator\\Desktop\\proxyIP.txt");
*/
// out=new BufferedWriter(new
// FileWriter("C:\\Users\\Administrator\\Desktop\\proxyIP.txt"));
HtmlCleaner cleaner = new HtmlCleaner();
TagNode tagNode = cleaner.clean(DownloadHtml.getHtml());
Object[] action = tagNode.getElementsByName("td", true);
System.out.println(action.length);
con = DBUtil.getConnection();
con.setAutoCommit(false);
ps = con.prepareStatement(sql);
for (int i = 1; i < action.length - 6;) {
for (int j = 1; j <= 5; j++) {
TagNode tna = (TagNode) action[i];
ps.setString(j, tna.getText().toString());
i++;
}
i = i + 5;
ps.addBatch();
}
ps.executeBatch();
con.commit();
con.setAutoCommit(true);
} catch (SQLException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
} finally {
DBUtil.closePreparedStatement(ps);
DBUtil.closeConnection(con);
}
}
public void update(){
String sql="truncate table proxyip";
Connection con=null;
Statement st=null;
try {
con=DBUtil.getConnection();
st=con.createStatement();
st.execute(sql);
} catch (ClassNotFoundException | SQLException e) {
e.printStackTrace();
}finally{
DBUtil.closeStatement(st);
DBUtil.closeConnection(con);
}
put();
}
}
package pitd;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
public class GetCookie {
public static String getCookie(String url){
HttpURLConnection conn=null;
String cookie=null;
StringBuffer result=new StringBuffer();
try {
URL u=new URL(url);
conn = (HttpURLConnection)u.openConnection();
conn.setRequestMethod("GET");
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36");
//conn.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch"); //kongzhi bainma
conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
conn.setInstanceFollowRedirects(false);
System.out.println(conn.getResponseCode());
String cookieskey = "Set-Cookie";
cookie = conn.getHeaderField(cookieskey);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
URL u=new URL(url);
conn = (HttpURLConnection)u.openConnection();
conn.setRequestMethod("GET");
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36");
//conn.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch"); //kongzhi bainma
conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
conn.setInstanceFollowRedirects(false);
conn.setRequestProperty("Cookie", cookie);
conn.connect();
System.out.println(conn.getResponseCode());
String cookieskey = "Set-Cookie";
String cookie2 = conn.getHeaderField(cookieskey);
result.append(cookie+";"+cookie2);
System.out.println(result.toString());
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return result.toString();
}
}
package pitd;
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
public class DownloadHtml {
public static String getHtml() {
URL url;
StringBuffer contentBuffer = new StringBuffer();
FileWriter fw;
String htmlstr=null;
HttpURLConnection conn=null;
try {
fw = new FileWriter(
"C:\\Users\\Administrator\\Desktop\\crawler.txt");
String urlPath="http://www.xicidaili.com/nn/";
url = new URL(urlPath);
conn = (HttpURLConnection)url.openConnection();
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36");
//conn.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch"); //kongzhi bainma
conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
conn.setInstanceFollowRedirects(true);
conn.setRequestProperty("Connection", "keep-alive");
conn.setRequestProperty("Upgrade-Insecure-Requests", "1");
conn.setRequestProperty("Cookie",GetCookie.getCookie(urlPath));
conn.connect();
int returnCode = conn.getResponseCode();
if (returnCode == 200) {
InputStream input = conn.getInputStream();
InputStreamReader istreamReader = new InputStreamReader(
input, "UTF-8");
BufferedReader buffStr = new BufferedReader(istreamReader);
String str = null;
while ((str = buffStr.readLine()) != null)
contentBuffer.append(str);
htmlstr = contentBuffer.toString();
System.out.println(htmlstr);
fw.write(htmlstr);
input.close();
istreamReader.close();
buffStr.close();
fw.close();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (conn != null) {
conn.disconnect();
}
}
return htmlstr;
}
}
package proxyip;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.Proxy;
import java.net.URL;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import pitd.GetCookie;
import dao.DBUtil;
public class TestProxy {
static Proxy proxy = null;
Connection con = null;
Statement st = null;
ResultSet rs = null;
public static void main(String[] args) throws IOException {
try {
@SuppressWarnings("unchecked")
// 使用反射加载类。
Class clazz = (Class) Class
.forName("proxyip.TestProxy");
TestProxy tp = clazz.newInstance();
tp.checkProxy();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (InstantiationException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
e.printStackTrace();
}
}
public void checkProxy() {
int count = 100;
try {
String sql = "select * from proxyip";
con = DBUtil.getConnection();
st = con.createStatement();
rs = st.executeQuery(sql);
do {
rs.next();count--;
System.out.print("数据库中取出的数据为:");
System.out.println(rs.getString(1) + "\t"
+ new Integer(rs.getString(2)));
proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(
rs.getString(1), new Integer(rs.getString(2))));
// 确定代理是否设置成功
if (statuOk()) {
System.out.println("<<<<----代理成功————>>>>\n代理信息:");
System.out.println("Address:" + rs.getString(1) + "\nPort:"
+ rs.getString(2) + "\nLocaltion:" + rs.getString("LOCALTION")
+ "\nAnony:" + rs.getString(4) + "\nProtocal:"
+ rs.getString(5));
break;
} else if (count <= 0) {
System.out.println("代理失败,ip资源不足!");
break;
}
} while (true);
} catch (ClassNotFoundException | SQLException e) {
e.printStackTrace();
} finally {
DBUtil.closeResultSet(rs);
DBUtil.closeStatement(st);
DBUtil.closeConnection(con);
}
}
public boolean statuOk() {
int flag = 0;
String localIP = getV4IP();
System.out.println("start...");
try {
String ipInfo = getHtml("http://ip.chinaz.com/getip.aspx");// http://city.ip138.com/ip2city.asp
Pattern p = Pattern
.compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}");
Matcher m = p.matcher(ipInfo);
if (m.find()) {
String proxyIP = m.group(0);
System.out.println("正在检测的代理ip:" + proxyIP);
if (!localIP.equals(proxyIP)) {
System.out.println("本机ip:" + localIP);
flag = 1;
}
}
} catch (Exception e) {
System.out.println(e.getMessage());
flag = 0;
}
if (flag == 1) {
return true;
} else {
return false;
}
}
private static String getHtml(String address) throws Exception {
StringBuffer html = new StringBuffer();
String result = null;
/*
* System.getProperties().setProperty("proxySet", "true"); //
* 如果不设置,只要代理IP和代理端口正确,此项不设置也可以 String ip = "218.56.132.158";
*
* System.getProperties().setProperty("http.proxyHost",
* "202.124.205.26");
* System.getProperties().setProperty("http.proxyPort", "3128");
*/
URL url = new URL(address);
HttpURLConnection con = (HttpURLConnection) url.openConnection(proxy);
con.setConnectTimeout(5000);
con.setDoInput(true);
con.setRequestMethod("GET");
// conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 7.0; NT 5.1; GTB5; .NET CLR 2.0.50727; CIBA)");
BufferedInputStream in = new BufferedInputStream(con.getInputStream());
String inputLine;
byte[] buf = new byte[4096];
int bytesRead = 0;
while (bytesRead >= 0) {
inputLine = new String(buf, 0, bytesRead, "ISO-8859-1");
html.append(inputLine);
bytesRead = in.read(buf);
inputLine = null;
}
buf = null;
in.close();
con = null;
url = null;
result = new String(html.toString().trim().getBytes("ISO-8859-1"),
"gb2312").toLowerCase();
return result;
}
public String getV4IP() {
String ip = "";
String chinaz = "http://ip.chinaz.com/getip.aspx";
StringBuilder inputLine = new StringBuilder();
String read = "";
URL url = null;
HttpURLConnection urlConnection = null;
BufferedReader in = null;
try {
url = new URL(chinaz);
urlConnection = (HttpURLConnection) url.openConnection();
urlConnection
.setRequestProperty(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36");
// conn.setRequestProperty("Accept-Encoding",
// "gzip, deflate, sdch"); //kongzhi bainma
urlConnection
.setRequestProperty("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
urlConnection.setInstanceFollowRedirects(true);
urlConnection.setRequestProperty("Connection", "keep-alive");
urlConnection.setRequestProperty("Upgrade-Insecure-Requests", "1");
urlConnection.connect();
if (urlConnection.getResponseCode() == 200) {
in = new BufferedReader(new InputStreamReader(
urlConnection.getInputStream(), "UTF-8"));
while ((read = in.readLine()) != null) {
inputLine.append(read);
}
}
// System.out.println(inputLine.toString());
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (in != null) {
try {
in.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
// "\\(.*?)\\<\\/dd>"
Pattern p = Pattern
.compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}");
Matcher m = p.matcher(inputLine.toString());
if (m.find()) {
String ipstr = m.group(0);
ip = ipstr;
}
return ip;
}
}