httpclient常用基本抓取类

以下是我常用的抓取类,直接调用其中方法可实现本机ip抓取,goagent代理ip抓取,代理ip抓取。以及对文件的下载,页面内容保存到本地等。

package crawlMethodManager;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.DeflateDecompressingEntity;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.protocol.HTTP;
import org.apache.http.util.CharArrayBuffer;

@SuppressWarnings("deprecation")
public class CrawlMethodManager {

	static String ip = "";
	static int port = 0;
	static String ipUrl = "http://localhost:8080/ipFilter/getIp/getIp";
	

	static HttpClient httpPostClient = new DefaultHttpClient(
			new ThreadSafeClientConnManager());

	/**
	 * httpClient的get方法
	 * 
	 * @param url
	 *            String 要抓取的链接
	 * @param encode
	 *            String 抓取时使用的编码
	 * @param goagentFlag
	 *            boolean 是否启用goagent
	 * @param goagentNum
	 *            int goagent尝试的次数
	 * @param companyFlag
	 *            boolean 是否启用代理
	 * @param companyNum
	 *            int 代理尝试的次数
	 * @param localFlag
	 *            boolean 是否启用本机
	 * @param localNum
	 *            int 本机尝试的次数
	 */

	public String crawlPageContentByGet(String url, String encode,
			boolean goagentFlag, int goagentNum, boolean companyFlag,
			int companyNum, boolean localFlag, int localNum)
			throws ClientProtocolException, IOException {
		String content = "";
		if (goagentFlag && content.equals("")) {
			int goagentCount = 0;
			while (content.equals("") && goagentCount < goagentNum) {
				try {
					System.out.println("goagent正在请求");
					content = doGetByGoagent(url, encode);
				} catch (Exception e) {
					// System.out.println("goagent请求失败");
				}
				goagentCount++;
			}
		}
		if (companyFlag && content.equals("")) {
			int companyCount = 0;
			while (content.equals("") && companyCount < companyNum) {
				try {
					System.out.println("公司代理ip正在请求");
					content = getByCompanyProxy(url, encode);
				} catch (Exception e) {
					// System.out.println("公司代理ip请求失败");
				}
				companyCount++;
			}
		}
		if (localFlag && content.equals("")) {
			int localCount = 0;
			while (content.equals("") && localCount < localNum) {
				try {
					System.out.println("本机正在请求");
					content = doGet(url, encode);
				} catch (Exception e) {
					// System.out.println("本机请求失败");
				}
				localCount++;
			}
		}
		return content;
	}

	/**
	 * 
	 * @Description: get web content
	 * @param @param url
	 * @param @param encode
	 * @param @return
	 * @param @throws ClientProtocolException
	 * @param @throws IOException
	 * @return String
	 * @throws
	 * @author joe
	 * @date 2014-12-11
	 */
	public String crawlPageContentByGet(String url, String encode)
			throws ClientProtocolException, IOException {
		String content = "";
		try {
			content = doGetByGoagent(url, encode);
			if (content == null || content.equals("")) {
				System.out.println("启用公司代理");
				content = getByCompanyProxy(url, encode);
				// if (content == null || content.equals("")) {
				// System.out.println("启用本机");
				// content = doGet(url, encode);
				// }
			}
		} catch (Exception e) {
			try {
				System.out.println("goagent连接失败,启用公司代理");
				content = getByCompanyProxy(url, encode);
				// if (content == null || content.equals("")) {
				// System.out.println("公司代理连接失败,启用本机");
				// content = doGet(url, encode);
				// }
			} catch (Exception e2) {
				try {
					content = getByCompanyProxy(url, encode);
					// e2.printStackTrace();
					// System.out.println("公司代理连接失败,5秒后启用本机");
					// Thread.sleep(5000);
					// content = doGet(url, encode);
				} catch (Exception e3) {
					e3.printStackTrace();
				}

			}

		}

		return content;
	}

	private String getByCompanyProxy(String url, String encode) {
		int count = 10;
		String result = "";
		String urlString = url;
		String proxy = "";
		HttpHost proxyHost = null;
		boolean newProxy = false;
		int oldProxyUsecount = 0;
		for (int i = 0; i <= count; i++) {
			if (!ip.equals("")) {
				proxyHost = new HttpHost(ip, port, null);
			}

			try {
				if (newProxy || oldProxyUsecount > 2 || ip.equals("")) {
					oldProxyUsecount = 0;
					String[] proxys = null;
					try {
						while (proxy.equals("") || !proxy.contains(":")) {
							System.out.println("ip为空,正在提取");
							proxy = doGet(ipUrl, "gbk");
						}
						proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split(
								":");
					} catch (Exception e) {
						while (proxy.equals("") || !proxy.contains(":")) {
							System.out.println("ip为空,正在提取");
							proxy = doGet(ipUrl, "gbk");
						}
						proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split(
								":");
						// proxy = doGet(
						// ,
						// "gbk");
						// proxys = proxy.split(":");
					}
					ip = proxys[0];
					port = Integer.parseInt(proxys[1]);
					proxyHost = new HttpHost(ip, port, null);
				}
				System.out.println("正在使用代理" + ip + ":" + port + ":" + port);
				HttpGet httpRequst = new HttpGet(urlString);
				httpRequst.addHeader("Accept-Encoding", "gzip,deflate,sdch");
				httpRequst.getParams().setParameter(
						CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);
				DefaultHttpClient httpClient = new DefaultHttpClient();
				httpClient.getParams().setParameter(
						CoreConnectionPNames.CONNECTION_TIMEOUT, 9000);// 连接时间20s
				httpClient.getParams().setParameter(
						CoreConnectionPNames.SO_TIMEOUT, 9000);// 数据传输时间60s
				httpClient.getParams().setParameter(
						ConnRouteParams.DEFAULT_PROXY, proxyHost);
				HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类
				if (httpResponse.getStatusLine().getStatusCode() == 200) {
					HttpEntity httpEntity = httpResponse.getEntity();
					if (httpEntity.getContentEncoding() != null) {
						if ("gzip".equalsIgnoreCase(httpEntity
								.getContentEncoding().getValue())) {
							httpEntity = new GzipDecompressingEntity(httpEntity);
						} else if ("deflate".equalsIgnoreCase(httpEntity
								.getContentEncoding().getValue())) {
							httpEntity = new DeflateDecompressingEntity(
									httpEntity);
						}
					}
					result = enCodetoString(httpEntity, encode);// 取出应答字符串
					if (resultTest(result)) {
						System.out.println(ip + "公司代理成功抓取" + url);
						return result;
					} else if (result.contains("function JumpSelf")
							&& result.contains("WebShieldSessionVerify")) {
						int indexs = result.indexOf("&WebShieldSessionVerify");
						int indexe = result.indexOf("\";}</script>");
						String verify = result.substring(indexs, indexe);
						urlString = urlString + verify;
						newProxy = false;
					} else if (result.contains("function JumpSelf")
							&& !result.contains("WebShieldSessionVerify")) {
						urlString = url;
						newProxy = false;
					} else {
						System.out.println("网页含有错误特殊字符" + urlString);
						oldProxyUsecount++;
						System.out.println(result);
					}
				} else
					System.out.println(httpResponse.getStatusLine()
							.getStatusCode() + " " + urlString + " 状态不为200");
				oldProxyUsecount++;
				httpRequst.abort();
			} catch (ClientProtocolException e) {
				newProxy = true;
				System.out.println(ip + "代理ip拒绝了");
			} catch (IOException e) {
				oldProxyUsecount++;
				System.out.println(ip + "代理读取超时");
			}
		}
		return "";
	}

	private String doGet(String url, String encode)
			throws ClientProtocolException, IOException {
		String result = "";
		try {
			HttpGet httpRequst = new HttpGet(url);
			// httpRequst.addHeader("Content-Type", "text/html;charset=" +
			// encode);
			// httpRequst.getParams().setParameter(
			// CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);
			DefaultHttpClient httpClient = new DefaultHttpClient();
			// httpClient.getParams().setParameter(
			// CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);
			httpClient.getParams().setParameter(
					CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20s
			httpClient.getParams().setParameter(
					CoreConnectionPNames.SO_TIMEOUT, 8000);// 数据传输时间60s
			HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类
			if (httpResponse.getStatusLine().getStatusCode() == 200) {
				HttpEntity httpEntity = httpResponse.getEntity();
				if (httpEntity.getContentEncoding() != null) {
					if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding()
							.getValue())) {
						httpEntity = new GzipDecompressingEntity(httpEntity);
					} else if ("deflate".equalsIgnoreCase(httpEntity
							.getContentEncoding().getValue())) {
						httpEntity = new DeflateDecompressingEntity(httpEntity);
					}
				}

				result = enCodetoString(httpEntity, encode);// 取出应答字符串
			} else
				httpRequst.abort();
		} catch (ClientProtocolException e) {

			System.out.println("doget代理读取超时");
		} catch (IOException e) {
			System.out.println("doget代理读取超时");

		}
		return result;
	}

	private String doGetByGoagent(String url, String encode)
			throws ClientProtocolException, IOException {
		String result = "";
		HttpGet httpRequst = new HttpGet(url);
		httpRequst.addHeader("Accept-Encoding", "gzip,deflate,sdch");
		httpRequst.getParams().setParameter(
				CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);
		DefaultHttpClient httpClient = new DefaultHttpClient();
		HttpHost proxyHost = new HttpHost("127.0.0.1", 8087, null);
		httpClient.getParams().setParameter(
				CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20s
		httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,
				6000);// 数据传输时间60s
		httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY,
				proxyHost);
		HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类
		if (httpResponse.getStatusLine().getStatusCode() == 200) {
			HttpEntity httpEntity = httpResponse.getEntity();
			if (httpEntity.getContentEncoding() != null) {
				if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding()
						.getValue())) {
					httpEntity = new GzipDecompressingEntity(httpEntity);
				} else if ("deflate".equalsIgnoreCase(httpEntity
						.getContentEncoding().getValue())) {
					httpEntity = new DeflateDecompressingEntity(httpEntity);
				}
			}
			result = enCodetoString(httpEntity, encode);// 取出应答字符串
		} else
			httpRequst.abort();
		return result;
	}

	public String crawlPageContentByPost(String url, String pram, String encode)
			throws ClientProtocolException, IOException {
		String content = "";
		try {
			content = doPostByGoagent(url, pram, encode);
			if (content == null || content.equals("")) {
				content = doPostByGoagent(url, pram, encode);
				// System.out.println("启用公司代理");
				// content = postByCompanyProxy(url, pram, encode);
				// if (content == null || content.equals("")) {
				// System.out.println("5秒后启用本机");
				// Thread.sleep(5000);
				// content = doPost(url, pram, encode);
				// }
			}
		} catch (Exception e) {
			try {
				content = doPostByGoagent(url, pram, encode);
				// System.out.println("goagent连接失败,启用公司代理");
				// content = postByCompanyProxy(url, pram, encode);
				// if (content == null || content.equals("")) {
				// System.out.println("公司代理连接失败,启用本机");
				// content = doPost(url, pram, encode);
				// }
			} catch (Exception e2) {
				try {
					content = doPostByGoagent(url, pram, encode);
					// e2.printStackTrace();
					// content = postByCompanyProxy(url, pram, encode);
					// System.out.println("公司代理连接失败,启用本机");
					// content = doPost(url, pram, encode);
				} catch (Exception e3) {
					e3.printStackTrace();
				}

			}

		}

		return content;
	}

	private String doPostByGoagent(String url, String parm, String encode)
			throws ClientProtocolException, IOException {
		String result = "";
		HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象
		HttpHost proxy = new HttpHost("127.0.0.1", 8087, null);
		StringEntity entity = new StringEntity(parm);
		entity.setContentType("application/x-www-form-urlencoded");
		entity.setContentEncoding(encode);
		httpRequst.setEntity(entity);
		DefaultHttpClient httpClient = new DefaultHttpClient();
		httpClient.getParams().setParameter(
				CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20s
		httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,
				8000);// 数据传输时间60s
		httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY,
				proxy);
		HttpResponse httpResponse = httpClient.execute(httpRequst);
		// System.out.println(httpResponse.getStatusLine().getStatusCode());
		if (httpResponse.getStatusLine().getStatusCode() == 200) {
			HttpEntity httpEntity = httpResponse.getEntity();
			if (httpEntity.getContentEncoding() != null) {
				if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding()
						.getValue())) {
					httpEntity = new GzipDecompressingEntity(httpEntity);
				} else if ("deflate".equalsIgnoreCase(httpEntity
						.getContentEncoding().getValue())) {
					httpEntity = new DeflateDecompressingEntity(httpEntity);
				}
			}
			result = enCodetoString(httpEntity, encode);// 取出应答字符串
		}
		return result;
	}

	public String doPost(String url, String parm, String encode)
			throws ClientProtocolException, IOException {
		String result = "";
		HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象
		StringEntity entity = new StringEntity(parm);
		entity.setContentType("application/x-www-form-urlencoded");
		entity.setContentEncoding(encode);
		httpRequst.setEntity(entity);
		DefaultHttpClient httpClient = new DefaultHttpClient();
		httpClient.getParams().setParameter(
				CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20s
		httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,
				8000);// 数据传输时间60s
		HttpResponse httpResponse = httpClient.execute(httpRequst);
		// System.out.println(httpResponse.getStatusLine().getStatusCode());
		if (httpResponse.getStatusLine().getStatusCode() == 200) {
			HttpEntity httpEntity = httpResponse.getEntity();
			if (httpEntity.getContentEncoding() != null) {
				if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding()
						.getValue())) {
					httpEntity = new GzipDecompressingEntity(httpEntity);
				} else if ("deflate".equalsIgnoreCase(httpEntity
						.getContentEncoding().getValue())) {
					httpEntity = new DeflateDecompressingEntity(httpEntity);
				}
			}
			result = enCodetoString(httpEntity, encode);// 取出应答字符串
			return result;
		}
		return result;
	}

	@SuppressWarnings("unused")
	private String postByCompanyProxy(String url, String parm, String encode)
			throws ClientProtocolException, IOException {
		int count = 5;
		String result = "";
		String urlString = url;
		boolean okProxy = false;
		boolean newProxy = false;
		int oldProxyUsecount = 0;
		for (int i = 0; i <= count; i++) {

			try {
				if (newProxy || oldProxyUsecount > 2 || ip.equals("")) {
					okProxy = postByCompanyProxyBoolean(url, parm, encode);
				}
				if (okProxy) {
					System.out.println("正在使用代理" + ip + ":" + port);
					HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象
					StringEntity entity = new StringEntity(parm);
					entity.setContentType("application/x-www-form-urlencoded");
					httpRequst.setEntity(entity);
					httpRequst.getParams().setParameter(
							CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);
					HttpResponse httpResponse = httpPostClient
							.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类
					if (httpResponse.getStatusLine().getStatusCode() == 200) {
						HttpEntity httpEntity = httpResponse.getEntity();
						if (httpEntity.getContentEncoding() != null) {
							if ("gzip".equalsIgnoreCase(httpEntity
									.getContentEncoding().getValue())) {
								httpEntity = new GzipDecompressingEntity(
										httpEntity);
							} else if ("deflate".equalsIgnoreCase(httpEntity
									.getContentEncoding().getValue())) {
								httpEntity = new DeflateDecompressingEntity(
										httpEntity);
							}
						}
						result = enCodetoString(httpEntity, encode);// 取出应答字符串
						// System.out.println(result);
						if (resultTest(result)) {
							return result;
						} else if (result.contains("function JumpSelf")
								&& result.contains("WebShieldSessionVerify")) {
							int indexs = result
									.indexOf("&WebShieldSessionVerify");
							int indexe = result.indexOf("\";}</script>");
							String verify = result.substring(indexs, indexe);
							urlString = urlString + verify;
							newProxy = false;
						} else if (result.contains("function JumpSelf")
								&& !result.contains("WebShieldSessionVerify")) {
							urlString = url;
							newProxy = false;
						}
					} else if (httpResponse.getStatusLine().getStatusCode() == 302) {
						System.out.println("重定向了");
						Header header = httpResponse.getFirstHeader("location");
						if (header != null) {
							urlString = header.getValue();
							System.out.println(urlString);
							if (urlString.contains("tabid=26")) {
								urlString = "http://www.landchina.com"
										+ urlString;
								result = getByHttpClient(urlString, encode,
										httpPostClient);
								if (resultTest(result)) {
									System.out.println(i + "公司代理成功抓取" + url);
									return result;
								}
								newProxy = false;
							}
							newProxy = false;
						}
					} else {
						httpRequst.abort();
					}
				} else {
					oldProxyUsecount++;
				}
			} catch (ClientProtocolException e) {
				newProxy = true;
				System.out.println(ip + "代理ip拒绝了");
			} catch (IOException e) {
				oldProxyUsecount++;
				System.out.println(ip + "代理读取超时");
			}
		}
		return "";
	}

	private String getByHttpClient(String url, String encode,
			HttpClient httpClient) {
		int count = 2;
		String result = "";
		String urlString = url;
		for (int i = 0; i <= count; i++) {
			try {
				HttpGet httpRequst = new HttpGet(urlString);
				httpRequst.setHeader("Content-Type",
						"application/x-www-form-urlencoded");
				HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类
				if (httpResponse.getStatusLine().getStatusCode() == 200) {
					HttpEntity httpEntity = httpResponse.getEntity();
					if (httpEntity.getContentEncoding() != null) {
						if ("gzip".equalsIgnoreCase(httpEntity
								.getContentEncoding().getValue())) {
							httpEntity = new GzipDecompressingEntity(httpEntity);
						} else if ("deflate".equalsIgnoreCase(httpEntity
								.getContentEncoding().getValue())) {
							httpEntity = new DeflateDecompressingEntity(
									httpEntity);
						}
					}
					result = enCodetoString(httpEntity, encode);// 取出应答字符串
					if (resultTest(result)) {
						System.out.println(ip + "公司代理成功抓取" + url);
						return result;
					} else if (result.contains("function JumpSelf")
							&& result.contains("WebShieldSessionVerify")) {
						int indexs = result.indexOf("&WebShieldSessionVerify");
						int indexe = result.indexOf("\";}</script>");
						String verify = result.substring(indexs, indexe);
						urlString = urlString + verify;
					} else if (result.contains("function JumpSelf")
							&& !result.contains("WebShieldSessionVerify")) {
						urlString = url;
					}
				} else
					httpRequst.abort();
			} catch (ClientProtocolException e) {
				System.out.println(ip + "代理ip拒绝了");
			} catch (IOException e) {
				System.out.println(ip + "代理读取超时");
			}
		}
		return "";
	}

	/**
	 * 新ip第一次访问时要先通过安全验证,这时只能得到首页的内容,所以在post前线验证一次 <功能详细描述> [参数说明]
	 * 
	 * @return void [返回类型说明]
	 * @exception throws [违例类型] [违例说明]
	 * @see [类、类#方法、类#成员]
	 */
	private Boolean postByCompanyProxyBoolean(String url, String parm,
			String encode) throws ClientProtocolException, IOException {
		int count = 10;
		String result = "";
		String urlString = url;
		String proxy = "";
		HttpHost proxyHost = null;
		boolean newProxy = false;
		int oldProxyUsecount = 0;
		for (int i = 0; i <= count; i++) {

			try {
				if (newProxy || oldProxyUsecount > 2 || ip.equals("")) {
					oldProxyUsecount = 0;
					String[] proxys = null;
					try {
						while (proxy.equals("") || !proxy.contains(":")) {
							System.out.println("ip为空,正在提取");
							proxy = doGet(ipUrl, "gbk");
						}
						proxys = proxy.replaceAll("\"|//|/|\r\n| | ", "")
								.split(":");
					} catch (Exception e) {
						while (proxy.equals("") || !proxy.contains(":")) {
							System.out.println("ip为空,正在提取");
							proxy = doGet(ipUrl, "gbk");
						}
						proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split(
								":");
					}
					ip = proxys[0];
					port = Integer.parseInt(proxys[1]);
					proxyHost = new HttpHost(ip, port, null);
				}
				System.out.println("正在使用代理" + ip + ":" + port);
				HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象
				StringEntity entity = new StringEntity(parm);
				entity.setContentType("application/x-www-form-urlencoded");
				httpRequst.setEntity(entity);
				httpRequst.getParams().setParameter(
						CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);
				httpPostClient.getParams().setParameter(
						CoreConnectionPNames.CONNECTION_TIMEOUT, 10000);// 连接时间20s
				httpPostClient.getParams().setParameter(
						CoreConnectionPNames.SO_TIMEOUT, 8000);// 数据传输时间60s
				httpPostClient.getParams().setParameter(
						ConnRouteParams.DEFAULT_PROXY, proxyHost);
				HttpResponse httpResponse = httpPostClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类
				if (httpResponse.getStatusLine().getStatusCode() == 200) {
					HttpEntity httpEntity = httpResponse.getEntity();
					if (httpEntity.getContentEncoding() != null) {
						if ("gzip".equalsIgnoreCase(httpEntity
								.getContentEncoding().getValue())) {
							httpEntity = new GzipDecompressingEntity(httpEntity);
						} else if ("deflate".equalsIgnoreCase(httpEntity
								.getContentEncoding().getValue())) {
							httpEntity = new DeflateDecompressingEntity(
									httpEntity);
						}
					}
					result = enCodetoString(httpEntity, encode);// 取出应答字符串
					// System.out.println(result);
					if (resultTest(result)) {
						return true;
					} else if (result.contains("function JumpSelf")
							&& result.contains("WebShieldSessionVerify")) {
						int indexs = result.indexOf("&WebShieldSessionVerify");
						int indexe = result.indexOf("\";}</script>");
						String verify = result.substring(indexs, indexe);
						urlString = urlString + verify;
						if (urlString.contains("tabid=26")
								&& !urlString.contains("landchina")) {
							urlString = "http://www.landchina.com" + urlString;
							result = getByHttpClient(urlString, encode,
									httpPostClient);
							if (resultTest(result)) {
								System.out.println(ip + "公司代理成功抓取" + url);
								return true;
							}
							newProxy = false;
						} else if (urlString.contains("tabid=26")
								&& urlString.contains("landchina")) {
							result = getByHttpClient(urlString, encode,
									httpPostClient);
							if (resultTest(result)) {
								System.out.println(ip + "公司代理成功抓取" + url);
								return true;
							}
							newProxy = false;
						}
						newProxy = false;
					} else if (result.contains("function JumpSelf")
							&& !result.contains("WebShieldSessionVerify")) {
						urlString = url;
						newProxy = false;
					}
				} else if (httpResponse.getStatusLine().getStatusCode() == 302) {
					System.out.println("重定向了");
					Header header = httpResponse.getFirstHeader("location");
					if (header != null) {
						urlString = header.getValue();
						System.out.println(urlString);
						if (urlString.contains("tabid=26")
								&& !urlString.contains("landchina")) {
							urlString = "http://www.landchina.com" + urlString;
							result = getByHttpClient(urlString, encode,
									httpPostClient);
							if (resultTest(result)) {
								System.out.println(ip + "公司代理成功抓取" + url);
								return true;
							}
							newProxy = false;
						} else if (urlString.contains("tabid=26")
								&& urlString.contains("landchina")) {
							result = getByHttpClient(urlString, encode,
									httpPostClient);
							if (resultTest(result)) {
								System.out.println(ip + "公司代理成功抓取" + url);
								return true;
							}
							newProxy = false;
						}
						newProxy = false;
					}
				} else {
					httpRequst.abort();
				}
			} catch (ClientProtocolException e) {
				newProxy = true;
				System.out.println(ip + "代理ip拒绝了");
			} catch (IOException e) {
				oldProxyUsecount++;
				System.out.println(ip + "代理读取超时");
			}
		}
		return false;
	}

	private Boolean resultTest(String result) {
		if (!result.equals("") && !result.equals("100")
				&& !result.contains("<title>blank")
				&& !result.contains("Error Page Messages")
				&& !result.contains("<title>404")
				&& !result.contains("您的访问出错了") && !result.contains("302 Found")
				&& !result.contains("出错页面") && !result.contains("没有找到这篇文章!")
				&& !result.contains("特定于实例的错误") && !result.contains("错误 404")
				&& !result.contains("Error report")
				&& !result.contains("function JumpSelf")
				&& !result.contains("refused") && !result.contains("网站防火墙")
				&& !result.contains("无法解析服务器") && !result.contains("STATUS OK")
				&& !result.contains("refresh")
				&& !result.contains("DownloadError")
				&& !result.contains("Not Found")
				&& !result.contains("Runtime Error")
				&& !result.contains("Service Unavailable")) {
			return true;
		}

		return false;

	}

	public static String enCodetoString(final HttpEntity entity,
			final String defaultCharset) throws IOException, ParseException {
		return enCodetoStringDo(entity,
				defaultCharset != null ? Charset.forName(defaultCharset) : null);
	}

	public static String enCodetoStringDo(final HttpEntity entity,
			Charset defaultCharset) throws IOException, ParseException {
		if (entity == null) {
			throw new IllegalArgumentException("HTTP entity may not be null");
		}
		InputStream instream = entity.getContent();
		if (instream == null) {
			return null;
		}
		try {
			if (entity.getContentLength() > Integer.MAX_VALUE) {
				throw new IllegalArgumentException(
						"HTTP entity too large to be buffered in memory");
			}
			int i = (int) entity.getContentLength();
			if (i < 0) {
				i = 4096;
			}
			Charset charset = null;
			try {
				// ContentType contentType = ContentType.get(entity);
				// if (contentType != null) {
				// charset = contentType.getCharset();
				// }
			} catch (final UnsupportedCharsetException ex) {
				throw new UnsupportedEncodingException(ex.getMessage());
			}
			if (charset == null) {
				charset = defaultCharset;
			}
			if (charset == null) {
				charset = HTTP.DEF_CONTENT_CHARSET;
			}
			Reader reader = new InputStreamReader(instream, charset);
			CharArrayBuffer buffer = new CharArrayBuffer(i);
			char[] tmp = new char[1024];
			int l;
			while ((l = reader.read(tmp)) != -1) {
				buffer.append(tmp, 0, l);
			}
			return buffer.toString();
		} finally {
			instream.close();
		}
	}
	
	/**
	 * 
	 * @Description: TODO
	 * @param @param 硬盘名
	 * @param @param 文件名
	 * @param @param 文件夹名
	 * @param @param 保存后缀名
	 * @param @param 保存的内容
	 * @return void
	 * @throws
	 * @author joe
	 * @date 2015-3-6
	 */
	public static void writeToFile(String topName, String fileName,
			String tagName, String type, String content) {
		File dirFile = null;
		try {
			dirFile = new File(topName + ":\\" + tagName);
			if (!(dirFile.exists()) && !(dirFile.isDirectory())) {
				boolean creadok = dirFile.mkdirs();
				if (creadok) {
					System.out.println(" ok:创建文件夹成功! ");
				} else {
					System.out.println(" err:创建文件夹失败! ");
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		String fullPath = dirFile + "/" + fileName + "." + type;
		write(fullPath, content);
	}

	/**
	 * 写文件
	 * 
	 * @param path
	 * @param content
	 */
	public static boolean write(String path, String content) {
		String s = new String();
		String s1 = new String();
		BufferedWriter output = null;
		try {
			File f = new File(path);
			if (f.exists()) {
			} else {
				System.out.println("文件不存在,正在创建...");
				if (f.createNewFile()) {
					System.out.println("文件创建成功!");
				} else {
					System.out.println("文件创建失败!");
				}
			}
			BufferedReader input = new BufferedReader(new FileReader(f));
			while ((s = input.readLine()) != null) {
				s1 += s + "\n";
			}
			System.out.println("原文件内容:" + s1);
			input.close();
			s1 += content;
			output = new BufferedWriter(new FileWriter(f));
			output.write(s1);
			output.flush();
			return true;
		} catch (Exception e) {
			e.printStackTrace();
			return false;
		} finally {
			if (output != null) {
				try {
					output.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}

	/**
	 * 
	 * @Description: TODO
	 * @param @param fileUrl文件链接
	 * @param @param topName硬盘名
	 * @param @param fileName文件名
	 * @param @param tagName文件夹名
	 * @param @param type 后缀名
	 * @return void  
	 * @throws
	 * @author joe
	 * @date 2015-3-6
	 */
	public void downLoadFile(String fileUrl, String topName, String fileName,
			String tagName, String type) {
		// 下载网络文件
		int bytesum = 0;
		int byteread = 0;
		try {
			URL url = new URL(fileUrl);
			URLConnection conn = url.openConnection();
			InputStream inStream = conn.getInputStream();
			File fileD = new File(topName + ":/" + tagName);
			// 如果文件夹不存在则创建
			if (!fileD.exists() && !fileD.isDirectory()) {
				System.out.println("正在新建目录");
				fileD.mkdirs();
				;
			} else {
				System.out.println("目录存在");
			}
			File file = new File(topName + ":/" + tagName + "/" + fileName
					+ "." + type);
			if (!file.exists()) {
				try {
					file.createNewFile();
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
			FileOutputStream fs = new FileOutputStream(topName + ":/" + tagName
					+ "/" + fileName + "." + type);
			byte[] buffer = new byte[1204];
			while ((byteread = inStream.read(buffer)) != -1) {
				bytesum += byteread;
				System.out.println(bytesum);
				fs.write(buffer, 0, byteread);
			}
			System.out.println("downloaded ok");
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	public static void main(String[] args) throws ClientProtocolException,
			IOException {
		CrawlMethodManager manager = new CrawlMethodManager();
	

	}

}



你可能感兴趣的:(java,httpclient,html)