以下是我常用的抓取类,直接调用其中方法可实现本机ip抓取,goagent代理ip抓取,代理ip抓取。以及对文件的下载,页面内容保存到本地等。
package crawlMethodManager; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLConnection; import java.nio.charset.Charset; import java.nio.charset.UnsupportedCharsetException; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.ParseException; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.entity.DeflateDecompressingEntity; import org.apache.http.client.entity.GzipDecompressingEntity; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.conn.params.ConnRouteParams; import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.params.CoreProtocolPNames; import org.apache.http.protocol.HTTP; import org.apache.http.util.CharArrayBuffer; @SuppressWarnings("deprecation") public class CrawlMethodManager { static String ip = ""; static int port = 0; static String ipUrl = "http://localhost:8080/ipFilter/getIp/getIp"; static HttpClient httpPostClient = new DefaultHttpClient( new ThreadSafeClientConnManager()); /** * httpClient的get方法 * * @param url * String 要抓取的链接 * @param encode * String 抓取时使用的编码 * @param goagentFlag * boolean 是否启用goagent * @param goagentNum * int goagent尝试的次数 * @param companyFlag * boolean 是否启用代理 * @param companyNum * int 代理尝试的次数 * @param localFlag * boolean 是否启用本机 * @param localNum * int 本机尝试的次数 */ public String crawlPageContentByGet(String url, String encode, boolean goagentFlag, int goagentNum, boolean companyFlag, int companyNum, boolean localFlag, int localNum) throws ClientProtocolException, IOException { String content = ""; if (goagentFlag && content.equals("")) { int goagentCount = 0; while (content.equals("") && goagentCount < goagentNum) { try { System.out.println("goagent正在请求"); content = doGetByGoagent(url, encode); } catch (Exception e) { // System.out.println("goagent请求失败"); } goagentCount++; } } if (companyFlag && content.equals("")) { int companyCount = 0; while (content.equals("") && companyCount < companyNum) { try { System.out.println("公司代理ip正在请求"); content = getByCompanyProxy(url, encode); } catch (Exception e) { // System.out.println("公司代理ip请求失败"); } companyCount++; } } if (localFlag && content.equals("")) { int localCount = 0; while (content.equals("") && localCount < localNum) { try { System.out.println("本机正在请求"); content = doGet(url, encode); } catch (Exception e) { // System.out.println("本机请求失败"); } localCount++; } } return content; } /** * * @Description: get web content * @param @param url * @param @param encode * @param @return * @param @throws ClientProtocolException * @param @throws IOException * @return String * @throws * @author joe * @date 2014-12-11 */ public String crawlPageContentByGet(String url, String encode) throws ClientProtocolException, IOException { String content = ""; try { content = doGetByGoagent(url, encode); if (content == null || content.equals("")) { System.out.println("启用公司代理"); content = getByCompanyProxy(url, encode); // if (content == null || content.equals("")) { // System.out.println("启用本机"); // content = doGet(url, encode); // } } } catch (Exception e) { try { System.out.println("goagent连接失败,启用公司代理"); content = getByCompanyProxy(url, encode); // if (content == null || content.equals("")) { // System.out.println("公司代理连接失败,启用本机"); // content = doGet(url, encode); // } } catch (Exception e2) { try { content = getByCompanyProxy(url, encode); // e2.printStackTrace(); // System.out.println("公司代理连接失败,5秒后启用本机"); // Thread.sleep(5000); // content = doGet(url, encode); } catch (Exception e3) { e3.printStackTrace(); } } } return content; } private String getByCompanyProxy(String url, String encode) { int count = 10; String result = ""; String urlString = url; String proxy = ""; HttpHost proxyHost = null; boolean newProxy = false; int oldProxyUsecount = 0; for (int i = 0; i <= count; i++) { if (!ip.equals("")) { proxyHost = new HttpHost(ip, port, null); } try { if (newProxy || oldProxyUsecount > 2 || ip.equals("")) { oldProxyUsecount = 0; String[] proxys = null; try { while (proxy.equals("") || !proxy.contains(":")) { System.out.println("ip为空,正在提取"); proxy = doGet(ipUrl, "gbk"); } proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split( ":"); } catch (Exception e) { while (proxy.equals("") || !proxy.contains(":")) { System.out.println("ip为空,正在提取"); proxy = doGet(ipUrl, "gbk"); } proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split( ":"); // proxy = doGet( // , // "gbk"); // proxys = proxy.split(":"); } ip = proxys[0]; port = Integer.parseInt(proxys[1]); proxyHost = new HttpHost(ip, port, null); } System.out.println("正在使用代理" + ip + ":" + port + ":" + port); HttpGet httpRequst = new HttpGet(urlString); httpRequst.addHeader("Accept-Encoding", "gzip,deflate,sdch"); httpRequst.getParams().setParameter( CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode); DefaultHttpClient httpClient = new DefaultHttpClient(); httpClient.getParams().setParameter( CoreConnectionPNames.CONNECTION_TIMEOUT, 9000);// 连接时间20s httpClient.getParams().setParameter( CoreConnectionPNames.SO_TIMEOUT, 9000);// 数据传输时间60s httpClient.getParams().setParameter( ConnRouteParams.DEFAULT_PROXY, proxyHost); HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类 if (httpResponse.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = httpResponse.getEntity(); if (httpEntity.getContentEncoding() != null) { if ("gzip".equalsIgnoreCase(httpEntity .getContentEncoding().getValue())) { httpEntity = new GzipDecompressingEntity(httpEntity); } else if ("deflate".equalsIgnoreCase(httpEntity .getContentEncoding().getValue())) { httpEntity = new DeflateDecompressingEntity( httpEntity); } } result = enCodetoString(httpEntity, encode);// 取出应答字符串 if (resultTest(result)) { System.out.println(ip + "公司代理成功抓取" + url); return result; } else if (result.contains("function JumpSelf") && result.contains("WebShieldSessionVerify")) { int indexs = result.indexOf("&WebShieldSessionVerify"); int indexe = result.indexOf("\";}</script>"); String verify = result.substring(indexs, indexe); urlString = urlString + verify; newProxy = false; } else if (result.contains("function JumpSelf") && !result.contains("WebShieldSessionVerify")) { urlString = url; newProxy = false; } else { System.out.println("网页含有错误特殊字符" + urlString); oldProxyUsecount++; System.out.println(result); } } else System.out.println(httpResponse.getStatusLine() .getStatusCode() + " " + urlString + " 状态不为200"); oldProxyUsecount++; httpRequst.abort(); } catch (ClientProtocolException e) { newProxy = true; System.out.println(ip + "代理ip拒绝了"); } catch (IOException e) { oldProxyUsecount++; System.out.println(ip + "代理读取超时"); } } return ""; } private String doGet(String url, String encode) throws ClientProtocolException, IOException { String result = ""; try { HttpGet httpRequst = new HttpGet(url); // httpRequst.addHeader("Content-Type", "text/html;charset=" + // encode); // httpRequst.getParams().setParameter( // CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode); DefaultHttpClient httpClient = new DefaultHttpClient(); // httpClient.getParams().setParameter( // CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode); httpClient.getParams().setParameter( CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20s httpClient.getParams().setParameter( CoreConnectionPNames.SO_TIMEOUT, 8000);// 数据传输时间60s HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类 if (httpResponse.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = httpResponse.getEntity(); if (httpEntity.getContentEncoding() != null) { if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding() .getValue())) { httpEntity = new GzipDecompressingEntity(httpEntity); } else if ("deflate".equalsIgnoreCase(httpEntity .getContentEncoding().getValue())) { httpEntity = new DeflateDecompressingEntity(httpEntity); } } result = enCodetoString(httpEntity, encode);// 取出应答字符串 } else httpRequst.abort(); } catch (ClientProtocolException e) { System.out.println("doget代理读取超时"); } catch (IOException e) { System.out.println("doget代理读取超时"); } return result; } private String doGetByGoagent(String url, String encode) throws ClientProtocolException, IOException { String result = ""; HttpGet httpRequst = new HttpGet(url); httpRequst.addHeader("Accept-Encoding", "gzip,deflate,sdch"); httpRequst.getParams().setParameter( CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode); DefaultHttpClient httpClient = new DefaultHttpClient(); HttpHost proxyHost = new HttpHost("127.0.0.1", 8087, null); httpClient.getParams().setParameter( CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20s httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 6000);// 数据传输时间60s httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY, proxyHost); HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类 if (httpResponse.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = httpResponse.getEntity(); if (httpEntity.getContentEncoding() != null) { if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding() .getValue())) { httpEntity = new GzipDecompressingEntity(httpEntity); } else if ("deflate".equalsIgnoreCase(httpEntity .getContentEncoding().getValue())) { httpEntity = new DeflateDecompressingEntity(httpEntity); } } result = enCodetoString(httpEntity, encode);// 取出应答字符串 } else httpRequst.abort(); return result; } public String crawlPageContentByPost(String url, String pram, String encode) throws ClientProtocolException, IOException { String content = ""; try { content = doPostByGoagent(url, pram, encode); if (content == null || content.equals("")) { content = doPostByGoagent(url, pram, encode); // System.out.println("启用公司代理"); // content = postByCompanyProxy(url, pram, encode); // if (content == null || content.equals("")) { // System.out.println("5秒后启用本机"); // Thread.sleep(5000); // content = doPost(url, pram, encode); // } } } catch (Exception e) { try { content = doPostByGoagent(url, pram, encode); // System.out.println("goagent连接失败,启用公司代理"); // content = postByCompanyProxy(url, pram, encode); // if (content == null || content.equals("")) { // System.out.println("公司代理连接失败,启用本机"); // content = doPost(url, pram, encode); // } } catch (Exception e2) { try { content = doPostByGoagent(url, pram, encode); // e2.printStackTrace(); // content = postByCompanyProxy(url, pram, encode); // System.out.println("公司代理连接失败,启用本机"); // content = doPost(url, pram, encode); } catch (Exception e3) { e3.printStackTrace(); } } } return content; } private String doPostByGoagent(String url, String parm, String encode) throws ClientProtocolException, IOException { String result = ""; HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象 HttpHost proxy = new HttpHost("127.0.0.1", 8087, null); StringEntity entity = new StringEntity(parm); entity.setContentType("application/x-www-form-urlencoded"); entity.setContentEncoding(encode); httpRequst.setEntity(entity); DefaultHttpClient httpClient = new DefaultHttpClient(); httpClient.getParams().setParameter( CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20s httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 8000);// 数据传输时间60s httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY, proxy); HttpResponse httpResponse = httpClient.execute(httpRequst); // System.out.println(httpResponse.getStatusLine().getStatusCode()); if (httpResponse.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = httpResponse.getEntity(); if (httpEntity.getContentEncoding() != null) { if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding() .getValue())) { httpEntity = new GzipDecompressingEntity(httpEntity); } else if ("deflate".equalsIgnoreCase(httpEntity .getContentEncoding().getValue())) { httpEntity = new DeflateDecompressingEntity(httpEntity); } } result = enCodetoString(httpEntity, encode);// 取出应答字符串 } return result; } public String doPost(String url, String parm, String encode) throws ClientProtocolException, IOException { String result = ""; HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象 StringEntity entity = new StringEntity(parm); entity.setContentType("application/x-www-form-urlencoded"); entity.setContentEncoding(encode); httpRequst.setEntity(entity); DefaultHttpClient httpClient = new DefaultHttpClient(); httpClient.getParams().setParameter( CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20s httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 8000);// 数据传输时间60s HttpResponse httpResponse = httpClient.execute(httpRequst); // System.out.println(httpResponse.getStatusLine().getStatusCode()); if (httpResponse.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = httpResponse.getEntity(); if (httpEntity.getContentEncoding() != null) { if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding() .getValue())) { httpEntity = new GzipDecompressingEntity(httpEntity); } else if ("deflate".equalsIgnoreCase(httpEntity .getContentEncoding().getValue())) { httpEntity = new DeflateDecompressingEntity(httpEntity); } } result = enCodetoString(httpEntity, encode);// 取出应答字符串 return result; } return result; } @SuppressWarnings("unused") private String postByCompanyProxy(String url, String parm, String encode) throws ClientProtocolException, IOException { int count = 5; String result = ""; String urlString = url; boolean okProxy = false; boolean newProxy = false; int oldProxyUsecount = 0; for (int i = 0; i <= count; i++) { try { if (newProxy || oldProxyUsecount > 2 || ip.equals("")) { okProxy = postByCompanyProxyBoolean(url, parm, encode); } if (okProxy) { System.out.println("正在使用代理" + ip + ":" + port); HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象 StringEntity entity = new StringEntity(parm); entity.setContentType("application/x-www-form-urlencoded"); httpRequst.setEntity(entity); httpRequst.getParams().setParameter( CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode); HttpResponse httpResponse = httpPostClient .execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类 if (httpResponse.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = httpResponse.getEntity(); if (httpEntity.getContentEncoding() != null) { if ("gzip".equalsIgnoreCase(httpEntity .getContentEncoding().getValue())) { httpEntity = new GzipDecompressingEntity( httpEntity); } else if ("deflate".equalsIgnoreCase(httpEntity .getContentEncoding().getValue())) { httpEntity = new DeflateDecompressingEntity( httpEntity); } } result = enCodetoString(httpEntity, encode);// 取出应答字符串 // System.out.println(result); if (resultTest(result)) { return result; } else if (result.contains("function JumpSelf") && result.contains("WebShieldSessionVerify")) { int indexs = result .indexOf("&WebShieldSessionVerify"); int indexe = result.indexOf("\";}</script>"); String verify = result.substring(indexs, indexe); urlString = urlString + verify; newProxy = false; } else if (result.contains("function JumpSelf") && !result.contains("WebShieldSessionVerify")) { urlString = url; newProxy = false; } } else if (httpResponse.getStatusLine().getStatusCode() == 302) { System.out.println("重定向了"); Header header = httpResponse.getFirstHeader("location"); if (header != null) { urlString = header.getValue(); System.out.println(urlString); if (urlString.contains("tabid=26")) { urlString = "http://www.landchina.com" + urlString; result = getByHttpClient(urlString, encode, httpPostClient); if (resultTest(result)) { System.out.println(i + "公司代理成功抓取" + url); return result; } newProxy = false; } newProxy = false; } } else { httpRequst.abort(); } } else { oldProxyUsecount++; } } catch (ClientProtocolException e) { newProxy = true; System.out.println(ip + "代理ip拒绝了"); } catch (IOException e) { oldProxyUsecount++; System.out.println(ip + "代理读取超时"); } } return ""; } private String getByHttpClient(String url, String encode, HttpClient httpClient) { int count = 2; String result = ""; String urlString = url; for (int i = 0; i <= count; i++) { try { HttpGet httpRequst = new HttpGet(urlString); httpRequst.setHeader("Content-Type", "application/x-www-form-urlencoded"); HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类 if (httpResponse.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = httpResponse.getEntity(); if (httpEntity.getContentEncoding() != null) { if ("gzip".equalsIgnoreCase(httpEntity .getContentEncoding().getValue())) { httpEntity = new GzipDecompressingEntity(httpEntity); } else if ("deflate".equalsIgnoreCase(httpEntity .getContentEncoding().getValue())) { httpEntity = new DeflateDecompressingEntity( httpEntity); } } result = enCodetoString(httpEntity, encode);// 取出应答字符串 if (resultTest(result)) { System.out.println(ip + "公司代理成功抓取" + url); return result; } else if (result.contains("function JumpSelf") && result.contains("WebShieldSessionVerify")) { int indexs = result.indexOf("&WebShieldSessionVerify"); int indexe = result.indexOf("\";}</script>"); String verify = result.substring(indexs, indexe); urlString = urlString + verify; } else if (result.contains("function JumpSelf") && !result.contains("WebShieldSessionVerify")) { urlString = url; } } else httpRequst.abort(); } catch (ClientProtocolException e) { System.out.println(ip + "代理ip拒绝了"); } catch (IOException e) { System.out.println(ip + "代理读取超时"); } } return ""; } /** * 新ip第一次访问时要先通过安全验证,这时只能得到首页的内容,所以在post前线验证一次 <功能详细描述> [参数说明] * * @return void [返回类型说明] * @exception throws [违例类型] [违例说明] * @see [类、类#方法、类#成员] */ private Boolean postByCompanyProxyBoolean(String url, String parm, String encode) throws ClientProtocolException, IOException { int count = 10; String result = ""; String urlString = url; String proxy = ""; HttpHost proxyHost = null; boolean newProxy = false; int oldProxyUsecount = 0; for (int i = 0; i <= count; i++) { try { if (newProxy || oldProxyUsecount > 2 || ip.equals("")) { oldProxyUsecount = 0; String[] proxys = null; try { while (proxy.equals("") || !proxy.contains(":")) { System.out.println("ip为空,正在提取"); proxy = doGet(ipUrl, "gbk"); } proxys = proxy.replaceAll("\"|//|/|\r\n| | ", "") .split(":"); } catch (Exception e) { while (proxy.equals("") || !proxy.contains(":")) { System.out.println("ip为空,正在提取"); proxy = doGet(ipUrl, "gbk"); } proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split( ":"); } ip = proxys[0]; port = Integer.parseInt(proxys[1]); proxyHost = new HttpHost(ip, port, null); } System.out.println("正在使用代理" + ip + ":" + port); HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象 StringEntity entity = new StringEntity(parm); entity.setContentType("application/x-www-form-urlencoded"); httpRequst.setEntity(entity); httpRequst.getParams().setParameter( CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode); httpPostClient.getParams().setParameter( CoreConnectionPNames.CONNECTION_TIMEOUT, 10000);// 连接时间20s httpPostClient.getParams().setParameter( CoreConnectionPNames.SO_TIMEOUT, 8000);// 数据传输时间60s httpPostClient.getParams().setParameter( ConnRouteParams.DEFAULT_PROXY, proxyHost); HttpResponse httpResponse = httpPostClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类 if (httpResponse.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = httpResponse.getEntity(); if (httpEntity.getContentEncoding() != null) { if ("gzip".equalsIgnoreCase(httpEntity .getContentEncoding().getValue())) { httpEntity = new GzipDecompressingEntity(httpEntity); } else if ("deflate".equalsIgnoreCase(httpEntity .getContentEncoding().getValue())) { httpEntity = new DeflateDecompressingEntity( httpEntity); } } result = enCodetoString(httpEntity, encode);// 取出应答字符串 // System.out.println(result); if (resultTest(result)) { return true; } else if (result.contains("function JumpSelf") && result.contains("WebShieldSessionVerify")) { int indexs = result.indexOf("&WebShieldSessionVerify"); int indexe = result.indexOf("\";}</script>"); String verify = result.substring(indexs, indexe); urlString = urlString + verify; if (urlString.contains("tabid=26") && !urlString.contains("landchina")) { urlString = "http://www.landchina.com" + urlString; result = getByHttpClient(urlString, encode, httpPostClient); if (resultTest(result)) { System.out.println(ip + "公司代理成功抓取" + url); return true; } newProxy = false; } else if (urlString.contains("tabid=26") && urlString.contains("landchina")) { result = getByHttpClient(urlString, encode, httpPostClient); if (resultTest(result)) { System.out.println(ip + "公司代理成功抓取" + url); return true; } newProxy = false; } newProxy = false; } else if (result.contains("function JumpSelf") && !result.contains("WebShieldSessionVerify")) { urlString = url; newProxy = false; } } else if (httpResponse.getStatusLine().getStatusCode() == 302) { System.out.println("重定向了"); Header header = httpResponse.getFirstHeader("location"); if (header != null) { urlString = header.getValue(); System.out.println(urlString); if (urlString.contains("tabid=26") && !urlString.contains("landchina")) { urlString = "http://www.landchina.com" + urlString; result = getByHttpClient(urlString, encode, httpPostClient); if (resultTest(result)) { System.out.println(ip + "公司代理成功抓取" + url); return true; } newProxy = false; } else if (urlString.contains("tabid=26") && urlString.contains("landchina")) { result = getByHttpClient(urlString, encode, httpPostClient); if (resultTest(result)) { System.out.println(ip + "公司代理成功抓取" + url); return true; } newProxy = false; } newProxy = false; } } else { httpRequst.abort(); } } catch (ClientProtocolException e) { newProxy = true; System.out.println(ip + "代理ip拒绝了"); } catch (IOException e) { oldProxyUsecount++; System.out.println(ip + "代理读取超时"); } } return false; } private Boolean resultTest(String result) { if (!result.equals("") && !result.equals("100") && !result.contains("<title>blank") && !result.contains("Error Page Messages") && !result.contains("<title>404") && !result.contains("您的访问出错了") && !result.contains("302 Found") && !result.contains("出错页面") && !result.contains("没有找到这篇文章!") && !result.contains("特定于实例的错误") && !result.contains("错误 404") && !result.contains("Error report") && !result.contains("function JumpSelf") && !result.contains("refused") && !result.contains("网站防火墙") && !result.contains("无法解析服务器") && !result.contains("STATUS OK") && !result.contains("refresh") && !result.contains("DownloadError") && !result.contains("Not Found") && !result.contains("Runtime Error") && !result.contains("Service Unavailable")) { return true; } return false; } public static String enCodetoString(final HttpEntity entity, final String defaultCharset) throws IOException, ParseException { return enCodetoStringDo(entity, defaultCharset != null ? Charset.forName(defaultCharset) : null); } public static String enCodetoStringDo(final HttpEntity entity, Charset defaultCharset) throws IOException, ParseException { if (entity == null) { throw new IllegalArgumentException("HTTP entity may not be null"); } InputStream instream = entity.getContent(); if (instream == null) { return null; } try { if (entity.getContentLength() > Integer.MAX_VALUE) { throw new IllegalArgumentException( "HTTP entity too large to be buffered in memory"); } int i = (int) entity.getContentLength(); if (i < 0) { i = 4096; } Charset charset = null; try { // ContentType contentType = ContentType.get(entity); // if (contentType != null) { // charset = contentType.getCharset(); // } } catch (final UnsupportedCharsetException ex) { throw new UnsupportedEncodingException(ex.getMessage()); } if (charset == null) { charset = defaultCharset; } if (charset == null) { charset = HTTP.DEF_CONTENT_CHARSET; } Reader reader = new InputStreamReader(instream, charset); CharArrayBuffer buffer = new CharArrayBuffer(i); char[] tmp = new char[1024]; int l; while ((l = reader.read(tmp)) != -1) { buffer.append(tmp, 0, l); } return buffer.toString(); } finally { instream.close(); } } /** * * @Description: TODO * @param @param 硬盘名 * @param @param 文件名 * @param @param 文件夹名 * @param @param 保存后缀名 * @param @param 保存的内容 * @return void * @throws * @author joe * @date 2015-3-6 */ public static void writeToFile(String topName, String fileName, String tagName, String type, String content) { File dirFile = null; try { dirFile = new File(topName + ":\\" + tagName); if (!(dirFile.exists()) && !(dirFile.isDirectory())) { boolean creadok = dirFile.mkdirs(); if (creadok) { System.out.println(" ok:创建文件夹成功! "); } else { System.out.println(" err:创建文件夹失败! "); } } } catch (Exception e) { e.printStackTrace(); } String fullPath = dirFile + "/" + fileName + "." + type; write(fullPath, content); } /** * 写文件 * * @param path * @param content */ public static boolean write(String path, String content) { String s = new String(); String s1 = new String(); BufferedWriter output = null; try { File f = new File(path); if (f.exists()) { } else { System.out.println("文件不存在,正在创建..."); if (f.createNewFile()) { System.out.println("文件创建成功!"); } else { System.out.println("文件创建失败!"); } } BufferedReader input = new BufferedReader(new FileReader(f)); while ((s = input.readLine()) != null) { s1 += s + "\n"; } System.out.println("原文件内容:" + s1); input.close(); s1 += content; output = new BufferedWriter(new FileWriter(f)); output.write(s1); output.flush(); return true; } catch (Exception e) { e.printStackTrace(); return false; } finally { if (output != null) { try { output.close(); } catch (IOException e) { e.printStackTrace(); } } } } /** * * @Description: TODO * @param @param fileUrl文件链接 * @param @param topName硬盘名 * @param @param fileName文件名 * @param @param tagName文件夹名 * @param @param type 后缀名 * @return void * @throws * @author joe * @date 2015-3-6 */ public void downLoadFile(String fileUrl, String topName, String fileName, String tagName, String type) { // 下载网络文件 int bytesum = 0; int byteread = 0; try { URL url = new URL(fileUrl); URLConnection conn = url.openConnection(); InputStream inStream = conn.getInputStream(); File fileD = new File(topName + ":/" + tagName); // 如果文件夹不存在则创建 if (!fileD.exists() && !fileD.isDirectory()) { System.out.println("正在新建目录"); fileD.mkdirs(); ; } else { System.out.println("目录存在"); } File file = new File(topName + ":/" + tagName + "/" + fileName + "." + type); if (!file.exists()) { try { file.createNewFile(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } FileOutputStream fs = new FileOutputStream(topName + ":/" + tagName + "/" + fileName + "." + type); byte[] buffer = new byte[1204]; while ((byteread = inStream.read(buffer)) != -1) { bytesum += byteread; System.out.println(bytesum); fs.write(buffer, 0, byteread); } System.out.println("downloaded ok"); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void main(String[] args) throws ClientProtocolException, IOException { CrawlMethodManager manager = new CrawlMethodManager(); } }