一、org.apache.http.impl.client.HttpClients
优点:性能快
缺点:对需要JS进行渲染的页面 不支持,无法获取JS渲染之后的页面源码。
附上代码示例:
public static String getByUrl(final String url,final String charset){ /*RequestConfig defaultRequestConfig = RequestConfig.custom() .setConnectTimeout(5000) .setConnectionRequestTimeout(5000) .build();*/ //CloseableHttpClient httpclient = HttpClients.custom().setMaxConnTotal(800).setMaxConnPerRoute(800).setDefaultRequestConfig(defaultRequestConfig).build(); CloseableHttpClient httpclient = HttpClients.createDefault(); //DefaultHttpClient httpClientOld = new DefaultHttpClient();// 创建httpClient对象 try { HttpGet httpget = new HttpGet(url); //System.out.println("executing request " + httpget.getURI()); ResponseHandler<String> responseHandler = new ResponseHandler<String>() { public String handleResponse(final HttpResponse response) throws ClientProtocolException, IOException { int status = response.getStatusLine().getStatusCode(); //System.out.println("========responseStatusCode:"+status + " "+url); if (status == 200) { HttpEntity entity = response.getEntity(); if(entity == null){ System.out.println("========entity is null:"+status + " "+url); return null; }else{ String content = EntityUtils.toString(entity); if(charset != null){ content = new String(content.getBytes("ISO-8859-1"),charset); } return content; } } else { throw new ClientProtocolException("HttpClientUtil Unexpected response status: " + status); } } }; String responseBody = httpclient.execute(httpget, responseHandler); return responseBody; } catch (ClientProtocolException e) { System.out.println("========HttpClientUtil===="+e.getMessage() + " "+url); //e.printStackTrace(); closeHttpclient(httpclient); return getByUrl(url,charset); } catch (IOException e) { System.out.println("========HttpClientUtil IOException===="+e.getMessage() + " "+url); //e.printStackTrace(); closeHttpclient(httpclient); return getByUrl(url,charset); } finally { closeHttpclient(httpclient); } } /** * 关闭CloseableHttpClient * @param httpclient */ private static void closeHttpclient(CloseableHttpClient httpclient){ if(httpclient != null){ try { httpclient.close(); } catch (IOException e) { e.printStackTrace(); } } }
优点:支持需要JS进行渲染的页面,功能强大,是一款没有界面的浏览器,对页面上的CSS渲染、ajax渲染、JS渲染 都支持。能够模拟页面上的事件操作。
缺点:如果开启JS渲染功能,性能大大降低。
附上代码示例:
/** * 针对普通页面 * @param url * @param charset * @param loopNum 循环请求次数 ,当页面请求出错,进行重新请求的次数 * @return */ private static String getByUrlForHtmlPage(String url,String charset,int loopNum){ WebClient wc = new WebClient(BrowserVersion.FIREFOX_38); wc.setJavaScriptTimeout(5000); wc.getOptions().setUseInsecureSSL(false); wc.getOptions().setJavaScriptEnabled(false); // 启用JS解释器,默认为true wc.getOptions().setCssEnabled(false); // 禁用css支持 wc.getOptions().setThrowExceptionOnScriptError(false); // js运行错误时,是否抛出异常 wc.getOptions().setTimeout(15000); // 设置连接超时时间 ,这里是10S。如果为0,则无限期等待 wc.getOptions().setDoNotTrackEnabled(false); //wc.setAjaxController(new NicelyResynchronizingAjaxController());//启用ajax支持 HtmlPage page = null; try { page = (HtmlPage) wc.getPage(url); /** * 获得page后执行js代码还需要一定的时间,sellp 5s以便获得最终执行js后的纯html页面。 */ //Thread.sleep(5000); } catch (Exception e) { System.out.println("========getByUrlForHtmlPage===="+e.getMessage() + " "+url); closeHttpclient(wc); if(loopNum < LOOP_NUM && page != null){ loopNum = loopNum + 1; return getByUrlForHtmlPage(url,charset,loopNum); }else{ return null; } }finally{ closeHttpclient(wc); } if(page == null){ return null; } return page.asXml(); } /** * 关闭CloseableHttpClient * @param httpclient */ private static void closeHttpclient(WebClient wc){ if(wc != null){ wc.close(); } }