看了两篇不错的博客,这里总结一下:
第一篇博客(http://hackerzhou.me/2010/08/support-ssl-proxy-post-get-the-webclient-using-httpclient-4-0-1.html):
import java.io.IOException; import java.security.KeyManagementException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; import org.apache.http.Header; import org.apache.http.HttpException; import org.apache.http.HttpHost; import org.apache.http.HttpRequest; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.params.ClientPNames; import org.apache.http.client.params.CookiePolicy; import org.apache.http.conn.routing.HttpRoute; import org.apache.http.conn.routing.HttpRoutePlanner; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.conn.ssl.SSLSocketFactory; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.message.BasicNameValuePair; import org.apache.http.params.CoreProtocolPNames; import org.apache.http.params.HttpConnectionParams; import org.apache.http.params.HttpParams; import org.apache.http.protocol.HTTP; import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; public class WebClient { private DefaultHttpClient httpClient = new DefaultHttpClient(); private String url; private HTTPMethod method; private byte[] content; private Map<String, String> headers = new HashMap<String, String>(); private int responseCode; private List<NameValuePair> postParameter = new ArrayList<NameValuePair>(); private static final Pattern pageEncodingReg = Pattern.compile( "content-type.*charset=([^\">\\\\]+)", Pattern.CASE_INSENSITIVE); private static final Pattern headerEncodingReg = Pattern.compile( "charset=(.+)", Pattern.CASE_INSENSITIVE); public static void main(String[] args) throws Exception { WebClient web = new WebClient("http://www.baidu.com/", HTTPMethod.GET); web.enableProxy("10.58.32.51", 8080, false, null, null, "127.0.0.1"); System.out.println(web.getTextContent()); System.out.println("------------------------------------------"); web.setUrl("https://mail.google.com/mail/"); System.out.println(web.getTextContent()); System.out.println("------------------------------------------"); web.setUrl("http://www.snee.com/xml/crud/posttest.cgi"); web.setMethod(HTTPMethod.POST); web.addPostParameter("fname", "ababab"); web.addPostParameter("lname", "cdcdcd"); System.out.println(web.getTextContent()); System.out.println("------------------------------------------"); } // Without proxy public WebClient(String url, HTTPMethod method) { this(url, method, false, null, 0, false, null, null, null); } // Proxy without auth public WebClient(String url, HTTPMethod method, String proxyHost, int proxyPort) { this(url, method, true, proxyHost, proxyPort, false, null, null, null); } // All in one settings public WebClient(String url, HTTPMethod method, boolean useProxy, String proxyHost, int proxyPort, boolean needAuth, String username, String password, String nonProxyReg) { setUrl(url); setMethod(method); if (useProxy) { enableProxy(proxyHost, proxyPort, needAuth, username, password, nonProxyReg); } } public void setMethod(HTTPMethod method) { this.method = method; } public void setUrl(String url) { if (isStringEmpty(url)) { throw new RuntimeException("[Error] url is empty!"); } this.url = url; headers.clear(); responseCode = 0; postParameter.clear(); content = null; if (url.startsWith("https://")) { enableSSL(); } else { disableSSL(); } } public Map<String, String> getRequestHeaders() { return headers; } public void addPostParameter(String name, String value) { this.postParameter.add(new BasicNameValuePair(name, value)); } public void setTimeout(int connectTimeout, int readTimeout) { HttpParams params = httpClient.getParams(); HttpConnectionParams.setConnectionTimeout(params, connectTimeout); HttpConnectionParams.setSoTimeout(params, readTimeout); } private void enableSSL() { try { SSLContext sslcontext = SSLContext.getInstance("TLS"); sslcontext.init(null, new TrustManager[] { truseAllManager }, null); SSLSocketFactory sf = new SSLSocketFactory(sslcontext); sf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER); Scheme https = new Scheme("https", sf, 443); httpClient.getConnectionManager().getSchemeRegistry() .register(https); } catch (KeyManagementException e) { e.printStackTrace(); } catch (NoSuchAlgorithmException e) { e.printStackTrace(); } } private void disableSSL() { SchemeRegistry reg = httpClient.getConnectionManager() .getSchemeRegistry(); if (reg.get("https") != null) { reg.unregister("https"); } } public void disableProxy() { httpClient.getCredentialsProvider().clear(); httpClient.setRoutePlanner(null); } public void enableProxy(final String proxyHost, final int proxyPort, boolean needAuth, String username, String password, final String nonProxyHostRegularExpression) { if (needAuth) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(proxyHost, proxyPort), new UsernamePasswordCredentials(username, password)); } // Simple proxy setting, can't handle non-proxy-host // httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,new // HttpHost(proxyHost, proxyPort)); httpClient.setRoutePlanner(new HttpRoutePlanner() { @Override public HttpRoute determineRoute(HttpHost target, HttpRequest request, HttpContext contenxt) throws HttpException { HttpRoute proxyRoute = new HttpRoute(target, null, new HttpHost(proxyHost, proxyPort), "https" .equalsIgnoreCase(target.getSchemeName())); if (nonProxyHostRegularExpression == null) { return proxyRoute; } Pattern pattern = Pattern .compile(nonProxyHostRegularExpression, Pattern.CASE_INSENSITIVE); Matcher m = pattern.matcher(target.getHostName()); if (m.find()) { return new HttpRoute(target, null, target, "https" .equalsIgnoreCase(target.getSchemeName())); } else { return proxyRoute; } } }); } private void fetch() throws IOException { if (url == null || method == null) { throw new RuntimeException( "Fetch exception: URL and Method is null"); } httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); HttpResponse response = null; HttpUriRequest req = null; if (method.equals(HTTPMethod.GET)) { req = new HttpGet(url); } else { req = new HttpPost(url); ((HttpPost) req).setEntity(new UrlEncodedFormEntity( this.postParameter, HTTP.UTF_8)); } for (Entry<String, String> e : headers.entrySet()) { req.addHeader(e.getKey(), e.getValue()); } // // Turn off "except" http header, some proxy server and web server do // not support it, may cause "417 Expectation Failed" // // HttpClient's doc says: 100-continue handshake should be used with // caution, as it may cause problems with HTTP servers and proxies that // do not support HTTP/1.1 protocol. // req.getParams().setBooleanParameter( CoreProtocolPNames.USE_EXPECT_CONTINUE, false); response = httpClient.execute(req); Header[] header = response.getAllHeaders(); headers.clear(); for (Header h : header) { headers.put(h.getName(), h.getValue()); } content = EntityUtils.toByteArray(response.getEntity()); responseCode = response.getStatusLine().getStatusCode(); } private boolean isStringEmpty(String s) { return s == null || s.length() == 0; } public int getResponseCode() throws IOException { if (responseCode == 0) { fetch(); } return responseCode; } public Map<String, String> getResponseHeaders() throws IOException { if (responseCode == 0) { fetch(); } return headers; } public byte[] getByteArrayContent() throws IOException { if (content == null) { fetch(); } return content; } public String getTextContent() throws IOException { if (content == null) { fetch(); } if (content == null) { throw new RuntimeException("[Error] Can't fetch content!"); } String headerContentType = null; if ((headerContentType = headers.get("Content-Type")) != null) { // use http header encoding Matcher m1 = headerEncodingReg.matcher(headerContentType); if (m1.find()) { return new String(content, m1.group(1)); } } // Use html's encoding String html = new String(content); Matcher m2 = pageEncodingReg.matcher(html); if (m2.find()) { html = new String(content, m2.group(1)); } return html; } public DefaultHttpClient getHttpClient() { return httpClient; } public enum HTTPMethod { GET, POST } // SSL handler (ignore untrusted hosts) private static TrustManager truseAllManager = new X509TrustManager() { @Override public X509Certificate[] getAcceptedIssuers() { return null; } @Override public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { } @Override public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { } }; }
最近研究了下HttpClient 4.0.1,主要是因为Java自己的HttpURLConnection对SSL支持的不好,而且控制起来不太方便,而且HttpClient还支持抓取非信任的站点,别的实现方式貌似需要在代码中显式导入证书。
需要的jar包:commons-logging-1.1.1.jar,httpclient-4.0.1.jar,httpcore-4.0.1.jar
Coding的时候遇到了些非常规问题:
1.HttpClient支持使用Java默认的Properties方式设置代理,不过我还是使用了HttpClient的代理设置方式。因而遇到了一个很诡异的问题,Properties方式设置的代理可以设置代理例外,即本地地址不通过代理访问,HttpClient没有简单的一句话设置的方法,必须写HttpRoutePlanner来自定义,比较繁琐。
如果使用HttpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,new HttpHost(proxyHost, proxyPort));来设置代理,则所有的请求都会往这个代理发送,没有例外,故弃之。
2.HTTP Header中的“Except”字段引起的问题,我向一些网页直接提交POST没有问题,但如果使用squid proxy进行post的话就会出现417 Expectation Failed错误,网上查了http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html 发现这个问题是因为proxy server/web server不能理解或满足Except字段中指定的值,则会引发这个错误,而HttpClient默认会发送这个字段,只要指示request不发送这个字段即可。
3.不受信任的HTTPS站点的访问问题,通过X509TrustManager来完成,将方法重写成返回null的或者是啥都不做的,理论上要是checkServerTrusted或checkClientTrusted方法检测到不受信任的站点,会抛出异常,但如果什么都不做,则被视为通过检查。
4.写了个getTextContent方法,用来获取返回的文本,解决乱码问题的方法其实很简单,首先用正则提取HTTP Header中Content-Type里的charset,如果没有,使用默认编码分析html head中Content-Type里的charset,如果没有,使用系统默认编码。
第二篇博客(http://eyecm.com/httpclient-download/):
在之前的文章《HttpClient4.1入门教程-利用官方例子讲解httpClient4.1的用法》中介绍了HttpClient4的基本用户,本篇文章以实例的方式介绍如何使用HttpClient下载文件。
事实上我们仍然发送GET或者POST请求,但对响应写入到文件流中即可。通过这段代码,也许你就明白下载软件或采集器的写法了。
package sitemap; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import org.apache.http.HttpResponse; import org.apache.http.StatusLine; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; public class SitemapDownloader { public static void main(String[] args) throws Exception { int min = 1; int max = 806; String url = "http://www.foxnews.com/sitemap.xml?idx="; while (min < max) { Thread.sleep(500); HttpClient httpClient1 = new DefaultHttpClient(); HttpGet httpGet1 = new HttpGet(url+min); HttpResponse httpResponse1 = httpClient1.execute(httpGet1); StatusLine statusLine = httpResponse1.getStatusLine(); if(statusLine.getStatusCode() == 200) { File xml = new File("d:/sitemap/"+min+".xml"); FileOutputStream outputStream = new FileOutputStream(xml); InputStream inputStream = httpResponse1.getEntity().getContent(); byte b[] = new byte[1024]; int j = 0; while( (j = inputStream.read(b))!=-1) { outputStream.write(b,0,j); } outputStream.flush(); outputStream.close(); min++; System.out.println("存储了XML: " +min); } httpClient1.getConnectionManager().shutdown(); } } }
package com.adobe.touchstone.plus.updownload; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.security.KeyManagementException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; import org.apache.http.HttpResponse; import org.apache.http.StatusLine; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.ssl.SSLSocketFactory; import org.apache.http.impl.client.DefaultHttpClient; public class SitemapDownloader{ // SSL handler (ignore untrusted hosts) private static TrustManager truseAllManager = new X509TrustManager() { @Override public X509Certificate[] getAcceptedIssuers() { return null; } @Override public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { } @Override public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { } }; private static void enableSSL(HttpClient httpClient) { try { SSLContext sslcontext = SSLContext.getInstance("TLS"); sslcontext.init(null, new TrustManager[] { truseAllManager }, null); SSLSocketFactory sf = new SSLSocketFactory(sslcontext); sf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER); Scheme https = new Scheme("https", sf, 443); httpClient.getConnectionManager().getSchemeRegistry() .register(https); } catch (KeyManagementException e) { e.printStackTrace(); } catch (NoSuchAlgorithmException e) { e.printStackTrace(); } } public static void main(String[] args) throws Exception{ String url = "https://mail.google.com/mail/"; HttpClient httpClient1 = new DefaultHttpClient() ; enableSSL(httpClient1); HttpGet httpGet1 = new HttpGet(url); HttpResponse httpResponse1 = httpClient1.execute(httpGet1); StatusLine statusLine = httpResponse1.getStatusLine(); if(statusLine.getStatusCode() == 200){ File xml = new File( "d:/mail.html" ); FileOutputStream outputStream = new FileOutputStream(xml); InputStream inputStream = httpResponse1.getEntity().getContent(); byte b[] = new byte[1024]; int j = 0; while( (j = inputStream.read(b))!=-1){ System.out.println("Writing : "+b.toString()); outputStream.write(b,0,j); } outputStream.flush(); outputStream.close(); System.out.println( "存储了文件: " +xml.toString()); httpClient1.getConnectionManager(). shutdown(); } } }