如何使用HttpClient下载网络资源(包括下载ssl认证的资源)

看了两篇不错的博客,这里总结一下:

第一篇博客(http://hackerzhou.me/2010/08/support-ssl-proxy-post-get-the-webclient-using-httpclient-4-0-1.html):

import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
 
import org.apache.http.Header;
import org.apache.http.HttpException;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.routing.HttpRoutePlanner;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.HTTP;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
 
public class WebClient {
    private DefaultHttpClient httpClient = new DefaultHttpClient();
    private String url;
    private HTTPMethod method;
    private byte[] content;
    private Map<String, String> headers = new HashMap<String, String>();
    private int responseCode;
    private List<NameValuePair> postParameter = new ArrayList<NameValuePair>();
 
    private static final Pattern pageEncodingReg = Pattern.compile(
            "content-type.*charset=([^\">\\\\]+)", Pattern.CASE_INSENSITIVE);
    private static final Pattern headerEncodingReg = Pattern.compile(
            "charset=(.+)", Pattern.CASE_INSENSITIVE);
 
    public static void main(String[] args) throws Exception {
        WebClient web = new WebClient("http://www.baidu.com/", HTTPMethod.GET);
        web.enableProxy("10.58.32.51", 8080, false, null, null, "127.0.0.1");
        System.out.println(web.getTextContent());
        System.out.println("------------------------------------------");
        web.setUrl("https://mail.google.com/mail/");
        System.out.println(web.getTextContent());
        System.out.println("------------------------------------------");
        web.setUrl("http://www.snee.com/xml/crud/posttest.cgi");
        web.setMethod(HTTPMethod.POST);
        web.addPostParameter("fname", "ababab");
        web.addPostParameter("lname", "cdcdcd");
        System.out.println(web.getTextContent());
        System.out.println("------------------------------------------");
    }
 
    // Without proxy
    public WebClient(String url, HTTPMethod method) {
        this(url, method, false, null, 0, false, null, null, null);
    }
 
    // Proxy without auth
    public WebClient(String url, HTTPMethod method, String proxyHost,
            int proxyPort) {
        this(url, method, true, proxyHost, proxyPort, false, null, null, null);
    }
 
    // All in one settings
    public WebClient(String url, HTTPMethod method, boolean useProxy,
            String proxyHost, int proxyPort, boolean needAuth, String username,
            String password, String nonProxyReg) {
        setUrl(url);
        setMethod(method);
        if (useProxy) {
            enableProxy(proxyHost, proxyPort, needAuth, username, password,
                    nonProxyReg);
        }
    }
 
    public void setMethod(HTTPMethod method) {
        this.method = method;
    }
 
    public void setUrl(String url) {
        if (isStringEmpty(url)) {
            throw new RuntimeException("[Error] url is empty!");
        }
        this.url = url;
        headers.clear();
        responseCode = 0;
        postParameter.clear();
        content = null;
        if (url.startsWith("https://")) {
            enableSSL();
        } else {
            disableSSL();
        }
    }
 
    public Map<String, String> getRequestHeaders() {
        return headers;
    }
 
    public void addPostParameter(String name, String value) {
        this.postParameter.add(new BasicNameValuePair(name, value));
    }
 
    public void setTimeout(int connectTimeout, int readTimeout) {
        HttpParams params = httpClient.getParams();
        HttpConnectionParams.setConnectionTimeout(params, connectTimeout);
        HttpConnectionParams.setSoTimeout(params, readTimeout);
    }
 
    private void enableSSL() {
        try {
            SSLContext sslcontext = SSLContext.getInstance("TLS");
            sslcontext.init(null, new TrustManager[] { truseAllManager }, null);
            SSLSocketFactory sf = new SSLSocketFactory(sslcontext);
            sf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
            Scheme https = new Scheme("https", sf, 443);
            httpClient.getConnectionManager().getSchemeRegistry()
                    .register(https);
        } catch (KeyManagementException e) {
            e.printStackTrace();
        } catch (NoSuchAlgorithmException e) {
            e.printStackTrace();
        }
    }
 
    private void disableSSL() {
        SchemeRegistry reg = httpClient.getConnectionManager()
                .getSchemeRegistry();
        if (reg.get("https") != null) {
            reg.unregister("https");
        }
    }
 
    public void disableProxy() {
        httpClient.getCredentialsProvider().clear();
        httpClient.setRoutePlanner(null);
    }
 
    public void enableProxy(final String proxyHost, final int proxyPort,
            boolean needAuth, String username, String password,
            final String nonProxyHostRegularExpression) {
        if (needAuth) {
            httpClient.getCredentialsProvider().setCredentials(
                    new AuthScope(proxyHost, proxyPort),
                    new UsernamePasswordCredentials(username, password));
        }
        // Simple proxy setting, can't handle non-proxy-host
        // httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,new
        // HttpHost(proxyHost, proxyPort));
        httpClient.setRoutePlanner(new HttpRoutePlanner() {
            @Override
            public HttpRoute determineRoute(HttpHost target,
                    HttpRequest request, HttpContext contenxt)
                    throws HttpException {
                HttpRoute proxyRoute = new HttpRoute(target, null,
                        new HttpHost(proxyHost, proxyPort), "https"
                                .equalsIgnoreCase(target.getSchemeName()));
                if (nonProxyHostRegularExpression == null) {
                    return proxyRoute;
                }
                Pattern pattern = Pattern
                        .compile(nonProxyHostRegularExpression,
                                Pattern.CASE_INSENSITIVE);
                Matcher m = pattern.matcher(target.getHostName());
                if (m.find()) {
                    return new HttpRoute(target, null, target, "https"
                            .equalsIgnoreCase(target.getSchemeName()));
                } else {
                    return proxyRoute;
                }
            }
        });
    }
 
    private void fetch() throws IOException {
        if (url == null || method == null) {
            throw new RuntimeException(
                    "Fetch exception: URL and Method is null");
        }
        httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY,
                CookiePolicy.BROWSER_COMPATIBILITY);
        HttpResponse response = null;
        HttpUriRequest req = null;
        if (method.equals(HTTPMethod.GET)) {
            req = new HttpGet(url);
        } else {
            req = new HttpPost(url);
            ((HttpPost) req).setEntity(new UrlEncodedFormEntity(
                    this.postParameter, HTTP.UTF_8));
        }
        for (Entry<String, String> e : headers.entrySet()) {
            req.addHeader(e.getKey(), e.getValue());
        }
 
        //
        // Turn off "except" http header, some proxy server and web server do
        // not support it, may cause "417 Expectation Failed"
        //
        // HttpClient's doc says: 100-continue handshake should be used with
        // caution, as it may cause problems with HTTP servers and proxies that
        // do not support HTTP/1.1 protocol.
        //
        req.getParams().setBooleanParameter(
                CoreProtocolPNames.USE_EXPECT_CONTINUE, false);
        response = httpClient.execute(req);
        Header[] header = response.getAllHeaders();
        headers.clear();
        for (Header h : header) {
            headers.put(h.getName(), h.getValue());
        }
        content = EntityUtils.toByteArray(response.getEntity());
        responseCode = response.getStatusLine().getStatusCode();
    }
 
    private boolean isStringEmpty(String s) {
        return s == null || s.length() == 0;
    }
 
    public int getResponseCode() throws IOException {
        if (responseCode == 0) {
            fetch();
        }
        return responseCode;
    }
 
    public Map<String, String> getResponseHeaders() throws IOException {
        if (responseCode == 0) {
            fetch();
        }
        return headers;
    }
 
    public byte[] getByteArrayContent() throws IOException {
        if (content == null) {
            fetch();
        }
        return content;
    }
 
    public String getTextContent() throws IOException {
        if (content == null) {
            fetch();
        }
        if (content == null) {
            throw new RuntimeException("[Error] Can't fetch content!");
        }
        String headerContentType = null;
        if ((headerContentType = headers.get("Content-Type")) != null) {
            // use http header encoding
            Matcher m1 = headerEncodingReg.matcher(headerContentType);
            if (m1.find()) {
                return new String(content, m1.group(1));
            }
        }
        // Use html's encoding
        String html = new String(content);
        Matcher m2 = pageEncodingReg.matcher(html);
        if (m2.find()) {
            html = new String(content, m2.group(1));
        }
        return html;
    }
 
    public DefaultHttpClient getHttpClient() {
        return httpClient;
    }
 
    public enum HTTPMethod {
        GET, POST
    }
 
    // SSL handler (ignore untrusted hosts)
    private static TrustManager truseAllManager = new X509TrustManager() {
        @Override
        public X509Certificate[] getAcceptedIssuers() {
            return null;
        }
 
        @Override
        public void checkServerTrusted(X509Certificate[] chain, String authType)
                throws CertificateException {
        }
 
        @Override
        public void checkClientTrusted(X509Certificate[] chain, String authType)
                throws CertificateException {
        }
    };
}

最近研究了下HttpClient 4.0.1,主要是因为Java自己的HttpURLConnection对SSL支持的不好,而且控制起来不太方便,而且HttpClient还支持抓取非信任的站点,别的实现方式貌似需要在代码中显式导入证书。

需要的jar包:commons-logging-1.1.1.jar,httpclient-4.0.1.jar,httpcore-4.0.1.jar

Coding的时候遇到了些非常规问题:

1.HttpClient支持使用Java默认的Properties方式设置代理,不过我还是使用了HttpClient的代理设置方式。因而遇到了一个很诡异的问题,Properties方式设置的代理可以设置代理例外,即本地地址不通过代理访问,HttpClient没有简单的一句话设置的方法,必须写HttpRoutePlanner来自定义,比较繁琐。
如果使用HttpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,new HttpHost(proxyHost, proxyPort));来设置代理,则所有的请求都会往这个代理发送,没有例外,故弃之。

2.HTTP Header中的“Except”字段引起的问题,我向一些网页直接提交POST没有问题,但如果使用squid proxy进行post的话就会出现417 Expectation Failed错误,网上查了http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html 发现这个问题是因为proxy server/web server不能理解或满足Except字段中指定的值,则会引发这个错误,而HttpClient默认会发送这个字段,只要指示request不发送这个字段即可。

3.不受信任的HTTPS站点的访问问题,通过X509TrustManager来完成,将方法重写成返回null的或者是啥都不做的,理论上要是checkServerTrusted或checkClientTrusted方法检测到不受信任的站点,会抛出异常,但如果什么都不做,则被视为通过检查。

4.写了个getTextContent方法,用来获取返回的文本,解决乱码问题的方法其实很简单,首先用正则提取HTTP Header中Content-Type里的charset,如果没有,使用默认编码分析html head中Content-Type里的charset,如果没有,使用系统默认编码。


第二篇博客(http://eyecm.com/httpclient-download/):

使用HttpClient下载文件
2012年08月03日  ⁄ 编程整理 ⁄ 共 1268字 ⁄ +0 ⁄ 被围观 2,737+

在之前的文章《HttpClient4.1入门教程-利用官方例子讲解httpClient4.1的用法》中介绍了HttpClient4的基本用户,本篇文章以实例的方式介绍如何使用HttpClient下载文件。

事实上我们仍然发送GET或者POST请求,但对响应写入到文件流中即可。通过这段代码,也许你就明白下载软件或采集器的写法了。

package sitemap;

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;

import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;

public class SitemapDownloader
{
public static void main(String[] args) throws Exception
{
int min = 1;
int max = 806;

String url = "http://www.foxnews.com/sitemap.xml?idx=";

while (min < max)
{
Thread.sleep(500);

HttpClient httpClient1 = new DefaultHttpClient();

HttpGet httpGet1 = new HttpGet(url+min);
HttpResponse httpResponse1 = httpClient1.execute(httpGet1);

StatusLine statusLine = httpResponse1.getStatusLine();
if(statusLine.getStatusCode() == 200)
{

File xml = new File("d:/sitemap/"+min+".xml");

FileOutputStream outputStream = new FileOutputStream(xml);
InputStream inputStream = httpResponse1.getEntity().getContent();
byte b[] = new byte[1024];
int j = 0;
while( (j = inputStream.read(b))!=-1)
{
outputStream.write(b,0,j);
}
outputStream.flush();
outputStream.close();

min++;
System.out.println("存储了XML: " +min);
}

httpClient1.getConnectionManager().shutdown();
}
}
}

最后,将第一篇博客中的ssl认证功能加到第二篇博客中,就可以下载https资源:

package com.adobe.touchstone.plus.updownload;

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;

import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;

import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.impl.client.DefaultHttpClient;


public class SitemapDownloader{
    // SSL handler (ignore untrusted hosts)
    private static TrustManager truseAllManager = new X509TrustManager() {
        @Override
        public X509Certificate[] getAcceptedIssuers() {
            return null;
        }
 
        @Override
        public void checkServerTrusted(X509Certificate[] chain, String authType)
                throws CertificateException {
        }
 
        @Override
        public void checkClientTrusted(X509Certificate[] chain, String authType)
                throws CertificateException {
        }
    };
    
    private static void enableSSL(HttpClient httpClient) {
        try {
            SSLContext sslcontext = SSLContext.getInstance("TLS");
            sslcontext.init(null, new TrustManager[] { truseAllManager }, null);
            SSLSocketFactory sf = new SSLSocketFactory(sslcontext);
            sf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
            Scheme https = new Scheme("https", sf, 443);
            httpClient.getConnectionManager().getSchemeRegistry()
                    .register(https);
        } catch (KeyManagementException e) {
            e.printStackTrace();
        } catch (NoSuchAlgorithmException e) {
            e.printStackTrace();
        }
    }
	public static void main(String[] args) throws Exception{
	
	
		String url = "https://mail.google.com/mail/";
		
		HttpClient httpClient1 = new DefaultHttpClient() ;
		enableSSL(httpClient1);
		HttpGet httpGet1 = new HttpGet(url);
		HttpResponse httpResponse1 = httpClient1.execute(httpGet1);
		
		StatusLine statusLine = httpResponse1.getStatusLine();
		if(statusLine.getStatusCode() == 200){
			File xml = new File( "d:/mail.html" );
			
			FileOutputStream outputStream = new FileOutputStream(xml);
			InputStream inputStream = httpResponse1.getEntity().getContent();
			
			byte b[] = new byte[1024];
			int j = 0;
			while( (j = inputStream.read(b))!=-1){
				System.out.println("Writing : "+b.toString());
				outputStream.write(b,0,j);
			}
			outputStream.flush();
			outputStream.close();
			
			System.out.println( "存储了文件: " +xml.toString());
			httpClient1.getConnectionManager(). shutdown();
		}
	}
}


你可能感兴趣的:(如何使用HttpClient下载网络资源(包括下载ssl认证的资源))