应用htmlparser访问https网站时,部分网站会出现PKIX path building failed问题,应是ssl未认证(浏览器直接访问可以,可能是保存过对应证书),可从两个方法解决,一个是从目标服务器下载有效证书,另一个是信任所有证书。
方法一:下载有效证书,InstallCert源码如下
/* * Copyright 2006 Sun Microsystems, Inc. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * - Neither the name of Sun Microsystems nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.security.KeyStore; import java.security.MessageDigest; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import javax.net.ssl.SSLContext; import javax.net.ssl.SSLException; import javax.net.ssl.SSLSocket; import javax.net.ssl.SSLSocketFactory; import javax.net.ssl.TrustManager; import javax.net.ssl.TrustManagerFactory; import javax.net.ssl.X509TrustManager; public class InstallCert { public static void main(String[] args) throws Exception { String host; int port; char[] passphrase; if ((args.length == 1) || (args.length == 2)) { String[] c = args[0].split(":"); host = c[0]; port = (c.length == 1) ? 443 : Integer.parseInt(c[1]); String p = (args.length == 1) ? "changeit" : args[1]; passphrase = p.toCharArray(); } else { System.out .println("Usage: java InstallCert <host>[:port] [passphrase]"); return; } File file = new File("jssecacerts"); if (file.isFile() == false) { char SEP = File.separatorChar; File dir = new File(System.getProperty("java.home") + SEP + "lib" + SEP + "security"); file = new File(dir, "jssecacerts"); if (file.isFile() == false) { file = new File(dir, "cacerts"); } } System.out.println("Loading KeyStore " + file + "..."); InputStream in = new FileInputStream(file); KeyStore ks = KeyStore.getInstance(KeyStore.getDefaultType()); ks.load(in, passphrase); in.close(); SSLContext context = SSLContext.getInstance("TLS"); TrustManagerFactory tmf = TrustManagerFactory .getInstance(TrustManagerFactory.getDefaultAlgorithm()); tmf.init(ks); X509TrustManager defaultTrustManager = (X509TrustManager) tmf .getTrustManagers()[0]; SavingTrustManager tm = new SavingTrustManager(defaultTrustManager); context.init(null, new TrustManager[] { tm }, null); SSLSocketFactory factory = context.getSocketFactory(); System.out .println("Opening connection to " + host + ":" + port + "..."); SSLSocket socket = (SSLSocket) factory.createSocket(host, port); socket.setSoTimeout(10000); try { System.out.println("Starting SSL handshake..."); socket.startHandshake(); socket.close(); System.out.println(); System.out.println("No errors, certificate is already trusted"); } catch (SSLException e) { System.out.println(); e.printStackTrace(System.out); } X509Certificate[] chain = tm.chain; if (chain == null) { System.out.println("Could not obtain server certificate chain"); return; } BufferedReader reader = new BufferedReader(new InputStreamReader( System.in)); System.out.println(); System.out.println("Server sent " + chain.length + " certificate(s):"); System.out.println(); MessageDigest sha1 = MessageDigest.getInstance("SHA1"); MessageDigest md5 = MessageDigest.getInstance("MD5"); for (int i = 0; i < chain.length; i++) { X509Certificate cert = chain[i]; System.out.println(" " + (i + 1) + " Subject " + cert.getSubjectDN()); System.out.println(" Issuer " + cert.getIssuerDN()); sha1.update(cert.getEncoded()); System.out.println(" sha1 " + toHexString(sha1.digest())); md5.update(cert.getEncoded()); System.out.println(" md5 " + toHexString(md5.digest())); System.out.println(); } System.out .println("Enter certificate to add to trusted keystore or 'q' to quit: [1]"); String line = reader.readLine().trim(); int k; try { k = (line.length() == 0) ? 0 : Integer.parseInt(line) - 1; } catch (NumberFormatException e) { System.out.println("KeyStore not changed"); return; } X509Certificate cert = chain[k]; String alias = host + "-" + (k + 1); ks.setCertificateEntry(alias, cert); OutputStream out = new FileOutputStream("jssecacerts"); ks.store(out, passphrase); out.close(); System.out.println(); System.out.println(cert); System.out.println(); System.out .println("Added certificate to keystore 'jssecacerts' using alias '" + alias + "'"); } private static final char[] HEXDIGITS = "0123456789abcdef".toCharArray(); private static String toHexString(byte[] bytes) { StringBuilder sb = new StringBuilder(bytes.length * 3); for (int b : bytes) { b &= 0xff; sb.append(HEXDIGITS[b >> 4]); sb.append(HEXDIGITS[b & 15]); sb.append(' '); } return sb.toString(); } private static class SavingTrustManager implements X509TrustManager { private final X509TrustManager tm; private X509Certificate[] chain; SavingTrustManager(X509TrustManager tm) { this.tm = tm; } public X509Certificate[] getAcceptedIssuers() { throw new UnsupportedOperationException(); } public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { throw new UnsupportedOperationException(); } public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { this.chain = chain; tm.checkServerTrusted(chain, authType); } } }编译InstallCert.java,然后执行:java InstallCert hostname,比如: java InstallCert www.rizhiyi.com
工程目录下生成jssecacerts证书文件,在parser = new Parser(url)代码前执行
System.setProperty("javax.net.ssl.trustStore", "工程目录/jssecacerts");
方法二:信任所有证书
import java.net.URL; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSession; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; import org.htmlparser.Parser; import org.htmlparser.visitors.TextExtractingVisitor; public class SslUtils { private static void trustAllHttpsCertificates() throws Exception { TrustManager[] trustAllCerts = new TrustManager[1]; TrustManager tm = new miTM(); trustAllCerts[0] = tm; SSLContext sc = SSLContext.getInstance("SSL"); sc.init(null, trustAllCerts, null); HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory()); } static class miTM implements TrustManager,X509TrustManager { public X509Certificate[] getAcceptedIssuers() { return null; } public boolean isServerTrusted(X509Certificate[] certs) { return true; } public boolean isClientTrusted(X509Certificate[] certs) { return true; } public void checkServerTrusted(X509Certificate[] certs, String authType) throws CertificateException { return; } public void checkClientTrusted(X509Certificate[] certs, String authType) throws CertificateException { return; } } /** * 忽略HTTPS请求的SSL证书 * @throws Exception */ public static void ignoreSsl() throws Exception{ HostnameVerifier hv = new HostnameVerifier() { public boolean verify(String urlHostName, SSLSession session) { System.out.println("Warning: URL Host: " + urlHostName + " vs. " + session.getPeerHost()); return true; } }; trustAllHttpsCertificates(); HttpsURLConnection.setDefaultHostnameVerifier(hv); } public static void main(String[] args) throws Exception { URL u = new URL("https://www.rizhiyi.com/"); if("https".equalsIgnoreCase(u.getProtocol())){ SslUtils.ignoreSsl(); } Parser parser; parser = new Parser(u.toString()); parser.setEncoding("UTF-8"); TextExtractingVisitor tev=new TextExtractingVisitor(); parser.visitAllNodesWith(tev); String body=tev.getExtractedText(); System.out.println(body); } }