彩票计算JAVA版(一)获取HTML页面的内容

彩票计算JAVA版(一)获取HTML页面的内容

主要用到了开源的两个包,pom.xml书写如下:
<dependency>
<groupId>org.htmlparser</groupId>
<artifactId>htmlparser</artifactId>
<version>1.6</version>
</dependency>
<dependency>
<groupId>cpdetector</groupId>
<artifactId>cpdetector</artifactId>
<version>1.0.5</version>
</dependency>
核心类HTMLParserUtil.java内容如下:
package com.sillycat.easyluck.common.html;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.UnknownHostException;
import java.nio.charset.Charset;
import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import cpdetector.io.ASCIIDetector;
import cpdetector.io.CodepageDetectorProxy;
import cpdetector.io.JChardetFacade;
import cpdetector.io.ParsingDetector;
import cpdetector.io.UnicodeDetector;
public class HtmlParserUtil {
/* StringBuffer的缓冲区大小 */
public static int TRANSFER_SIZE = 4096;
/* 当前平台的行分隔符 */
public static String lineSep = System.getProperty("line.separator");
/* 自动探测页面编码,避免中文乱码的出现 */
public static String autoDetectCharset(URL url) {
CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
/**
* ParsingDetector可用于检查HTML、XML等文件或字符流的编码 构造方法中的参数用于指示是否显示探测过程的详细信息
* 为false则不显示
*/
detector.add(new ParsingDetector(false));
detector.add(JChardetFacade.getInstance());
detector.add(ASCIIDetector.getInstance());
detector.add(UnicodeDetector.getInstance());

Charset charset = null;
try {
charset = detector.detectCodepage(url);
} catch (MalformedURLException mue) {
mue.printStackTrace();
} catch (IOException ie) {
ie.printStackTrace();
}
if (charset == null)
charset = Charset.defaultCharset();
return charset.name();
}
/* 按照指定编码解析标准的html页面,为建立索引做准备 */
public static String[] parseHtml(String url, String charset) {
String result[] = null;
String content = null;
try {
URL source = new URL(url);
InputStream in = source.openStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(
in, charset));
String line = new String();
StringBuffer temp = new StringBuffer(TRANSFER_SIZE);
while ((line = reader.readLine()) != null) {
temp.append(line);
temp.append(lineSep);
}
reader.close();
in.close();
content = temp.toString();
} catch (UnsupportedEncodingException uee) {
uee.printStackTrace();
} catch (MalformedURLException mue) {
System.err.println("Invalid URL : " + url);
} catch (UnknownHostException uhe) {
System.err.println("UnknowHost : " + url);
} catch (SocketException se) {
System.err.println("Socket Error : " + se.getMessage() + " " + url);
} catch (SocketTimeoutException ste) {
System.err.println("Socket Connection Time Out : " + url);
} catch (FileNotFoundException fnfe) {
System.err.println("broken link "
+ ((FileNotFoundException) fnfe.getCause()).getMessage()
+ " ignored");
} catch (IOException ie) {
ie.printStackTrace();
}
if (content != null) {
Parser myParser = Parser.createParser(content, charset);
HtmlPage visitor = new HtmlPage(myParser);
try {
myParser.visitAllNodesWith(visitor);
String body = null;
String title = "Untitled";
if (visitor.getBody() != null) {
NodeList nodelist = visitor.getBody();
body = nodelist.asString().trim();
}
if (visitor.getTitle() != null) {
title = visitor.getTitle();
}
result = new String[] { body, title };
} catch (ParserException pe) {
pe.printStackTrace();
}
}
return result;
}
}

测试用例HtmlParserUtilTest.java
package com.sillycat.easyluck.common.html;
import java.net.MalformedURLException;
import java.net.URL;
import junit.framework.TestCase;
import com.sillycat.easybase.utils.StringUtil;
public class HtmlParserUtilTest extends TestCase {
private String url = "http://www.mengjiang.net/listnews.asp?dncp=small&anid=2&nid=14&Page=1";
protected void setUp() throws Exception {
super.setUp();
}
protected void tearDown() throws Exception {
super.tearDown();
}
public void testDumy() {
assertTrue(true);
}
public void testAutoDetectCharset() throws MalformedURLException {
String encode = HtmlParserUtil.autoDetectCharset(new URL(url));
assertTrue(StringUtil.isNotBlank(encode));
}
public void testParseHtml() throws MalformedURLException {
String[] contexts = HtmlParserUtil.parseHtml(url, HtmlParserUtil
.autoDetectCharset(new URL(url)));
assertNotNull(contexts);
assertTrue(contexts.length == 2);
assertTrue(StringUtil.isNotBlank(contexts[0]));
assertTrue(StringUtil.isNotBlank(contexts[1]));
System.out.println(contexts[0]);
}
}

你可能感兴趣的:(java,html,.net,IE,asp.net)