HtmlPaser与StringEscapeUtils共舞抓取网页

用正则来匹配的确很强大,但如果是网页的话HtmlPaser更方便,由于抓下来的信息中文是unicode的,所以要用到apache的一个包,以下是代码:

import java.net.URL;
import org.apache.commons.lang3.StringEscapeUtils;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;




public class GetURLText {
	public void getText() throws Exception {
		String urlString="http://localhost:8080/TestXFace/TestHtmlPaser/ABC.jsp";
		URL url = new URL(urlString);
		Parser parser = new Parser(url.openConnection());
		parser.setEncoding("UTF-8");
		NodeFilter nodeFilter = new NodeClassFilter(TableTag.class);
		NodeList nodeList = parser.parse(nodeFilter);//得到table标签里所有的信息
		System.out.println(nodeList);
		for(int i=0;i<nodeList.size();i++){
			TableTag tableTag = (TableTag) nodeList.elementAt(i);
			TableRow[] rows = tableTag.getRows();
			for(TableRow row:rows){
			System.out.println("<tr>  :"+ row.toPlainTextString());
				TableColumn[] tableColumns = row.getColumns();
				for(TableColumn tableColumn :tableColumns){
					String string = tableColumn.toPlainTextString();
					string = StringEscapeUtils.escapeHtml3(string);//unicode2String 
					System.out.println("<td>  :"+string+"</td>");//得到<td>标签里的内容
				}
			}
		}
	}
	public static void main(String[] args) throws Exception {
		GetURLText getURLText = new GetURLText();
		getURLText.getText();
	}
}
 

 

你可能感兴趣的:(unicode,网页抓取,Htmlpaser)