HtmlParser初体验

package com.lch.parser;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;

public class HtmlPT {

	public static void main(String[] args) throws ParserException {
		String DivHtml = "";
		Parser parser = new Parser();
		parser.setURL("*****************");//地址
		parser.setEncoding(parser.getEncoding());
		
		NodeFilter filter = new TagNameFilter("DIV");
		NodeList nodes = parser.extractAllNodesThatMatch(filter);
		
		if(nodes != null){
			for(int i=0; i<nodes.size(); i++){
				Node textNode = (Node)nodes.elementAt(i);
				//System.out.println("当前DIV : " + textNode.getText());
				if(textNode.getText().equals("DIV class=Yaowentitle")){
					DivHtml = textNode.toHtml();
					System.out.println( textNode.toHtml());
					pageLink(DivHtml);
				}
				
			}
		}
	}
	
	public static void  pageLink(String cStr) throws ParserException{
		Parser parser = new Parser("<body>"+cStr+"</body>");
		HtmlPage page = new HtmlPage(parser);
		
		parser.visitAllNodesWith(page);
		NodeList nodeList = page.getBody();
		NodeFilter filter = new TagNameFilter("A");
		nodeList = nodeList.extractAllNodesThatMatch(filter, true);
		
		for(int i=0; i<nodeList.size(); i++){
			LinkTag link = (LinkTag)nodeList.elementAt(i);
			System.out.println("link : " + link.getLink());
			System.out.println("title : "+ link.getAttribute("TITLE"));
		}
	}
}

强大,比自己慢慢读取,要方便的多了!

你可能感兴趣的:(HtmlParser)