HtmlParser —— 抓取百度新歌TOP100

import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.Span;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;


public class test3 {
	public static void main(String[] args) throws ParserException {
		String url = "http://list.mp3.baidu.com/top/top100.html";
		Parser parser = new Parser(url);
		NodeFilter filter_table = new NodeClassFilter(TableTag.class);
		NodeFilter filter_span = new NodeClassFilter(Span.class);
		NodeFilter filter_link = new TagNameFilter("A");
		NodeFilter filter_search = new HasAttributeFilter("class","search");
		NodeList nodelist = parser.extractAllNodesThatMatch(filter_table);
		for(int i = 0   ; i < nodelist.size() ; i++){
			TableTag table = (TableTag) nodelist.elementAt(i);
			for(int j = 0 ; j < table.getRowCount() ; j++){
				TableRow row = table.getRow(j);
				TableColumn[] column = row.getColumns();
				for(int m = 0 ; m < row.getColumnCount() ; m++){
					String tempstr = column[m].getStringText() ;
					parser = Parser.createParser(tempstr,"utf-8");
					NodeList list_span = parser.extractAllNodesThatMatch(filter_span);
					if(list_span.size() > 0){
						for(int i1 = 0 ; i1 < list_span.size() ; i1++){
							System.out.print(list_span.elementAt(i1).toPlainTextString() + "   ");
						}
					}
					parser.reset();
					NodeList list_link = parser.extractAllNodesThatMatch(filter_link);
					if(list_link.size() > 0){
						for(int i1 = 0 ; i1 < list_link.size() ; i1++){	
							String content = ((LinkTag)list_link.elementAt(i1)).getLinkText() ;
							if(!content.equals(null)){
								System.out.print(content + "   ");
							}
						}
					}
					parser.reset();
					NodeList list_search = parser.extractAllNodesThatMatch(filter_search);
					if(list_search.size() > 0){
						for(int i1 = 0 ; i1 < list_search.size() ; i1++){	
							String link = ((LinkTag)list_search.elementAt(i1)).getLink() ;
							if(!link.equals(null)){
								System.out.print(link + "   ");
							}
						}
					}
				}
				System.out.println();
			}
		}	
	}
}

 

你可能感兴趣的:(html,百度,J#)