HTMLParser初试

    blog迁移至: http://www.micmiu.com

HTMLParser是目前Java领域中解析HTML应用比较广泛的一个。
HTMLParser的主页是 http://htmlparser.sourceforge.net/
初次接触HTML Parser,它的核心模块是org.htmlparser.Parser类
介绍几种Parser 初始化的方法,详细见代码:
package com.htmlparser;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;

/**
 * 
 * @author Michael
 */
public class TestMain {

    private static String ENCODE = "UTF-8";

    /**
     * @param args
     */
    public static void main(String[] args) {
        TestMain test = new TestMain();
        // String url =
        // "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=nero9%E5%88%BB%E5%BD%95ape&aq=f&aqi=&aql=&oq=&gs_rfai=";
        String url = "http://www.baidu.com/s?wd=nero9%BF%CC%C2%BCape&oq=nero9k&rsp=1&f=3&sugT=6679";
        test.testNodeFilter(url);

    }

    /**
     * 几种初始化的方法
     */
    private void testInitParser() {
        try {
            Parser parser1 = new Parser();
            parser1.setURL("http://www.baidu.com");
            parser1.setEncoding(parser1.getEncoding());

            // url 初始化的方法
            HttpURLConnection.setFollowRedirects(true);
            URL netUrl = new URL("http://www.baidu.com");
            HttpURLConnection con = (HttpURLConnection) netUrl.openConnection();
            con.setInstanceFollowRedirects(false);
            con.connect();
            Parser parser2 = new Parser(con);

            // 根据字符串初始化
            String htmlString = this.readHtmlFile("d:/test/test.html");
            Parser parser3 = Parser.createParser(htmlString, ENCODE);

            // 根据字符串初始化
            String htmlStr1 = "<html><head><title>Test</title>"
                    + "<link href=’/test01/css.css' text='text/css' rel='stylesheet'/>"
                    + "</head><body><div><a href='www.baidu.com'  target='_blank'>baidu</a></div>"
                    + "<div><a href='www.sina.com' target='_blank'>sina</a></div></body></html>";
            Parser parser4 = new Parser(htmlString);

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * NodeFilter
     * @param url
     */
    private void testNodeFilter(String url) {
        System.out.println("NodeFilter start...");
        try {
            HttpURLConnection.setFollowRedirects(true);
            URL netUrl = new URL(url);
            HttpURLConnection con = (HttpURLConnection) netUrl.openConnection();
            con.setInstanceFollowRedirects(false);
            con.connect();
            Parser parser = new Parser(con);
            parser.setEncoding(parser.getEncoding());

            NodeFilter filter = new TagNameFilter("A");
            NodeList list = parser.extractAllNodesThatMatch(filter);
            for (int i = 0; i < list.size(); i++) {
                System.out.println(list.elementAt(i).toHtml());
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        System.out.println("NodeFilter end");
    }

    /**
     * 读取HTML文件
     * @param htmlFileName
     * @return
     */
    private String readHtmlFile(String htmlFileName) {
        BufferedReader bis = null;
        try {
            bis = new BufferedReader(new InputStreamReader(new FileInputStream(
                    new File(htmlFileName)), ENCODE));
            StringBuffer htmlsb = new StringBuffer();
            String readTemp;
            while ((readTemp = bis.readLine()) != null) {
                htmlsb.append(readTemp);
            }
            bis.close();
            return htmlsb.toString();
        } catch (Exception e) {
            return null;
        } finally {
            if (null != bis) {
                try {
                    bis.close();
                } catch (IOException ioe) {
                    ioe.printStackTrace();
                }
            }
        }
    }
}

你可能感兴趣的:(java,html,OpenSource,HtmlParser,sourceforge)