利用htmlparser抓取网页内容(一)

//转自:[url]http://www.blogjava.net/rocky/archive/2005/12/21/24997.html[/url]
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;

/**
* <br>
* 标题: <br>
* 功能概要: <br>
* 版权: cityyouth.cn (c) 2005 <br>
* 公司:上海城市青年网 <br>
* 创建时间:2005-12-21 <br>
* 修改时间: <br>
* 修改原因:
*
* @author 张伟
* @version 1.0
*/
public class TestYahoo {
         public static void testHtml() {
                 try {
                        String sCurrentLine;
                        String sTotalString;
                        sCurrentLine = "";
                        sTotalString = "";
                        java.io.InputStream l_urlStream;
                        java.net.URL l_url = new java.net.URL(
                                         "http://sports.sina.com.cn/iframe/nba/live/");
                        java.net.HttpURLConnection l_connection = (java.net.HttpURLConnection) l_url
                                        .openConnection();
                        l_connection.connect();
                        l_urlStream = l_connection.getInputStream();
                        java.io.BufferedReader l_reader = new java.io.BufferedReader(
                                        new java.io.InputStreamReader(l_urlStream));
                        while ((sCurrentLine = l_reader.readLine()) != null) {
                                sTotalString += sCurrentLine;
                        }
                        System.out.println(sTotalString);

                        System.out.println("====================");
                        String testText = extractText(sTotalString);
                        System.out.println(testText);
                } catch (Exception e) {
                        e.printStackTrace();
                }

        }

        /**
         * 抽取纯文本信息
         *
         * @param inputHtml
         * @return
         */
        public static String extractText(String inputHtml) throws Exception {
                StringBuffer text = new StringBuffer();

                Parser parser = Parser.createParser(new String(inputHtml.getBytes(),
                                "8859_1"), "8859-1");
                // 遍历所有的节点
                NodeList nodes = parser.extractAllNodesThatMatch(new NodeFilter() {
                        public boolean accept(Node node) {
                                return true;
                        }
                });
                Node node = nodes.elementAt(0);
                text.append(new String(node.toPlainTextString().getBytes("8859_1")));
                return text.toString();
        }

        /**
         * 读取文件的方式来分析内容. filePath也可以是一个Url.
         *
         * @param resource
         *                        文件/Url
         */
        public static void test5(String resource) throws Exception {
                Parser myParser = new Parser(resource);

                // 设置编码
                myParser.setEncoding("GBK");
                String filterStr = "table";
                NodeFilter filter = new TagNameFilter(filterStr);
                NodeList nodeList = myParser.extractAllNodesThatMatch(filter);
                TableTag tabletag = (TableTag) nodeList.elementAt(11);
                        
                        System.out.println(tabletag.toHtml());
                        
                        System.out.println("==============");

        }

        /*
         * public static void main(String[] args) { TestYahoo testYahoo = new
         * TestYahoo(); testYahoo.testHtml(); }
         */
        public static void main(String[] args) throws Exception {
                test5("http://sports.yahoo.com/nba/scoreboard");
        }
}

你可能感兴趣的:(java,职场,HtmlParser,休闲)