抓取HTML网页数据

(转)htmlparse filter使用

该类并不是一个通用的工具类,需要按自己的要求实现,这里只记录了Htmlparse.jar包的一些用法。仅此而已! 
详细看这里:http://gundumw100.javaeye.com/blog/704311

 

import java.util.*;   

import org.htmlparser.Node;   

import org.htmlparser.NodeFilter;   

import org.htmlparser.Parser;   

import org.htmlparser.filters.AndFilter;   

import org.htmlparser.filters.HasAttributeFilter;   

import org.htmlparser.filters.NodeClassFilter;   

import org.htmlparser.filters.TagNameFilter;   

import org.htmlparser.tags.BodyTag;   

import org.htmlparser.tags.LinkTag;   

import org.htmlparser.util.NodeList;   

import org.htmlparser.util.ParserException;   

  

  

public class HtmlparseUtil {   

    WebHttpClient util=new WebHttpClient();   

      

    public Map<String, String> linkGet(String url, String charset) {   

        String content=util.getWebContentByGet(url,charset);   

        Map<String, String> linkMap = new HashMap<String, String>();   

        try {   

            //开始解析   

            Parser parser = Parser.createParser(content, charset);   

            // 过滤出<a></a>标签   

            NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);   

            NodeList list = parser.extractAllNodesThatMatch(linkFilter);   

            Node node = null;   

            for (int i = 0; i < list.size(); i++) {   

                node = list.elementAt(i);   

                // 获得网页中的链接map(href,text)   

                linkMap.put(((LinkTag) node).getLink(), this.processText(((LinkTag) node).getLinkText()));   

            }   

        } catch (ParserException e) {   

            e.printStackTrace();   

        }    

        return linkMap;   

    }   

  

      

    public String bodyGet(String url, String charset) {   

        String content=util.getWebContentByGet(url,charset);   

        String body = "";   

        try {   

            Parser parser = Parser.createParser(content, charset);   

            // 过滤<body></body>标签   

            NodeFilter bodyFilter = new NodeClassFilter(BodyTag.class);   

            NodeList list = parser.extractAllNodesThatMatch(bodyFilter);   

            Node node = null;   

            for (int i = 0; i < list.size(); i++) {   

                node = list.elementAt(i);   

                // 获得网页内容 保存在content中   

                body = ((BodyTag) node).getBody();   

            }   

        } catch (ParserException e) {   

            e.printStackTrace();   

        }   

        return body;   

    }   

  

      

    public Map<String,String> termGet(String url, String charset) {   

        String content=util.getWebContentByGet(url,charset);   

           

        Map<String, String> map = new HashMap<String, String>();   

        try {   

            //开始解析   

            // 过滤出class为term的<span>元素   

            Parser parser = Parser.createParser(content, charset);   

            AndFilter filter =    

                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","term"));   

               

            Node node = null;   

            NodeList nodeList = parser.parse(filter);   

               

            for (int i = 0; i < nodeList.size(); i++) {   

                node = nodeList.elementAt(i);   

                map.put("term", node.toPlainTextString());   

            }   

            // 过滤出class为start-time的<span>元素   

            Parser parser2 = Parser.createParser(content, charset);   

            AndFilter filter2 =    

                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","start-time"));   

            NodeList nodeList2 = parser2.parse(filter2);   

            for (int i = 0; i < nodeList2.size(); i++) {   

                node = nodeList2.elementAt(i);   

                map.put("start-time", node.toPlainTextString());   

            }   

            // 过滤出id为J_SingleEndTimeLabel的<span>元素   

            Parser parser3 = Parser.createParser(content, charset);   

            AndFilter filter3 =    

                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("id","J_SingleEndTimeLabel"));   

            NodeList nodeList3 = parser3.parse(filter3);   

            for (int i = 0; i < nodeList3.size(); i++) {   

                node = nodeList3.elementAt(i);   

                map.put("end-time", node.toPlainTextString());   

            }   

               

            // 过滤出class为box post的<div>元素   

            Parser parser4 = Parser.createParser(content, charset);   

            AndFilter filter4 =    

                new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("class","box post"));   

            NodeList nodeList4 = parser4.parse(filter4);   

            for (int i = 0; i < nodeList4.size(); i++) {   

                node = nodeList4.elementAt(i);   

                String temp=node.toPlainTextString().trim();   

                temp=temp.substring(10,20).trim();   

                map.put("pre-term", temp);   

            }   

               

            // 过滤出class为J_AwardNumber的<span>元素   

            Parser parser5 = Parser.createParser(content, charset);   

//          AndFilter filter5 =    

//                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","J_AwardNumber"));   

            NodeList nodeList5 = parser5.parse(new HasAttributeFilter("class","J_AwardNumber"));   

            StringBuffer buffer=new StringBuffer();   

            for (int i = 0; i < nodeList5.size(); i++) {   

                node = nodeList5.elementAt(i);   

                buffer.append(","+node.toPlainTextString());   

            }   

            buffer.append("|");   

               

            // 过滤出class为blue J_AwardNumber的<span>元素   

            Parser parser6 = Parser.createParser(content, charset);   

//          AndFilter filter6 =    

//                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","blue J_AwardNumber"));   

            NodeList nodeList6 = parser6.parse(new HasAttributeFilter("class","blue J_AwardNumber"));   

            for (int i = 0; i < nodeList6.size(); i++) {   

                node = nodeList6.elementAt(i);   

                buffer.append(node.toPlainTextString()+",");   

            }   

               

            map.put("numbers", buffer.toString());   

        } catch (ParserException e) {   

            // TODO Auto-generated catch block   

            e.printStackTrace();   

        }   

           

        return map;   

    }   

       

    private String processText(String content){      

        content=content.trim().replaceAll("&nbsp;", "");      

//      content=content.replaceAll("<p>", "\n");      

//      content=content.replaceAll("</TD>", "");      

//      content=content.replaceAll("</div>", "");      

//      content=content.replaceAll("</a>", "");      

//      content=content.replaceAll("<a href=.*>", "");      

        return content;      

    }      

       

    public static void main(String[] str) {   

           

        String url="http://caipiao.taobao.com/lottery/order/lottery_dlt.htm?type=1";   

        HtmlparseUtil util=new HtmlparseUtil();   

        Map<String,String> map=util.termGet(url, "gb2312");   

        System.out.println("term="+map.get("term"));//<span class="term">第<em>10074</em>期</span>   

        System.out.println("start-time="+map.get("start-time"));//   

        System.out.println("end-time="+map.get("end-time"));//   

        System.out.println("pre-term="+map.get("pre-term"));//   

        System.out.println("numbers="+map.get("numbers"));//   

           

          

           

    }   

       

}

 

你可能感兴趣的:(html)