HtmlParser爬取气象信息实例

闲来无事,搞搞htmlParser,本想将内容发到手机上,后来未成,只是将杭州的天气情况拼装好了。

import java.util.HashMap;
import java.util.Map;

import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class MyWeatherTest {

    private static final Logger        logger         = Logger.getLogger(MyWeatherTest.class);
    private static String              PARSE_CITY     = "杭州";
    private static String              CLASS_NODE     = "class";
    private static String              CLASS_CITY     = "w365";
    private static String              CLASS_DAY      = "everyDay";
    private static String              CLASS_DAY_ELSE = "everyDayelse";

    private static String              TEXT_MSG       = "$CITY明天$DATE$STATUS,温度为$$TMPT;后天$DATE$STATUS,温度为$TMPT   中央气象局 $LAST_MODIFY_TIME";
    private static Map<String, String> cityURLMap     = new HashMap();
    static {
        cityURLMap.put(PARSE_CITY, "http://www.nmc.gov.cn/publish/forecast/AZJ/hangzhou.html");
    }
    
    public static void main(String[] args) {
        Parser parser;
        NodeFilter filter;
        String text = TEXT_MSG.replaceFirst("\\$CITY", PARSE_CITY);
        int start = 1;
        Node[] nodes = new Node[2];
        
        try {
            // 如果当天的class为everyDayelse,则start值减1
            parser = new Parser(cityURLMap.get(PARSE_CITY));
            parser.setEncoding(parser.getEncoding());
            NodeFilter filterElse = new HasAttributeFilter(CLASS_NODE, CLASS_DAY_ELSE);
            if (parser.extractAllNodesThatMatch(filterElse).size() > 0) {
                start--;
            }

            // 取出后面两天的内容div
            parser = new Parser(cityURLMap.get(PARSE_CITY));
            parser.setEncoding(parser.getEncoding());
            filter = new HasAttributeFilter(CLASS_NODE, CLASS_DAY);
            NodeList list1 = parser.extractAllNodesThatMatch(filter);
            for (int i = 0; i < 2; i++) {
                nodes[i] = list1.elementAt(start);
                start++;
                // logger.fatal(list.elementAt(i).toHtml());
            }

            // 将内容
            for (int i = 0; i < 2; i++) {

                String content = nodes[i].toPlainTextString().replaceAll("[ ]|[\t]", "");
                
                //填充日期
                String[] str = content.split("\\\n");
                text = text.replaceFirst("\\$DATE", str[1]);
                
                //填充天气
                if (str[6].equals(str[7])) {
                    text = text.replaceFirst("\\$STATUS", str[6]);
                } else {
                    text = text.replaceFirst("\\$STATUS", str[6] + "转" + str[7]);
                }
                
                //填充温度
                if(str[8].contains("低温")){
                    text = text.replaceFirst("\\$TMPT", str[8].replace("高温", "").replace("低温", "-"));
                }else{
                    text = text.replaceFirst("\\$TMPT", str[8].replace("高温", "")+str[9].replace("低温", "-"));
                }

            }

            
            
            // 取出后面两天的内容div
            parser = new Parser(cityURLMap.get(PARSE_CITY));
            parser.setEncoding(parser.getEncoding());
            filter = new HasAttributeFilter(CLASS_NODE, CLASS_CITY);
            NodeList list2 = parser.extractAllNodesThatMatch(filter);
            for (int i = 0; i <list2.size(); i++) {
                String myStr=list2.elementAt(i).toPlainTextString().replaceAll("[ ]|[\t]|[\n]|[\\&nbsp;]", "");
                if(myStr.indexOf("最后更新时间")==0){
                    text = text.replaceFirst("\\$LAST_MODIFY_TIME", myStr);
                }
                   
            }
            

            System.out.println(text);
        } catch (ParserException e) {
            e.printStackTrace();
        }

    }
    
    

}

 可以跑起来,不过代码写得很烂,主要参考的是以下的代码:

import java.net.URL;

import junit.framework.TestCase;  
  
import org.apache.log4j.Logger;  
import org.htmlparser.Node;  
import org.htmlparser.NodeFilter;  
import org.htmlparser.Parser;  
import org.htmlparser.Tag;  
import org.htmlparser.beans.LinkBean;  
import org.htmlparser.filters.NodeClassFilter;  
import org.htmlparser.filters.OrFilter;  
import org.htmlparser.filters.TagNameFilter;  
import org.htmlparser.tags.HeadTag;  
import org.htmlparser.tags.ImageTag;  
import org.htmlparser.tags.InputTag;  
import org.htmlparser.tags.LinkTag;  
import org.htmlparser.tags.OptionTag;  
import org.htmlparser.tags.SelectTag;  
import org.htmlparser.tags.TableColumn;  
import org.htmlparser.tags.TableRow;  
import org.htmlparser.tags.TableTag;  
import org.htmlparser.tags.TitleTag;  
import org.htmlparser.util.NodeIterator;  
import org.htmlparser.util.NodeList;  
import org.htmlparser.util.ParserException;  
import org.htmlparser.visitors.HtmlPage;  
import org.htmlparser.visitors.NodeVisitor;  
import org.htmlparser.visitors.ObjectFindingVisitor;  
  
public class ParserTestCase extends TestCase {  
  
    private static final Logger logger = Logger.getLogger(ParserTestCase.class);  
  
    public ParserTestCase(String name) {  
        super(name);  
    }  
    /* 
     * 测试ObjectFindVisitor的用法 
     */  
    public void testImageVisitor() {  
        try {  
            ImageTag imgLink;  
            ObjectFindingVisitor visitor = new ObjectFindingVisitor(  
                    ImageTag.class);  
            Parser parser = new Parser();  
            parser.setURL("http://profile.china.alibaba.com/user/dengminhui12.html");  
            parser.setEncoding(parser.getEncoding());  
            parser.visitAllNodesWith(visitor);  
            Node[] nodes = visitor.getTags();  
            logger.fatal("result of testImageVisitor : size = " + nodes.length);  
            for (int i = 0; i < nodes.length; i++) {  
                imgLink = (ImageTag) nodes[i];  
                logger.fatal("testImageVisitor() ImageURL = "  
                        + imgLink.getImageURL()); 
            }  
        }  
        catch (Exception e) {  
            e.printStackTrace();  
        }  
    }  
    /* 
     * 测试TagNameFilter用法 
     */  
    public void testNodeFilter() {  
        try {  
            NodeFilter filter = new TagNameFilter("IMG");  
            Parser parser = new Parser();  
            parser.setURL("http://profile.china.alibaba.com/user/dengminhui12.html");  
            parser.setEncoding(parser.getEncoding());  
            NodeList list = parser.extractAllNodesThatMatch(filter);  
            logger.fatal("result of testNodeFilter : size = " + list.size()); 
            for (int i = 0; i < list.size(); i++) {  
                logger.fatal("testNodeFilter() " + list.elementAt(i).toHtml());  
            }  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
  
    }  
    /* 
     * 测试NodeClassFilter用法 
     */  
    public void testLinkTag() {  
        try {  
  
            NodeFilter filter = new NodeClassFilter(LinkTag.class);  
            Parser parser = new Parser();  
            parser.setURL("http://profile.china.alibaba.com/user/dengminhui12.html");  
            parser.setEncoding(parser.getEncoding());  
            NodeList list = parser.extractAllNodesThatMatch(filter);  
            logger.fatal("result of testLinkTag : size = " + list.size());  
            for (int i = 0; i < list.size(); i++) {  
                LinkTag node = (LinkTag) list.elementAt(i);  
                logger.fatal("testLinkTag() Link is :" + node.extractLink());  
            }  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
  
    }  
    /* 
     * 测试<link href=" text=’text/css’ rel=’stylesheet’ />用法 
     */  
    public void testLinkCSS() {  
        try {  
  
            Parser parser = new Parser();  
            parser  
                    .setInputHTML("<head><title>Link Test</title>"  
                            + "<link href=’/test01/css.css’ text=’text/css’ rel=’stylesheet’ />"  
                            + "<link href=’/test02/css.css’ text=’text/css’ rel=’stylesheet’ />"  
                            + "</head>" + "<body>");  
            parser.setEncoding(parser.getEncoding());  
            NodeList nodeList = null;  
  
            for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {  
                Node node = e.nextNode();  
                logger  
                        .fatal("testLinkCSS()" + node.getText()  
                                + node.getClass());  
  
            }  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
    }  
    /* 
     * 测试OrFilter的用法 
     */  
    public void testOrFilter() {  
        NodeFilter inputFilter = new NodeClassFilter(InputTag.class);  
        NodeFilter selectFilter = new NodeClassFilter(SelectTag.class);  
        Parser myParser;  
        NodeList nodeList = null;  
  
        try {  
            Parser parser = new Parser();  
            parser  
                    .setInputHTML("<head><title>OrFilter Test</title>"  
                            + "<link href=’/test01/css.css’ text=’text/css’ rel=’stylesheet’ />"  
                            + "<link href=’/test02/css.css’ text=’text/css’ rel=’stylesheet’ />"  
                            + "</head>"  
                            + "<body>"  
                            + "<input type=’text’ value=’text1′ name=’text1′/>"  
                            + "<input type=’text’ value=’text2′ name=’text2′/>"  
                            + "<select><option id=’1′>1</option><option id=’2′>2</option><option id=’3′></option></select>"  
                            + "<a href=’http://www.yeeach.com’>yeeach.com</a>"  
                            + "</body>");  
  
            parser.setEncoding(parser.getEncoding());  
            OrFilter lastFilter = new OrFilter();  
            lastFilter.setPredicates(new NodeFilter[] { selectFilter,  
                    inputFilter });  
            nodeList = parser.parse(lastFilter);  
            for (int i = 0; i <= nodeList.size(); i++) {  
                if (nodeList.elementAt(i) instanceof InputTag) {  
                    InputTag tag = (InputTag) nodeList.elementAt(i);  
                    logger.fatal("OrFilter tag name is :" + tag.getTagName()  
                            + " ,tag value is:" + tag.getAttribute("value"));  
                }  
                if (nodeList.elementAt(i) instanceof SelectTag) {  
                    SelectTag tag = (SelectTag) nodeList.elementAt(i);  
                    NodeList list = tag.getChildren();  
  
                    for (int j = 0; j < list.size(); j++) {  
                        OptionTag option = (OptionTag) list.elementAt(j);  
                        logger  
                                .fatal("OrFilter Option"  
                                        + option.getOptionText());  
                    }  
  
                }  
            }  
  
        } catch (ParserException e) {  
            e.printStackTrace();  
        }  
    }  
    /* 
     * 测试对<table><tr><td></td></tr></table>的解析 
     */  
    public void testTable() {  
        Parser myParser;  
        NodeList nodeList = null;  
        myParser = Parser.createParser("<body> " + "<table id=’table1′ >"  
                + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"  
                + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"  
                + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>"  
                + "<table id=’table2′ >"  
                + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"  
                + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"  
                + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>"  
                + "</body>", "GBK");  
        NodeFilter tableFilter = new NodeClassFilter(TableTag.class);  
        OrFilter lastFilter = new OrFilter();  
        lastFilter.setPredicates(new NodeFilter[] { tableFilter });  
        try {  
            nodeList = myParser.parse(lastFilter);  
            for (int i = 0; i <= nodeList.size(); i++) {  
                if (nodeList.elementAt(i) instanceof TableTag) {  
                    TableTag tag = (TableTag) nodeList.elementAt(i);  
                    TableRow[] rows = tag.getRows();  
  
                    for (int j = 0; j < rows.length; j++) {  
                        TableRow tr = (TableRow) rows[j];  
                        TableColumn[] td = tr.getColumns();  
                        for (int k = 0; k < td.length; k++) {  
                            logger.fatal("<td>" + td[k].toPlainTextString());  
                        }  
  
                    }  
  
                }  
            }  
  
        } catch (ParserException e) {  
            e.printStackTrace();  
        }  
    }  
    /* 
     * 测试NodeVisitor的用法,遍历所有节点 
     */  
    public void testVisitorAll() {  
        try {  
            Parser parser = new Parser();  
            parser.setURL("http://profile.china.alibaba.com/user/dengminhui12.html");  
            parser.setEncoding(parser.getEncoding());  
            NodeVisitor visitor = new NodeVisitor() {  
                public void visitTag(Tag tag) {  
                    logger.fatal("testVisitorAll()  Tag name is :"  
                            + tag.getTagName() + " \n Class is :"  
                            + tag.getClass());  
                }  
  
            };  
  
            parser.visitAllNodesWith(visitor);  
        } catch (ParserException e) {  
            e.printStackTrace();  
        }  
    }  
    /* 
     * 测试对指定Tag的NodeVisitor的用法 
     */  
    public void testTagVisitor() {  
        try {  
  
            Parser parser = new Parser(  
                    "<head><title>dddd</title>"  
                            + "<link href=’/test01/css.css’ text=’text/css’ rel=’stylesheet’ />"  
                            + "<link href=’/test02/css.css’ text=’text/css’ rel=’stylesheet’ />"  
                            + "</head>" + "<body>"  
                            + "<a id='hah' href=’http://www.yeeach.com’>yeeach.com</a>"  
                            + "</body>");  
            NodeVisitor visitor = new NodeVisitor() {  
                public void visitTag(Tag tag) {  
                    if (tag instanceof HeadTag) {  
                        logger.fatal("visitTag() HeadTag : Tag name is :"  
                                + tag.getTagName() + " \n Class is :"  
                                + tag.getClass() + "\n Text is :"  
                                + tag.getText());  
                    } else if (tag instanceof TitleTag) {  
                        logger.fatal("visitTag() TitleTag : Tag name is :"  
                                + tag.getTagName() + " \n Class is :"  
                                + tag.getClass() + "\n Text is :"  
                                + tag.getText());  
  
  
                    } else if (tag instanceof LinkTag) {  
                        logger.fatal("visitTag() LinkTag : Tag name is :"  
                                + tag.getTagName() + " \n Class is :"  
                                + tag.getClass() + "\n Text is :"  
                                + tag.getText() + " \n getAttribute is :"  
                                + tag.getAttribute("href"));  
                    } else {  
                        logger.fatal("visitTag() : Tag name is :"  
                                + tag.getTagName() + " \n Class is :"  
                                + tag.getClass() + "\n Text is :"  
                                + tag.getText());  
                    }  
  
                }  
  
            };  
  
            parser.visitAllNodesWith(visitor);  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
    }  
    /* 
     * 测试HtmlPage的用法 
     */  
    public void testHtmlPage() {  
        String inputHTML = "<html>" + "<head>"  
                + "<title>Welcome to the HTMLParser website</title>"  
                + "</head>" + "<body>" + "Welcome to HTMLParser"  
                + "<table id=’table1′ >"  
                + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"  
                + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"  
                + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>"  
                + "<table id=’table2′ >"  
                + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"  
                + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"  
                + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>"  
                + "</body>" + "</html>";  
        Parser parser = new Parser();  
        try {  
            parser.setInputHTML(inputHTML);  
            parser.setEncoding(parser.getURL());  
            HtmlPage page = new HtmlPage(parser);  
            parser.visitAllNodesWith(page);  
            logger.fatal("testHtmlPage -title is :" + page.getTitle());  
            NodeList list = page.getBody();  
  
            for (NodeIterator iterator = list.elements(); iterator  
                    .hasMoreNodes();) {  
                Node node = iterator.nextNode();  
                logger.fatal("testHtmlPage -node  is :" + node.toHtml());  
            }  
  
        } catch (ParserException e) {  
            // TODO Auto-generated catch block  
            e.printStackTrace();  
        }  
    }  
    /* 
     * 测试LinkBean的用法 
     */  
    public void testLinkBean() {  
        Parser parser = new Parser();  
  
        LinkBean linkBean = new LinkBean();  
        linkBean.setURL("http://profile.china.alibaba.com/user/dengminhui12.html");  
        URL[] urls = linkBean.getLinks();  
  
        for (int i = 0; i < urls.length; i++) {  
            URL url = urls[i];  
            logger.fatal("testLinkBean() -url  is :" + url);  
        }  
  
    }  
  
} 
 

 

你可能感兴趣的:(apache,html,log4j,css,J#)