htmlparser使用例子(全) 转载

1.import java.net.URL;  
2.  
3.import junit.framework.TestCase;  
4.  
5.import org.apache.log4j.Logger;  
6.import org.htmlparser.Node;  
7.import org.htmlparser.NodeFilter;  
8.import org.htmlparser.Parser;  
9.import org.htmlparser.Tag;  
10.import org.htmlparser.beans.LinkBean;  
11.import org.htmlparser.filters.NodeClassFilter;  
12.import org.htmlparser.filters.OrFilter;  
13.import org.htmlparser.filters.TagNameFilter;  
14.import org.htmlparser.tags.HeadTag;  
15.import org.htmlparser.tags.ImageTag;  
16.import org.htmlparser.tags.InputTag;  
17.import org.htmlparser.tags.LinkTag;  
18.import org.htmlparser.tags.OptionTag;  
19.import org.htmlparser.tags.SelectTag;  
20.import org.htmlparser.tags.TableColumn;  
21.import org.htmlparser.tags.TableRow;  
22.import org.htmlparser.tags.TableTag;  
23.import org.htmlparser.tags.TitleTag;  
24.import org.htmlparser.util.NodeIterator;  
25.import org.htmlparser.util.NodeList;  
26.import org.htmlparser.util.ParserException;  
27.import org.htmlparser.visitors.HtmlPage;  
28.import org.htmlparser.visitors.NodeVisitor;  
29.import org.htmlparser.visitors.ObjectFindingVisitor;  
30.  
31.public class T extends TestCase {  
32.  
33.  private static final Logger logger = Logger.getLogger(T.class);  
34.  
35.  public T(String name) {  
36.    super(name);  
37.  }  
38.  
39.  /* 
40.   * 测试ObjectFindVisitor的用法 
41.   */  
42.  public void testImageVisitor() {  
43.    try {  
44.      ImageTag imgLink;  
45.      ObjectFindingVisitor visitor = new ObjectFindingVisitor(ImageTag.class);  
46.      Parser parser = new Parser();  
47.      parser.setURL("http://www.google.com");  
48.      parser.setEncoding(parser.getEncoding());  
49.      parser.visitAllNodesWith(visitor);  
50.      Node[] nodes = visitor.getTags();  
51.      for (int i = 0; i < nodes.length; i++) {  
52.        imgLink = (ImageTag) nodes[i];  
53.        logger.fatal("testImageVisitor() ImageURL = " + imgLink.getImageURL());  
54.        logger.fatal("testImageVisitor() ImageLocation = " + imgLink.extractImageLocn());  
55.        logger.fatal("testImageVisitor() SRC = " + imgLink.getAttribute("SRC"));  
56.      }  
57.    } catch (Exception e) {  
58.      e.printStackTrace();  
59.    }  
60.  }  
61.  
62.  /* 
63.   * 测试TagNameFilter用法 
64.   */  
65.  public void testNodeFilter() {  
66.    try {  
67.      NodeFilter filter = new TagNameFilter("IMG");  
68.      Parser parser = new Parser();  
69.      parser.setURL("http://www.google.com");  
70.      parser.setEncoding(parser.getEncoding());  
71.      NodeList list = parser.extractAllNodesThatMatch(filter);  
72.      for (int i = 0; i < list.size(); i++) {  
73.        logger.fatal("testNodeFilter() " + list.elementAt(i).toHtml());  
74.      }  
75.    } catch (Exception e) {  
76.      e.printStackTrace();  
77.    }  
78.  
79.  }  
80.  
81.  /* 
82.   * 测试NodeClassFilter用法 
83.   */  
84.  public void testLinkTag() {  
85.    try {  
86.  
87.      NodeFilter filter = new NodeClassFilter(LinkTag.class);  
88.      Parser parser = new Parser();  
89.      parser.setURL("http://www.google.com");  
90.      parser.setEncoding(parser.getEncoding());  
91.      NodeList list = parser.extractAllNodesThatMatch(filter);  
92.      for (int i = 0; i < list.size(); i++) {  
93.        LinkTag node = (LinkTag) list.elementAt(i);  
94.        logger.fatal("testLinkTag() Link is :" + node.extractLink());  
95.      }  
96.    } catch (Exception e) {  
97.      e.printStackTrace();  
98.    }  
99.  
100.  }  
101.  
102.  /* 
103.   * 测试<link href=" text=’text/css’ rel=’stylesheet’ />用法 
104.   */  
105.  public void testLinkCSS() {  
106.    try {  
107.  
108.      Parser parser = new Parser();  
109.      parser.setInputHTML("<head><title>Link Test</title>"  
110.          + "<link href=’/test01/css.css' text='text/css' rel='stylesheet' />"  
111.          + "<link href='/test02/css.css' text='text/css' rel='stylesheet' />" + "</head>"  
112.          + "<body>");  
113.      parser.setEncoding(parser.getEncoding());  
114.  
115.      for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {  
116.        Node node = e.nextNode();  
117.        logger.fatal("testLinkCSS()" + node.getText() + node.getClass());  
118.  
119.      }  
120.    } catch (Exception e) {  
121.      e.printStackTrace();  
122.    }  
123.  }  
124.  
125.  /* 
126.   * 测试OrFilter的用法 
127.   */  
128.  public void testOrFilter() {  
129.    NodeFilter inputFilter = new NodeClassFilter(InputTag.class);  
130.    NodeFilter selectFilter = new NodeClassFilter(SelectTag.class);  
131.  
132.    NodeList nodeList = null;  
133.  
134.    try {  
135.      Parser parser = new Parser();  
136.      parser  
137.          .setInputHTML("<head><title>OrFilter Test</title>"  
138.              + "<link href='/test01/css.css' text='text/css' rel='stylesheet' />"  
139.              + "<link href='/test02/css.css' text='text/css' rel='stylesheet' />"  
140.              + "</head>"  
141.              + "<body>"  
142.              + "<input type='text' value='text1′ name='text1′/>"  
143.              + "<input type='text' value='text2′ name='text2′/>"  
144.              + "<select><option id='1′>1</option><option id='2′>2</option><option id='3′></option></select>"  
145.              + "<a href='http://www.yeeach.com'>yeeach.com</a>" + "</body>");  
146.  
147.      parser.setEncoding(parser.getEncoding());  
148.      OrFilter lastFilter = new OrFilter();  
149.      lastFilter.setPredicates(new NodeFilter[] { selectFilter, inputFilter });  
150.      nodeList = parser.parse(lastFilter);  
151.      for (int i = 0; i <= nodeList.size(); i++) {  
152.        if (nodeList.elementAt(i) instanceof InputTag) {  
153.          InputTag tag = (InputTag) nodeList.elementAt(i);  
154.          logger.fatal("OrFilter tag name is :" + tag.getTagName() + " ,tag value is:"  
155.              + tag.getAttribute("value"));  
156.        }  
157.        if (nodeList.elementAt(i) instanceof SelectTag) {  
158.          SelectTag tag = (SelectTag) nodeList.elementAt(i);  
159.          NodeList list = tag.getChildren();  
160.  
161.          for (int j = 0; j < list.size(); j++) {  
162.            OptionTag option = (OptionTag) list.elementAt(j);  
163.            logger.fatal("OrFilter Option" + option.getOptionText());  
164.          }  
165.  
166.        }  
167.      }  
168.  
169.    } catch (ParserException e) {  
170.      e.printStackTrace();  
171.    }  
172.  }  
173.  
174.  /* 
175.   * 测试对<table><tr><td></td></tr></table>的解析 
176.   */  
177.  public void testTable() {  
178.    Parser myParser;  
179.    NodeList nodeList = null;  
180.    myParser = Parser.createParser("<body> " + "<table id='table1′ >"  
181.        + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"  
182.        + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"  
183.        + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" + "<table id='table2′ >"  
184.        + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"  
185.        + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"  
186.        + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" + "</body>", "GBK");  
187.    NodeFilter tableFilter = new NodeClassFilter(TableTag.class);  
188.    OrFilter lastFilter = new OrFilter();  
189.    lastFilter.setPredicates(new NodeFilter[] { tableFilter });  
190.    try {  
191.      nodeList = myParser.parse(lastFilter);  
192.      for (int i = 0; i <= nodeList.size(); i++) {  
193.        if (nodeList.elementAt(i) instanceof TableTag) {  
194.          TableTag tag = (TableTag) nodeList.elementAt(i);  
195.          TableRow[] rows = tag.getRows();  
196.  
197.          for (int j = 0; j < rows.length; j++) {  
198.            TableRow tr = (TableRow) rows[j];  
199.            TableColumn[] td = tr.getColumns();  
200.            for (int k = 0; k < td.length; k++) {  
201.              logger.fatal("<td>" + td[k].toPlainTextString());  
202.            }  
203.  
204.          }  
205.  
206.        }  
207.      }  
208.  
209.    } catch (ParserException e) {  
210.      e.printStackTrace();  
211.    }  
212.  }  
213.  
214.  /* 
215.   * 测试NodeVisitor的用法,遍历所有节点 
216.   */  
217.  public void testVisitorAll() {  
218.    try {  
219.      Parser parser = new Parser();  
220.      parser.setURL("http://www.google.com");  
221.      parser.setEncoding(parser.getEncoding());  
222.      NodeVisitor visitor = new NodeVisitor() {  
223.        public void visitTag(Tag tag) {  
224.          logger.fatal("testVisitorAll()  Tag name is :" + tag.getTagName() + " \n Class is :"  
225.              + tag.getClass());  
226.        }  
227.  
228.      };  
229.  
230.      parser.visitAllNodesWith(visitor);  
231.    } catch (ParserException e) {  
232.      e.printStackTrace();  
233.    }  
234.  }  
235.  
236.  /* 
237.   * 测试对指定Tag的NodeVisitor的用法 
238.   */  
239.  public void testTagVisitor() {  
240.    try {  
241.  
242.      Parser parser = new Parser("<head><title>dddd</title>"  
243.          + "<link href='/test01/css.css' text='text/css' rel='stylesheet' />"  
244.          + "<link href='/test02/css.css' text='text/css' rel='stylesheet' />" + "</head>"  
245.          + "<body>" + "<a href='http://www.yeeach.com'>yeeach.com</a>" + "</body>");  
246.      NodeVisitor visitor = new NodeVisitor() {  
247.        public void visitTag(Tag tag) {  
248.          if (tag instanceof HeadTag) {  
249.            logger.fatal("visitTag() HeadTag : Tag name is :" + tag.getTagName()  
250.                + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText());  
251.          } else if (tag instanceof TitleTag) {  
252.            logger.fatal("visitTag() TitleTag : Tag name is :" + tag.getTagName()  
253.                + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText());  
254.  
255.          } else if (tag instanceof LinkTag) {  
256.            logger.fatal("visitTag() LinkTag : Tag name is :" + tag.getTagName()  
257.                + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText()  
258.                + " \n getAttribute is :" + tag.getAttribute("href"));  
259.          } else {  
260.            logger.fatal("visitTag() : Tag name is :" + tag.getTagName() + " \n Class is :"  
261.                + tag.getClass() + "\n Text is :" + tag.getText());  
262.          }  
263.  
264.        }  
265.  
266.      };  
267.  
268.      parser.visitAllNodesWith(visitor);  
269.    } catch (Exception e) {  
270.      e.printStackTrace();  
271.    }  
272.  }  
273.  
274.  /* 
275.   * 测试HtmlPage的用法 
276.   */  
277.  public void testHtmlPage() {  
278.    String inputHTML = "<html>" + "<head>"  
279.        + "<title>Welcome to the HTMLParser website</title>" + "</head>" + "<body>"  
280.        + "Welcome to HTMLParser" + "<table id='table1′ >"  
281.        + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"  
282.        + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"  
283.        + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" + "<table id='table2′ >"  
284.        + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"  
285.        + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"  
286.        + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" + "</body>" + "</html>";  
287.    Parser parser = new Parser();  
288.    try {  
289.  
290.      parser.setInputHTML(inputHTML);  
291.      parser.setEncoding(parser.getURL());  
292.      HtmlPage page = new HtmlPage(parser);  
293.      parser.visitAllNodesWith(page);  
294.      logger.fatal("testHtmlPage -title is :" + page.getTitle());  
295.      NodeList list = page.getBody();  
296.  
297.      for (NodeIterator iterator = list.elements(); iterator.hasMoreNodes();) {  
298.        Node node = iterator.nextNode();  
299.        logger.fatal("testHtmlPage -node  is :" + node.toHtml());  
300.      }  
301.  
302.    } catch (ParserException e) {  
303.      // TODO Auto-generated catch block  
304.      e.printStackTrace();  
305.    }  
306.  }  
307.  
308.  /* 
309.   * 测试LinkBean的用法 
310.   */  
311.  public void testLinkBean() {  
312.    Parser parser = new Parser();  
313.  
314.    LinkBean linkBean = new LinkBean();  
315.    linkBean.setURL("http://www.google.com");  
316.    URL[] urls = linkBean.getLinks();  
317.  
318.    for (int i = 0; i < urls.length; i++) {  
319.      URL url = urls[i];  
320.      logger.fatal("testLinkBean() -url  is :" + url);  
321.    }  
322.  
323.  }  
324.  
325.}  

你可能感兴趣的:(htmlparser使用例子(全) 转载)