曾经用HTMLParser过滤HTML, 但发现HTMLParser有时候对不规范的HTMl解析不了, 并且不支持xpath,
后来在Web-Harvest开源爬虫网站找到了HTMLParser,能够帮助我们将HTML 文档 转化为结构化的XML文档。虽然目前已经有了类似这样的工具,但是HtmlCleaner 能够完成几乎所有的HTML转换,而且不到30k,这是他们值得称道的地方。
1.HtmlCleaner的文档对象模型现在拥有了一些函数,处理节点和属性,所以现在在序列化之前搜索或者编辑是非常容易的。
2.提供基本 HtmlCleaner DOM的XPath支持
3. 解析后编程轻量级文档对象,能够很容易的被转换到DOM或者JDom标准文档,或者通过各种方式(压缩,打印)连续输出XML。
转换完成后, 能用JDOM,dom4j对文当进行处理
package com.citgee.webclip; import org.htmlcleaner.*; import java.net.*; import java.io.*; import java.util.*; import org.jdom.*; //import org.jdom.output.*; import org.jdom.contrib.helpers.XPathHelper; import org.jdom.filter.Filter; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; import org.jdom.xpath.XPath; public class WebClipUtils { public static Document getDocumentByURL(String url,String charset) throws MalformedURLException, IOException{ HtmlCleaner htmlCleaner = new HtmlCleaner(); CleanerProperties props = htmlCleaner.getProperties(); TagNode node = htmlCleaner.clean(new URL(url),charset); JDomSerializer jdomSerializer = new JDomSerializer(props,true); Document doc = jdomSerializer.createJDom(node); return doc; } public static List<Element> getElementsByTagName(Document doc,String tagName){ List<Element> eleList = new ArrayList<Element>(); buildList(doc.getRootElement(),tagName,eleList); return eleList; } private static void buildList(Element rootEle,String tagName,List<Element> eleList){ if(rootEle.getName().equals(tagName)){ eleList.add(rootEle); } List list = rootEle.getChildren(); for(Iterator iter = list.iterator();iter.hasNext();){ Element ele = (Element)iter.next(); buildList(ele,tagName,eleList); } } public static void printElement(Element ele) throws IOException{ XMLOutputter outputer = new XMLOutputter(); Format format = outputer.getFormat(); format.setEncoding("GB2312"); outputer.setFormat(format); outputer.output(ele, System.out); } public static void main(String[] args) throws Exception{ HtmlCleaner htmlCleaner = new HtmlCleaner(); CleanerProperties props = htmlCleaner.getProperties(); // TagNode node = htmlCleaner.clean(new URL("http://www.baidu.com")); TagNode node = htmlCleaner.clean(new URL("http://www.huanqiu.com"),"UTF-8"); // XmlSerializer xmlSerializer = new PrettyXmlSerializer(props); // StringWriter writer = new StringWriter(); // xmlSerializer.writeXml(node, writer, "GB2312"); // System.out.println(writer.toString()); JDomSerializer jdomSerializer = new JDomSerializer(props,true); Document doc = jdomSerializer.createJDom(node); Element rootEle = doc.getRootElement(); System.out.println(XPathHelper.getPathString(rootEle)); final String tagName = "div"; List list = getElementsByTagName(doc,"div"); System.out.println(list.size()); Iterator iter = list.iterator(); while (iter.hasNext()) { Element ele = (Element) iter.next(); System.out.println(); System.out.println("*****************************************"); System.out.println(XPathHelper.getPathString(ele)); System.out.println("*****************************************"); printElement(ele); } } } public class HtmlClean { public void cleanHtml(String htmlurl, String xmlurl) { try { long start = System.currentTimeMillis(); HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setUseCdataForScriptAndStyle(true); props.setRecognizeUnicodeChars(true); props.setUseEmptyElementTags(true); props.setAdvancedXmlEscape(true); props.setTranslateSpecialEntities(true); props.setBooleanAttributeValues("empty"); TagNode node = cleaner.clean(new File(htmlurl)); System.out.println("vreme:" + (System.currentTimeMillis() - start)); new PrettyXmlSerializer(props).writeXmlToFile(node, xmlurl); System.out.println("vreme:" + (System.currentTimeMillis() - start)); } catch (IOException e) { e.printStackTrace(); } } }