遍历html,删除节点信息demo
//author: thrillerzw public class DomUtils { public static Document parse(String str) { InputSource input = new InputSource(new StringReader(str)); DOMParser parser = new DOMParser(); try { // parser.setFeature("http://cyberneko.org/html/features/override-namespaces", // false); parser.setFeature( "http://cyberneko.org/html/features/balance-tags/ignore-outside-content", true); parser.setFeature( "http://cyberneko.org/html/features/balance-tags/document-fragment", false); parser.setFeature( "http://cyberneko.org/html/features/scanner/script/strip-comment-delims", true); parser.setFeature( "http://cyberneko.org/html/features/scanner/script/strip-cdata-delims", true); parser.setFeature( "http://cyberneko.org/html/features/scanner/style/strip-comment-delims", true); parser.setFeature( "http://cyberneko.org/html/features/scanner/style/strip-cdata-delims", true); parser.setFeature( "http://cyberneko.org/html/features/scanner/notify-builtin-refs", true); parser.setFeature( "http://apache.org/xml/features/scanner/notify-char-refs", true); parser.setFeature( "http://apache.org/xml/features/scanner/notify-builtin-refs", true); } catch (SAXNotRecognizedException e1) { e1.printStackTrace(); } catch (SAXNotSupportedException e1) { e1.printStackTrace(); } try { // 设置网页的默认编码 parser.setProperty( "http://cyberneko.org/html/properties/default-encoding", "utf-8"); // parser.parse(input); parser.parse(str); Document d = parser.getDocument(); return d; } catch (SAXException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } //<base target="_blank"> public static String[] delNodes = { "SCRIPT", "STYLE", "OBJECT", "EMBED", "INPUT","SELECT","IFRAME","LINK","BASE","TITLE"}; public static String[] clearAttrsNodes = { "BODY","HTML","STRONG", "EM", "OL", "UL", "DL","DT","DD","LI","SUB","SUP","BR","SPAN","H1","H2" }; public static String[] transPNodes = { "CAPTION","DIV"}; public static String[] transH2Nodes = { "H3","H4","H5","H6",}; // img: {$:{width:1,height:1,src:1,style:['float','width','height']}} public static String[] imgAttrs = {"src","style"}; private static boolean isDelNode(String nodeName){ List<String> list=Arrays.asList(delNodes); boolean res=list.contains(nodeName); return res; } private static boolean isClearAttrsNode(String nodeName){ List<String> list=Arrays.asList(clearAttrsNodes); boolean res=list.contains(nodeName); return res; } private static boolean isTransPNodes(String nodeName){ List<String> list=Arrays.asList(transPNodes); boolean res=list.contains(nodeName); return res; } private static boolean isTransH2Nodes(String nodeName){ List<String> list=Arrays.asList(transH2Nodes); boolean res=list.contains(nodeName); return res; } private static boolean isImgAttrs(String nodeName){ List<String> list=Arrays.asList(imgAttrs); boolean res=list.contains(nodeName.toLowerCase()); return res; } private static void toHTML(StringBuilder sb, Node node,String path) { int type = node.getNodeType(); switch (type) { case Node.DOCUMENT_NODE: { NodeList children = node.getChildNodes(); if (children != null) { int len = children.getLength(); for (int i = 0; i < len; i++) toHTML(sb, children.item(i),path); } break; } // print element with attributes case Node.ELEMENT_NODE: { //节点名字都是大写 String nodeName=node.getNodeName(); if(isDelNode(nodeName)||(!nodeName.startsWith("P:")&&nodeName.contains(":"))){ break; } sb.append("<"); if(isTransPNodes(nodeName)||nodeName.startsWith("P:")){ nodeName="P"; }else if("B".equals(nodeName)){ nodeName="STRONG"; } else if(isTransH2Nodes(nodeName)){ nodeName="H2"; } else if("H2".equals(nodeName)){ nodeName="H1"; }else if("NOBR".equals(nodeName)){ nodeName="SPAN"; } sb.append(nodeName); if (!isClearAttrsNode(nodeName)) { NamedNodeMap attrs = node.getAttributes(); for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); String key = attr.getNodeName(); String value = attr.getNodeValue(); //a : clearStyle if("A".equals(nodeName)&&"STYLE".equalsIgnoreCase(key)){ continue; } //p: {$:{id:1,style:['text-align']}}, if("P".equals(nodeName)&&(!"id".equalsIgnoreCase(key)||!"style".equalsIgnoreCase(key))){ continue; } if("P".equals(nodeName)&&"style".equalsIgnoreCase(key)){ int p1=value.indexOf("text-align"); if(p1!=-1){ int p2=value.indexOf(";", p1); value=value.substring(p1, p2); } } //img: {$:{width:1,height:1,src:1,style:['float','width','height']}} if("IMG".equals(nodeName)&&!isImgAttrs(key)){ continue; } if("IMG".equals(nodeName)&&"src".equalsIgnoreCase(key)){ value=path+value; } if("IMG".equals(nodeName)&&"style".equalsIgnoreCase(key)){ String[] cssArr={"float","width","height"}; StringBuffer cssSb=new StringBuffer(); for(int j=0;j<cssArr.length;j++){ int p1=value.indexOf(cssArr[j]); if(p1!=-1){ int p2=value.indexOf(";", p1); if(p2==-1){ p2=value.length(); } cssSb.append(value.substring(p1, p2)).append(";"); } } value=cssSb.toString(); } sb.append(" " + key + "=\"" + value + "\""); } } if (!node.hasChildNodes()) { if (!sb.toString().trim().endsWith("/>")) { sb.append("/>"); } return; } sb.append(">"); NodeList children = node.getChildNodes(); if (children != null) { int len = children.getLength(); for (int i = 0; i < len; i++) toHTML(sb, children.item(i),path); } break; } // handle entity reference nodes case Node.ENTITY_REFERENCE_NODE: { sb.append("&").append(node.getNodeName()).append(";"); break; } // print cdata sections /* case Node.CDATA_SECTION_NODE: { sb.append("<![CDATA[").append(node.getNodeValue()).append("]]>"); break; }*/ // print text case Node.TEXT_NODE: { String value=node.getNodeValue(); String name=node.getNodeName(); if(value.trim().startsWith("if(navigator")){ value=value.replace("if(navigator.userAgent.indexOf('MSIE')<0) {", "").replace("}", ""); } sb.append(value); break; } /* case Node.COMMENT_NODE: { String value=node.getNodeValue(); if(value.startsWith("[if !mso]>")){ sb.append("<!--").append(value).append("-->"); } break; }*/ } if (type == Node.ELEMENT_NODE) { String nodeName=node.getNodeName(); if (!isDelNode(nodeName)&&!(!nodeName.startsWith("P:")&&nodeName.contains(":"))) { sb.append("</"); if(isTransPNodes(nodeName)||nodeName.startsWith("P:")){ nodeName="P"; }else if("B".equals(nodeName)){ nodeName="STRONG"; } else if(isTransH2Nodes(nodeName)){ nodeName="H2"; } else if("H2".equals(nodeName)){ nodeName="H1"; }else if("NOBR".equals(nodeName)){ nodeName="SPAN"; } sb.append(nodeName); sb.append(">"); } } } /** * prase node to HTML * * @param node * @return html string */ public static String toHTML(Node node,String path) { StringBuilder sb = new StringBuilder(); toHTML(sb, node,path); return sb.toString(); } public static void main(String[] args) throws IOException { // 解析为dom节点 Node node = DomUtils.parse("http://www.baidu.com"); // Node node = DomTest.parse("D:\\tmp\\11pptx.files\\slide1.htm"); //Node node = DomTest.parse("D:\\tmp\11pptx.files\\slide1.htm"); System.out.println(DomUtils.toHTML(node,"")); } }