NekoHTML

遍历html,删除节点信息demo

//author: thrillerzw
public class DomUtils {


    public static Document parse(String str) {
	InputSource input = new InputSource(new StringReader(str));

	DOMParser parser = new DOMParser();

	try {
	    // parser.setFeature("http://cyberneko.org/html/features/override-namespaces",
	    // false);
	    parser.setFeature(
		    "http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
		    true);
	    parser.setFeature(
		    "http://cyberneko.org/html/features/balance-tags/document-fragment",
		    false);

	    parser.setFeature(
		    "http://cyberneko.org/html/features/scanner/script/strip-comment-delims",
		    true);
	    parser.setFeature(
		    "http://cyberneko.org/html/features/scanner/script/strip-cdata-delims",
		    true);
	    parser.setFeature(
		    "http://cyberneko.org/html/features/scanner/style/strip-comment-delims",
		    true);
	    parser.setFeature(
		    "http://cyberneko.org/html/features/scanner/style/strip-cdata-delims",
		    true);
	    parser.setFeature(
		    "http://cyberneko.org/html/features/scanner/notify-builtin-refs",
		    true);

	    parser.setFeature(
		    "http://apache.org/xml/features/scanner/notify-char-refs",
		    true);
	    parser.setFeature(
		    "http://apache.org/xml/features/scanner/notify-builtin-refs",
		    true);

	} catch (SAXNotRecognizedException e1) {
	    e1.printStackTrace();
	} catch (SAXNotSupportedException e1) {
	    e1.printStackTrace();
	}

	try {

	    // 设置网页的默认编码
	    parser.setProperty(
		    "http://cyberneko.org/html/properties/default-encoding",
		    "utf-8");
	    // parser.parse(input);
	    parser.parse(str);

	    Document d = parser.getDocument();

	    return d;
	} catch (SAXException e) {
	    e.printStackTrace();
	} catch (IOException e) {
	    e.printStackTrace();
	}
	return null;
    }
    //<base target="_blank">
    public static String[] delNodes = { "SCRIPT", "STYLE", "OBJECT", "EMBED", "INPUT","SELECT","IFRAME","LINK","BASE","TITLE"};
    public static String[] clearAttrsNodes = { "BODY","HTML","STRONG", "EM", "OL", "UL", "DL","DT","DD","LI","SUB","SUP","BR","SPAN","H1","H2" };
    public static String[] transPNodes = { "CAPTION","DIV"};
    public static String[] transH2Nodes = { "H3","H4","H5","H6",};
  //  img: {$:{width:1,height:1,src:1,style:['float','width','height']}}    
    public static String[] imgAttrs = {"src","style"};
    private static boolean isDelNode(String nodeName){
	List<String> list=Arrays.asList(delNodes);
	boolean res=list.contains(nodeName);
	return res;
    }
    private static boolean isClearAttrsNode(String nodeName){
	List<String> list=Arrays.asList(clearAttrsNodes);
	boolean res=list.contains(nodeName);
	return res;
    }
    private static boolean isTransPNodes(String nodeName){
   	List<String> list=Arrays.asList(transPNodes);
   	boolean res=list.contains(nodeName);
   	return res;
     }
    private static boolean isTransH2Nodes(String nodeName){
	List<String> list=Arrays.asList(transH2Nodes);
	boolean res=list.contains(nodeName);
	return res;
    }
    private static boolean isImgAttrs(String nodeName){
	List<String> list=Arrays.asList(imgAttrs);
	boolean res=list.contains(nodeName.toLowerCase());
	return res;
    }
    private static void toHTML(StringBuilder sb, Node node,String path) {

	int type = node.getNodeType();
	switch (type) {

	case Node.DOCUMENT_NODE: {
	    NodeList children = node.getChildNodes();
	    if (children != null) {
		int len = children.getLength();
		for (int i = 0; i < len; i++)
		    toHTML(sb, children.item(i),path);
	    }
	    break;
	}

	// print element with attributes    
	case Node.ELEMENT_NODE: {
	    //节点名字都是大写
	    String nodeName=node.getNodeName();
	    if(isDelNode(nodeName)||(!nodeName.startsWith("P:")&&nodeName.contains(":"))){
		break;
	    }
	   
	    sb.append("<");
	    if(isTransPNodes(nodeName)||nodeName.startsWith("P:")){
		nodeName="P";
	    }else if("B".equals(nodeName)){
		nodeName="STRONG";
	    }
	    else if(isTransH2Nodes(nodeName)){
	       nodeName="H2";
	    }
	    else if("H2".equals(nodeName)){
		nodeName="H1";
	    }else if("NOBR".equals(nodeName)){
		nodeName="SPAN";
	    }
 
	    sb.append(nodeName);
	    
	   
	    if (!isClearAttrsNode(nodeName)) {
		NamedNodeMap attrs = node.getAttributes();
		for (int i = 0; i < attrs.getLength(); i++) {
		    Node attr = attrs.item(i);
		    String key = attr.getNodeName();
		    String value = attr.getNodeValue();
		    //a : clearStyle
		    if("A".equals(nodeName)&&"STYLE".equalsIgnoreCase(key)){
			continue;
		    }
		    //p: {$:{id:1,style:['text-align']}},
		    if("P".equals(nodeName)&&(!"id".equalsIgnoreCase(key)||!"style".equalsIgnoreCase(key))){
			continue;
		    }
		    if("P".equals(nodeName)&&"style".equalsIgnoreCase(key)){
			int p1=value.indexOf("text-align");
			if(p1!=-1){
			    int p2=value.indexOf(";", p1);
			    value=value.substring(p1, p2);
			}
		    }
		    //img: {$:{width:1,height:1,src:1,style:['float','width','height']}}
		    if("IMG".equals(nodeName)&&!isImgAttrs(key)){
			continue;
		    }
		    if("IMG".equals(nodeName)&&"src".equalsIgnoreCase(key)){
			value=path+value;
		    }
		    if("IMG".equals(nodeName)&&"style".equalsIgnoreCase(key)){
			String[] cssArr={"float","width","height"};
			StringBuffer cssSb=new StringBuffer();
			for(int j=0;j<cssArr.length;j++){
			    int p1=value.indexOf(cssArr[j]);
				if(p1!=-1){
				    int p2=value.indexOf(";", p1);
				    if(p2==-1){
					p2=value.length();
				    }
				    cssSb.append(value.substring(p1, p2)).append(";");
				}
			}
			value=cssSb.toString();
		    }
		    sb.append(" " + key + "=\"" + value + "\"");
		}
	    }
	    if (!node.hasChildNodes()) {
		
		if (!sb.toString().trim().endsWith("/>")) {
		    sb.append("/>");
		}
		return;
	    }
	    sb.append(">");

	    NodeList children = node.getChildNodes();
	    if (children != null) {
		int len = children.getLength();
		for (int i = 0; i < len; i++)
		    toHTML(sb, children.item(i),path);
	    }
	    break;
	}

	// handle entity reference nodes
	case Node.ENTITY_REFERENCE_NODE: {
	    sb.append("&").append(node.getNodeName()).append(";");
	    break;
	}

	// print cdata sections
/*	case Node.CDATA_SECTION_NODE: {
	    sb.append("<![CDATA[").append(node.getNodeValue()).append("]]>");
	    break;
	}*/

	// print text
	case Node.TEXT_NODE: {
	    String value=node.getNodeValue();
	    String name=node.getNodeName();
	    if(value.trim().startsWith("if(navigator")){
		value=value.replace("if(navigator.userAgent.indexOf('MSIE')<0) {", "").replace("}", "");
	    }
	    sb.append(value);
	    break;
	}
	
/*	case Node.COMMENT_NODE: {
	    String value=node.getNodeValue();
	    if(value.startsWith("[if !mso]>")){
		 sb.append("<!--").append(value).append("-->");
	    }
	   
	    break;
	}*/
	
	}

	if (type == Node.ELEMENT_NODE) {
	    String nodeName=node.getNodeName();
	    if (!isDelNode(nodeName)&&!(!nodeName.startsWith("P:")&&nodeName.contains(":"))) {
		   sb.append("</");
		   if(isTransPNodes(nodeName)||nodeName.startsWith("P:")){
			nodeName="P";
		    }else if("B".equals(nodeName)){
			nodeName="STRONG";
		    }
		    else if(isTransH2Nodes(nodeName)){
		       nodeName="H2";
		    }
		    else if("H2".equals(nodeName)){
			nodeName="H1";
		    }else if("NOBR".equals(nodeName)){
			nodeName="SPAN";
		    }
		sb.append(nodeName);
		sb.append(">");
	    }
	    
	}
    }

    /**
     * prase node to HTML
     * 
     * @param node
     * @return html string
     */
    public static String toHTML(Node node,String path) {
	StringBuilder sb = new StringBuilder();
	toHTML(sb, node,path);
	return sb.toString();
    }


    public static void main(String[] args) throws IOException {

	// 解析为dom节点
	Node node = DomUtils.parse("http://www.baidu.com");
//	Node node = DomTest.parse("D:\\tmp\\11pptx.files\\slide1.htm");
	//Node node = DomTest.parse("D:\\tmp\11pptx.files\\slide1.htm");
	System.out.println(DomUtils.toHTML(node,""));

    }
}

 

你可能感兴趣的:(html)