HTML 标签闭合性检测

用于校验HTML标签是否合法,是否闭合, 使用的是HtmlParser开源包

package com.lhb.client.util;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;

import org.apache.commons.lang.StringUtils;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.tags.CompositeTag;
import org.htmlparser.util.NodeIterator;


public class ValidateHTML {

	private final String CONTENT;
	private Parser parser;
	
	public ValidateHTML(String content) {
		CONTENT = content;
		parser = Parser.createParser(content, "GBK");
	}
	
	private static String getContent() {
		byte[] con = null;
		InputStream in = ValidateHTML.class.getResourceAsStream("content.txt");
		try {
			int length = in.available();
			con = new byte[length];
			in.read(con, 0, length);
		} catch (IOException e) {
			e.printStackTrace();
		}
		
		try {
			return new String(con, "GBK");
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
			return "";
		}
	}
	
	private void recusive(NodeIterator iterator) throws Exception  {
		while (iterator.hasMoreNodes()) {
			Node node = iterator.nextNode();
			
			if (node instanceof TagNode) {
				TagNode tagNode = (TagNode)node;

				if (!isClosed(tagNode)) {
					throw new Exception("发现不完整的错误标签");
				}
				
				if (moreTag(tagNode)) {
					throw new Exception("发现多余的结束标签");
				}
				
				TagNode endTagNode = (TagNode) tagNode.getEndTag();
				if (endTagNode == null) {
					continue;
				}

				if (isIgnored(endTagNode)) {
					throw new Exception("发现没有闭合的标签");
				}
			} else if (node instanceof RemarkNode) {
				RemarkNode remarkNode = (RemarkNode)node;
				if (!remarkNode.toHtml().endsWith("-->")) {
					throw new Exception("发现没有闭合的注释标签");
				}
			}
			
			if (node.getChildren() == null) {
				continue;
			}
			recusive(node.getChildren().elements());
		}
	}
	
	private boolean isIgnored(TagNode tagNode) {
		String tagName = tagNode.getTagName();
		int position = tagNode.getTagBegin();
		int length = tagName.length() + 3;
		String subString = StringUtils.substring(CONTENT, position, position + length);
		if (subString == null) {
			return true;
		}
		
		return !subString.equalsIgnoreCase("</" + tagName + ">");
	}
	

	private boolean moreTag(TagNode tagNode) {
		return tagNode.toHtml().startsWith("</");
	}
	
	
	//检测不完整的标签或错误标签,例如<script </script>
	private boolean isClosed(TagNode tagNode) {
		String html = tagNode.toHtml();
		int length = tagNode.getTagEnd() - tagNode.getTagBegin();
		String tag = StringUtils.substring(html, 0, length);

		return tag.endsWith(">");
	}
	
	
	public boolean validate() {
		try {
			recusive(parser.elements());
			return true;
		} catch (Exception e) {
			//e.printStackTrace();
			return false;
		}
	}
	
	public static void main(String[] args) {
		String c = getContent();
		ValidateHTML app = new ValidateHTML(c);
		boolean result = app.validate();
		System.out.println(result);
	}
}


你可能感兴趣的:(apache,html,C++,c,C#)