将HTMl转换成纯文本

import java.util.HashSet;
import java.util.Set;

import javax.swing.text.html.HTMLEditorKit;

import org.apache.commons.lang.StringEscapeUtils;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.richart.Constants;
import com.richart.util.StringUtils;

public class HtmlToText extends HTMLEditorKit.ParserCallback {

	private static Logger logger = LoggerFactory.getLogger(HtmlToText.class);

	/**
	 * 提取纯文本
	 * 
	 * @param inputHtml
	 * @return
	 * @throws Exception
	 */
	public static String extractText(String inputHtml) {
		StringBuffer text = new StringBuffer();
		Parser parser = null;
		NodeList nodes = null;
		Set<String> set = new HashSet<String>();
		try {
			String htmlStr = new String(inputHtml.getBytes(Constants.CODING_UTF_8), Constants.CODING_UTF_8);
			htmlStr = StringEscapeUtils.unescapeHtml(inputHtml);
			parser = Parser.createParser(new String(htmlStr), Constants.CODING_UTF_8);

			// 遍历所有的节点
			nodes = parser.extractAllNodesThatMatch(new NodeFilter() {
				private static final long serialVersionUID = 1L;

				public boolean accept(Node node) {
					return true;
				}
			});

			for (int i = 0; i < nodes.size(); i++) {
				Node nodet = nodes.elementAt(i);
				String noteStr = new String(nodet.toPlainTextString().getBytes(Constants.CODING_UTF_8)).trim();
				if (StringUtils.isValidateString(noteStr) && set.add(noteStr)) {
					text.append(new String(noteStr)).append("<br />");
				}
			}
		} catch (Exception e) {
			logger.error(e.getMessage(), e);
		}
		return text.toString().replaceAll("[[\r]+[\n]+[\t]+]+", "<br />");
	}

 未处理的问题:特殊字符乱码问题,内容重复

你可能感兴趣的:(html)