java用正则去除html标签

原文地址:https://www.aliyun.com/jiaocheng/297802.html

packagecom.xz.cxzy.utils;importjava.util.regex.Matcher;importjava.util.regex.Pattern;publicclassHtmlUtil{privatestaticfinalStringregEx_script="]*?>[//s//S]*?";//定义script的正则表达式  private

package com.xz.cxzy.utils; 

import java.util.regex.Matcher; 

import java.util.regex.Pattern; 

public class HtmlUtil { 

private static final String regEx_script = "]*?>[//s//S]*?"; // 定义script的正则表达式 

private static final String regEx_style = "]*?>[//s//S]*?"; // 定义style的正则表达式 

private static final String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式 

private static final String regEx_space = "//s*|/t|/r|/n";//定义空格回车换行符 



/** 

* @param htmlStr 

* @return 

*  删除Html标签 

*/ 

public static String delHTMLTag(String htmlStr) { 

Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE); 

Matcher m_script = p_script.matcher(htmlStr); 

htmlStr = m_script.replaceAll(""); // 过滤script标签 

Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE); 

Matcher m_style = p_style.matcher(htmlStr); 

htmlStr = m_style.replaceAll(""); // 过滤style标签 

Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE); 

Matcher m_html = p_html.matcher(htmlStr); 

htmlStr = m_html.replaceAll(""); // 过滤html标签 

Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE); 

Matcher m_space = p_space.matcher(htmlStr); 

htmlStr = m_space.replaceAll(""); // 过滤空格回车标签 

return htmlStr.trim(); // 返回文本字符串 

} 



public static String getTextFromHtml(String htmlStr){ 

htmlStr = delHTMLTag(htmlStr); 

htmlStr = htmlStr.replaceAll(" ", ""); 

htmlStr = htmlStr.substring(0, htmlStr.indexOf("。")+1); 

return htmlStr; 

} 



public static void main(String[] args) { 

String str = "

 整治“四风”   清弊除垢
 公司召开党的群众路线教育实践活动动员大会

"; 

System.out.println(getTextFromHtml(str)); 

} 

}

你可能感兴趣的:(java用正则去除html标签)