c#中过滤html的正则表达式

实现代码

///  
///  去除HTML标记
///  
///  包括HTML的源码  
///  已经去除后的文字
public static string NoHTML(string Htmlstring)
{
  //删除脚本
  Htmlstring = Regex.Replace(Htmlstring, @"]*?>.*?", "",
  RegexOptions.IgnoreCase);
  //删除HTML 
  Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "",
  RegexOptions.IgnoreCase);
  Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "",
  RegexOptions.IgnoreCase);
  Htmlstring = Regex.Replace(Htmlstring, @"�C>", "", RegexOptions.IgnoreCase);
  Htmlstring = Regex.Replace(Htmlstring, @"",
  RegexOptions.IgnoreCase);
  Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", "  ",
  RegexOptions.IgnoreCase);
  Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
  Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
  Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
  Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
  Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
  Htmlstring.Replace("<", "");
  Htmlstring.Replace(">", "");
  Htmlstring.Replace("\r\n", "");
  Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
  return Htmlstring;
}

C#过滤Html标签及空格

public static string FilterHTML(string HTMLStr)
    {
      if (!string.IsNullOrEmpty(HTMLStr))
        return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>| ", "");
      else
        return "";
    }

写一个静态方法移除HTML标签

#region
///  
///  移除HTML标签
///  
///  HTMLStr
public static string ParseTags(string HTMLStr)
{
 return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>", "");
}
#endregion

取出文本中的图片地址

#region
///  
///  取出文本中的图片地址
///  
///  HTMLStr
public static string GetImgUrl(string HTMLStr)
{
 string str = string.Empty;
 string sPattern = @"^]*>";
 Regex r = new Regex(@"]*\s*src\s*=\s*([']?)(?\S+)'?[^>]*>",
  RegexOptions.Compiled);
 Match m = r.Match(HTMLStr.ToLower());
 if (m.Success)
  str = m.Result("${url}");
 return str;
}
#endregion

提取HTML代码中文字的C#函数

///  
///  提取HTML代码中文字的C#函数
///  
///  包括HTML的源码  
///  已经去除后的文字
using System;
using System.Text.RegularExpressions;
public class StripHTMLTest
{
 public static void Main()
 {
  string s = StripHTML(
   "中国石龙信息平台faddfs龙信息平台");
  Console.WriteLine(s);
 }

 public static string StripHTML(string strHtml)
 {
  string[]aryReg =
  {
   @"]*?>.*?",

   @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\["
    "'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", @"([\r\n])[\s]+", @
    "&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @
    "&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);",
    @"&(copy|#169);", @"&#(\d+);", @"-->", @"
                    

你可能感兴趣的:(c#中过滤html的正则表达式)