C#字符串去除html格式

转自:http://www.cnblogs.com/grokyao/archive/2008/12/04/1347512.html

    在使用Freetextbox等流行编辑器后获得的文字内容里会掺杂着一些html标记,有时会需要将它们处理掉,这里给出处理的方法,使用了正则表达式进行规则过滤,由于html标记都是基于<>这种格式,而且还有类似&nbsp;这样的符号,所以分了2次处理将字符串处理为无html格式的字符串。

 简易代码:

string  html  =   @" <span lang= " EN - US " >&rdquo;</span>,用户可以随时接收喜欢的视频电视内容。<span lang= " EN - US " ><a target= " _blank "  href= " http: // info.tele.hc360.com/list/mobile.shtml"><span lang="EN-US"><span lang="EN-US">手机</span></span></a></span>"; 

string  StrNohtml  =  System.Text.RegularExpressions.Regex.Replace(html,  " <[^>]+> " "" ); 

StrNohtml
= System.Text.RegularExpressions.Regex.Replace(StrNohtml,  " &[^;]+; " "" ); 

Console.WriteLine(StrNohtml);

 

功能增强代码:

public   string  NoHTML( string  Htmlstring)   // 替换HTML标记
{
    
// 删除脚本
    Htmlstring  =  Regex.Replace(Htmlstring,  @" <script[^>]*?>.*?</script> " "" , RegexOptions.IgnoreCase);

    
// 删除HTML
    Htmlstring  =  Regex.Replace(Htmlstring,  @" <(.[^>]*)> " "" , RegexOptions.IgnoreCase);
    Htmlstring 
=  Regex.Replace(Htmlstring,  @" ([\r\n])[\s]+ " "" , RegexOptions.IgnoreCase);
    Htmlstring 
=  Regex.Replace(Htmlstring,  @" --> " "" , RegexOptions.IgnoreCase);
    Htmlstring 
=  Regex.Replace(Htmlstring,  @" <!--.* " "" , RegexOptions.IgnoreCase);
    Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(quot|#34); " " \ "" , RegexOptions.IgnoreCase);
    Htmlstring  =  Regex.Replace(Htmlstring,  @" &(amp|#38); " " & " , RegexOptions.IgnoreCase);
    Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(lt|#60); " " < " , RegexOptions.IgnoreCase);
    Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(gt|#62); " " > " , RegexOptions.IgnoreCase);
    Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(nbsp|#160); " "   " , RegexOptions.IgnoreCase);
    Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(iexcl|#161); " " \xa1 " , RegexOptions.IgnoreCase);
    Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(cent|#162); " " \xa2 " , RegexOptions.IgnoreCase);
    Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(pound|#163); " " \xa3 " , RegexOptions.IgnoreCase);
    Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(copy|#169); " " \xa9 " , RegexOptions.IgnoreCase);
    Htmlstring 
=  Regex.Replace(Htmlstring,  @" &#(\d+); " "" , RegexOptions.IgnoreCase);
    Htmlstring 
=  Regex.Replace(Htmlstring,  @" <img[^>]*>; " "" , RegexOptions.IgnoreCase);
    Htmlstring.Replace(
" < " "" );
    Htmlstring.Replace(
" > " "" );
    Htmlstring.Replace(
" \r\n " "" );
    Htmlstring 
=  HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
    
return  Htmlstring;
}

 

 

你可能感兴趣的:(html)