html代码转换成为纯文本

先解码。接着转换为纯文本,用这段代码:
public static string HtmlToText(string source)
        {
            string result;

            //remove line breaks,tabs
            result = source.Replace("\r", " ");
            result = result.Replace("\n", " ");
            result = result.Replace("\t", " ");

            //remove the header
            result = Regex.Replace(result, "().*()", string.Empty, RegexOptions.IgnoreCase);

            result = Regex.Replace(result, @"<( )*script([^>])*>", ")", string.Empty, RegexOptions.IgnoreCase);

            //remove all styles
            result = Regex.Replace(result, @"<( )*style([^>])*>", ")", string.Empty, RegexOptions.IgnoreCase);

            //insert tabs in spaces of tags
            result = Regex.Replace(result, @"<( )*td([^>])*>", " ", RegexOptions.IgnoreCase);

            //insert line breaks in places of
and

  • tags
                result = Regex.Replace(result, @"<( )*br( )*>", "\r", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"<( )*li( )*>", "\r", RegexOptions.IgnoreCase);

                //insert line paragraphs in places of and

    tags
                result = Regex.Replace(result, @"<( )*tr([^>])*>", "\r\r", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"<( )*p([^>])*>", "\r\r", RegexOptions.IgnoreCase);

                //remove anything thats enclosed inside < >
                result = Regex.Replace(result, @"<[^>]*>", string.Empty, RegexOptions.IgnoreCase);

                //replace special characters:
                result = Regex.Replace(result, @"&", "&", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @" ", " ", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"<", "<", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @">", ">", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&(.{2,6});", string.Empty, RegexOptions.IgnoreCase);

                //remove extra line breaks and tabs
                result = Regex.Replace(result, @" ( )+", " ");
                result = Regex.Replace(result, "(\r)( )+(\r)", "\r\r");
                result = Regex.Replace(result, @"(\r\r)+", "\r\n");

                return result;
            }

  • 转载于:https://www.cnblogs.com/walleyekneel/archive/2011/09/20/2182635.html

    你可能感兴趣的:(html代码转换成为纯文本)