获取网面标题,文本

        //获取 title
        private string GetTitle(string strHtml)
        {
            string str = "";
            string p = @"<title>([^<]*)</title>";
            Regex reg = new Regex(p, RegexOptions.IgnoreCase | RegexOptions.Compiled);
            Match m = reg.Match(strHtml);
            str = m.Groups[1].Captures[0].ToString().Replace(":", "-").Replace(@"/", "").Replace("/", "").Replace("*", "#").Replace("/"", "").Replace("<", "(").Replace(">", ")").Replace("|", "_");
            return str;
        }

        /// <summary>
        /// 获得读取的html文挡的标题
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        public static string GetTitle(string html)
        {
            Match m = Regex.Match(html, "<title>([^<]*)</title>");
            if (m.Groups.Count == 2)
                return m.Groups[1].Value;
            return "此文挡标题未知";
        }

        /// <summary>
        /// 把读取的文件中的所有的html标记去掉,把&nbsp;替换成空格
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        private string ParseHtml(string html)
        {
            string temp = Regex.Replace(html, @"<script[^>]*?>.*?</script>", "");
            temp = Regex.Replace(temp, "<[^>]*>", "");
            return temp.Replace("&nbsp;", " ");
        } 

 

   /// <summary>
        /// 获取页面预处理后的信息
        /// </summary>
        /// <remarks>页面主体内容是</remarks>
        /// <returns>页面主体内容</returns>
        public static string GetBodyText(string html)
        {
            string resultText;
            Regex reg_Body = new Regex(@"<(body/b)([/s/S]*)<//s*body/s*>", RegexOptions.IgnoreCase);
            Regex reg_script_start = new Regex(@"<script/b", RegexOptions.IgnoreCase);
            Regex reg_script_end = new Regex(@"<//s*script/s*>");
            Regex reg_script = new Regex(@"<script/b[^>]*/>");
            Regex reg_targe = new Regex(@"<[^>]*>");
            int start = 0;
            int end;
            int length;
            if (reg_Body.IsMatch(html))
            {
                string body = reg_Body.Match(html).Value;

                while (reg_script_start.IsMatch(body, start))
                {
                    start = reg_script_start.Match(body, start).Index;
                    end = reg_script_end.Match(body, start).Index;
                    length = end - start;
                    if (length > 0)
                    {
                        if (reg_script.IsMatch(body, start))
                        {
                            string script = reg_script.Match(body, start).Value;
                            length = script.Length;
                            body = body.Remove(start, length);

                        }
                        else
                        {

                            body = body.Remove(start, length);

                        }

                    }
                    else
                    {
                        start += 1;
                    }
                }

                resultText = reg_targe.Replace(body, " ");
                Regex nulls = new Regex(@"/s");
                resultText = nulls.Replace(resultText, "");
                Regex reg1 = new Regex(@"/s");
                Regex reg2 = new Regex(@"/.{2,}");
                Regex reg3 = new Regex(@"&amp;");
                Regex reg4 = new Regex(@"nbsp;");
                Regex reg5 = new Regex(@"quot;");
                Regex reg6 = new Regex(@"gt;");
                Regex reg7 = new Regex(@"&{2,}");
                resultText = reg1.Replace(resultText, "");
                resultText = reg2.Replace(resultText, "");
                resultText = reg3.Replace(resultText, "");
                resultText = reg4.Replace(resultText, "");
                resultText = reg5.Replace(resultText, "");
                resultText = reg6.Replace(resultText, "");
                resultText = reg7.Replace(resultText, "");
                return resultText;

            }
            else
            {
                return "No Body Text.";

            }
        }

你可能感兴趣的:(获取网面标题,文本)