获取HTML源码(只取文字,判断编码,过滤标签)

private void button1_Click(object sender, EventArgs e)

        {

            string s1 = this.textBox1.Text;

            //正则表达式内容

            //string match = @"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$";

            //string match = @"[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$";

            string match = @"[a-zA-z]+://[^\s]*";

            //初始化正则表达式实例

            Regex reg = new Regex(match);

            //开始验证

            bool HasValidate = reg.IsMatch(s1);



            if (HasValidate)

            {

                //MessageBox.Show("这是网站有效URL格式。");

                try

                {

                    string tmp = GetHtml(s1);

                    string tmpend = StripHTML(tmp);



                }

                catch (Exception)

                {

                    //MessageBox.Show("3.该网站只能手动查询!");

                }

            }

        }

 

 

1.获取HTML

GetHtml(String Url)

获取HTML源码(只取文字,判断编码,过滤标签) View Code
        /// <summary>

        /// 获取有效的HTML

        /// </summary>

        /// <param name="Url"></param>

        /// <returns></returns>

        public String GetHtml(String Url)

        {

            string sException = null;



            string sRslt = null;

            string GBsRslt = null;

            StreamReader htm = null;

            WebResponse oWebRps = null;

            WebResponse bWebRps = null;

            int a = 0;



            WebRequest oWebRqst = WebRequest.Create(Url);



            oWebRqst.Timeout = 50000;



            WebRequest bWebRqst = WebRequest.Create(Url);



            bWebRqst.Timeout = 50000;



            try

            {

                oWebRps = oWebRqst.GetResponse();

                bWebRps = bWebRqst.GetResponse();

            }

            catch (WebException e)

            {

                sException = e.Message.ToString();



                MessageBox.Show(sException);

            }

            catch (Exception e)

            {

                sException = e.ToString();



                MessageBox.Show(sException);

            }

            finally

            {

                if (oWebRps != null)

                {

                    StreamReader oStreamRd = new StreamReader(

                        oWebRps.GetResponseStream(), Encoding.GetEncoding("UTF-8")

                        );



                    StreamReader GBoStreamRd = new StreamReader(

                        bWebRps.GetResponseStream(), Encoding.GetEncoding("GB2312")

                        );



                    sRslt = oStreamRd.ReadToEnd();

                    GBsRslt = GBoStreamRd.ReadToEnd();



                    if (!isLuan(sRslt)) //判断utf8是否有乱码

                    {

                        htm = oStreamRd;

                    }



                    else

                    {

                        htm = GBoStreamRd;

                    }



                    if (htm == oStreamRd)

                    {

                        a = 1;

                    }

                    else

                    {

                        a = 2;

                    }



                    oStreamRd.Close();

                    GBoStreamRd.Close();

                    oWebRps.Close();



                }

            }

            if (a == 1)

            {

                return sRslt;

            }

            else

            {

                return GBsRslt;

            }



        }

  

2.去除HTML标记(正则表达式)

StripHTML(string strHtml)

获取HTML源码(只取文字,判断编码,过滤标签) View Code
 1         /// <summary>

 2         /// 去除HTML标记

 3         /// </summary>

 4         /// <param name="strHtml">包括HTML的源码 </param>

 5         /// <returns>已经去除后的文字</returns>

 6         public static string StripHTML(string strHtml)

 7         {

 8             //regex_str="<script type=\\s*[^>]*>[^<]*?</script>";//替换<script>内容</script>为空格

 9             string regex_str = "(?is)<script[^>]*>.*?</script>";//替换<script>内容</script>为空格

10             strHtml = Regex.Replace(strHtml, regex_str, "");

11 

12             //regex_str="<script type=\\s*[^>]*>[^<]*?</script>";//替换<style>内容</style>为空格

13             regex_str = "(?is)<style[^>]*>.*?</style>";//替换<style>内容</style>为空格

14             strHtml = Regex.Replace(strHtml, regex_str, "");

15 

16             //regex_str = "(&nbsp;)+";//替换&nbsp;为空格

17             regex_str = "(?i)&nbsp;";//替换&nbsp;为空格

18             strHtml = Regex.Replace(strHtml, regex_str, " ");

19 

20             //regex_str = "(\r\n)*";//替换\r\n为空

21             regex_str = @"[\r\n]*";//替换\r\n为空

22             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);

23 

24             //regex_str = "<[^<]*>";//替换Html标签为空

25             regex_str = "<[^<>]*>";//替换Html标签为空

26             strHtml = Regex.Replace(strHtml, regex_str, "");

27 

28             //regex_str = "\n*";//替换\n为空

29             regex_str = @"\n*";//替换\n为空

30             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);

31 

32             //可以这样

33             regex_str = "\t*";//替换\t为空

34             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);

35 

36             //可以

37             regex_str = "'";//替换'为’

38             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);

39 

40             //可以

41             regex_str = " +";//替换若干个空格为一个空格

42             strHtml = Regex.Replace(strHtml, regex_str, "  ", RegexOptions.IgnoreCase);

43 

44             Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);

45 

46             string strOutput = regex.Replace(strHtml, "");//替换掉"<"和">"之间的内容

47             strOutput = strOutput.Replace("<", "");

48             strOutput = strOutput.Replace(">", "");

49             strOutput = strOutput.Replace("&nbsp;", "");

50 

51 

52             return strOutput;

53 

54         }

 

3.判断是否为乱码(编码):在StripHTML里调用

获取HTML源码(只取文字,判断编码,过滤标签) View Code
        //判断是否为乱码

        bool isLuan(string txt)

        {



            var bytes = Encoding.UTF8.GetBytes(txt);



            //239 191 189



            for (var i = 0; i < bytes.Length; i++)

            {



                if (i < bytes.Length - 3)



                    if (bytes[i] == 239 && bytes[i + 1] == 191 && bytes[i + 2] == 189)

                    {



                        return true;



                    }

            }



            return false;



        }

 

 

你可能感兴趣的:(html)