asp.net 新闻采集 简单示例

在网上看了点资料,自己整理了一下,我感觉要用的话新闻地址,应该用RSS来配这样好用些,

o(∩_∩)o 哈哈

private void init2(string url,string begin,string end)

    {

        HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(url);

        HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse();

        Stream stream = webResponse.GetResponseStream();

        System.IO.StreamReader streamReader = new StreamReader(stream, Encoding.GetEncoding("gb2312"));

        string content = streamReader.ReadToEnd();

        streamReader.Close();

        webResponse.Close();

        if (content.IndexOf(begin) > 0)

            content = content.Substring(content.IndexOf(begin));

        if (content.IndexOf(end) > 0)

            content = content.Substring(0, content.IndexOf(end) + end.Length);

        if (content.IndexOf(begin) < 0 || content.IndexOf(end)<0)

        {

            Response.Write("<script>alert('规则定义错误!');</script>");

        }else

        {

            content = DelHTML(content);

            txtContent.Text = content;

        }

    }



    public static string DelHTML(string Htmlstring)//将HTML去除

    {

        #region

        //删除脚本

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        //删除HTML

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"-->", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<!--.*", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        //Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<A>.*</A>", "");

        //Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<[a-zA-Z]*=\.[a-zA-Z]*\?[a-zA-Z]+=\d&\w=%[a-zA-Z]*|[A-Z0-9]", "");

        //Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<P>.*</P>", "");

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, " ", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);



        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(amp|#38);", "&", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(lt|#60);", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(gt|#62);", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&#(\d+);", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

        Htmlstring.Replace("<", "");

        Htmlstring.Replace(">", "");

        Htmlstring.Replace("\r\n", "");

        //Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();

        #endregion

        return Htmlstring;

    }

 

 

 

你可能感兴趣的:(asp.net)