C# RSS:新闻抓取正文并转TXT

如果你喜欢手机阅读

如果宿舍手机几乎没CMNET信号

如果你不想浪费手机流量

如果你只想睡前静静浏览今天的新闻

以下程序抓取了 cnblogs,cnbeta,网易深度,南方周末的首页正文,可添加其它网站

 
using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using System.Text.RegularExpressions;

using System.Net;

using System.Collections;

using System.Threading;

using System.IO;

using System.Configuration;

namespace RSS

{

    class Program

    {

      

        static void Main(string[] args)

        {

            string file="i://";



            {

                GetItem gi1 = new GetItem();

                gi1.pageUrl = "http://news.cnblogs.com/n/page/";

                gi1.prefix = "http://news.cnblogs.com";

                gi1.pageUrlsRegex = "\"(?<url>/n/[\\d]+?)\"";

                gi1.titleRegex = "<div id=\"news_title\"><a.*?>(?<title>.*?)</a>";

                gi1.timeRegex = "<span class=\"time\">(?<time>.*?)</span>";

                gi1.bodyRegex = "<div id=\"news_body\">(?<body>.*?)</div>";

                gi1.hostName = "CnBlogs";

                gi1.encoding = "utf-8";

                gi1.fileSave = string.Format("{2}{0}_{1}.txt", gi1.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);

                Console.WriteLine(gi1.fileSave);

                gi1.pageWantToGet = 20;

                gi1.threadStart();



            }



            //{

            //    GetItem gi2 = new GetItem();

            //    gi2.prefix = "http://www.cnbeta.com/";

            //    gi2.pageUrlsRegex = "\"(?<url>/articles/[\\d]+.htm?)\"";

            //    gi2.titleRegex = "id=\"news_title\">(?<title>.*?)</h3>";

            //    gi2.timeRegex = "id=\"news_author\"><span>(?<time>.*?)[|]";

            //    gi2.bodyRegex = "<div id=\"news_content\">(?<body>.*?)<!-- end newsBox news -->";

            //    gi2.hostName = "CnBeta";

            //    gi2.encoding = "gb2312";

            //    gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);

            //    Console.WriteLine(gi2.fileSave);

            //    gi2.homeOnly = true;

            //    gi2.threadStart();



            //}



            //{

            //    GetItem gi2 = new GetItem();

            //    gi2.pageUrlsRegex = "\"(?<url>http://focus.news.163.com.[^>< ]*.html?)\"";

            //    gi2.prefix = "http://focus.news.163.com/";

            //    gi2.hasPrefix = false;//default:true

            //    gi2.hasManyPage = true;//default:false

            //    gi2.manyPageRegex = "<span class=\"s1 s3\">上一页</span>(?<np>.*?)下一页</a>";

            //    gi2.titleRegex = "id=\"h1title\">(?<title>.*?)</h1>";

            //    gi2.timeRegex = "<span class=\"info\">(?<time>.*?)来源";

            //    gi2.bodyRegex = "class=\"summary\"(?<body>.*?)<!-- 分页 -->";

            //    gi2.hostName = "163";

            //    gi2.encoding = "GBK";

            //    gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);

            //    Console.WriteLine(gi2.fileSave);

            //    gi2.homeOnly = true;

            //    gi2.threadStart();



            //}

            //{

            //    GetItem gi2 = new GetItem();

            //    gi2.pageUrlsRegex = "\"(?<url>http://www.infzm.com/content/[\\d]+?)\"";

            //    gi2.prefix = "http://www.infzm.com/";

            //    gi2.hasPrefix = false;//default:true

            //    gi2.hasManyPage = false;//default:false

            //    //gi2.manyPageRegex = "<span class=\"s1 s3\">上一页</span>(?<np>.*?)下一页</a>";

            //    gi2.titleRegex = "<div id=\"detailContent\">[\\s]*<h1>[\\s]*(?<title>.*?)[\\s]*</h1>";

            //    gi2.timeRegex = "<span class=\"pubTime\">(?<time>.*?)</span>";

            //    gi2.bodyRegex = "<div id=\"content-context\">(?<body>.*?)<!--end #text-->";

            //    gi2.hostName = "infzm";

            //    gi2.encoding = "utf-8";

            //    gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);

            //    Console.WriteLine(gi2.fileSave);

            //    gi2.homeOnly = true;

            //    gi2.threadStart();



            //}

            //Console.Read();

            

        }

    }

    class GetItem{

        public string pageUrl;

        public bool homeOnly = false;

        public bool hasPrefix = true;

        public int pageWantToGet = 1;

        public bool hasManyPage = false;

        public string manyPageRegex;

        public string prefix;

        private List<string> pageUrls;

        public string pageUrlsRegex;

        public string titleRegex;

        public string timeRegex;

        public string bodyRegex;

        public string fileSave;

        public string hostName;

        public string encoding;

        public void threadStart() {



            if(!prefix.EndsWith("/"))prefix+="/";

            ThreadStart ts = new ThreadStart(start);

            Thread th = new Thread(ts);

            th.Start();

            

        }

        private void start() {



            if (homeOnly) { 

                

                getPageUrls(-1);



            }

            else

            {



                for (int i = 1; i <= pageWantToGet; i++)

                    getPageUrls(i);

            }

            startGetAll();

        }

        private void WriteFile(string str) {

            FileStream fs = new FileStream(fileSave, FileMode.Append);

            StreamWriter streamWriter = new StreamWriter(fs,System.Text.Encoding.GetEncoding("gb2312"));

            streamWriter.WriteLine(str);

            streamWriter.Flush();

            streamWriter.Close();

            fs.Close();

        }

        private void deleteTag(ref string str)

        {

   

            str = Regex.Replace(str, "<[\\s]*p[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*p[\\s]*?>", "\r\n");

            str = Regex.Replace(str, "<[\\s]*br[\\s]*/[\\s]*[^>]*>?>", "\r\n");

            str = Regex.Replace(str, "<[\\s]*br[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*br[^>]*>?>", "\r\n");

            

            str = Regex.Replace(str, "<[\\s]*a[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*a[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*img[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*img[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*strong[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*strong[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*div[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*div[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*b[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*b[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*span[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*span[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*script[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*script[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*li[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*li[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*img[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*img[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*style[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*style[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*i[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*i[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*h3[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*h2[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*h3[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*h2[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*font[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*font[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "<[\\s]*q[\\s]*[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*q[\\s]*[^>]*>?>", "");

            str = str.Replace("&rdquo;", "\"");

            str = str.Replace("&ldquo;", "\"");

            str = str.Replace("&lsquo;", "'");

            str = str.Replace("&rsquo;", "'");

            str = str.Replace("&nbsp;", " ");

            str = str.Replace("&hellip;", "");

            str = str.Replace("&ndash;", "-");

            str = str.Replace("&mdash;", "");

        }

        public GetItem()

        {

            //this.homeUrl = url;

            pageUrls = new List<string>(50);

        }

        private string getNextPageContent(string url) {



            Console.WriteLine(url);



            //Console.Read();



            try

            {

                HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);

                //req.SendChunked = true;

                req.Method = "get";

                req.ContentType = "text/html;charset=utf-8";



                //req.AllowAutoRedirect = false;

                // req.Timeout = 50;

                //req.CookieContainer = cc;





                StringBuilder sb = new StringBuilder("");

                StringBuilder cont = new StringBuilder("");

                using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)

                {



                    System.IO.Stream respStream = wr.GetResponseStream();

                    System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding));

                    //Regex titler = new Regex(this.titleRegex, RegexOptions.Singleline);

                    //Regex timer = new Regex(this.timeRegex, RegexOptions.Singleline);

                    Regex bodyr = new Regex(this.bodyRegex, RegexOptions.Singleline);

                    do

                    {



                        sb.Append(reader.ReadLine());







                    } while (!reader.EndOfStream);



                    string str = sb.ToString();

                    //Console.WriteLine(sb);

                    //Match m = titler.Match(str);

                    //if (m.Success)

                    //{

                    //    Console.WriteLine("title:{0}", m.Groups["title"].Value);

                    //    //streamWriter.WriteLine(m.Groups["title"].Value);

                    //    cont.AppendLine(m.Groups["title"].Value);



                    //}

                    //cont.AppendLine(url);

                    //m = timer.Match(str);

                    //if (m.Success)

                    //{

                    //    Console.WriteLine("time:{0}", m.Groups["time"].Value);

                    //    cont.AppendLine(m.Groups["time"].Value);

                    //}

                    Match m = bodyr.Match(str);

                    if (m.Success)

                    {

                        string body = m.Groups["body"].Value;



                        deleteTag(ref body);

                        Console.WriteLine("已获取下一页正文");

                        return body;

                    }

                   



                }

            }

            catch (Exception ex)

            {

                Console.WriteLine("异常:{0}",ex.Message);

                return "";

            }

            return "";

        

        

        

        }

        private void getContent(string url,int index,int total)

        {

            Console.WriteLine(url);



            //Console.Read();

            

            try

            {

                HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);

                

                req.Method = "get";

                req.ContentType = "	text/html;charset=utf-8";

                

                //req.

                //req.AllowAutoRedirect = false;

                // req.Timeout = 50;

                //req.CookieContainer = cc;





                StringBuilder sb = new StringBuilder("");

                StringBuilder cont = new StringBuilder("");

                using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)

                {

                   

                    System.IO.Stream respStream = wr.GetResponseStream();

                    System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding));

                    Regex titler = new Regex(this.titleRegex, RegexOptions.Singleline);

                    Regex timer = new Regex(this.timeRegex, RegexOptions.Singleline);

                    Regex bodyr = new Regex(this.bodyRegex, RegexOptions.Singleline);

                    do

                    {



                        sb.Append(reader.ReadLine());







                    } while (!reader.EndOfStream);



                    string str = sb.ToString();

                    //Console.WriteLine(sb);

                    Match m = titler.Match(str);

                    if (m.Success) {

                        Console.WriteLine("title:{0}",m.Groups["title"].Value);

                        //streamWriter.WriteLine(m.Groups["title"].Value);

                        cont.AppendLine(m.Groups["title"].Value);

                        

                    }

                    cont.AppendLine(string.Format("({0}/{1}){2}",index,total,url));

                    m = timer.Match(str);

                    if (m.Success) {

                        Console.WriteLine("time:{0}", m.Groups["time"].Value);

                        cont.AppendLine(m.Groups["time"].Value);

                    }

                    m = bodyr.Match(str);

                    if (m.Success)

                    {

                        string body = m.Groups["body"].Value;



                        deleteTag(ref body);

                        Console.WriteLine("获取正文");

                        cont.AppendLine(body);

                    }

                    if (hasManyPage) {

                        

                        Regex mr = new Regex(this.manyPageRegex, RegexOptions.Singleline);

                        Match mm = mr.Match(str);

                        if (mm.Success) {

                            Console.WriteLine("存在多页..");

                            string pagesurl = mm.Groups["np"].Value;

                            Regex r = new Regex(this.pageUrlsRegex, RegexOptions.Singleline);

                            MatchCollection mc = r.Matches(pagesurl);

                            for (int i = 0; i < mc.Count; i++) {

                                string u = mc[i].Groups["url"].Value;

                                if (pageUrls.IndexOf(u) == -1) {



                                    pageUrls.Add(u);

                                    cont.AppendLine(getNextPageContent(u));

                                }

                            }

                        

                        }

                    

                    }

                    cont.AppendLine("--------------------------------------------------------------");

                    WriteFile(cont.ToString());

                  

                }

            }

            catch (Exception ex)

            {

                Console.WriteLine("异常:{0},{1}",ex.Source,ex.Message);

                return;

            }

            

        

        }

        private void startGetAll() {



            for (int i = 0; i < pageUrls.Count; i++)

            {

                string u;

                if (hasPrefix)

                {







                    if (pageUrls[i].StartsWith("/"))

                        u = string.Format("{0}{1}", prefix, pageUrls[i].Substring(1));

                    else u = string.Format("{0}{1}", prefix, pageUrls[i]);



                }

                else u = pageUrls[i];





                getContent(u, i, pageUrls.Count);



            }

        }

        private void getPageUrls(int pageIndex)

        {

            string url;

            if (pageIndex == -1) url = prefix;

            else url = string.Format("{0}{1}",this.pageUrl,pageIndex);

            Console.WriteLine(url);

            try

            {

                HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);

                req.Method = "get";

                req.ContentType = "	text/html;charset=utf-8";



                //req.AllowAutoRedirect = false;

                // req.Timeout = 50;

                //req.CookieContainer = cc;





                StringBuilder sb = new StringBuilder("");

                using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)

                {



                    System.IO.Stream respStream = wr.GetResponseStream();

                    System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding));

                    Regex r = new Regex(this.pageUrlsRegex, RegexOptions.Singleline);

                    

                    do

                    {

                        

                        sb.Append(reader.ReadLine());







                    } while (!reader.EndOfStream);





                   // Console.WriteLine(sb);

                    MatchCollection m = r.Matches(sb.ToString());

                    //Console.WriteLine("regex:{0},matches:{1}", this.pageUrlsRegex, m.Count);

                    for (int i = 0; i < m.Count; i++) {

                        string temp = m[i].Groups["url"].Value;

                        //Console.WriteLine("index:{0},{1}", pageUrls.IndexOf(temp), temp);

                        if (pageUrls.IndexOf(temp) == -1) pageUrls.Add(temp);

                    }

                    Console.WriteLine("{0}:{1} articles.",this.hostName,pageUrls.Count);

                    

             

                    

                }

            }

            catch (Exception ex)

            {

                Console.WriteLine(ex.Message);

                Console.WriteLine("{0} end!", this.hostName);

                return;

            }

            Console.WriteLine("{0} end!", this.hostName);

        }

    }

}



注:project->add item->new xml file:app.config
like this:

<?xml version="1.0" encoding="utf-8" ?>
<configuration>
  <system.net>
    <settings>
      <httpWebRequest  useUnsafeHeaderParsing= "true"  />
    </settings>
  </system.net>
</configuration>


你可能感兴趣的:(txt)