如果你喜欢手机阅读
如果宿舍手机几乎没CMNET信号
如果你不想浪费手机流量
如果你只想睡前静静浏览今天的新闻
以下程序抓取了 cnblogs,cnbeta,网易深度,南方周末的首页正文,可添加其它网站
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Net; using System.Collections; using System.Threading; using System.IO; using System.Configuration; namespace RSS { class Program { static void Main(string[] args) { string file="i://"; { GetItem gi1 = new GetItem(); gi1.pageUrl = "http://news.cnblogs.com/n/page/"; gi1.prefix = "http://news.cnblogs.com"; gi1.pageUrlsRegex = "\"(?<url>/n/[\\d]+?)\""; gi1.titleRegex = "<div id=\"news_title\"><a.*?>(?<title>.*?)</a>"; gi1.timeRegex = "<span class=\"time\">(?<time>.*?)</span>"; gi1.bodyRegex = "<div id=\"news_body\">(?<body>.*?)</div>"; gi1.hostName = "CnBlogs"; gi1.encoding = "utf-8"; gi1.fileSave = string.Format("{2}{0}_{1}.txt", gi1.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file); Console.WriteLine(gi1.fileSave); gi1.pageWantToGet = 20; gi1.threadStart(); } //{ // GetItem gi2 = new GetItem(); // gi2.prefix = "http://www.cnbeta.com/"; // gi2.pageUrlsRegex = "\"(?<url>/articles/[\\d]+.htm?)\""; // gi2.titleRegex = "id=\"news_title\">(?<title>.*?)</h3>"; // gi2.timeRegex = "id=\"news_author\"><span>(?<time>.*?)[|]"; // gi2.bodyRegex = "<div id=\"news_content\">(?<body>.*?)<!-- end newsBox news -->"; // gi2.hostName = "CnBeta"; // gi2.encoding = "gb2312"; // gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file); // Console.WriteLine(gi2.fileSave); // gi2.homeOnly = true; // gi2.threadStart(); //} //{ // GetItem gi2 = new GetItem(); // gi2.pageUrlsRegex = "\"(?<url>http://focus.news.163.com.[^>< ]*.html?)\""; // gi2.prefix = "http://focus.news.163.com/"; // gi2.hasPrefix = false;//default:true // gi2.hasManyPage = true;//default:false // gi2.manyPageRegex = "<span class=\"s1 s3\">上一页</span>(?<np>.*?)下一页</a>"; // gi2.titleRegex = "id=\"h1title\">(?<title>.*?)</h1>"; // gi2.timeRegex = "<span class=\"info\">(?<time>.*?)来源"; // gi2.bodyRegex = "class=\"summary\"(?<body>.*?)<!-- 分页 -->"; // gi2.hostName = "163"; // gi2.encoding = "GBK"; // gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file); // Console.WriteLine(gi2.fileSave); // gi2.homeOnly = true; // gi2.threadStart(); //} //{ // GetItem gi2 = new GetItem(); // gi2.pageUrlsRegex = "\"(?<url>http://www.infzm.com/content/[\\d]+?)\""; // gi2.prefix = "http://www.infzm.com/"; // gi2.hasPrefix = false;//default:true // gi2.hasManyPage = false;//default:false // //gi2.manyPageRegex = "<span class=\"s1 s3\">上一页</span>(?<np>.*?)下一页</a>"; // gi2.titleRegex = "<div id=\"detailContent\">[\\s]*<h1>[\\s]*(?<title>.*?)[\\s]*</h1>"; // gi2.timeRegex = "<span class=\"pubTime\">(?<time>.*?)</span>"; // gi2.bodyRegex = "<div id=\"content-context\">(?<body>.*?)<!--end #text-->"; // gi2.hostName = "infzm"; // gi2.encoding = "utf-8"; // gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file); // Console.WriteLine(gi2.fileSave); // gi2.homeOnly = true; // gi2.threadStart(); //} //Console.Read(); } } class GetItem{ public string pageUrl; public bool homeOnly = false; public bool hasPrefix = true; public int pageWantToGet = 1; public bool hasManyPage = false; public string manyPageRegex; public string prefix; private List<string> pageUrls; public string pageUrlsRegex; public string titleRegex; public string timeRegex; public string bodyRegex; public string fileSave; public string hostName; public string encoding; public void threadStart() { if(!prefix.EndsWith("/"))prefix+="/"; ThreadStart ts = new ThreadStart(start); Thread th = new Thread(ts); th.Start(); } private void start() { if (homeOnly) { getPageUrls(-1); } else { for (int i = 1; i <= pageWantToGet; i++) getPageUrls(i); } startGetAll(); } private void WriteFile(string str) { FileStream fs = new FileStream(fileSave, FileMode.Append); StreamWriter streamWriter = new StreamWriter(fs,System.Text.Encoding.GetEncoding("gb2312")); streamWriter.WriteLine(str); streamWriter.Flush(); streamWriter.Close(); fs.Close(); } private void deleteTag(ref string str) { str = Regex.Replace(str, "<[\\s]*p[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*p[\\s]*?>", "\r\n"); str = Regex.Replace(str, "<[\\s]*br[\\s]*/[\\s]*[^>]*>?>", "\r\n"); str = Regex.Replace(str, "<[\\s]*br[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*br[^>]*>?>", "\r\n"); str = Regex.Replace(str, "<[\\s]*a[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*a[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*img[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*img[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*strong[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*strong[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*div[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*div[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*b[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*b[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*span[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*span[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*script[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*script[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*li[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*li[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*img[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*img[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*style[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*style[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*i[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*i[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*h3[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*h2[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*h3[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*h2[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*font[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*font[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*q[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*q[\\s]*[^>]*>?>", ""); str = str.Replace("”", "\""); str = str.Replace("“", "\""); str = str.Replace("‘", "'"); str = str.Replace("’", "'"); str = str.Replace(" ", " "); str = str.Replace("…", "…"); str = str.Replace("–", "-"); str = str.Replace("—", "—"); } public GetItem() { //this.homeUrl = url; pageUrls = new List<string>(50); } private string getNextPageContent(string url) { Console.WriteLine(url); //Console.Read(); try { HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url); //req.SendChunked = true; req.Method = "get"; req.ContentType = "text/html;charset=utf-8"; //req.AllowAutoRedirect = false; // req.Timeout = 50; //req.CookieContainer = cc; StringBuilder sb = new StringBuilder(""); StringBuilder cont = new StringBuilder(""); using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse) { System.IO.Stream respStream = wr.GetResponseStream(); System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding)); //Regex titler = new Regex(this.titleRegex, RegexOptions.Singleline); //Regex timer = new Regex(this.timeRegex, RegexOptions.Singleline); Regex bodyr = new Regex(this.bodyRegex, RegexOptions.Singleline); do { sb.Append(reader.ReadLine()); } while (!reader.EndOfStream); string str = sb.ToString(); //Console.WriteLine(sb); //Match m = titler.Match(str); //if (m.Success) //{ // Console.WriteLine("title:{0}", m.Groups["title"].Value); // //streamWriter.WriteLine(m.Groups["title"].Value); // cont.AppendLine(m.Groups["title"].Value); //} //cont.AppendLine(url); //m = timer.Match(str); //if (m.Success) //{ // Console.WriteLine("time:{0}", m.Groups["time"].Value); // cont.AppendLine(m.Groups["time"].Value); //} Match m = bodyr.Match(str); if (m.Success) { string body = m.Groups["body"].Value; deleteTag(ref body); Console.WriteLine("已获取下一页正文"); return body; } } } catch (Exception ex) { Console.WriteLine("异常:{0}",ex.Message); return ""; } return ""; } private void getContent(string url,int index,int total) { Console.WriteLine(url); //Console.Read(); try { HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url); req.Method = "get"; req.ContentType = " text/html;charset=utf-8"; //req. //req.AllowAutoRedirect = false; // req.Timeout = 50; //req.CookieContainer = cc; StringBuilder sb = new StringBuilder(""); StringBuilder cont = new StringBuilder(""); using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse) { System.IO.Stream respStream = wr.GetResponseStream(); System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding)); Regex titler = new Regex(this.titleRegex, RegexOptions.Singleline); Regex timer = new Regex(this.timeRegex, RegexOptions.Singleline); Regex bodyr = new Regex(this.bodyRegex, RegexOptions.Singleline); do { sb.Append(reader.ReadLine()); } while (!reader.EndOfStream); string str = sb.ToString(); //Console.WriteLine(sb); Match m = titler.Match(str); if (m.Success) { Console.WriteLine("title:{0}",m.Groups["title"].Value); //streamWriter.WriteLine(m.Groups["title"].Value); cont.AppendLine(m.Groups["title"].Value); } cont.AppendLine(string.Format("({0}/{1}){2}",index,total,url)); m = timer.Match(str); if (m.Success) { Console.WriteLine("time:{0}", m.Groups["time"].Value); cont.AppendLine(m.Groups["time"].Value); } m = bodyr.Match(str); if (m.Success) { string body = m.Groups["body"].Value; deleteTag(ref body); Console.WriteLine("获取正文"); cont.AppendLine(body); } if (hasManyPage) { Regex mr = new Regex(this.manyPageRegex, RegexOptions.Singleline); Match mm = mr.Match(str); if (mm.Success) { Console.WriteLine("存在多页.."); string pagesurl = mm.Groups["np"].Value; Regex r = new Regex(this.pageUrlsRegex, RegexOptions.Singleline); MatchCollection mc = r.Matches(pagesurl); for (int i = 0; i < mc.Count; i++) { string u = mc[i].Groups["url"].Value; if (pageUrls.IndexOf(u) == -1) { pageUrls.Add(u); cont.AppendLine(getNextPageContent(u)); } } } } cont.AppendLine("--------------------------------------------------------------"); WriteFile(cont.ToString()); } } catch (Exception ex) { Console.WriteLine("异常:{0},{1}",ex.Source,ex.Message); return; } } private void startGetAll() { for (int i = 0; i < pageUrls.Count; i++) { string u; if (hasPrefix) { if (pageUrls[i].StartsWith("/")) u = string.Format("{0}{1}", prefix, pageUrls[i].Substring(1)); else u = string.Format("{0}{1}", prefix, pageUrls[i]); } else u = pageUrls[i]; getContent(u, i, pageUrls.Count); } } private void getPageUrls(int pageIndex) { string url; if (pageIndex == -1) url = prefix; else url = string.Format("{0}{1}",this.pageUrl,pageIndex); Console.WriteLine(url); try { HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url); req.Method = "get"; req.ContentType = " text/html;charset=utf-8"; //req.AllowAutoRedirect = false; // req.Timeout = 50; //req.CookieContainer = cc; StringBuilder sb = new StringBuilder(""); using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse) { System.IO.Stream respStream = wr.GetResponseStream(); System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding)); Regex r = new Regex(this.pageUrlsRegex, RegexOptions.Singleline); do { sb.Append(reader.ReadLine()); } while (!reader.EndOfStream); // Console.WriteLine(sb); MatchCollection m = r.Matches(sb.ToString()); //Console.WriteLine("regex:{0},matches:{1}", this.pageUrlsRegex, m.Count); for (int i = 0; i < m.Count; i++) { string temp = m[i].Groups["url"].Value; //Console.WriteLine("index:{0},{1}", pageUrls.IndexOf(temp), temp); if (pageUrls.IndexOf(temp) == -1) pageUrls.Add(temp); } Console.WriteLine("{0}:{1} articles.",this.hostName,pageUrls.Count); } } catch (Exception ex) { Console.WriteLine(ex.Message); Console.WriteLine("{0} end!", this.hostName); return; } Console.WriteLine("{0} end!", this.hostName); } } }
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<system.net>
<settings>
<httpWebRequest useUnsafeHeaderParsing= "true" />
</settings>
</system.net>
</configuration>