discuz论坛的抓取

discuz论坛通用URL地址:

URL+?mod=my&q=关键字(关键字的编码gb2312)

例如: 'http://bbs.anzhi.com/search.php?mod=my&q=htc+%CA%D6%BB%FA';

而真实的sId的值是存储在head里面的location中的。

例如:location=http://search.bbs.hiapk.com/f/search?q=%E6%89%8B%E6%9C%BA&sId=8520930&ts=1355197250&mySign=5785b7cb&searchLevel=3&menu=1&rfh=1&qs=txt.tsort.a&orderField=posted&orderType=desc

           string url = string.Format("http://bbs.hiapk.com/search.php?mod=my&q={0}", crawlerModel.Keyword);
          HTMLContent= GetHteml(url);

private string GetHteml(string url)
       {
           HttpWebRequest request = null;
           HttpWebResponse response = null;
           string gethost = string.Empty;
           CookieContainer cc = new CookieContainer();
           string Cookiesstr = string.Empty;
           try
           {
               request = (HttpWebRequest)WebRequest.Create(url);
               request.Method = "GET";
               request.KeepAlive = true;
               request.Headers.Add("Cookie:" + Cookiesstr);
               request.CookieContainer = cc;
               request.AllowAutoRedirect = false;
               response = (HttpWebResponse)request.GetResponse();
               //设置cookie
               Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
               //取再次跳转链接
               //StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
               //string ss = sr.ReadToEnd();
               gethost =response.Headers["location"];
               request.Abort();
               response.Close();              
           }
           catch (Exception)
           {
               //第一次GET出错
               return "";
           }
           try
           {
               request = (HttpWebRequest)WebRequest.Create(gethost);
               request.Method = "GET";
               request.KeepAlive = true;
               request.Headers.Add("Cookie:" + Cookiesstr);
               request.CookieContainer = cc;
               request.AllowAutoRedirect = false;
               response = (HttpWebResponse)request.GetResponse();
               //设置cookie
               Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
               //取再次跳转链接
               StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));
               string ss = sr.ReadToEnd();
               string pattern = @"按时间排序";
               if (!Regex.IsMatch(ss, pattern)) return "";
               gethost = "http://search.bbs.anzhi.com"+GetURL(Regex.Match(ss, pattern).Value.Replace("&","&"));
               request.Abort();
               sr.Close();
               response.Close();
           }
           catch (Exception)
           {
               //第一次GET出错
               return "";
           }
           try
           {
               request = (HttpWebRequest)WebRequest.Create(gethost);
               request.Method = "GET";
               request.KeepAlive = true;
               request.Headers.Add("Cookie:" + Cookiesstr);
               request.CookieContainer = cc;
               request.AllowAutoRedirect = false;
               response = (HttpWebResponse)request.GetResponse();
               //设置cookie
               Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
               //取再次跳转链接
               StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));
               string ss = sr.ReadToEnd();
               request.Abort();
               sr.Close();
               response.Close();
               return ss;
           }
           catch (Exception)
           {
               //第一次GET出错
               return "";
           }
       }

       private string GetURL(string urlHtml)
       {
           MatchCollection matchList;
           Regex regex = new Regex(@"f="".*?""");
           matchList = regex.Matches(urlHtml);
           if (matchList.Count > 0)
           {
               return matchList[0].Value.Substring(3, matchList[0].Value.Length - 4);
           }
           return "";
       }

小例子:

using System;
using System.Collections.Generic;
using IWOMWebCrawlerDbLayer.DAL;
using IWOMWebCrawlerDbLayer.Model;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Threading;
using IWOMWebCrawlerDbLayer.Common;
using HtmlAgilityPack;
using System.Net;
using System.IO;

namespace IWOMWebCrawlerApp.Crawler
{
    public class tousue_teizi : AbstractSearchEngine
    {
        public tousue_teizi()
       {
           this.SearchID = 1600;
           this.SearchName = "投诉易--帖子";
            
       }
       private int pageId=0;
       /// 
       /// 根据任务生成抓取要素
       /// 
       protected override void initCrawlerModel(IwomTask taskItem)
       {
           crawlerModel = new CrawlerModel();
           crawlerModel.Keyword = CommonFunction.AssembledKeyword(taskItem.KeyWord, IWOMWebCrawlerDbLayer.Common.KeyWordUrlEncode.Normal, false);//关键词
           crawlerModel.PageSize = crawlerModel.PageSize > 10 ? 10 : taskItem.GetItems;//每页大小
           crawlerModel.Postion = taskItem.Task_Postion;
       }
       /// 
       /// 根据任务要素构造抓取的url
       /// 
       protected override string createUrl(int pageIndex)
       {
           if (pageIndex != 0) return "";
           string url = string.Format("http://www.tousue.com/search.php?mod=my&q={0}", crawlerModel.Keyword);
           return url;
       }
       /// 
       /// 每抓取一页都间隔的时间
       /// 
       protected override void PageSleep()
       {
           Thread.Sleep(500);
       }
       /// 
       /// 页面的编码
       /// 
       protected override Encoding getPageEncoding()
       {
           return Encoding.GetEncoding("utf-8");
       }
       /// 
       /// 根据内容判断是否被封禁了
       /// 
       protected override bool checkContentIsForbat(string HTMLContent)
       {
           return true;
       }
       /// 
       /// 根据内容判断是否是最后一页了
       /// 
       protected override bool checkContentIsLastPage(string HTMLContent)
       {
           return false;
       }

       /// 
       /// 根据网页信息得到文章集合
       /// 
       protected override List GetArticleByHtml(string HTMLContent, int task_ID)
       {
           string url = string.Format("http://www.tousue.com/search.php?mod=my&q={0}", crawlerModel.Keyword);
           HTMLContent = GetHteml(url,pageId);
           string mainId = "result-items";
           XPathModel model = new XPathModel();
           model.listXPath = "./ul/li";
           model.titleXPath = "./h3/a";
           model.urlXPath = "./h3/a";
           model.timeXPath = "./p[3]";
           List arrayList = CommonFunction.GetListByXPath(HTMLContent, task_ID, mainId, model, TitleMethod, CommonFunction.UrlMethod, TimeMethod);
           return arrayList;
       }
       /// 
       /// 获得解析的URL
       /// 
       private string GetURL(string urlHtml)
       {
           MatchCollection matchList;
           Regex regex = new Regex(@"f="".*?""");
           matchList = regex.Matches(urlHtml);
           if (matchList.Count > 0)
           {
               return matchList[0].Value.Substring(3, matchList[0].Value.Length - 4);
           }
           return "";
       }
       private string GetHteml(string url,int pageId)
       {
           HttpWebRequest request = null;
           HttpWebResponse response = null;
           string gethost = string.Empty;
           CookieContainer cc = new CookieContainer();
           string Cookiesstr = string.Empty;
           try
           {
               request = (HttpWebRequest)WebRequest.Create(url);
               request.Method = "GET";
               request.KeepAlive = true;
               request.Headers.Add("Cookie:" + Cookiesstr);
               request.CookieContainer = cc;
               request.AllowAutoRedirect = false;
               response = (HttpWebResponse)request.GetResponse();
               //设置cookie
               Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
               //取再次跳转链接
               //StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
               //string ss = sr.ReadToEnd();
               gethost = response.Headers["location"];
               request.Abort();
               response.Close();
           }
           catch (Exception)
           {
               //第一次GET出错
               return "";
           }
           try
           {
               request = (HttpWebRequest)WebRequest.Create(gethost);
               request.Method = "GET";
               request.KeepAlive = true;
               request.Headers.Add("Cookie:" + Cookiesstr);
               request.CookieContainer = cc;
               request.AllowAutoRedirect = false;
               response = (HttpWebResponse)request.GetResponse();
               //设置cookie
               Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
               //取再次跳转链接
               StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));
               string ss = sr.ReadToEnd();
               string pattern = @"按时间排序";
               if (!Regex.IsMatch(ss, pattern)) return "";
               gethost = "http://search.discuz.qq.com" + GetURL(Regex.Match(ss, pattern).Value.Replace("&", "&")) + "&page=" + pageId.ToString();
               request.Abort();
               sr.Close();
               response.Close();
           }
           catch (Exception)
           {
               //第一次GET出错
               return "";
           }
           try
           {
               request = (HttpWebRequest)WebRequest.Create(gethost);
               request.Method = "GET";
               request.KeepAlive = true;
               request.Headers.Add("Cookie:" + Cookiesstr);
               request.CookieContainer = cc;
               request.AllowAutoRedirect = false;
               response = (HttpWebResponse)request.GetResponse();
               //设置cookie
               Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
               //取再次跳转链接
               StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));
               string ss = sr.ReadToEnd();
               request.Abort();
               sr.Close();
               response.Close();
               return ss;
           }
           catch (Exception)
           {
               //第一次GET出错
               return "";
           }
       }
       private string TitleMethod(string content)
       {
           string pattern = @"(?<=[\d]回答).+";
           if (Regex.IsMatch(content, pattern))
               return Regex.Match(content, pattern).Value;
           return content;
       }
       private DateTime TimeMethod(string content)
       {
           string pattern = @"[\d]{4}-[\d]{1,2}-[\d]{1,2}";
           string temp = "";
           DateTime time = new DateTime();
           time = CommonFunction.GetTimeByChinese(content);
           if (time != DateTime.MinValue)
               return time;
           if (Regex.IsMatch(content, pattern))
               temp = Regex.Match(content, pattern).Value;
           DateTime.TryParse(temp, out time);
           return time;
       }
       /// 
       /// 生成测试任务的方法
       /// 
       protected override string initTestUrl()
       {
           this.HaseCreateTime = true;
           this.HasePageSize = 10;
           this.HaseSiteName = true;
           this.HaseSummary = true;
           pageId = 2;
           HaseAuthor = true;
           crawlerModel.Keyword = CommonFunction.AssembledKeyword("海尔", IWOMWebCrawlerDbLayer.Common.KeyWordUrlEncode.Normal, false);
           string url = string.Format("http://www.tousue.com/search.php?mod=my&q={0}", crawlerModel.Keyword);
           return url;
       }
    }
}




你可能感兴趣的:(抓取)