discuz论坛通用URL地址:
URL+?mod=my&q=关键字(关键字的编码gb2312)
例如: 'http://bbs.anzhi.com/search.php?mod=my&q=htc+%CA%D6%BB%FA';
而真实的sId的值是存储在head里面的location中的。
例如:location=http://search.bbs.hiapk.com/f/search?q=%E6%89%8B%E6%9C%BA&sId=8520930&ts=1355197250&mySign=5785b7cb&searchLevel=3&menu=1&rfh=1&qs=txt.tsort.a&orderField=posted&orderType=desc
string url = string.Format("http://bbs.hiapk.com/search.php?mod=my&q={0}", crawlerModel.Keyword);
HTMLContent= GetHteml(url);
private string GetHteml(string url)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
string gethost = string.Empty;
CookieContainer cc = new CookieContainer();
string Cookiesstr = string.Empty;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
request.KeepAlive = true;
request.Headers.Add("Cookie:" + Cookiesstr);
request.CookieContainer = cc;
request.AllowAutoRedirect = false;
response = (HttpWebResponse)request.GetResponse();
//设置cookie
Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
//取再次跳转链接
//StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
//string ss = sr.ReadToEnd();
gethost =response.Headers["location"];
request.Abort();
response.Close();
}
catch (Exception)
{
//第一次GET出错
return "";
}
try
{
request = (HttpWebRequest)WebRequest.Create(gethost);
request.Method = "GET";
request.KeepAlive = true;
request.Headers.Add("Cookie:" + Cookiesstr);
request.CookieContainer = cc;
request.AllowAutoRedirect = false;
response = (HttpWebResponse)request.GetResponse();
//设置cookie
Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
//取再次跳转链接
StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));
string ss = sr.ReadToEnd();
string pattern = @"按时间排序";
if (!Regex.IsMatch(ss, pattern)) return "";
gethost = "http://search.bbs.anzhi.com"+GetURL(Regex.Match(ss, pattern).Value.Replace("&","&"));
request.Abort();
sr.Close();
response.Close();
}
catch (Exception)
{
//第一次GET出错
return "";
}
try
{
request = (HttpWebRequest)WebRequest.Create(gethost);
request.Method = "GET";
request.KeepAlive = true;
request.Headers.Add("Cookie:" + Cookiesstr);
request.CookieContainer = cc;
request.AllowAutoRedirect = false;
response = (HttpWebResponse)request.GetResponse();
//设置cookie
Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
//取再次跳转链接
StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));
string ss = sr.ReadToEnd();
request.Abort();
sr.Close();
response.Close();
return ss;
}
catch (Exception)
{
//第一次GET出错
return "";
}
}
private string GetURL(string urlHtml)
{
MatchCollection matchList;
Regex regex = new Regex(@"f="".*?""");
matchList = regex.Matches(urlHtml);
if (matchList.Count > 0)
{
return matchList[0].Value.Substring(3, matchList[0].Value.Length - 4);
}
return "";
}
小例子:
using System;
using System.Collections.Generic;
using IWOMWebCrawlerDbLayer.DAL;
using IWOMWebCrawlerDbLayer.Model;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Threading;
using IWOMWebCrawlerDbLayer.Common;
using HtmlAgilityPack;
using System.Net;
using System.IO;
namespace IWOMWebCrawlerApp.Crawler
{
public class tousue_teizi : AbstractSearchEngine
{
public tousue_teizi()
{
this.SearchID = 1600;
this.SearchName = "投诉易--帖子";
}
private int pageId=0;
///
/// 根据任务生成抓取要素
///
protected override void initCrawlerModel(IwomTask taskItem)
{
crawlerModel = new CrawlerModel();
crawlerModel.Keyword = CommonFunction.AssembledKeyword(taskItem.KeyWord, IWOMWebCrawlerDbLayer.Common.KeyWordUrlEncode.Normal, false);//关键词
crawlerModel.PageSize = crawlerModel.PageSize > 10 ? 10 : taskItem.GetItems;//每页大小
crawlerModel.Postion = taskItem.Task_Postion;
}
///
/// 根据任务要素构造抓取的url
///
protected override string createUrl(int pageIndex)
{
if (pageIndex != 0) return "";
string url = string.Format("http://www.tousue.com/search.php?mod=my&q={0}", crawlerModel.Keyword);
return url;
}
///
/// 每抓取一页都间隔的时间
///
protected override void PageSleep()
{
Thread.Sleep(500);
}
///
/// 页面的编码
///
protected override Encoding getPageEncoding()
{
return Encoding.GetEncoding("utf-8");
}
///
/// 根据内容判断是否被封禁了
///
protected override bool checkContentIsForbat(string HTMLContent)
{
return true;
}
///
/// 根据内容判断是否是最后一页了
///
protected override bool checkContentIsLastPage(string HTMLContent)
{
return false;
}
///
/// 根据网页信息得到文章集合
///
protected override List GetArticleByHtml(string HTMLContent, int task_ID)
{
string url = string.Format("http://www.tousue.com/search.php?mod=my&q={0}", crawlerModel.Keyword);
HTMLContent = GetHteml(url,pageId);
string mainId = "result-items";
XPathModel model = new XPathModel();
model.listXPath = "./ul/li";
model.titleXPath = "./h3/a";
model.urlXPath = "./h3/a";
model.timeXPath = "./p[3]";
List arrayList = CommonFunction.GetListByXPath(HTMLContent, task_ID, mainId, model, TitleMethod, CommonFunction.UrlMethod, TimeMethod);
return arrayList;
}
///
/// 获得解析的URL
///
private string GetURL(string urlHtml)
{
MatchCollection matchList;
Regex regex = new Regex(@"f="".*?""");
matchList = regex.Matches(urlHtml);
if (matchList.Count > 0)
{
return matchList[0].Value.Substring(3, matchList[0].Value.Length - 4);
}
return "";
}
private string GetHteml(string url,int pageId)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
string gethost = string.Empty;
CookieContainer cc = new CookieContainer();
string Cookiesstr = string.Empty;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
request.KeepAlive = true;
request.Headers.Add("Cookie:" + Cookiesstr);
request.CookieContainer = cc;
request.AllowAutoRedirect = false;
response = (HttpWebResponse)request.GetResponse();
//设置cookie
Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
//取再次跳转链接
//StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
//string ss = sr.ReadToEnd();
gethost = response.Headers["location"];
request.Abort();
response.Close();
}
catch (Exception)
{
//第一次GET出错
return "";
}
try
{
request = (HttpWebRequest)WebRequest.Create(gethost);
request.Method = "GET";
request.KeepAlive = true;
request.Headers.Add("Cookie:" + Cookiesstr);
request.CookieContainer = cc;
request.AllowAutoRedirect = false;
response = (HttpWebResponse)request.GetResponse();
//设置cookie
Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
//取再次跳转链接
StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));
string ss = sr.ReadToEnd();
string pattern = @"按时间排序";
if (!Regex.IsMatch(ss, pattern)) return "";
gethost = "http://search.discuz.qq.com" + GetURL(Regex.Match(ss, pattern).Value.Replace("&", "&")) + "&page=" + pageId.ToString();
request.Abort();
sr.Close();
response.Close();
}
catch (Exception)
{
//第一次GET出错
return "";
}
try
{
request = (HttpWebRequest)WebRequest.Create(gethost);
request.Method = "GET";
request.KeepAlive = true;
request.Headers.Add("Cookie:" + Cookiesstr);
request.CookieContainer = cc;
request.AllowAutoRedirect = false;
response = (HttpWebResponse)request.GetResponse();
//设置cookie
Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
//取再次跳转链接
StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));
string ss = sr.ReadToEnd();
request.Abort();
sr.Close();
response.Close();
return ss;
}
catch (Exception)
{
//第一次GET出错
return "";
}
}
private string TitleMethod(string content)
{
string pattern = @"(?<=[\d]回答).+";
if (Regex.IsMatch(content, pattern))
return Regex.Match(content, pattern).Value;
return content;
}
private DateTime TimeMethod(string content)
{
string pattern = @"[\d]{4}-[\d]{1,2}-[\d]{1,2}";
string temp = "";
DateTime time = new DateTime();
time = CommonFunction.GetTimeByChinese(content);
if (time != DateTime.MinValue)
return time;
if (Regex.IsMatch(content, pattern))
temp = Regex.Match(content, pattern).Value;
DateTime.TryParse(temp, out time);
return time;
}
///
/// 生成测试任务的方法
///
protected override string initTestUrl()
{
this.HaseCreateTime = true;
this.HasePageSize = 10;
this.HaseSiteName = true;
this.HaseSummary = true;
pageId = 2;
HaseAuthor = true;
crawlerModel.Keyword = CommonFunction.AssembledKeyword("海尔", IWOMWebCrawlerDbLayer.Common.KeyWordUrlEncode.Normal, false);
string url = string.Format("http://www.tousue.com/search.php?mod=my&q={0}", crawlerModel.Keyword);
return url;
}
}
}