一个C# 小爬虫 用于爬取标书信息
using System;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.IO.Compression;
using System.Security.Cryptography.X509Certificates;
using System.Net.Security;
using System.Globalization;
namespace crawler
{
public partial class JunXiang : Form
{
public JunXiang()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
int page = 10;
try
{
page = int.Parse(textBox2.Text)+1;
if (page > 10000)
page = 9999;
}
catch {
}
webBrowser1.Document.OpenNew(false);
textBox1.Text = "";
HttpHelper http = new HttpHelper();
String str = "";
bool flag = true;
for(int i=1;iif (flag == false)
break;
DateTime start = dateTimePicker1.Value;
DateTime end = dateTimePicker2.Value;
HttpItem item = new HttpItem()
{
URL = "http://www.sczfcg.com/CmsNewsController.do?method=recommendBulletinList&moreType=provincebuyBulletinMore&channelCode=shiji_cggg&rp=25&page="+i,//URL 必需项
Method = "get",//URL 可选项 默认为Get
ContentType = "text/html",//返回类型 可选项有默认值
//ContentType = "application/x-www-form-urlencoded",//返回类型 可选项有默认值
};
//请求的返回值对象
HttpResult result = http.GetHtml(item);
//获取请请求的Html
string html = result.Html;
//string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']";
//Regex re = new Regex(regex);
//MatchCollection matches = re.Matches(html);
//System.Collections.IEnumerator enu = matches.GetEnumerator();
//while (enu.MoveNext() && enu.Current != null)
//{
// Match match = (Match)(enu.Current);
// Console.Write(match.Value + "\r\n");
//}
// Regex reg = new Regex(@"(?is)([^>]+?)");
Regex reg = new Regex(@"(.|\s)*? ", RegexOptions.IgnoreCase);
MatchCollection mc = reg.Matches(html);
foreach (Match m in mc)
{
if (m.ToString().IndexOf("采购公告") > -1)
{
string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']";
Regex reg1 = new Regex(regex);
MatchCollection matches = reg1.Matches(m.ToString());
Regex reg4 = new Regex(@"(?is)([^>]+?)");
MatchCollection matches4 = reg.Matches(ToString());
if (matches.Count > 0 && (m.ToString().IndexOf(textBox3.Text) > -1 || textBox3.Text.Length == 0))
{
Regex reg3 = new Regex(@"(.|\s)*?", RegexOptions.IgnoreCase);
MatchCollection matches3 = reg3.Matches(m.ToString());
DateTimeFormatInfo dtFormat = new DateTimeFormatInfo();
dtFormat.ShortDatePattern = "yyyy-MM-dd";
String now = matches3[0].ToString();
string start_date = start.ToShortDateString().ToString();
string end_date = end.ToShortDateString().ToString();
now = now.Replace("", "").Replace("", "");
DateTime dt = Convert.ToDateTime(now, dtFormat);
start = Convert.ToDateTime(start_date, dtFormat);
end = Convert.ToDateTime(end_date, dtFormat);
if (DateTime.Compare(start, dt) > 0 || DateTime.Compare(dt, end) > 0)
{
break;
}
if((DateTime.Compare(start, dt) > 0))
flag = false;
textBox1.Text += matches3[0] + "\r\n";
str += m.ToString();
textBox1.Text += matches[0].Groups[0] + "\r\n";
Regex reg2 = new Regex(@" title=""(.|\s)*?"" target");
MatchCollection matches2 = reg2.Matches(m.ToString());
textBox1.Text += matches2[0].Value + "\r\n";
textBox1.Text += "***** " + "第"+(i)+"页"+" *****\r\n";
textBox1.Text = textBox1.Text.Replace("href=", "");
textBox1.Text = textBox1.Text.Replace("\"", "");
textBox1.Text = textBox1.Text.Replace(" title=", "");
textBox1.Text = textBox1.Text.Replace(" target", "");
textBox1.Text = textBox1.Text.Replace("", "");
textBox1.Text = textBox1.Text.Replace("", "");
}
}
}
}
str += "";
str = str.Replace("/view", "http://www.sczfcg.com/view");
textBox1.Text = textBox1.Text.Replace("/view", "http://www.sczfcg.com/view");
webBrowser1.Document.Write(str);
}
private void textBox1_TextChanged(object sender, EventArgs e)
{
}
private void Form1_Load(object sender, EventArgs e)
{
webBrowser1.Navigate("about:blank");
webBrowser1.Document.OpenNew(true);
}
private void textBox3_TextChanged(object sender, EventArgs e)
{
}
private void linkLabel1_LinkClicked(object sender, LinkLabelLinkClickedEventArgs e)
{
}
private void textBox2_TextChanged(object sender, EventArgs e)
{
}
private void button2_Click(object sender, EventArgs e)
{
webBrowser1.Document.OpenNew(false);
textBox1.Text = "";
HttpHelper http = new HttpHelper();
String str = "";
str += "";
webBrowser1.Document.Write(str);
}
}
public class HttpHelper
{
#region 预定义方变量
//默认的编码
private Encoding encoding = Encoding.Default;
//Post数据编码
private Encoding postencoding = Encoding.Default;
//HttpWebRequest对象用来发起请求
private HttpWebRequest request = null;
//获取影响流的数据对象
private HttpWebResponse response = null;
//设置本地的出口ip和端口
private IPEndPoint _IPEndPoint = null;
#endregion
#region Public
///
/// 根据相传入的数据,得到相应页面数据
///
/// 参数类对象
/// 返回HttpResult类型
public HttpResult GetHtml(HttpItem item)
{
//返回参数
HttpResult result = new HttpResult();
try
{
//准备参数
SetRequest(item);
}
catch (Exception ex)
{
//配置参数时出错
return new HttpResult() { Cookie = string.Empty, Header = null, Html = ex.Message, StatusDescription = "配置参数时出错:" + ex.Message };
}
try
{
//请求数据
using (response = (HttpWebResponse)request.GetResponse())
{
GetData(item, result);
}
}
catch (WebException ex)
{
if (ex.Response != null)
{
using (response = (HttpWebResponse)ex.Response)
{
GetData(item, result);
}
}
else
{
result.Html = ex.Message;
}
}
catch (Exception ex)
{
result.Html = ex.Message;
}
if (item.IsToLower) result.Html = result.Html.ToLower();
return result;
}
#endregion
#region GetData
///
/// 获取数据的并解析的方法
///
///
///
private void GetData(HttpItem item, HttpResult result)
{
if (response == null)
{
return;
}
#region base
//获取StatusCode
result.StatusCode = response.StatusCode;
//获取StatusDescription
result.StatusDescription = response.StatusDescription;
//获取Headers
result.Header = response.Headers;
//获取最后访问的URl
result.ResponseUri = response.ResponseUri.ToString();
//获取CookieCollection
if (response.Cookies != null) result.CookieCollection = response.Cookies;
//获取set-cookie
if (response.Headers["set-cookie"] != null) result.Cookie = response.Headers["set-cookie"];
#endregion
#region byte
//处理网页Byte
byte[] ResponseByte = GetByte();
#endregion
#region Html
if (ResponseByte != null && ResponseByte.Length > 0)
{
//设置编码
SetEncoding(item, result, ResponseByte);
//得到返回的HTML
result.Html = encoding.GetString(ResponseByte);
}
else
{
//没有返回任何Html代码
result.Html = string.Empty;
}
#endregion
}
///
/// 设置编码
///
/// HttpItem
/// HttpResult
/// byte[]
private void SetEncoding(HttpItem item, HttpResult result, byte[] ResponseByte)
{
//是否返回Byte类型数据
if (item.ResultType == ResultType.Byte) result.ResultByte = ResponseByte;
//从这里开始我们要无视编码了
if (encoding == null)
{
Match meta = Regex.Match(Encoding.Default.GetString(ResponseByte), ", RegexOptions.IgnoreCase);
string c = string.Empty;
if (meta != null && meta.Groups.Count > 0)
{
c = meta.Groups[1].Value.ToLower().Trim();
}
if (c.Length > 2)
{
try
{
encoding = Encoding.GetEncoding(c.Replace("\"", string.Empty).Replace("'", "").Replace(";", "").Replace("iso-8859-1", "gbk").Trim());
}
catch
{
if (string.IsNullOrEmpty(response.CharacterSet))
{
encoding = Encoding.UTF8;
}
else
{
encoding = Encoding.GetEncoding(response.CharacterSet);
}
}
}
else
{
if (string.IsNullOrEmpty(response.CharacterSet))
{
encoding = Encoding.UTF8;
}
else
{
encoding = Encoding.GetEncoding(response.CharacterSet);
}
}
}
}
///
/// 提取网页Byte
///
///
private byte[] GetByte()
{
byte[] ResponseByte = null;
using (MemoryStream _stream = new MemoryStream())
{
//GZIIP处理
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
{
//开始读取流并设置编码方式
new GZipStream(response.GetResponseStream(), CompressionMode.Decompress).CopyTo(_stream, 10240);
}
else
{
//开始读取流并设置编码方式
response.GetResponseStream().CopyTo(_stream, 10240);
}
//获取Byte
ResponseByte = _stream.ToArray();
}
return ResponseByte;
}
#endregion
#region SetRequest
///
/// 为请求准备参数
///
///参数列表
private void SetRequest(HttpItem item)
{
// 验证证书
SetCer(item);
if (item.IPEndPoint != null)
{
_IPEndPoint = item.IPEndPoint;
//设置本地的出口ip和端口
request.ServicePoint.BindIPEndPointDelegate = new BindIPEndPoint(BindIPEndPointCallback);
}
//设置Header参数
if (item.Header != null && item.Header.Count > 0) foreach (string key in item.Header.AllKeys)
{
request.Headers.Add(key, item.Header[key]);
}
// 设置代理
SetProxy(item);
if (item.ProtocolVersion != null) request.ProtocolVersion = item.ProtocolVersion;
request.ServicePoint.Expect100Continue = item.Expect100Continue;
//请求方式Get或者Post
request.Method = item.Method;
request.Timeout = item.Timeout;
request.KeepAlive = item.KeepAlive;
request.ReadWriteTimeout = item.ReadWriteTimeout;
if (!string.IsNullOrWhiteSpace(item.Host))
{
request.Host = item.Host;
}
if (item.IfModifiedSince != null) request.IfModifiedSince = Convert.ToDateTime(item.IfModifiedSince);
//Accept
request.Accept = item.Accept;
//ContentType返回类型
request.ContentType = item.ContentType;
//UserAgent客户端的访问类型,包括浏览器版本和操作系统信息
request.UserAgent = item.UserAgent;
// 编码
encoding = item.Encoding;
//设置安全凭证
request.Credentials = item.ICredentials;
//设置Cookie
SetCookie(item);
//来源地址
request.Referer = item.Referer;
//是否执行跳转功能
request.AllowAutoRedirect = item.Allowautoredirect;
if (item.MaximumAutomaticRedirections > 0)
{
request.MaximumAutomaticRedirections = item.MaximumAutomaticRedirections;
}
//设置Post数据
SetPostData(item);
//设置最大连接
if (item.Connectionlimit > 0) request.ServicePoint.ConnectionLimit = item.Connectionlimit;
}
///
/// 设置证书
///
///
private void SetCer(HttpItem item)
{
if (!string.IsNullOrWhiteSpace(item.CerPath))
{
//这一句一定要写在创建连接的前面。使用回调的方法进行证书验证。
ServicePointManager.ServerCertificateValidationCallback = new System.Net.Security.RemoteCertificateValidationCallback(CheckValidationResult);
//初始化对像,并设置请求的URL地址
request = (HttpWebRequest)WebRequest.Create(item.URL);
SetCerList(item);
//将证书添加到请求里
request.ClientCertificates.Add(new X509Certificate(item.CerPath));
}
else
{
//初始化对像,并设置请求的URL地址
request = (HttpWebRequest)WebRequest.Create(item.URL);
SetCerList(item);
}
}
///
/// 设置多个证书
///
///
private void SetCerList(HttpItem item)
{
if (item.ClentCertificates != null && item.ClentCertificates.Count > 0)
{
foreach (X509Certificate c in item.ClentCertificates)
{
request.ClientCertificates.Add(c);
}
}
}
///
/// 设置Cookie
///
/// Http参数
private void SetCookie(HttpItem item)
{
if (!string.IsNullOrEmpty(item.Cookie)) request.Headers[HttpRequestHeader.Cookie] = item.Cookie;
//设置CookieCollection
if (item.ResultCookieType == ResultCookieType.CookieCollection)
{
request.CookieContainer = new CookieContainer();
if (item.CookieCollection != null && item.CookieCollection.Count > 0)
request.CookieContainer.Add(item.CookieCollection);
}
}
///
/// 设置Post数据
///
/// Http参数
private void SetPostData(HttpItem item)
{
//验证在得到结果时是否有传入数据
if (!request.Method.Trim().ToLower().Contains("get"))
{
if (item.PostEncoding != null)
{
postencoding = item.PostEncoding;
}
byte[] buffer = null;
//写入Byte类型
if (item.PostDataType == PostDataType.Byte && item.PostdataByte != null && item.PostdataByte.Length > 0)
{
//验证在得到结果时是否有传入数据
buffer = item.PostdataByte;
}//写入文件
else if (item.PostDataType == PostDataType.FilePath && !string.IsNullOrWhiteSpace(item.Postdata))
{
StreamReader r = new StreamReader(item.Postdata, postencoding);
buffer = postencoding.GetBytes(r.ReadToEnd());
r.Close();
} //写入字符串
else if (!string.IsNullOrWhiteSpace(item.Postdata))
{
buffer = postencoding.GetBytes(item.Postdata);
}
if (buffer != null)
{
request.ContentLength = buffer.Length;
request.GetRequestStream().Write(buffer, 0, buffer.Length);
}
}
}
///
/// 设置代理
///
/// 参数对象
private void SetProxy(HttpItem item)
{
bool isIeProxy = false;
if (!string.IsNullOrWhiteSpace(item.ProxyIp))
{
isIeProxy = item.ProxyIp.ToLower().Contains("ieproxy");
}
if (!string.IsNullOrWhiteSpace(item.ProxyIp) && !isIeProxy)
{
//设置代理服务器
if (item.ProxyIp.Contains(":"))
{
string[] plist = item.ProxyIp.Split(':');
WebProxy myProxy = new WebProxy(plist[0].Trim(), Convert.ToInt32(plist[1].Trim()));
//建议连接
myProxy.Credentials = new NetworkCredential(item.ProxyUserName, item.ProxyPwd);
//给当前请求对象
request.Proxy = myProxy;
}
else
{
WebProxy myProxy = new WebProxy(item.ProxyIp, false);
//建议连接
myProxy.Credentials = new NetworkCredential(item.ProxyUserName, item.ProxyPwd);
//给当前请求对象
request.Proxy = myProxy;
}
}
else if (isIeProxy)
{
//设置为IE代理
}
else
{
request.Proxy = item.WebProxy;
}
}
#endregion
#region private main
///
/// 回调验证证书问题
///
/// 流对象
/// 证书
/// X509Chain
/// SslPolicyErrors
/// bool
private bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) { return true; }
///
/// 通过设置这个属性,可以在发出连接的时候绑定客户端发出连接所使用的IP地址。
///
///
///
///
///
private IPEndPoint BindIPEndPointCallback(ServicePoint servicePoint, IPEndPoint remoteEndPoint, int retryCount)
{
return _IPEndPoint;//端口号
}
#endregion
}
#region public calss
///
/// Http请求参考类
///
public class HttpItem
{
///
/// 请求URL必须填写
///
public string URL { get; set; }
string _Method = "GET";
///
/// 请求方式默认为GET方式,当为POST方式时必须设置Postdata的值
///
public string Method
{
get { return _Method; }
set { _Method = value; }
}
int _Timeout = 100000;
///
/// 默认请求超时时间
///
public int Timeout
{
get { return _Timeout; }
set { _Timeout = value; }
}
int _ReadWriteTimeout = 30000;
///
/// 默认写入Post数据超时间
///
public int ReadWriteTimeout
{
get { return _ReadWriteTimeout; }
set { _ReadWriteTimeout = value; }
}
///
/// 设置Host的标头信息
///
public string Host { get; set; }
Boolean _KeepAlive = true;
///
/// 获取或设置一个值,该值指示是否与 Internet 资源建立持久性连接默认为true。
///
public Boolean KeepAlive
{
get { return _KeepAlive; }
set { _KeepAlive = value; }
}
string _Accept = "text/html, application/xhtml+xml, */*";
///
/// 请求标头值 默认为text/html, application/xhtml+xml, */*
///
public string Accept
{
get { return _Accept; }
set { _Accept = value; }
}
string _ContentType = "text/html";
///
/// 请求返回类型默认 text/html
///
public string ContentType
{
get { return _ContentType; }
set { _ContentType = value; }
}
string _UserAgent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)";
///
/// 客户端访问信息默认Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)
///
public string UserAgent
{
get { return _UserAgent; }
set { _UserAgent = value; }
}
///
/// 返回数据编码默认为NUll,可以自动识别,一般为utf-8,gbk,gb2312
///
public Encoding Encoding { get; set; }
private PostDataType _PostDataType = PostDataType.String;
///
/// Post的数据类型
///
public PostDataType PostDataType
{
get { return _PostDataType; }
set { _PostDataType = value; }
}
///
/// Post请求时要发送的字符串Post数据
///
public string Postdata { get; set; }
///
/// Post请求时要发送的Byte类型的Post数据
///
public byte[] PostdataByte { get; set; }
///
/// Cookie对象集合
///
public CookieCollection CookieCollection { get; set; }
///
/// 请求时的Cookie
///
public string Cookie { get; set; }
///
/// 来源地址,上次访问地址
///
public string Referer { get; set; }
///
/// 证书绝对路径
///
public string CerPath { get; set; }
///
/// 设置代理对象,不想使用IE默认配置就设置为Null,而且不要设置ProxyIp
///
public WebProxy WebProxy { get; set; }
private Boolean isToLower = false;
///
/// 是否设置为全文小写,默认为不转化
///
public Boolean IsToLower
{
get { return isToLower; }
set { isToLower = value; }
}
private Boolean allowautoredirect = false;
///
/// 支持跳转页面,查询结果将是跳转后的页面,默认是不跳转
///
public Boolean Allowautoredirect
{
get { return allowautoredirect; }
set { allowautoredirect = value; }
}
private int connectionlimit = 1024;
///
/// 最大连接数
///
public int Connectionlimit
{
get { return connectionlimit; }
set { connectionlimit = value; }
}
///
/// 代理Proxy 服务器用户名
///
public string ProxyUserName { get; set; }
///
/// 代理 服务器密码
///
public string ProxyPwd { get; set; }
///
/// 代理 服务IP,如果要使用IE代理就设置为ieproxy
///
public string ProxyIp { get; set; }
private ResultType resulttype = ResultType.String;
///
/// 设置返回类型String和Byte
///
public ResultType ResultType
{
get { return resulttype; }
set { resulttype = value; }
}
private WebHeaderCollection header = new WebHeaderCollection();
///
/// header对象
///
public WebHeaderCollection Header
{
get { return header; }
set { header = value; }
}
///
// 获取或设置用于请求的 HTTP 版本。返回结果:用于请求的 HTTP 版本。默认为 System.Net.HttpVersion.Version11。
///
public Version ProtocolVersion { get; set; }
private Boolean _expect100continue = false;
///
/// 获取或设置一个 System.Boolean 值,该值确定是否使用 100-Continue 行为。如果 POST 请求需要 100-Continue 响应,则为 true;否则为 false。默认值为 true。
///
public Boolean Expect100Continue
{
get { return _expect100continue; }
set { _expect100continue = value; }
}
///
/// 设置509证书集合
///
public X509CertificateCollection ClentCertificates { get; set; }
///
/// 设置或获取Post参数编码,默认的为Default编码
///
public Encoding PostEncoding { get; set; }
private ResultCookieType _ResultCookieType = ResultCookieType.String;
///
/// Cookie返回类型,默认的是只返回字符串类型
///
public ResultCookieType ResultCookieType
{
get { return _ResultCookieType; }
set { _ResultCookieType = value; }
}
private ICredentials _ICredentials = CredentialCache.DefaultCredentials;
///
/// 获取或设置请求的身份验证信息。
///
public ICredentials ICredentials
{
get { return _ICredentials; }
set { _ICredentials = value; }
}
///
/// 设置请求将跟随的重定向的最大数目
///
public int MaximumAutomaticRedirections { get; set; }
private DateTime? _IfModifiedSince = null;
///
/// 获取和设置IfModifiedSince,默认为当前日期和时间
///
public DateTime? IfModifiedSince
{
get { return _IfModifiedSince; }
set { _IfModifiedSince = value; }
}
#region ip-port
private IPEndPoint _IPEndPoint = null;
///
/// 设置本地的出口ip和端口
/// ]
///
///item.IPEndPoint = new IPEndPoint(IPAddress.Parse("192.168.1.1"),80);
///
public IPEndPoint IPEndPoint
{
get { return _IPEndPoint; }
set { _IPEndPoint = value; }
}
#endregion
}
///
/// Http返回参数类
///
public class HttpResult
{
///
/// Http请求返回的Cookie
///
public string Cookie { get; set; }
///
/// Cookie对象集合
///
public CookieCollection CookieCollection { get; set; }
private string _html = string.Empty;
///
/// 返回的String类型数据 只有ResultType.String时才返回数据,其它情况为空
///
public string Html
{
get { return _html; }
set { _html = value; }
}
///
/// 返回的Byte数组 只有ResultType.Byte时才返回数据,其它情况为空
///
public byte[] ResultByte { get; set; }
///
/// header对象
///
public WebHeaderCollection Header { get; set; }
///
/// 返回状态说明
///
public string StatusDescription { get; set; }
///
/// 返回状态码,默认为OK
///
public HttpStatusCode StatusCode { get; set; }
///
/// 最后访问的URl
///
public string ResponseUri { get; set; }
///
/// 获取重定向的URl
///
public string RedirectUrl
{
get
{
try
{
if (Header != null && Header.Count > 0)
{
if (Header.AllKeys.Any(k => k.ToLower().Contains("location")))
{
string baseurl = Header["location"].ToString().Trim();
string locationurl = baseurl.ToLower();
if (!string.IsNullOrWhiteSpace(locationurl))
{
bool b = locationurl.StartsWith("http://") || locationurl.StartsWith("https://");
if (!b)
{
baseurl = new Uri(new Uri(ResponseUri), baseurl).AbsoluteUri;
}
}
return baseurl;
}
}
}
catch { }
return string.Empty;
}
}
}
///
/// 返回类型
///
public enum ResultType
{
///
/// 表示只返回字符串 只有Html有数据
///
String,
///
/// 表示返回字符串和字节流 ResultByte和Html都有数据返回
///
Byte
}
///
/// Post的数据格式默认为string
///
public enum PostDataType
{
///
/// 字符串类型,这时编码Encoding可不设置
///
String,
///
/// Byte类型,需要设置PostdataByte参数的值编码Encoding可设置为空
///
Byte,
///
/// 传文件,Postdata必须设置为文件的绝对路径,必须设置Encoding的值
///
FilePath
}
///
/// Cookie返回类型
///
public enum ResultCookieType
{
///
/// 只返回字符串类型的Cookie
///
String,
///
/// CookieCollection格式的Cookie集合同时也返回String类型的cookie
///
CookieCollection
}
#endregion
}