C# 小爬虫

一个C# 小爬虫 用于爬取标书信息

using System;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.IO.Compression;
using System.Security.Cryptography.X509Certificates;
using System.Net.Security;
using System.Globalization;

namespace crawler
{
    public partial class JunXiang : Form
    {
        public JunXiang()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            int page = 10;
            try
            {
                page = int.Parse(textBox2.Text)+1;
                if (page > 10000)
                    page = 9999;
            }
            catch {
            }
            webBrowser1.Document.OpenNew(false);
            textBox1.Text = "";
            HttpHelper http = new HttpHelper();
            String str = "";
            bool flag = true;
            for(int i=1;iif (flag == false)
                    break;
                DateTime start = dateTimePicker1.Value;
                DateTime end = dateTimePicker2.Value;
                HttpItem item = new HttpItem()
                {
                 URL = "http://www.sczfcg.com/CmsNewsController.do?method=recommendBulletinList&moreType=provincebuyBulletinMore&channelCode=shiji_cggg&rp=25&page="+i,//URL     必需项    
                 Method = "get",//URL     可选项 默认为Get   
                 ContentType = "text/html",//返回类型    可选项有默认值   
                //ContentType = "application/x-www-form-urlencoded",//返回类型    可选项有默认值   
                };
                //请求的返回值对象
                HttpResult result = http.GetHtml(item);
                //获取请请求的Html
                string html = result.Html;   

                //string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']";
                //Regex re = new Regex(regex);
                //MatchCollection matches = re.Matches(html);

                //System.Collections.IEnumerator enu = matches.GetEnumerator();
                //while (enu.MoveNext() && enu.Current != null)
                //{
                //    Match match = (Match)(enu.Current);
                //    Console.Write(match.Value + "\r\n");
                //}
                // Regex reg = new Regex(@"(?is)([^>]+?)");
                Regex reg = new Regex(@"
  • (.|\s)*?
  • "
    , RegexOptions.IgnoreCase); MatchCollection mc = reg.Matches(html); foreach (Match m in mc) { if (m.ToString().IndexOf("采购公告") > -1) { string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']"; Regex reg1 = new Regex(regex); MatchCollection matches = reg1.Matches(m.ToString()); Regex reg4 = new Regex(@"(?is)([^>]+?)"); MatchCollection matches4 = reg.Matches(ToString()); if (matches.Count > 0 && (m.ToString().IndexOf(textBox3.Text) > -1 || textBox3.Text.Length == 0)) { Regex reg3 = new Regex(@"(.|\s)*?", RegexOptions.IgnoreCase); MatchCollection matches3 = reg3.Matches(m.ToString()); DateTimeFormatInfo dtFormat = new DateTimeFormatInfo(); dtFormat.ShortDatePattern = "yyyy-MM-dd"; String now = matches3[0].ToString(); string start_date = start.ToShortDateString().ToString(); string end_date = end.ToShortDateString().ToString(); now = now.Replace("", "").Replace("", ""); DateTime dt = Convert.ToDateTime(now, dtFormat); start = Convert.ToDateTime(start_date, dtFormat); end = Convert.ToDateTime(end_date, dtFormat); if (DateTime.Compare(start, dt) > 0 || DateTime.Compare(dt, end) > 0) { break; } if((DateTime.Compare(start, dt) > 0)) flag = false; textBox1.Text += matches3[0] + "\r\n"; str += m.ToString(); textBox1.Text += matches[0].Groups[0] + "\r\n"; Regex reg2 = new Regex(@" title=""(.|\s)*?"" target"); MatchCollection matches2 = reg2.Matches(m.ToString()); textBox1.Text += matches2[0].Value + "\r\n"; textBox1.Text += "***** " + "第"+(i)+"页"+" *****\r\n"; textBox1.Text = textBox1.Text.Replace("href=", ""); textBox1.Text = textBox1.Text.Replace("\"", ""); textBox1.Text = textBox1.Text.Replace(" title=", ""); textBox1.Text = textBox1.Text.Replace(" target", ""); textBox1.Text = textBox1.Text.Replace("", ""); textBox1.Text = textBox1.Text.Replace("", ""); } } } } str += ""; str = str.Replace("/view", "http://www.sczfcg.com/view"); textBox1.Text = textBox1.Text.Replace("/view", "http://www.sczfcg.com/view"); webBrowser1.Document.Write(str); } private void textBox1_TextChanged(object sender, EventArgs e) { } private void Form1_Load(object sender, EventArgs e) { webBrowser1.Navigate("about:blank"); webBrowser1.Document.OpenNew(true); } private void textBox3_TextChanged(object sender, EventArgs e) { } private void linkLabel1_LinkClicked(object sender, LinkLabelLinkClickedEventArgs e) { } private void textBox2_TextChanged(object sender, EventArgs e) { } private void button2_Click(object sender, EventArgs e) { webBrowser1.Document.OpenNew(false); textBox1.Text = ""; HttpHelper http = new HttpHelper(); String str = ""; str += ""; webBrowser1.Document.Write(str); } } public class HttpHelper { #region 预定义方变量 //默认的编码 private Encoding encoding = Encoding.Default; //Post数据编码 private Encoding postencoding = Encoding.Default; //HttpWebRequest对象用来发起请求 private HttpWebRequest request = null; //获取影响流的数据对象 private HttpWebResponse response = null; //设置本地的出口ip和端口 private IPEndPoint _IPEndPoint = null; #endregion #region Public /// /// 根据相传入的数据,得到相应页面数据 /// /// 参数类对象 /// 返回HttpResult类型 public HttpResult GetHtml(HttpItem item) { //返回参数 HttpResult result = new HttpResult(); try { //准备参数 SetRequest(item); } catch (Exception ex) { //配置参数时出错 return new HttpResult() { Cookie = string.Empty, Header = null, Html = ex.Message, StatusDescription = "配置参数时出错:" + ex.Message }; } try { //请求数据 using (response = (HttpWebResponse)request.GetResponse()) { GetData(item, result); } } catch (WebException ex) { if (ex.Response != null) { using (response = (HttpWebResponse)ex.Response) { GetData(item, result); } } else { result.Html = ex.Message; } } catch (Exception ex) { result.Html = ex.Message; } if (item.IsToLower) result.Html = result.Html.ToLower(); return result; } #endregion #region GetData /// /// 获取数据的并解析的方法 /// /// /// private void GetData(HttpItem item, HttpResult result) { if (response == null) { return; } #region base //获取StatusCode result.StatusCode = response.StatusCode; //获取StatusDescription result.StatusDescription = response.StatusDescription; //获取Headers result.Header = response.Headers; //获取最后访问的URl result.ResponseUri = response.ResponseUri.ToString(); //获取CookieCollection if (response.Cookies != null) result.CookieCollection = response.Cookies; //获取set-cookie if (response.Headers["set-cookie"] != null) result.Cookie = response.Headers["set-cookie"]; #endregion #region byte //处理网页Byte byte[] ResponseByte = GetByte(); #endregion #region Html if (ResponseByte != null && ResponseByte.Length > 0) { //设置编码 SetEncoding(item, result, ResponseByte); //得到返回的HTML result.Html = encoding.GetString(ResponseByte); } else { //没有返回任何Html代码 result.Html = string.Empty; } #endregion } /// /// 设置编码 /// /// HttpItem /// HttpResult /// byte[] private void SetEncoding(HttpItem item, HttpResult result, byte[] ResponseByte) { //是否返回Byte类型数据 if (item.ResultType == ResultType.Byte) result.ResultByte = ResponseByte; //从这里开始我们要无视编码了 if (encoding == null) { Match meta = Regex.Match(Encoding.Default.GetString(ResponseByte), ", RegexOptions.IgnoreCase); string c = string.Empty; if (meta != null && meta.Groups.Count > 0) { c = meta.Groups[1].Value.ToLower().Trim(); } if (c.Length > 2) { try { encoding = Encoding.GetEncoding(c.Replace("\"", string.Empty).Replace("'", "").Replace(";", "").Replace("iso-8859-1", "gbk").Trim()); } catch { if (string.IsNullOrEmpty(response.CharacterSet)) { encoding = Encoding.UTF8; } else { encoding = Encoding.GetEncoding(response.CharacterSet); } } } else { if (string.IsNullOrEmpty(response.CharacterSet)) { encoding = Encoding.UTF8; } else { encoding = Encoding.GetEncoding(response.CharacterSet); } } } } /// /// 提取网页Byte /// /// private byte[] GetByte() { byte[] ResponseByte = null; using (MemoryStream _stream = new MemoryStream()) { //GZIIP处理 if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase)) { //开始读取流并设置编码方式 new GZipStream(response.GetResponseStream(), CompressionMode.Decompress).CopyTo(_stream, 10240); } else { //开始读取流并设置编码方式 response.GetResponseStream().CopyTo(_stream, 10240); } //获取Byte ResponseByte = _stream.ToArray(); } return ResponseByte; } #endregion #region SetRequest /// /// 为请求准备参数 /// ///参数列表 private void SetRequest(HttpItem item) { // 验证证书 SetCer(item); if (item.IPEndPoint != null) { _IPEndPoint = item.IPEndPoint; //设置本地的出口ip和端口 request.ServicePoint.BindIPEndPointDelegate = new BindIPEndPoint(BindIPEndPointCallback); } //设置Header参数 if (item.Header != null && item.Header.Count > 0) foreach (string key in item.Header.AllKeys) { request.Headers.Add(key, item.Header[key]); } // 设置代理 SetProxy(item); if (item.ProtocolVersion != null) request.ProtocolVersion = item.ProtocolVersion; request.ServicePoint.Expect100Continue = item.Expect100Continue; //请求方式Get或者Post request.Method = item.Method; request.Timeout = item.Timeout; request.KeepAlive = item.KeepAlive; request.ReadWriteTimeout = item.ReadWriteTimeout; if (!string.IsNullOrWhiteSpace(item.Host)) { request.Host = item.Host; } if (item.IfModifiedSince != null) request.IfModifiedSince = Convert.ToDateTime(item.IfModifiedSince); //Accept request.Accept = item.Accept; //ContentType返回类型 request.ContentType = item.ContentType; //UserAgent客户端的访问类型,包括浏览器版本和操作系统信息 request.UserAgent = item.UserAgent; // 编码 encoding = item.Encoding; //设置安全凭证 request.Credentials = item.ICredentials; //设置Cookie SetCookie(item); //来源地址 request.Referer = item.Referer; //是否执行跳转功能 request.AllowAutoRedirect = item.Allowautoredirect; if (item.MaximumAutomaticRedirections > 0) { request.MaximumAutomaticRedirections = item.MaximumAutomaticRedirections; } //设置Post数据 SetPostData(item); //设置最大连接 if (item.Connectionlimit > 0) request.ServicePoint.ConnectionLimit = item.Connectionlimit; } /// /// 设置证书 /// /// private void SetCer(HttpItem item) { if (!string.IsNullOrWhiteSpace(item.CerPath)) { //这一句一定要写在创建连接的前面。使用回调的方法进行证书验证。 ServicePointManager.ServerCertificateValidationCallback = new System.Net.Security.RemoteCertificateValidationCallback(CheckValidationResult); //初始化对像,并设置请求的URL地址 request = (HttpWebRequest)WebRequest.Create(item.URL); SetCerList(item); //将证书添加到请求里 request.ClientCertificates.Add(new X509Certificate(item.CerPath)); } else { //初始化对像,并设置请求的URL地址 request = (HttpWebRequest)WebRequest.Create(item.URL); SetCerList(item); } } /// /// 设置多个证书 /// /// private void SetCerList(HttpItem item) { if (item.ClentCertificates != null && item.ClentCertificates.Count > 0) { foreach (X509Certificate c in item.ClentCertificates) { request.ClientCertificates.Add(c); } } } /// /// 设置Cookie /// /// Http参数 private void SetCookie(HttpItem item) { if (!string.IsNullOrEmpty(item.Cookie)) request.Headers[HttpRequestHeader.Cookie] = item.Cookie; //设置CookieCollection if (item.ResultCookieType == ResultCookieType.CookieCollection) { request.CookieContainer = new CookieContainer(); if (item.CookieCollection != null && item.CookieCollection.Count > 0) request.CookieContainer.Add(item.CookieCollection); } } /// /// 设置Post数据 /// /// Http参数 private void SetPostData(HttpItem item) { //验证在得到结果时是否有传入数据 if (!request.Method.Trim().ToLower().Contains("get")) { if (item.PostEncoding != null) { postencoding = item.PostEncoding; } byte[] buffer = null; //写入Byte类型 if (item.PostDataType == PostDataType.Byte && item.PostdataByte != null && item.PostdataByte.Length > 0) { //验证在得到结果时是否有传入数据 buffer = item.PostdataByte; }//写入文件 else if (item.PostDataType == PostDataType.FilePath && !string.IsNullOrWhiteSpace(item.Postdata)) { StreamReader r = new StreamReader(item.Postdata, postencoding); buffer = postencoding.GetBytes(r.ReadToEnd()); r.Close(); } //写入字符串 else if (!string.IsNullOrWhiteSpace(item.Postdata)) { buffer = postencoding.GetBytes(item.Postdata); } if (buffer != null) { request.ContentLength = buffer.Length; request.GetRequestStream().Write(buffer, 0, buffer.Length); } } } /// /// 设置代理 /// /// 参数对象 private void SetProxy(HttpItem item) { bool isIeProxy = false; if (!string.IsNullOrWhiteSpace(item.ProxyIp)) { isIeProxy = item.ProxyIp.ToLower().Contains("ieproxy"); } if (!string.IsNullOrWhiteSpace(item.ProxyIp) && !isIeProxy) { //设置代理服务器 if (item.ProxyIp.Contains(":")) { string[] plist = item.ProxyIp.Split(':'); WebProxy myProxy = new WebProxy(plist[0].Trim(), Convert.ToInt32(plist[1].Trim())); //建议连接 myProxy.Credentials = new NetworkCredential(item.ProxyUserName, item.ProxyPwd); //给当前请求对象 request.Proxy = myProxy; } else { WebProxy myProxy = new WebProxy(item.ProxyIp, false); //建议连接 myProxy.Credentials = new NetworkCredential(item.ProxyUserName, item.ProxyPwd); //给当前请求对象 request.Proxy = myProxy; } } else if (isIeProxy) { //设置为IE代理 } else { request.Proxy = item.WebProxy; } } #endregion #region private main /// /// 回调验证证书问题 /// /// 流对象 /// 证书 /// X509Chain /// SslPolicyErrors /// bool private bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) { return true; } /// /// 通过设置这个属性,可以在发出连接的时候绑定客户端发出连接所使用的IP地址。 /// /// /// /// /// private IPEndPoint BindIPEndPointCallback(ServicePoint servicePoint, IPEndPoint remoteEndPoint, int retryCount) { return _IPEndPoint;//端口号 } #endregion } #region public calss /// /// Http请求参考类 /// public class HttpItem { /// /// 请求URL必须填写 /// public string URL { get; set; } string _Method = "GET"; /// /// 请求方式默认为GET方式,当为POST方式时必须设置Postdata的值 /// public string Method { get { return _Method; } set { _Method = value; } } int _Timeout = 100000; /// /// 默认请求超时时间 /// public int Timeout { get { return _Timeout; } set { _Timeout = value; } } int _ReadWriteTimeout = 30000; /// /// 默认写入Post数据超时间 /// public int ReadWriteTimeout { get { return _ReadWriteTimeout; } set { _ReadWriteTimeout = value; } } /// /// 设置Host的标头信息 /// public string Host { get; set; } Boolean _KeepAlive = true; /// /// 获取或设置一个值,该值指示是否与 Internet 资源建立持久性连接默认为true。 /// public Boolean KeepAlive { get { return _KeepAlive; } set { _KeepAlive = value; } } string _Accept = "text/html, application/xhtml+xml, */*"; /// /// 请求标头值 默认为text/html, application/xhtml+xml, */* /// public string Accept { get { return _Accept; } set { _Accept = value; } } string _ContentType = "text/html"; /// /// 请求返回类型默认 text/html /// public string ContentType { get { return _ContentType; } set { _ContentType = value; } } string _UserAgent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"; /// /// 客户端访问信息默认Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) /// public string UserAgent { get { return _UserAgent; } set { _UserAgent = value; } } /// /// 返回数据编码默认为NUll,可以自动识别,一般为utf-8,gbk,gb2312 /// public Encoding Encoding { get; set; } private PostDataType _PostDataType = PostDataType.String; /// /// Post的数据类型 /// public PostDataType PostDataType { get { return _PostDataType; } set { _PostDataType = value; } } /// /// Post请求时要发送的字符串Post数据 /// public string Postdata { get; set; } /// /// Post请求时要发送的Byte类型的Post数据 /// public byte[] PostdataByte { get; set; } /// /// Cookie对象集合 /// public CookieCollection CookieCollection { get; set; } /// /// 请求时的Cookie /// public string Cookie { get; set; } /// /// 来源地址,上次访问地址 /// public string Referer { get; set; } /// /// 证书绝对路径 /// public string CerPath { get; set; } /// /// 设置代理对象,不想使用IE默认配置就设置为Null,而且不要设置ProxyIp /// public WebProxy WebProxy { get; set; } private Boolean isToLower = false; /// /// 是否设置为全文小写,默认为不转化 /// public Boolean IsToLower { get { return isToLower; } set { isToLower = value; } } private Boolean allowautoredirect = false; /// /// 支持跳转页面,查询结果将是跳转后的页面,默认是不跳转 /// public Boolean Allowautoredirect { get { return allowautoredirect; } set { allowautoredirect = value; } } private int connectionlimit = 1024; /// /// 最大连接数 /// public int Connectionlimit { get { return connectionlimit; } set { connectionlimit = value; } } /// /// 代理Proxy 服务器用户名 /// public string ProxyUserName { get; set; } /// /// 代理 服务器密码 /// public string ProxyPwd { get; set; } /// /// 代理 服务IP,如果要使用IE代理就设置为ieproxy /// public string ProxyIp { get; set; } private ResultType resulttype = ResultType.String; /// /// 设置返回类型String和Byte /// public ResultType ResultType { get { return resulttype; } set { resulttype = value; } } private WebHeaderCollection header = new WebHeaderCollection(); /// /// header对象 /// public WebHeaderCollection Header { get { return header; } set { header = value; } } /// // 获取或设置用于请求的 HTTP 版本。返回结果:用于请求的 HTTP 版本。默认为 System.Net.HttpVersion.Version11。 /// public Version ProtocolVersion { get; set; } private Boolean _expect100continue = false; /// /// 获取或设置一个 System.Boolean 值,该值确定是否使用 100-Continue 行为。如果 POST 请求需要 100-Continue 响应,则为 true;否则为 false。默认值为 true。 /// public Boolean Expect100Continue { get { return _expect100continue; } set { _expect100continue = value; } } /// /// 设置509证书集合 /// public X509CertificateCollection ClentCertificates { get; set; } /// /// 设置或获取Post参数编码,默认的为Default编码 /// public Encoding PostEncoding { get; set; } private ResultCookieType _ResultCookieType = ResultCookieType.String; /// /// Cookie返回类型,默认的是只返回字符串类型 /// public ResultCookieType ResultCookieType { get { return _ResultCookieType; } set { _ResultCookieType = value; } } private ICredentials _ICredentials = CredentialCache.DefaultCredentials; /// /// 获取或设置请求的身份验证信息。 /// public ICredentials ICredentials { get { return _ICredentials; } set { _ICredentials = value; } } /// /// 设置请求将跟随的重定向的最大数目 /// public int MaximumAutomaticRedirections { get; set; } private DateTime? _IfModifiedSince = null; /// /// 获取和设置IfModifiedSince,默认为当前日期和时间 /// public DateTime? IfModifiedSince { get { return _IfModifiedSince; } set { _IfModifiedSince = value; } } #region ip-port private IPEndPoint _IPEndPoint = null; /// /// 设置本地的出口ip和端口 /// ] /// ///item.IPEndPoint = new IPEndPoint(IPAddress.Parse("192.168.1.1"),80); /// public IPEndPoint IPEndPoint { get { return _IPEndPoint; } set { _IPEndPoint = value; } } #endregion } /// /// Http返回参数类 /// public class HttpResult { /// /// Http请求返回的Cookie /// public string Cookie { get; set; } /// /// Cookie对象集合 /// public CookieCollection CookieCollection { get; set; } private string _html = string.Empty; /// /// 返回的String类型数据 只有ResultType.String时才返回数据,其它情况为空 /// public string Html { get { return _html; } set { _html = value; } } /// /// 返回的Byte数组 只有ResultType.Byte时才返回数据,其它情况为空 /// public byte[] ResultByte { get; set; } /// /// header对象 /// public WebHeaderCollection Header { get; set; } /// /// 返回状态说明 /// public string StatusDescription { get; set; } /// /// 返回状态码,默认为OK /// public HttpStatusCode StatusCode { get; set; } /// /// 最后访问的URl /// public string ResponseUri { get; set; } /// /// 获取重定向的URl /// public string RedirectUrl { get { try { if (Header != null && Header.Count > 0) { if (Header.AllKeys.Any(k => k.ToLower().Contains("location"))) { string baseurl = Header["location"].ToString().Trim(); string locationurl = baseurl.ToLower(); if (!string.IsNullOrWhiteSpace(locationurl)) { bool b = locationurl.StartsWith("http://") || locationurl.StartsWith("https://"); if (!b) { baseurl = new Uri(new Uri(ResponseUri), baseurl).AbsoluteUri; } } return baseurl; } } } catch { } return string.Empty; } } } /// /// 返回类型 /// public enum ResultType { /// /// 表示只返回字符串 只有Html有数据 /// String, /// /// 表示返回字符串和字节流 ResultByte和Html都有数据返回 /// Byte } /// /// Post的数据格式默认为string /// public enum PostDataType { /// /// 字符串类型,这时编码Encoding可不设置 /// String, /// /// Byte类型,需要设置PostdataByte参数的值编码Encoding可设置为空 /// Byte, /// /// 传文件,Postdata必须设置为文件的绝对路径,必须设置Encoding的值 /// FilePath } /// /// Cookie返回类型 /// public enum ResultCookieType { /// /// 只返回字符串类型的Cookie /// String, /// /// CookieCollection格式的Cookie集合同时也返回String类型的cookie /// CookieCollection } #endregion }

    C# 小爬虫_第1张图片

    你可能感兴趣的:(个人生活,信息系统)