网页爬虫【原创】【开源】

使用到了以下技术点:
1)webclient获得网页源码;
2)正则表达式,解析网页中想要的数据;
3)使用线程池加快网页采集数据。
4)……
 
以前写过几次类似的,但是找不到了,又重新写了一个。
代码比较粗糙,求拍砖。
 
using System;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Windows.Forms;

namespace SpiderMan
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        /// <summary>
        /// 线程数量
        /// </summary>
        private static int threadCount = 0;

        private void button1_Click(object sender, EventArgs e)
        {
            string urlPattern = "http://www.3464.com/data/zhongguochengshijingweidu/?PageNo={0}";
            int pageFirstIndex = 1;
            int pageLastIndex = 125;

            for (int pageIndex = pageFirstIndex; pageIndex <= pageLastIndex; pageIndex++)
            {
                string url = string.Format(urlPattern, pageIndex);
                Log("开始读取url:" + url);


                ThreadPool.QueueUserWorkItem(aurl =>
                {
                    string html = GetHttpSource((string)aurl);
                    ParseHtml(html);

                    //线程计数--
                    Interlocked.Decrement(ref threadCount);
                }, url);

                //线程计数++
                Interlocked.Increment(ref threadCount);
            }

            while (true)
            {
                Application.DoEvents();
                Loading();

                if (threadCount <= 0)
                {
                    break;
                }
            }

            //Thread.Sleep(1000);
            Log("数据采集结束");
        }

        #region 解析html
        /// <summary>
        /// 解析html
        /// </summary>
        /// <param name="html"></param>
        private void ParseHtml(string html)
        {
            var beginPos = html.IndexOf("编号");
            var endPos = html.IndexOf("</table>", beginPos);

            var partHtml = html.Substring(beginPos, endPos - beginPos);
            /*
             <tr[^<]*<td[^>]*>(?<id>\d*?)</td>[^>]*>(?<prov>\w*)</td>[^>]*>[^>]*>(?<city>\w*)</a></td>[^>]*>(?<city2>\w*)</td>[^>]*>(?<py>\w*)</td>[^>]*>(?<qh>\w*)</td>[^>]*>(?<yb>\w*)</td>[^>]*>(?<dj>[\d\.]*)</td>[^>]*>(?<bw>[\d\.]*)</td>[^>]*>
             */

            var ms = Regex.Matches(partHtml,
                @"<tr[^<]*<td[^>]*>(?<id>\d*?)</td>[^>]*>(?<prov>\w*)</td>[^>]*>[^>]*>(?<city>\w*)</a></td>[^>]*>(?<city2>\w*)</td>[^>]*>(?<py>\w*)</td>[^>]*>(?<qh>\w*)</td>[^>]*>(?<yb>\w*)</td>[^>]*>(?<dj>[\d\.]*)</td>[^>]*>(?<bw>[\d\.]*)</td>[^>]*>");

            foreach (Match m in ms)
            {
                if (!m.Success)
                {
                    Log("解析错误:" + m.Value);
                    continue;
                }

                //Log(partHtml);

                var 编码 = m.Groups["id"].Value;
                var 省市 = m.Groups["prov"].Value;
                var 地区市 = m.Groups["city"].Value;
                var 市县 = m.Groups["city2"].Value;
                var 拼音 = m.Groups["py"].Value;
                var 区号 = m.Groups["qh"].Value;
                var 邮编 = m.Groups["yb"].Value;
                var 东经 = m.Groups["dj"].Value;
                var 北纬 = m.Groups["bw"].Value;

                Log(string.Format("{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}", 编码, 省市, 地区市, 市县, 拼音, 区号, 邮编, 东经, 北纬));
            }

        }
        #endregion

        /// <summary>
        /// 简易进度条
        /// </summary>
        private void Loading()
        {
            if (this.InvokeRequired)
            {
                this.Invoke(new MethodInvoker(Loading));
            }
            else
            {
                int maxLength = 100;
                int residue = maxLength - this.Text.Length;
                this.Text = "采集中" + new StringBuilder().Append('.', residue).ToString();
            }
        }

        #region Log
        /// <summary>
        /// 简易控制台输出
        /// </summary>
        /// <param name="msg"></param>
        private void Log(string msg)
        {
            if (this.textBox1.InvokeRequired)
            {
                this.Invoke(new MethodInvoker(() => Log(msg)));
            }
            else
            {
                this.textBox1.AppendText(msg);
                this.textBox1.AppendText(System.Environment.NewLine);
            }
        }
        #endregion

        #region GetHttpSource
        /// <summary>
        /// 获得网页源码
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        private string GetHttpSource(string url)
        {
            //请求别人的网站温柔点
            Thread.Sleep(new Random().Next(100, 500));

            var wc = new WebClient {Encoding = Encoding.Default};
            wc.Headers.Add("Content-Type", "application/x-www-form-urlencoded");
            var source = wc.DownloadString(url);
            return source;
        }
        #endregion
    }
}

  

 
 
下载

你可能感兴趣的:(网页爬虫【原创】【开源】)