【原创】C# 多线程采集工具(使用 HtmlAgilityPack 工具)

我们平时或多或少的都需要采集一些网络上面的信息,当时采集的方法会有很多种,为了更高效的采集数据,我们基本上都要使用多线程,采集下来内容,最关键的还是需要分析网页内容,我们可以使用正则来分析网页中的内容,今天我们采集 HtmlAgilityPack 类库。

使用的工具类库包括:HtmlAgilityPack,以及苏飞的一个 HttpHelper 类,开发环境用的 VisualStudio 2008,.NetFramework 2.0,最终结果如图所示:

同时也看到几个最主要的类,这儿采集工厂模式,目的是让扩展更加容易一些,CollectorFactoryManager.cs 代码如下:

using System;
using System.Collections.Generic;

namespace CollectDemo
{
    /// 
    /// 采集工厂管理类
    /// 
    public class CollectorFactoryManager
    {
        private const int initCount = 5;

        private IList factoryList;
        private Action callback;
        private int collectFactoryIndex;

        public CollectorFactoryManager(Action callback)
        {
            this.callback = callback;
            this.factoryList = new List();
            // 可以无限添加
            this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/", this.CollectorFactoryCalback));
            this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/sitehome/p/2", this.CollectorFactoryCalback));
        }

        // 开始采集
        public void Run()
        {
            this.collectFactoryIndex = -1;
            // 因为线程有最大上限,设置初始采集数量
            for (int index = 0; index < initCount && index < this.factoryList.Count; index++)
            {
                this.CollectorFactoryData();
            }
        }

        private void CollectorFactoryData()
        {
            lock (this)
            {
                this.collectFactoryIndex++;

                //采集未结束,顺序采集
                if (this.collectFactoryIndex < this.factoryList.Count)
                {
                    CollectorFactory collectorFactory = this.factoryList[this.collectFactoryIndex];
                    collectorFactory.Run();
                }
                else
                {
                    // 采集结束
                    this.End();
                }
            }
        }

        public void CollectorFactoryCalback()
        {
            this.CollectorFactoryData();
        }

        /// 
        /// 采集结束
        /// 
        public void End()
        {
            if (this.callback != null) this.callback();
        }
    }
}

CollectorFactory.cs 代码如下:
using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;

namespace CollectDemo
{
    public class CollectorFactory
    {
        private const int initCount = 10;
        protected string htmlText;
        protected string urlPath;
        protected IList collectorItemList;
        protected Action callback;
        protected int collectItemIndex;

        public CollectorFactory(string urlPath, Action callback)
        {
            this.urlPath = urlPath;
            this.callback = callback;
        }

        /// 
        /// 启动采集
        /// 
        public virtual void Run()
        {
            // 添加睡眠,避免请求被当成爬虫
            int sleepData = new Random().Next(1000, 3000);
            Thread.Sleep(sleepData);

            Thread thread = new Thread(new ThreadStart(this.Start));
            thread.Start();
        }

        /// 
        /// 开启线程
        /// 
        protected virtual void Start()
        {
            this.CreateAndGetHtmlContent();
            this.AnalysisHtmlContent();
            this.CollectorPageData();
        }

        /// 
        /// 创建采集请求信息
        /// 
        protected virtual void CreateAndGetHtmlContent()
        {
            
        }

        /// 
        /// 分析采集数据
        /// 
        protected virtual void AnalysisHtmlContent()
        {
            
        }

        protected virtual void CollectorPageData()
        {
            this.collectItemIndex = -1;
            if (this.collectorItemList != null && this.collectorItemList.Count > 0)
            {
                for (int index = 0; index < initCount && index < this.collectorItemList.Count; index++)
                {
                    this.CollectorItemData();
                }
            }
        }

        public virtual void CollectorItemData()
        {
            lock (this)
            {
                this.collectItemIndex++;

                if (this.collectItemIndex < this.collectorItemList.Count)
                {
                    CollectorItem collectorItem = this.collectorItemList[this.collectItemIndex];
                    collectorItem.Run();
                }
                else
                {
                    // 采集结束
                    this.End();
                }
            }
        }

        public void CollectorItemCalback()
        {
            this.CollectorItemData();
        }

        public virtual void End()
        {
            if (this.callback != null) this.callback();
        }
    }
}
CollectorItem.cs 代码如下:
using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;

namespace CollectDemo
{
    public class CollectorItem
    {
        protected string htmlText;
        protected CollectorFactory collectorFactory;
        protected string urlPath;
        protected Action callback;

        public CollectorItem(CollectorFactory collectorFactory, string urlPath, Action callback)
        {
            this.collectorFactory = collectorFactory;
            this.urlPath = urlPath;
            this.callback = callback;
        }

        public void Run()
        {
            // 添加睡眠,避免请求被当成爬虫
            int sleepData = new Random().Next(2000, 6000);
            Thread.Sleep(sleepData);

            Thread thread = new Thread(new ThreadStart(this.Start));
            thread.Start();
        }

        /// 
        /// 开启线程
        /// 
        protected virtual void Start()
        {
            this.CreateAndGetHtmlContent();
            this.AnalysisHtmlContent();
        }

        /// 
        /// 创建采集请求信息
        /// 
        protected virtual void CreateAndGetHtmlContent()
        {

        }

        /// 
        /// 分析采集数据
        /// 
        protected virtual void AnalysisHtmlContent()
        {

        }

        public virtual void End()
        {
            if (this.callback != null) this.callback();
        }
    }
}
本例子采集的是博客园的前两页数据,所以我们需要一个解析两页数据链接的 CollectorFactoryOne.cs 类,代码如下:
using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;

namespace CollectDemo
{
    public class CollectorFactoryOne : CollectorFactory
    {
        public CollectorFactoryOne(string urlPath, Action callback) : base(urlPath, callback)
        {
            
        }

        protected override void CreateAndGetHtmlContent()
        {
            HttpItem httpItem = new HttpItem();
            httpItem.URL = this.urlPath;
            httpItem.Method = "get";
            httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0";
            httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";

            HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem);

            this.htmlText = httpResult.Html;
        }

        protected override void AnalysisHtmlContent()
        {
            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(this.htmlText);

            this.collectorItemList = new List();
            HtmlNodeCollection hrefList = htmlDocument.DocumentNode.SelectNodes("//a[@class = 'titlelnk']");
            if (hrefList != null)
            {
                foreach (HtmlNode hrefNode in hrefList)
                {
                    HtmlAttribute htmlAttribute = hrefNode.Attributes["href"];
                    this.collectorItemList.Add(new CollectorItemOne(this, htmlAttribute.Value, this.CollectorItemCalback));
                }
            }
        }
    }
}
还有一个解析博客园每页内容的 CollectorItemOne.cs 类,代码如下:
using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;
using System.IO;

namespace CollectDemo
{
    public class CollectorItemOne : CollectorItem
    {
        public CollectorItemOne(CollectorFactory collectorFactory, string urlPath, Action callback)
            : base(collectorFactory, urlPath, callback)
        {
        }

        protected override void CreateAndGetHtmlContent()
        {
            HttpItem httpItem = new HttpItem();
            httpItem.URL = this.urlPath;
            httpItem.Method = "get";
            httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0";
            httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";

            HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem);
            this.htmlText = httpResult.Html;
        }

        protected override void AnalysisHtmlContent()
        {
            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(this.htmlText);

            lock (this)
            {
                string htmlTitle = htmlDocument.DocumentNode.SelectSingleNode("//title").InnerText;

                // 这儿创建文件

                string filePath = System.Windows.Forms.Application.StartupPath + "\\txt\\";
                filePath += System.Guid.NewGuid() + ".txt";

                if (File.Exists(filePath)) return;
                File.Create(filePath).Close();

                try
                {
                    using (StreamWriter streamWriter = new StreamWriter(filePath, true, System.Text.Encoding.UTF8))
                    {
                        streamWriter.Write(htmlDocument.DocumentNode.InnerHtml);
                        streamWriter.Flush();
                        streamWriter.Close();
                    }
                }
                catch (Exception ex)
                {
                    // 处理错误
                }

                // 处理结束,这儿必须调用
                this.End();
            }
        }
    }
}

主要的多线程操作都已经封装好,只需要处理采集以及解析网页内容就可以实现快速扩展了。

你可能感兴趣的:(.net)