Collector.cs


using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.Net;
using System.IO;
using System.Data;
using System.Data.SqlClient;

using CnBlogCollector.Properties;

namespace CnBlogCollector
{
    /// <summary>
    /// 数据采集类
    /// </summary>
    public class Collector
    {
       #region 变量
        private string cnblogMain = "http://www.cnblogs.com/p{0}";//cnblog首页地址
        private WebClient wc = new WebClient();
        #endregion


       #region 创建目录
        /// <summary>
        /// 判断目录是否存在,若不存在则创建该目录
        /// </summary>
        /// <param name="path"></param>
        /// <returns></returns>
        public string CreateFolderIfNot(string path)
        {
            //获取该目录的完整路径
            string rtn = Path.GetFullPath(path);
            //若该目录不存在
            if (!Directory.Exists(rtn))
            {
                //创建该目录
                Directory.CreateDirectory(rtn);
            }
            return rtn;
        }
        #endregion

       #region 采集网页数据
       public void Gather(int startIndex, int endIndex)
       {

           SqlConnection con = new SqlConnection(@"Data Source=.;Initial Catalog=WordProject;User ID=misp2;Password=misp2;");
           con.Open();
           //根据startIndex和endIndex来遍历cnblog首页上文章
           for (int i = startIndex; i < endIndex; i++)
           {
               //从cnblog首页下载页面数据并将其转换成UTF8编码格式的STRING
               string mainData = Encoding.UTF8.GetString(wc.DownloadData(string.Format(cnblogMain, i.ToString())));


               int j = 1;
               //二次遍历抓取cnblog首页面上的文章链接,并顺着这些链接进入文章页面采集数据
               while (mainData.IndexOf("<a class=\"titlelnk\" href=\"") >= 0)
               {
                   try
                   {
                       mainData = mainData.Substring(mainData.IndexOf("<a class=\"titlelnk\" href=\"") + 26);

                       //获取文章页面的链接地址
                       string articleAddr = mainData.Substring(0, mainData.IndexOf("\""));

                       //获取文章标题
                       string articleTitle = mainData.Substring(mainData.IndexOf("target=\"_blank\">") + 16,
                                                                mainData.IndexOf("</a>") - mainData.IndexOf("target=\"_blank\">") - 16);

                       //下载文章页面数据
                       string articleData = Encoding.UTF8.GetString(wc.DownloadData(string.Format(articleAddr, i.ToString())));

                       //截取文章内容HTML
                       articleData = articleData.Substring(articleData.IndexOf("<div id=\"cnblogs_post_body\">") + 28);

                       articleData = articleData.Substring(0, articleData.IndexOf("if ($ != jQuery) {") - 33);

                       articleData = "\r\n"+articleTitle + "\r\n\r\n" +"\r\n"+articleData;

                       //输出数据到本地文件
                       //string pth = CreateFolderIfNot(Settings.Default.OutPath) + i + "_" + j + ".txt";
                       //if (!File.Exists(pth))
                       //    File.AppendAllText(pth,
                       //                       articleData,
                       //                       Encoding.UTF8);




                       string sqlO = @"insert into [Word_Content2](Title,Content) values ('" + articleTitle + "','" + articleData + "')";
                       SqlCommand cmd2 = new SqlCommand(sqlO, con);
                       cmd2.ExecuteNonQuery();
                       


                       j++;
                   }
                   catch (Exception ex)
                   {
                   }
               }
           }
           con.Close();
       }
       #endregion

    }
}

你可能感兴趣的:(Collector)