信息采集

这两天,编码做了一个新蛋网手机信息的采集,web页面信息采集是用WebClient控件。需要调用方法Gather()。希望能有帮助。

代码如下:

/* 
 * Created By ChinaAgan 2012-1-18
 * 
 */
using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;

using CnBlogCollector.Properties;

namespace CnBlogCollector
{
    /// <summary>
    /// 数据采集类
    /// </summary>
    public class Collector
    {
        #region 变量
        private string cnblogMain = "http://www.newegg.com.cn/SubCategory/1043-{0}.htm";//cnblog首页地址
        
        private WebClient wc = new WebClient(); 
        #endregion


       #region 创建目录
        /// <summary>
        /// 判断目录是否存在,若不存在则创建该目录
        /// </summary>
        /// <param name="path"></param>
        /// <returns></returns>
        public string CreateFolderIfNot(string path)
        {
            //获取该目录的完整路径
            string rtn = Path.GetFullPath(path);
            //若该目录不存在
            if (!Directory.Exists(rtn))
            {
                //创建该目录
                Directory.CreateDirectory(rtn);
            }
            return rtn;
        }
        #endregion

       #region 采集网页数据
       public void Gather(int startIndex, int endIndex)
       {
           WebProxy webProxy = new WebProxy("proxy.cn1.global.***.com:8080");
           webProxy.Credentials = new System.Net.NetworkCredential("user", "password");
           wc.Proxy = webProxy;

           string outContent = "";
           //根据startIndex和endIndex来遍历cnblog首页上文章
           for (int i = startIndex; i < endIndex; i++)
           {
               //从cnblog首页下载页面数据并将其转换成UTF8编码格式的STRING
               string url = string.Format(cnblogMain, i.ToString());
               string mainData = Encoding.GetEncoding("GB2312").GetString(wc.DownloadData(url)).Replace("\r\n", "");

               string strPattern = @"<p\s+class=""info""><a\s+href=(?<url>.+?)\s+title=""(?<title>.+?)"">(?<content>.+?)</a>";
               string oldPricePattern = @"<p\s+class=""bypast""><span>¥(?<OldPrice>.+?)</span></p>";
               string newPricePattern = @"<p\s+class=""current""><strong\s+class=""price""><span>¥</span>(?<NewPrice>\d+?\..+?)</strong></p>";

               List<string> nameList = new List<string>();
               List<string> oldPriceList = new List<string>();
               List<string> newPriceList = new List<string>();
               string oldPrice = String.Empty;
               string newPrice = String.Empty;

               MatchCollection MatchesName = Regex.Matches(mainData, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
               MatchCollection MatchesOldPrice = Regex.Matches(mainData, oldPricePattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
               MatchCollection MatchesNewPrice = Regex.Matches(mainData, newPricePattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);

               foreach (Match NextMatch in MatchesName)
               {
                   nameList.Add(NextMatch.Groups["content"].Value);
               }

               foreach (Match NextMatch in MatchesOldPrice)
               {
                   oldPriceList.Add(NextMatch.Groups["OldPrice"].Value);
               }

               foreach (Match NextMatch in MatchesNewPrice)
               {
                   newPriceList.Add(NextMatch.Groups["NewPrice"].Value);
               }

               for (int iLen = 0; iLen < nameList.Count; iLen++)
               {
                   outContent += String.Format("手机名称:{0}," + "原价:{1},现价:{2}", nameList[iLen].ToString(), oldPriceList[iLen].ToString(), newPriceList[iLen].ToString()) +"\r\n";
               }

               // 现价和&32;之类符号的处理。
               string pth = CreateFolderIfNot(Settings.Default.OutPath) + i + ".txt";
               if (File.Exists(pth))
               {
                   File.Delete(pth);  
               }

               File.AppendAllText(pth, outContent, Encoding.GetEncoding("GB2312"));

               outContent = "";
           }
       } 
       #endregion
    }
}

 

你可能感兴趣的:(信息采集)