《程序员的第一年》---------- 【抓取网页数据】定时查寻淘宝搜索结果并用excel记录下来(HttpWebRequest与正则等的使用)


最近在看一些关于HttpWebRequest与正则来抓取网页数据的知识,就做了一个搜索淘宝电商网站关键词(例如,1手机,2手机配件,3手机外壳....),

    /// 把这些关键词的结果数量,在Excel中输出

功能很简单高手路过别喷

不过查寻的速度有点慢,有哪位高手能指点一下嘛?

 


 

界面如下:


《程序员的第一年》---------- 【抓取网页数据】定时查寻淘宝搜索结果并用excel记录下来(HttpWebRequest与正则等的使用)_第1张图片




代码如下:


using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Text.RegularExpressions;

namespace 自动搜索记录
{
    public partial class Main : Form
    {
        public Main()
        {
            InitializeComponent();
        }

        private void btnAdd_Click(object sender, EventArgs e)
        {
            if (!string.IsNullOrEmpty(txtKeyWord.Text))
            {
                MyItem item = new MyItem();
                item.id = (lboxKeyWord.Items.Count+1).ToString();
                item.keyWord=txtKeyWord.Text.ToString();
                lboxKeyWord.Items.Add(item);
            //    MessageBox.Show("添加成功!");
            }
            else
            {
                MessageBox.Show("关键字不能为空!");
            }
        }

        private void button1_Click(object sender, EventArgs e)
        {
            lboxResultShow.Items.Clear();
            foreach (MyItem item in lboxKeyWord.Items)
            {
                lboxResultShow.Items.Add(GetFindResult(item.keyWord));
            }
        }

        /// 
        /// 取得查寻结果
        /// 
        /// 关键字
        private static string GetFindResult(string keyWord)
        {
            keyWord = System.Web.HttpUtility.UrlEncode(keyWord, System.Text.UnicodeEncoding.GetEncoding("GB2312")).ToUpper();
            // System.Diagnostics.Process.Start("http://s.taobao.com/search?q=" + keyWord);//弹出网页 
            comm comm = new comm();
            string url = "http://s.taobao.com/search?q=" + keyWord;//根据要求更改网址
            string html = comm.NoLoginGetHtml(url);//取得当前页HTML
            if (string.IsNullOrEmpty(html))
            {
                Console.WriteLine("查找有误");
                return "";
            }
            string resultInfo = comm.GetElementsByClass(html, "result-info")[0].ToString();
             
  • 3180078件宝贝
  • string regex = "
  • (?[^<]*)件宝贝
  • "; Regex rege = new Regex(regex); int total = Convert.ToInt32(rege.Match(resultInfo).Groups["total"].ToString()); keyWord = System.Web.HttpUtility.UrlDecode(keyWord, System.Text.UnicodeEncoding.GetEncoding("GB2312")); return keyWord + "---总共有:" + total + "个查寻结果"; // Console.WriteLine(keyWord + "---" + total + "个结果"); } private bool isCloseTime; private void btnFindByTime_Click(object sender, EventArgs e) { System.Timers.Timer t = new System.Timers.Timer(1000 * Convert.ToInt32(txtTime.Text));//实例化Timer类,设置间隔时间为5000毫秒; if (btnFindByTime.Text == "关闭定时") { isCloseTime = true; btnFindByTime.Text = "定时查寻"; } else { isCloseTime = false; this.btnAdd.Enabled = false; t.Elapsed += new System.Timers.ElapsedEventHandler(theout);//到达时间的时候执行事件; t.AutoReset = true;//设置是执行一次(false)还是一直执行(true); t.Enabled = true;//是否执行System.Timers.Timer.Elapsed事件; Control.CheckForIllegalCrossThreadCalls = false; btnFindByTime.Text = "关闭定时"; } } /// /// 定时查寻事件 /// public void theout(object source, System.Timers.ElapsedEventArgs e) { if ( isCloseTime) { System.Timers.Timer timer = (System.Timers.Timer)source; timer.Enabled = false; this.btnAdd.Enabled = true; } else { lboxResultShow.Items.Clear(); foreach (MyItem item in lboxKeyWord.Items) { lboxResultShow.Items.Add(GetFindResult(item.keyWord)); } } } private void btnSaveExcel_Click(object sender, EventArgs e) { //建立excel对象 Microsoft.Office.Interop.Excel.Application excel = new Microsoft.Office.Interop.Excel.Application(); excel.Application.Workbooks.Add(true); excel.Cells[1, 1] = "关键字"; excel.Cells[1, 2] = "查寻结果"; //填充数据 for (int i = 0; i < lboxResultShow.Items.Count; i++) { excel.Cells[i + 2, 1] = "" +this.lboxKeyWord.Items[i].ToString(); excel.Cells[i + 2, 2] = lboxResultShow.Items[i].ToString(); } excel.Visible = true; } } public class MyItem : object { public string keyWord; public string id; public override string ToString() { // TODO: 添加 MyItem.ToString 实现 return keyWord; } } } using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Drawing; using System.IO; using System.Net; using System.Text.RegularExpressions; namespace 自动搜索记录 { public class comm { /// /// 根据url取得图片 /// /// /// private Image GetImageByUrl(string url) { HttpWebRequest req = (HttpWebRequest)WebRequest.Create(new Uri(url)); HttpWebResponse res = (HttpWebResponse)req.GetResponse();//获取服务器返回的资源 Stream stream = res.GetResponseStream(); Bitmap bitmap = new Bitmap(stream); stream.Close(); return bitmap; } /// /// 取得HTML中所有图片的 URL。 /// /// HTML代码 /// 图片的URL列表 public string[] GetHtmlImageUrlList(string sHtmlText) { // 定义正则表达式用来匹配 img 标签 Regex regImg = new Regex(@"]*?\bsrc[\s]*=[\s]*[""']?[\s]*(?[^\s""'<>]*)[^<>]*?/?[\s]*>", RegexOptions.IgnoreCase); // 搜索匹配的字符串 MatchCollection matches = regImg.Matches(sHtmlText); int i = 0; string[] sUrlList = new string[matches.Count]; // 取得匹配项列表 foreach (Match match in matches) { string imgUrl = match.Groups["imgUrl"].Value; sUrlList[i++] = imgUrl; } return sUrlList; } /// /// 保存图片 /// /// public void SaveImage(string url, string path) { Image img = GetImageByUrl(url); if (!Directory.Exists(Path.GetDirectoryName(path))) { Directory.CreateDirectory(Path.GetDirectoryName(path)); } //img.Save(GetNameByTime()+".jpg"); img.Save(path); } /// /// 打开资源管理器并指定保存的文件 /// /// public void OpenExplorerFile(string Path) { System.Diagnostics.ProcessStartInfo psi = new System.Diagnostics.ProcessStartInfo("Explorer.exe"); psi.Arguments = "/e,/select," + Path; System.Diagnostics.Process.Start(psi); } /// /// 获取指定ID的标签内容 /// /// HTML源码 /// 标签ID /// public string GetElementById(string html, string id) { string pattern = @"<([a-z]+)(?:(?!id)[^<>])*id=([""']?){0}\2[^>]*>(?>(?<\1[^>]*>)|(?<-o>)|(?:(?!"; pattern = string.Format(pattern, Regex.Escape(id)); Match match = Regex.Match(html, pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase); return match.Success ? match.Value : ""; } /// /// 通过class属性获取对应标签集合 /// /// HTML源码 /// class值 /// public string[] GetElementsByClass(string html, string className) { return GetElements(html, "", className); } /// /// 根据正则获取内容 /// /// 内容 /// 正则 public string[] GetListByHtml(string text, string pat) { List list = new List(); Regex r = new Regex(pat, System.Text.RegularExpressions.RegexOptions.IgnoreCase); Match m = r.Match(text); //int matchCount = 0; while (m.Success) { list.Add(m.Value); m = m.NextMatch(); } return list.ToArray(); } /// /// 通过标签名获取标签集合 /// /// HTML源码 /// 标签名(如div) /// public string[] GetElementsByTagName(string html, string tagName) { return GetElements(html, tagName, ""); } /// /// 通过同时指定标签名+class值获取标签集合(内部方法) /// /// /// /// /// private string[] GetElements(string html, string tagName, string className) { string pattern = ""; if (tagName != "" && className != "") { pattern = @"<({0})(?:(?!class)[^<>])*class=([""']?){1}\2[^>]*>(?>(?<\1[^>]*>)|(?<-o>)|(?:(?!"; pattern = string.Format(pattern, Regex.Escape(tagName), Regex.Escape(className)); } else if (tagName != "") { pattern = @"<({0})(?:[^<>])*>(?>(?<\1[^>]*>)|(?<-o>)|(?:(?!"; pattern = string.Format(pattern, Regex.Escape(tagName)); } else if (className != "") { pattern = @"<([a-z]+)(?:(?!class)[^<>])*class=([""']?){0}\2[^>]*>(?>(?<\1[^>]*>)|(?<-o>)|(?:(?!"; pattern = string.Format(pattern, Regex.Escape(className)); } if (pattern == "") { return new string[] { }; } List list = new List(); Regex reg = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase); Match match = reg.Match(html); while (match.Success) { list.Add(match.Value); match = reg.Match(html, match.Index + match.Length); } return list.ToArray(); } /// /// 无需登录 直接查看网页源码 /// /// 被查看的地址 /// public string NoLoginGetHtml(string Url) { try { string URI = "http://"; if (Url.IndexOf("http://") >= 0 || Url.IndexOf("https://") >= 0) { URI = Url; } else { URI += Url; } HttpWebRequest request = (HttpWebRequest)WebRequest.Create(URI); request.Method = "GET"; request.UserAgent = ".NET Framework Test Client"; request.KeepAlive = false; //声明一个HttpWebRequest请求 request.Timeout = 300000; //设置连接超时时间 request.Headers.Set("Pragma", "no-cache"); // 接收返回的页面 HttpWebResponse response = (HttpWebResponse)request.GetResponse(); //System.Threading.Thread.Sleep(5000); Stream responseStream = response.GetResponseStream(); StreamReader reader = new System.IO.StreamReader(responseStream, Encoding.GetEncoding("GB2312")); return reader.ReadToEnd(); } catch { return ""; } } } }




    转载请注明来处(Taomeng)









    你可能感兴趣的:(技术日志,程序员的第一年,程序人生)