最近,女友的妹妹要去网上找房产中介人信息用于招聘,自己去网上一个一个找太慢,我女友知道我是搞IT的就请教我有没有办法帮她快速找电话号码,于是我就想到了爬虫程序,然后普通的爬虫代码有限制,刚好自己在搞自动化测试,就想到用webdriver自动化测试工具编写一个爬虫工具抓取赶集网上的经纪人信息。
自己一直用的是Java代码写的自动化测试脚本,但是对于他们不会编程的人,没有界面操作很不方便,于是我就想到了编写一个桌面程序,但是java的GUI又不美观,最后就想到微软的C#,于是自学VS2010,自学.NET,C#结合Webdriver FOR C#版本编写了一个小工具。
支持火狐浏览器,phatomJS内存浏览器进行抓取。
将数据查询出放在界面上
支持导出excel文件
基本的实现原理就是:先计算出页码,然后循环一页一页抓取数据,抓取出的数据先临时存储在LIST数据类型中,再存放到界面上的数据控件,点击导出excel的时候,把数据控件的数据转换成LIST,再导出到excel.
界面如下:
using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Linq; using System.Text; using System.Windows.Forms; using OpenQA.Selenium; using OpenQA.Selenium.Firefox; using OpenQA.Selenium.Interactions; using OpenQA.Selenium.Interactions.Internal; using OpenQA.Selenium.Support; using System.Threading; using Microsoft.Office; using Excel; using System.Drawing; using OpenQA.Selenium.PhantomJS; using OpenQA.Selenium.Chrome; using OpenQA.Selenium.IE; using OpenQA.Selenium.Remote; using System.Timers; namespace tel_search { public partial class Form1 : Form { //计时器 static int time_js = 0;//设定临时变量 static System.Timers.Timer timer; public Form1() { InitializeComponent(); } private void label1_Click(object sender, EventArgs e) { } private void label2_Click(object sender, EventArgs e) { } private void label3_Click(object sender, EventArgs e) { } private void Form1_Load(object sender, EventArgs e) { radioButton_phatom.Checked = true; } private void button1_Click(object sender, EventArgs e) { //开始后台运行搜索 //backgroundWorker1.RunWorkerAsync(200000); if (listBox1.SelectedItems.Count == 0) { DialogResult dr5 = MessageBox.Show("请选择地区", "温馨提示", MessageBoxButtons.OK); if (dr5 == DialogResult.OK) { listBox1.Focus(); } } else { List<String[]> mp_inofs_list = new List<string[]>(); //用于统计名片个数 int mp_total = 0; //记录错误关键字 //List<String> err_kw = null; IWebDriver driver = null; DialogResult dr_search_result = MessageBox.Show("数据比较多,可能需要几分钟时间抓取,请耐心等待!运行过程中,请不要关闭浏览器窗口,否则程序会报错!", "温馨提示", MessageBoxButtons.OK); if (dr_search_result == DialogResult.OK) { //抓取数据过程,开始计时 timer = new System.Timers.Timer(100); timer.Elapsed += new System.Timers.ElapsedEventHandler(OnTimedEvent); timer.AutoReset = true; timer.Enabled = true; if (radioButton_phatom.Checked == true) { driver = new PhantomJSDriver(); } if (radioButton_firefox.Checked == true) { try { FirefoxProfile profile = new FirefoxProfile(); profile.SetPreference("browser.bookmarks.restore_default_bookmarks", false); driver = new FirefoxDriver(profile); } catch { DialogResult dr4 = MessageBox.Show("你的电脑没有安装火狐浏览器,是否立即下载", "温馨提示", MessageBoxButtons.YesNo); if (dr4 == DialogResult.Yes) { listBox1.Focus(); System.Diagnostics.Process.Start("http://download.firefox.com.cn/releases/stub/official/zh-CN/Firefox-latest.exe"); //中止程序往下运行 return; } if (dr4 == DialogResult.No) { return; } } } } String quyu = null; String url_quyu = null; driver.Manage().Window.Maximize(); try { if (listBox1.SelectedItem.ToString() == "全深圳") { quyu = ""; } if (listBox1.SelectedItem.ToString() == "福田") { quyu = "futian"; } if (listBox1.SelectedItem.ToString() == "罗湖") { quyu = "luohu"; } if (listBox1.SelectedItem.ToString() == "南山") { quyu = "nanshan"; } if (listBox1.SelectedItem.ToString() == "宝安") { quyu = "baoan"; } if (listBox1.SelectedItem.ToString() == "龙岗") { quyu = "longgang"; } if (listBox1.SelectedItem.ToString() == "盐田") { quyu = "yantian"; } if (listBox1.SelectedItem.ToString() == "龙华新区") { quyu = "longhuaxinqu"; } if (listBox1.SelectedItem.ToString() == "光明新区") { quyu = "guangmingxinqu"; } if (listBox1.SelectedItem.ToString() == "坪山新区") { quyu = "pingshanxinqu"; } if (listBox1.SelectedItem.ToString() == "大鹏新区") { quyu = "dapengxinqu"; } if (listBox1.SelectedItem.ToString() == "深圳周边") { quyu = "shenzhenzhoubian"; } url_quyu = "http://sz.ganji.com/zufang/agent/" + quyu + "/"; driver.Navigate().GoToUrl(url_quyu); } catch { driver.Quit(); DialogResult dr_search_result2 = MessageBox.Show("打不开网页,您的网络可能有问题,请检查网络连接是否正常!", "温馨提示"); return; } List<IWebElement> page_next_page_button = new List<IWebElement>(); List<IWebElement> page_after_index = null; do { page_next_page_button = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[@class='next']/span"))); page_after_index = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]"))); page_after_index.ElementAt(page_after_index.Count - 1).Click(); }while (page_next_page_button.Count!=0);//判断是否查找到"下一页"的按钮.特别注意不能用!=null判断,这样是无效的。 //driver.Close();不能关闭浏览器,如果关闭了,需要重新创建driver对象 int page_nums = Convert.ToInt16(page_after_index.ElementAt(page_after_index.Count-1).Text); //获取每个关键词含有多少页,通过获取页码元素判断 url_quyu = "http://sz.ganji.com/zufang/agent/" + quyu + "/"; driver.Navigate().GoToUrl(url_quyu); List<IWebElement> page_index = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]"))); for (int page_num = 0; page_num < page_nums; page_num++) { //每次点击页码之后,都要重新找元素 driver.Manage().Timeouts().ImplicitlyWait(new TimeSpan(0, 0, 1)); //以一个集合作为参数创建list LIST<T> TESTLIST=NEW LIST<T>(IEnumerable<T> Collections) List<IWebElement> page = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]"))); String page_num_text = null; if (page_num <=5) { page_num_text = page.ElementAt(page_num).Text; page.ElementAt(page_num).Click(); } else { page_num_text = page.ElementAt(5).Text; page.ElementAt(5).Click(); } driver.Manage().Timeouts().ImplicitlyWait(new TimeSpan(0, 0, 1)); //获取每页所含名片元素,同时获取每页名片数量 List<IWebElement> mp_indx = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='listBox']/ul/li/div[@class='list-mod2']"))); //列表用于存储单个名片信息 List<String> mp_info_list = new List<String>(); //第一个detailLayer元素不含名片信息过滤掉,所以从1开始 for (int i = 0; i < mp_indx.Count(); i++) { //获取单个名片内容元素----Start IWebElement mp_name = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-info']/span[@class='broker-name']/a")); IWebElement mp_tel = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-info']/span[@class='broker-tel']")); IWebElement mp_compy = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(0); IWebElement mp_serverquyu = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(1); IWebElement mp_serverxiaoqu = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(2); String mp_name_text = mp_name.Text; String mp_tel_text = mp_tel.Text; //Substring函数,截取公司名字,删除开头的“经纪公司“几个字 String mp_compy_text = mp_compy.Text.Substring(6,mp_compy.Text.ToString().Length-6); String mp_serverquyu_text = mp_serverquyu.Text.Substring(6,mp_serverquyu.Text.ToString().Length-6); String mp_serverxiaoqu_text = mp_serverxiaoqu.Text.Substring(6,mp_serverxiaoqu.Text.ToString().Length-6); mp_info_list.Add(listBox1.SelectedItem.ToString()); mp_info_list.Add(mp_name_text); mp_info_list.Add(mp_tel_text); mp_info_list.Add(mp_compy_text); mp_info_list.Add(mp_serverquyu_text); mp_info_list.Add(mp_serverxiaoqu_text); mp_info_list.Add(page_num_text); //------------------END //将单个名片内容列表转为数组 String[] mp_info_arrary = mp_info_list.ToArray(); //将单个名片,存储到名片二维列表 mp_inofs_list.Add(mp_info_arrary); //将名片列表内容删除 ,用于存储下一个名片内容 mp_info_list.RemoveRange(0, mp_info_list.Count()); mp_total++; } } //将所有名片信息存储为一个二维数组 String[][] mp_infos_arrary = mp_inofs_list.ToArray(); //列宽设置 col_header1.Width = 30; col_header2.Width = 45; col_header3.Width = 100; col_header4.Width = 90; col_header5.Width = 350; col_header6.Width = 115; col_header7.Width = 465; col_header8.Width = 20; //开始更新LISTVIEW数据--START listView1.GridLines = true; listView1.MultiSelect = true; //listView1.Clear(); listView1.BeginUpdate(); for (int i = 0; i < mp_inofs_list.Count; i++) { ListViewItem Lvi = new ListViewItem(); //第一列序号 Lvi.Text = (i + 1).ToString(); for (int j = 0; j < 7; j++) { Lvi.SubItems.Add(mp_inofs_list.ElementAt(i).ElementAt(j)); } listView1.Items.Add(Lvi); } listView1.EndUpdate(); listView1.LabelEdit = true; listView1.FullRowSelect = true; driver.Quit(); //结束计时 timer.Enabled = false; timer.Dispose(); time_lbl.Visible = true; //换算成分秒 int minute = 0;//分 double second = 0;//秒 second = time_js / 10; if (second > 60) { minute = (int)(second / 60); second = second % 60; time_lbl.Text = minute.ToString() + "分" + second.ToString() + "秒"; } else { time_lbl.Text = 0.ToString() + "分" + second.ToString() + "秒"; } DialogResult msg_mp_total = MessageBox.Show("谢谢你的耐心等待" + "成功抓取" + page_nums + "页" + mp_inofs_list.Count + "条名片信息,用时:" + time_lbl.Text, "恭喜"); } } private void listView1_SelectedIndexChanged(object sender, EventArgs e) { } private void button4_Click(object sender, EventArgs e) { turntoexcel(); } private void turntoexcel() { SaveFileDialog sfd = new SaveFileDialog(); sfd.DefaultExt = "xls"; sfd.Filter = "Excel文件(*.xls)|*.xls"; if (sfd.ShowDialog() == DialogResult.OK) { DoExport(listView1, sfd.FileName); } } private void DoExport(ListView listView, string strFileName) { if (listView1.Items.Count== 0) { MessageBox.Show("没有数据,无法导出!"); return; } int rowNum = listView.Items.Count; int columnNum = listView.Items[0].SubItems.Count; int rowIndex = 1; int columnIndex = 0; if (rowNum == 0 || string.IsNullOrEmpty(strFileName)) { return; } if (rowNum > 0) { Microsoft.Office.Interop.Excel.Application xlApp = new Microsoft.Office.Interop.Excel.ApplicationClass(); if (xlApp == null) { MessageBox.Show("无法创建excel对象,可能您的系统没有安装excel"); return; } xlApp.DefaultFilePath = ""; xlApp.DisplayAlerts = true; xlApp.SheetsInNewWorkbook = 1; Microsoft.Office.Interop.Excel.Workbook xlBook = xlApp.Workbooks.Add(true); //将ListView的列名导入Excel表第一行 foreach (ColumnHeader dc in listView.Columns) { columnIndex++; xlApp.Cells[rowIndex, columnIndex] = dc.Text; } //将ListView中的数据导入Excel中 for (int i = 0; i < rowNum; i++) { rowIndex++; columnIndex = 0; for (int j = 0; j < columnNum; j++) { columnIndex++; //注意这个在导出的时候加了“\t” 的目的就是避免导出的数据显示为科学计数法。可以放在每行的首尾。 xlApp.Cells[rowIndex, columnIndex] = Convert.ToString(listView.Items[i].SubItems[j].Text) + "\t"; } } //例外需要说明的是用strFileName,Excel.XlFileFormat.xlExcel9795保存方式时 当你的Excel版本不是95、97 而是2003、2007 时导出的时候会报一个错误:异常来自 HRESULT:0x800A03EC。 解决办法就是换成strFileName, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal。 xlBook.SaveAs(strFileName, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Microsoft.Office.Interop.Excel.XlSaveAsAccessMode.xlExclusive, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing); xlApp = null; xlBook = null; MessageBox.Show("恭喜导出成功!"); } } private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e) { //允许长时间的操作 int input = (int)e.Argument; Thread.Sleep(input); } private void textBox1_TextChanged(object sender, EventArgs e) { } private void linkLabel1_LinkClicked(object sender, LinkLabelLinkClickedEventArgs e) { linkLabel1.Links[0].LinkData = "http://download.firefox.com.cn/releases/stub/official/zh-CN/Firefox-latest.exe"; String URL = linkLabel1.Links[0].LinkData.ToString(); System.Diagnostics.Process.Start(URL); } private void OnTimedEvent(Object source, ElapsedEventArgs e) { if (timer.Enabled == true) { time_js++; } else { return; } } } }