用C#.NET 与Webdriver写的抓取网页信息的小工具

    最近,女友的妹妹要去网上找房产中介人信息用于招聘,自己去网上一个一个找太慢,我女友知道我是搞IT的就请教我有没有办法帮她快速找电话号码,于是我就想到了爬虫程序,然后普通的爬虫代码有限制,刚好自己在搞自动化测试,就想到用webdriver自动化测试工具编写一个爬虫工具抓取赶集网上的经纪人信息。

    自己一直用的是Java代码写的自动化测试脚本,但是对于他们不会编程的人,没有界面操作很不方便,于是我就想到了编写一个桌面程序,但是java的GUI又不美观,最后就想到微软的C#,于是自学VS2010,自学.NET,C#结合Webdriver FOR C#版本编写了一个小工具。

 

支持火狐浏览器,phatomJS内存浏览器进行抓取。

将数据查询出放在界面上

支持导出excel文件

基本的实现原理就是:先计算出页码,然后循环一页一页抓取数据,抓取出的数据先临时存储在LIST数据类型中,再存放到界面上的数据控件,点击导出excel的时候,把数据控件的数据转换成LIST,再导出到excel.

界面如下:

用C#.NET 与Webdriver写的抓取网页信息的小工具_第1张图片
   用C#.NET 与Webdriver写的抓取网页信息的小工具_第2张图片

 

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using OpenQA.Selenium;
using OpenQA.Selenium.Firefox;
using OpenQA.Selenium.Interactions;
using OpenQA.Selenium.Interactions.Internal;
using OpenQA.Selenium.Support;
using System.Threading;
using Microsoft.Office;
using Excel;
using System.Drawing;
using OpenQA.Selenium.PhantomJS;
using OpenQA.Selenium.Chrome;
using OpenQA.Selenium.IE;
using OpenQA.Selenium.Remote;
using System.Timers;

namespace tel_search
{
    public partial class Form1 : Form
    {
        //计时器
        static int time_js = 0;//设定临时变量
        static System.Timers.Timer timer;
        
        public Form1()
        {
            InitializeComponent();
        }

        private void label1_Click(object sender, EventArgs e)
        {

        }

        private void label2_Click(object sender, EventArgs e)
        {

        }

        private void label3_Click(object sender, EventArgs e)
        {

        }

        private void Form1_Load(object sender, EventArgs e)
        {
            radioButton_phatom.Checked = true;
        }

        private void button1_Click(object sender, EventArgs e)
        {
            //开始后台运行搜索
            //backgroundWorker1.RunWorkerAsync(200000);
            
            if (listBox1.SelectedItems.Count == 0)
            {
                DialogResult dr5 = MessageBox.Show("请选择地区", "温馨提示", MessageBoxButtons.OK);
                if (dr5 == DialogResult.OK)
                {
                    listBox1.Focus();
                }
            }
            else
            {
                List<String[]> mp_inofs_list = new List<string[]>();
                //用于统计名片个数
                int mp_total = 0;
                //记录错误关键字
                //List<String> err_kw = null;
                IWebDriver driver = null;
                DialogResult dr_search_result = MessageBox.Show("数据比较多,可能需要几分钟时间抓取,请耐心等待!运行过程中,请不要关闭浏览器窗口,否则程序会报错!", "温馨提示", MessageBoxButtons.OK);
                if (dr_search_result == DialogResult.OK)
                {
                    //抓取数据过程,开始计时
                    timer = new System.Timers.Timer(100);
                    timer.Elapsed += new System.Timers.ElapsedEventHandler(OnTimedEvent);
                    timer.AutoReset = true;
                    timer.Enabled = true;
                    
                    if (radioButton_phatom.Checked == true)
                    {
                        driver = new PhantomJSDriver();
                    }
                    if (radioButton_firefox.Checked == true)
                    {
                        try
                        {
                            FirefoxProfile profile = new FirefoxProfile();

                            profile.SetPreference("browser.bookmarks.restore_default_bookmarks", false);
                            driver = new FirefoxDriver(profile);
                            
                        }
                        catch
                        {
                            DialogResult dr4 = MessageBox.Show("你的电脑没有安装火狐浏览器,是否立即下载", "温馨提示", MessageBoxButtons.YesNo);
                            if (dr4 == DialogResult.Yes)
                            {
                                listBox1.Focus();
                                System.Diagnostics.Process.Start("http://download.firefox.com.cn/releases/stub/official/zh-CN/Firefox-latest.exe");
                                //中止程序往下运行
                                return;

                            }
                            if (dr4 == DialogResult.No)
                            {
                                return;
                            }
                        }
                    }
                }



                String quyu = null;
                String url_quyu = null;
                driver.Manage().Window.Maximize(); 

                try
                {
                    if (listBox1.SelectedItem.ToString() == "全深圳")
                    {
                        quyu = "";

                    }
                    if (listBox1.SelectedItem.ToString() == "福田")
                    {
                        quyu = "futian";

                    }
                    if (listBox1.SelectedItem.ToString() == "罗湖")
                    {
                        quyu = "luohu";

                    }
                    if (listBox1.SelectedItem.ToString() == "南山")
                    {
                        quyu = "nanshan";

                    }
                    if (listBox1.SelectedItem.ToString() == "宝安")
                    {
                        quyu = "baoan";

                    }
                    if (listBox1.SelectedItem.ToString() == "龙岗")
                    {
                        quyu = "longgang";

                    }
                    if (listBox1.SelectedItem.ToString() == "盐田")
                    {
                        quyu = "yantian";

                    }
                    if (listBox1.SelectedItem.ToString() == "龙华新区")
                    {
                        quyu = "longhuaxinqu";

                    }
                    if (listBox1.SelectedItem.ToString() == "光明新区")
                    {
                        quyu = "guangmingxinqu";

                    }
                    if (listBox1.SelectedItem.ToString() == "坪山新区")
                    {
                        quyu = "pingshanxinqu";

                    }
                    if (listBox1.SelectedItem.ToString() == "大鹏新区")
                    {
                        quyu = "dapengxinqu";
                    }
                    if (listBox1.SelectedItem.ToString() == "深圳周边")
                    {
                        quyu = "shenzhenzhoubian";

                    }
                    
                    
                        url_quyu = "http://sz.ganji.com/zufang/agent/" + quyu + "/";
                        driver.Navigate().GoToUrl(url_quyu);
                    

                }
                catch
                {
                    driver.Quit();
                    DialogResult dr_search_result2 = MessageBox.Show("打不开网页,您的网络可能有问题,请检查网络连接是否正常!", "温馨提示");
                    return;

                }
                
                    List<IWebElement> page_next_page_button = new List<IWebElement>();
                    List<IWebElement> page_after_index = null;
                    
                    do
                    {
                        page_next_page_button = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[@class='next']/span")));
                        
                        page_after_index = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]")));
                        page_after_index.ElementAt(page_after_index.Count - 1).Click();


                    }while (page_next_page_button.Count!=0);//判断是否查找到"下一页"的按钮.特别注意不能用!=null判断,这样是无效的。

                    //driver.Close();不能关闭浏览器,如果关闭了,需要重新创建driver对象
                    int page_nums = Convert.ToInt16(page_after_index.ElementAt(page_after_index.Count-1).Text);
                  

              
                //获取每个关键词含有多少页,通过获取页码元素判断
                   url_quyu = "http://sz.ganji.com/zufang/agent/" + quyu + "/";
                   driver.Navigate().GoToUrl(url_quyu);
                  List<IWebElement> page_index = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]")));
               

                  for (int page_num = 0; page_num < page_nums; page_num++)
                {
                    //每次点击页码之后,都要重新找元素
                    driver.Manage().Timeouts().ImplicitlyWait(new TimeSpan(0, 0, 1));
					//以一个集合作为参数创建list   LIST<T> TESTLIST=NEW LIST<T>(IEnumerable<T> Collections)
                    List<IWebElement> page = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]")));
                    String page_num_text = null;
                    if (page_num <=5)
                    {
                        page_num_text = page.ElementAt(page_num).Text;
                       
                        page.ElementAt(page_num).Click();
                    }
                    else
                    {
                        page_num_text = page.ElementAt(5).Text;
                       
                        page.ElementAt(5).Click();
                    }

                    driver.Manage().Timeouts().ImplicitlyWait(new TimeSpan(0, 0, 1));

                    //获取每页所含名片元素,同时获取每页名片数量
                    List<IWebElement> mp_indx = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='listBox']/ul/li/div[@class='list-mod2']")));

                    //列表用于存储单个名片信息
                    List<String> mp_info_list = new List<String>();
                    //第一个detailLayer元素不含名片信息过滤掉,所以从1开始

                    for (int i = 0; i < mp_indx.Count(); i++)
                    {
                        

                        //获取单个名片内容元素----Start
                    
                        IWebElement mp_name = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-info']/span[@class='broker-name']/a"));
                        IWebElement mp_tel = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-info']/span[@class='broker-tel']"));
                        IWebElement mp_compy = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(0);
                        IWebElement mp_serverquyu = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(1);
                        IWebElement mp_serverxiaoqu = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(2);
                        
                       
                        String mp_name_text = mp_name.Text;
                        String mp_tel_text = mp_tel.Text;
                        //Substring函数,截取公司名字,删除开头的“经纪公司“几个字
                        String mp_compy_text = mp_compy.Text.Substring(6,mp_compy.Text.ToString().Length-6);
                        String mp_serverquyu_text = mp_serverquyu.Text.Substring(6,mp_serverquyu.Text.ToString().Length-6);
                        String mp_serverxiaoqu_text = mp_serverxiaoqu.Text.Substring(6,mp_serverxiaoqu.Text.ToString().Length-6);

                        mp_info_list.Add(listBox1.SelectedItem.ToString());
                        mp_info_list.Add(mp_name_text);
                        mp_info_list.Add(mp_tel_text);
                        mp_info_list.Add(mp_compy_text);
                        mp_info_list.Add(mp_serverquyu_text);
                        mp_info_list.Add(mp_serverxiaoqu_text);
                        mp_info_list.Add(page_num_text);

                        //------------------END
                        //将单个名片内容列表转为数组
                        String[] mp_info_arrary = mp_info_list.ToArray();

                        //将单个名片,存储到名片二维列表
                        mp_inofs_list.Add(mp_info_arrary);
                        //将名片列表内容删除 ,用于存储下一个名片内容
                        mp_info_list.RemoveRange(0, mp_info_list.Count());
                        mp_total++;

                    }
                    

                }

                //将所有名片信息存储为一个二维数组
                String[][] mp_infos_arrary = mp_inofs_list.ToArray();
                //列宽设置
                col_header1.Width = 30;
                col_header2.Width = 45;
                col_header3.Width = 100;
                col_header4.Width = 90; 
                col_header5.Width = 350;
                col_header6.Width = 115;
                col_header7.Width = 465;
                col_header8.Width = 20;

                //开始更新LISTVIEW数据--START
                listView1.GridLines = true;
                listView1.MultiSelect = true;
                //listView1.Clear();
                listView1.BeginUpdate();
                for (int i = 0; i < mp_inofs_list.Count; i++)
                {
                    ListViewItem Lvi = new ListViewItem();
                    //第一列序号
                    Lvi.Text = (i + 1).ToString();
                    for (int j = 0; j < 7; j++)
                    {

                        Lvi.SubItems.Add(mp_inofs_list.ElementAt(i).ElementAt(j));
                    }
                    listView1.Items.Add(Lvi);
                }
                listView1.EndUpdate();
                listView1.LabelEdit = true;
                listView1.FullRowSelect = true;
             
                driver.Quit();
                

              
                //结束计时
                timer.Enabled = false;
                timer.Dispose();

                time_lbl.Visible = true;
                //换算成分秒
                int minute = 0;//分
                double second = 0;//秒
                second = time_js / 10;
                if (second > 60)
                {
                    minute = (int)(second / 60);
                    second = second % 60;
                    time_lbl.Text = minute.ToString() + "分" + second.ToString() + "秒";
                }
                else
                {
                    time_lbl.Text = 0.ToString() + "分" + second.ToString() + "秒";
                }
                DialogResult msg_mp_total = MessageBox.Show("谢谢你的耐心等待" + "成功抓取" + page_nums + "页" + mp_inofs_list.Count + "条名片信息,用时:" + time_lbl.Text, "恭喜");
               



            }
        }

        private void listView1_SelectedIndexChanged(object sender, EventArgs e)
        {

        }

        private void button4_Click(object sender, EventArgs e)
        {
            turntoexcel();
        }
        private void turntoexcel()
        {
            SaveFileDialog sfd = new SaveFileDialog();
            sfd.DefaultExt = "xls";
            sfd.Filter = "Excel文件(*.xls)|*.xls";
            if (sfd.ShowDialog() == DialogResult.OK)
            {

                DoExport(listView1, sfd.FileName);

            }


        }
        private void DoExport(ListView listView, string strFileName)
        {

            if (listView1.Items.Count== 0)
            {
                MessageBox.Show("没有数据,无法导出!");
                return;
            }
            int rowNum = listView.Items.Count;
           
            int columnNum = listView.Items[0].SubItems.Count;
            int rowIndex = 1;
            int columnIndex = 0;
            if (rowNum == 0 || string.IsNullOrEmpty(strFileName))
            {
                return;
            }
            if (rowNum > 0)
            {

                Microsoft.Office.Interop.Excel.Application xlApp = new Microsoft.Office.Interop.Excel.ApplicationClass();
                if (xlApp == null)
                {
                    MessageBox.Show("无法创建excel对象,可能您的系统没有安装excel");
                    return;
                }
                xlApp.DefaultFilePath = "";
                xlApp.DisplayAlerts = true;
                xlApp.SheetsInNewWorkbook = 1;
                Microsoft.Office.Interop.Excel.Workbook xlBook = xlApp.Workbooks.Add(true);
                
                //将ListView的列名导入Excel表第一行
                foreach (ColumnHeader dc in listView.Columns)
                {
                    columnIndex++;
                    xlApp.Cells[rowIndex, columnIndex] = dc.Text;
                }
                //将ListView中的数据导入Excel中
                for (int i = 0; i < rowNum; i++)
                {
                    rowIndex++;
                    columnIndex = 0;
                    for (int j = 0; j < columnNum; j++)
                    {
                        columnIndex++;
                        //注意这个在导出的时候加了“\t” 的目的就是避免导出的数据显示为科学计数法。可以放在每行的首尾。
                        xlApp.Cells[rowIndex, columnIndex] = Convert.ToString(listView.Items[i].SubItems[j].Text) + "\t";
                    }
                }
                //例外需要说明的是用strFileName,Excel.XlFileFormat.xlExcel9795保存方式时 当你的Excel版本不是95、97 而是2003、2007 时导出的时候会报一个错误:异常来自 HRESULT:0x800A03EC。 解决办法就是换成strFileName, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal。
                xlBook.SaveAs(strFileName, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Microsoft.Office.Interop.Excel.XlSaveAsAccessMode.xlExclusive, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing);
                xlApp = null;
                xlBook = null;
                MessageBox.Show("恭喜导出成功!");
            }
        }

        private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
        {
            //允许长时间的操作
            int input = (int)e.Argument;
            Thread.Sleep(input);

        }

        private void textBox1_TextChanged(object sender, EventArgs e)
        {

        }

        private void linkLabel1_LinkClicked(object sender, LinkLabelLinkClickedEventArgs e)
        {
            linkLabel1.Links[0].LinkData = "http://download.firefox.com.cn/releases/stub/official/zh-CN/Firefox-latest.exe";
            String URL = linkLabel1.Links[0].LinkData.ToString();
            System.Diagnostics.Process.Start(URL);
        }
        private  void OnTimedEvent(Object source, ElapsedEventArgs e)
        {
            if (timer.Enabled == true)
            {
                time_js++;
                
            }
            else
            {
                return;
            }

        }

       

    }
}


你可能感兴趣的:(C#,selenium,webdriver,爬虫小程序)