使用HtmlAgilityPack解析Html(非常好用)

 /// <summary>

    /// 设计成一个exe,解决WebBrowser控件内存泄漏的问题.

    /// </summary>

    public partial class MainForm : Form

    {

        /// <summary>

        /// 是否处理完成

        /// </summary>

        private bool isCompleted; //webBrowser只能运行在UI线程上,所以这里不用信号通知,而用一个变量,不断检查这个变量的状态



        /// <summary>

        /// 处理结果

        /// </summary>

        private List<RowData> executeResult = new List<RowData>();



        private static MainForm instance = new MainForm();

        /// <summary>

        /// 单件实例

        /// </summary>

        public static MainForm Instance { get { return instance; } }



        private MainForm()

        {

            InitializeComponent();

            webBrowser.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser_DocumentCompleted);

        }



        private void webBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)

        {

            if (this.webBrowser.ReadyState != WebBrowserReadyState.Complete)

                return;



            var txt = webBrowser.Document.Body.InnerText;

            var html = webBrowser.Document.Body.InnerHtml;

            if (webBrowser.Document.Title == "选择")

            {

                var items = ExtractData(html);

                executeResult.AddRange(items);

                isCompleted = true;

            }

        }



        private List<RowData> ExtractData(string html)

        {

            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

            doc.LoadHtml(html);



            HtmlAgilityPack.HtmlNode node = doc.GetElementbyId("div");

            var trNodes = node.SelectNodes("tbody/tr");



            List<RowData> rows = new List<RowData>();

            foreach (var trNode in trNodes)

            {

                var tdNode = trNode.SelectNodes("td/div");

                RowData row = new RowData();

                rows.Add(row);



                row.航班 = tdNode[0].InnerText;

                row.出发时间 = tdNode[1].InnerText;

                row.到达时间 = tdNode[2].InnerText;

                row.机场 = tdNode[3].InnerText;

                row.机型 = tdNode[4].InnerText;

                row.头等 = tdNode[5].InnerText;

                row.公务 = tdNode[6].InnerText;

                row.全价 = tdNode[7].InnerText;

                row.折扣 = tdNode[8].InnerText;

                row.特价 = tdNode[9].InnerText;

            }



            return rows;

        }



        /// <summary>

        /// 查询数据

        /// </summary>

        /// <param name="fromCity">出发城市代码</param>

        /// <param name="toCity">到达城市代码</param>

        /// <param name="date">出发日期</param>

        /// <param name="timeout">超时时间</param>

        /// <returns>机票信息</returns>

        [MethodImpl(MethodImplOptions.Synchronized)]

        public List<RowData> Query(string fromCity, string toCity, DateTime date, TimeSpan timeout)

        {

            isCompleted = false;

            executeResult.Clear();



            string urlTemplate = "http://www.xxx.com";

            string url = string.Format(urlTemplate, fromCity, date.Month, date.Day, date.Year, toCity);

            Navigate(url);



            DateTime startTime = DateTime.Now;

            //未处理完,且没有超时,则等待

            while (!isCompleted && startTime.Add(timeout) > DateTime.Now)

            {

                Thread.Sleep(100);

                Application.DoEvents();

            }



            return executeResult;

        }



        private void Navigate(string url)

        {

            if (InvokeRequired)

            {

                BeginInvoke(new Action<string>(Navigate), url);

                return;

            }



            webBrowser.Navigate(url);

        }

    }



    /// <summary>

    /// 对应到页面上的每一行数据

    /// 不喜欢中文请自行修改

    /// </summary>

    public class RowData

    {

        public string 航班 { get; set; }

        public string 出发时间 { get; set; }

        public string 到达时间 { get; set; }

        public string 机场 { get; set; }

        public string 机型 { get; set; }

        public string 头等 { get; set; }

        public string 公务 { get; set; }

        public string 全价 { get; set; }

        public string 折扣 { get; set; }

        public string 特价 { get; set; }

    }

你可能感兴趣的:(html)