C# -爬虫之WebBrowser跨域跨iframe获取网页源码

前言:这里关键写用WebBrowser跨域跨iframe获取网页源码的部分,本意是要爬取全职高手的有声小说,这类网站特殊,网页上广告大堆,爬起来真麻烦,比如我爬取的网站的mp3源文件下载还需要秘钥的,还随时更新秘钥的那种,嵌在某个iframe下,是经过设计防止爬虫的。

上代码,获取我想要的关键iframe源码(各种百度搬砖拆砖的结果/苦笑):

using mshtml;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace DownLoadNovel
{
    public class FindHtml
    {
        private String htmlString;
        private String url = "";
        private bool success; // 是否成功运行

        public FindHtml()
        {
            htmlString = "";
            success = false;
        }

        /// 
        /// 结果源码
        /// 
        public String ResultHtml
        {
            get
            {
                if (success == false) return null;
                return htmlString;
            }
        }

        /// 
        /// 加载指定文件
        /// 
        /// 文件URL
        /// 超时时限
        /// 是否成功运行,没有超时
        public bool Run(String url, int timeOut = 10000)
        {
            this.url = url;
            Thread newThread = new Thread(NewThread);
            newThread.SetApartmentState(ApartmentState.STA);/// 为了创建WebBrowser类的实例 必须将对应线程设为单线程单元
            newThread.Start();
            //监督子线程运行时间
            while (newThread.IsAlive && timeOut > 0)
            {
                Thread.Sleep(100);
                timeOut -= 100;
            }
            // 超时处理
            if (newThread.IsAlive)
            {
                if (success) return true;
                newThread.Abort();
                return false;
            }
            return true;
        }

        private void NewThread()
        {
            new FindHtmlPerThread(this);
            Application.Run();// 循环等待webBrowser 加载完毕 调用 DocumentCompleted 事件
        }

        /// 
        ///  用于处理一个url的核心类
        /// 
        class FindHtmlPerThread : IDisposable
        {
            FindHtml master;
            WebBrowser web;

            public FindHtmlPerThread(FindHtml master)
            {
                this.master = master;
                String url = master.url;
                web = new WebBrowser();
                web.ScriptErrorsSuppressed = true;
                bool success = false;
                try
                {
                    web.Url = new Uri(url);
                    web.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(web_DocumentCompleted); // 对事件加委托
                    success = true;
                }
                finally
                {
                    if (!success)
                        Dispose();
                }
            }
            public void Dispose()
            {
                if (!web.IsDisposed)
                    web.Dispose();
            }
            private void web_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
            {
                // 一个网页有多个Ifram元素就有可能触发多次此事件,通过此判断是否已运行完毕
                if (web.ReadyState != WebBrowserReadyState.Complete) return;

                //获取向前网页有多少个iframe,然后循环
                var fcount = web.Document.Window.Frames.Count;
                for (int i = 0; i < fcount; i++)
                {
                    //关键点,得到 跨域的iframe 的源码
                    IHTMLDocument3 baiduDoc = CorssDomainHelper.GetDocumentFromWindow(web.Document.Window.Frames[i].DomWindow as IHTMLWindow2);

                    var furl = ((mshtml.HTMLDocumentClass)baiduDoc).IHTMLDocument2_url;
                    if (furl.StartsWith("关键iframe的链接域名,这是不会变的"))
                    {
                        master.htmlString = baiduDoc.documentElement.innerHTML;//iframe的源码
                        break;
                    }
                }
                master.success = true;
                Thread.CurrentThread.Abort();
            }
        }
    }
}
using System;
using System.Runtime.InteropServices;
using System.Windows.Forms;
using mshtml;
namespace DownLoadNovel
{
    // This is the COM IServiceProvider interface, not System.IServiceProvider .Net interface!
    [ComImport(), ComVisible(true), Guid("6D5140C1-7436-11CE-8034-00AA006009FA"),
    InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
    public interface IServiceProvider
    {
        [return: MarshalAs(UnmanagedType.I4)]
        [PreserveSig]
        int QueryService(ref Guid guidService, ref Guid riid, [MarshalAs(UnmanagedType.Interface)] out object ppvObject);
    }
    public enum OLECMDF
    {
        OLECMDF_DEFHIDEONCTXTMENU = 0x20,
        OLECMDF_ENABLED = 2,
        OLECMDF_INVISIBLE = 0x10,
        OLECMDF_LATCHED = 4,
        OLECMDF_NINCHED = 8,
        OLECMDF_SUPPORTED = 1
    }
    public enum OLECMDID
    {
        OLECMDID_PAGESETUP = 8,
        OLECMDID_PRINT = 6,
        OLECMDID_PRINTPREVIEW = 7,
        OLECMDID_PROPERTIES = 10,
        OLECMDID_SAVEAS = 4
    }
    public enum OLECMDEXECOPT
    {
        OLECMDEXECOPT_DODEFAULT,
        OLECMDEXECOPT_PROMPTUSER,
        OLECMDEXECOPT_DONTPROMPTUSER,
        OLECMDEXECOPT_SHOWHELP
    }
    [ComImport, Guid("D30C1661-CDAF-11d0-8A3E-00C04FC9E26E"), TypeLibType(TypeLibTypeFlags.FOleAutomation | TypeLibTypeFlags.FDual | TypeLibTypeFlags.FHidden)]
    public interface IWebBrowser2
    {
        [DispId(100)]
        void GoBack();
        [DispId(0x65)]
        void GoForward();
        [DispId(0x66)]
        void GoHome();
        [DispId(0x67)]
        void GoSearch();
        [DispId(0x68)]
        void Navigate([In] string Url, [In] ref object flags, [In] ref object targetFrameName, [In] ref object postData, [In] ref object headers);
        [DispId(-550)]
        void Refresh();
        [DispId(0x69)]
        void Refresh2([In] ref object level);
        [DispId(0x6a)]
        void Stop();
        [DispId(200)]
        object Application { [return: MarshalAs(UnmanagedType.IDispatch)] get; }
        [DispId(0xc9)]
        object Parent { [return: MarshalAs(UnmanagedType.IDispatch)] get; }
        [DispId(0xca)]
        object Container { [return: MarshalAs(UnmanagedType.IDispatch)] get; }
        [DispId(0xcb)]
        object Document { [return: MarshalAs(UnmanagedType.IDispatch)] get; }
        [DispId(0xcc)]
        bool TopLevelContainer { get; }
        [DispId(0xcd)]
        string Type { get; }
        [DispId(0xce)]
        int Left { get; set; }
        [DispId(0xcf)]
        int Top { get; set; }
        [DispId(0xd0)]
        int Width { get; set; }
        [DispId(0xd1)]
        int Height { get; set; }
        [DispId(210)]
        string LocationName { get; }
        [DispId(0xd3)]
        string LocationURL { get; }
        [DispId(0xd4)]
        bool Busy { get; }
        [DispId(300)]
        void Quit();
        [DispId(0x12d)]
        void ClientToWindow(out int pcx, out int pcy);
        [DispId(0x12e)]
        void PutProperty([In] string property, [In] object vtValue);
        [DispId(0x12f)]
        object GetProperty([In] string property);
        [DispId(0)]
        string Name { get; }
        [DispId(-515)]
        int HWND { get; }
        [DispId(400)]
        string FullName { get; }
        [DispId(0x191)]
        string Path { get; }
        [DispId(0x192)]
        bool Visible { get; set; }
        [DispId(0x193)]
        bool StatusBar { get; set; }
        [DispId(0x194)]
        string StatusText { get; set; }
        [DispId(0x195)]
        int ToolBar { get; set; }
        [DispId(0x196)]
        bool MenuBar { get; set; }
        [DispId(0x197)]
        bool FullScreen { get; set; }
        [DispId(500)]
        void Navigate2([In] ref object URL, [In] ref object flags, [In] ref object targetFrameName, [In] ref object postData, [In] ref object headers);
        [DispId(0x1f5)]
        OLECMDF QueryStatusWB([In] OLECMDID cmdID);
        [DispId(0x1f6)]
        void ExecWB([In] OLECMDID cmdID, [In] OLECMDEXECOPT cmdexecopt, ref object pvaIn, IntPtr pvaOut);
        [DispId(0x1f7)]
        void ShowBrowserBar([In] ref object pvaClsid, [In] ref object pvarShow, [In] ref object pvarSize);
        [DispId(-525)]
        WebBrowserReadyState ReadyState { get; }
        [DispId(550)]
        bool Offline { get; set; }
        [DispId(0x227)]
        bool Silent { get; set; }
        [DispId(0x228)]
        bool RegisterAsBrowser { get; set; }
        [DispId(0x229)]
        bool RegisterAsDropTarget { get; set; }
        [DispId(0x22a)]
        bool TheaterMode { get; set; }
        [DispId(0x22b)]
        bool AddressBar { get; set; }
        [DispId(0x22c)]
        bool Resizable { get; set; }
    }
    class CorssDomainHelper
    {
        private static Guid IID_IWebBrowserApp = new Guid("0002DF05-0000-0000-C000-000000000046");
        private static Guid IID_IWebBrowser2 = new Guid("D30C1661-CDAF-11D0-8A3E-00C04FC9E26E");
        // Utility for IE cross domain access
        // Returns null in case of failure.
        public static IHTMLDocument3 GetDocumentFromWindow(IHTMLWindow2 htmlWindow)
        {
            if (htmlWindow == null)
            {
                return null;
            }
            // First try the usual way to get the document.
            try
            {
                IHTMLDocument2 doc = htmlWindow.document;
                return (IHTMLDocument3)doc;
            }
            catch (COMException comEx)
            {
                // I think COMException won't be ever fired but just to be sure ...
            }
            catch (UnauthorizedAccessException)
            {
            }
            catch (Exception ex)
            {
                return null;
            }
            // At this point the error was E_ACCESSDENIED because the frame contains a document from another domain.
            // IE tries to prevent a cross frame scripting security issue.
            try
            {
                // Convert IHTMLWindow2 to IWebBrowser2 using IServiceProvider.
                IServiceProvider sp = (IServiceProvider)htmlWindow;
                // Use IServiceProvider.QueryService to get IWebBrowser2 object.
                Object brws = null;
                sp.QueryService(ref IID_IWebBrowserApp, ref IID_IWebBrowser2, out brws);
                // Get the document from IWebBrowser2.
                IWebBrowser2 browser = (IWebBrowser2)(brws);
                return (IHTMLDocument3)browser.Document;
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex);
            }
            return null;
        }
    }
}

最终的调用:

 FindHtml content = new FindHtml();
 var flag = content.Run("我爬虫的网站链接");
 if (flag)
 {
    //这下面就是根据得到的源码来找规则了,每个网站都不同,这里就不写了
 }

最后还是上一下我怎么下载那些mp3文件的吧,很简单用WebClient异步下载

private void DownFile(List files)
        {
            key = GetKey();
            if (string.IsNullOrWhiteSpace(key))
            {
                MessageBox.Show("无法爬到key,%>_<%,请多试几次,如若还不行,可能资源网站已改版");
            }
            downNum = files.Count;
            foreach (var i in files)
            {
                WebClient client = new WebClient();
                client.DownloadProgressChanged += client_DownloadProgressChanged;
                client.DownloadFileCompleted += client_DownloadFileCompleted;
                string furl = domain + string.Format(@"/{0}.mp3?{1}", i < 100 ? i.ToString("000") : i.ToString(), key);
                client.DownloadFileAsync(new Uri(furl), Path.Combine(this.textBox1.Text, i + ".mp3"), i);
            }
        }

        void client_DownloadFileCompleted(object sender, AsyncCompletedEventArgs e)
        {
            try
            {
                _dloadFile++;

                int percent = (int)(100.0 * _dloadFile/ downNum);
                this.Invoke(new MethodInvoker(delegate
                {
                    this.progressBar1.Value = percent;
                    this.label5.Text = string.Format("已完成文件下载{0}%  {1}/{2}(文件个数)", percent, _dloadFile, downNum);
                }));

                var epn = Uri.UnescapeDataString(((System.Net.WebClient)sender).ResponseHeaders.GetValues("Content-Location")[0]).Replace(domain, "").Replace(key, "").Replace(".mp3?", "").Replace("/","");
                aldownep.Add(Convert.ToInt32(epn));

                if (sender is WebClient)
                {
                    ((WebClient)sender).CancelAsync();
                    ((WebClient)sender).Dispose();
                }
                if (_dloadFile == downNum) { MessageBox.Show("下载完成!"); this.panel1.Visible = false; }
            }
            catch (Exception ex) { throw ex; }
        }

        void client_DownloadProgressChanged(object sender, DownloadProgressChangedEventArgs e)
        {
            //this.Invoke(new MethodInvoker(delegate
            //{
            //    this.progressBar1.Value = e.ProgressPercentage;
            //    this.label5.Text = string.Format("正在下载文件,完成进度{0}%  {1}/{2}(字节)"
            //                        , e.ProgressPercentage
            //                        , e.BytesReceived
            //                        , e.TotalBytesToReceive);
            //}));

            //this.progressBar1.Minimum = 0;
            //this.progressBar1.Maximum = (int)e.TotalBytesToReceive;
            //this.progressBar1.Value = (int)e.BytesReceived;
            //this.label5.Text = "第" + e.UserState.ToString() + "集" + e.ProgressPercentage + "%";
        }

还有一种办法能获取到最完整的源码:CefSharp

下一篇:写python实现爬取全职高手有声小说

以上纯属个人独自研究成果,仅供参考,转载请注明出处

你可能感兴趣的:(C#,开发,编程)