前言:这里关键写用WebBrowser跨域跨iframe获取网页源码的部分,本意是要爬取全职高手的有声小说,这类网站特殊,网页上广告大堆,爬起来真麻烦,比如我爬取的网站的mp3源文件下载还需要秘钥的,还随时更新秘钥的那种,嵌在某个iframe下,是经过设计防止爬虫的。
上代码,获取我想要的关键iframe源码(各种百度搬砖拆砖的结果/苦笑):
using mshtml;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace DownLoadNovel
{
public class FindHtml
{
private String htmlString;
private String url = "";
private bool success; // 是否成功运行
public FindHtml()
{
htmlString = "";
success = false;
}
///
/// 结果源码
///
public String ResultHtml
{
get
{
if (success == false) return null;
return htmlString;
}
}
///
/// 加载指定文件
///
/// 文件URL
/// 超时时限
/// 是否成功运行,没有超时
public bool Run(String url, int timeOut = 10000)
{
this.url = url;
Thread newThread = new Thread(NewThread);
newThread.SetApartmentState(ApartmentState.STA);/// 为了创建WebBrowser类的实例 必须将对应线程设为单线程单元
newThread.Start();
//监督子线程运行时间
while (newThread.IsAlive && timeOut > 0)
{
Thread.Sleep(100);
timeOut -= 100;
}
// 超时处理
if (newThread.IsAlive)
{
if (success) return true;
newThread.Abort();
return false;
}
return true;
}
private void NewThread()
{
new FindHtmlPerThread(this);
Application.Run();// 循环等待webBrowser 加载完毕 调用 DocumentCompleted 事件
}
///
/// 用于处理一个url的核心类
///
class FindHtmlPerThread : IDisposable
{
FindHtml master;
WebBrowser web;
public FindHtmlPerThread(FindHtml master)
{
this.master = master;
String url = master.url;
web = new WebBrowser();
web.ScriptErrorsSuppressed = true;
bool success = false;
try
{
web.Url = new Uri(url);
web.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(web_DocumentCompleted); // 对事件加委托
success = true;
}
finally
{
if (!success)
Dispose();
}
}
public void Dispose()
{
if (!web.IsDisposed)
web.Dispose();
}
private void web_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
// 一个网页有多个Ifram元素就有可能触发多次此事件,通过此判断是否已运行完毕
if (web.ReadyState != WebBrowserReadyState.Complete) return;
//获取向前网页有多少个iframe,然后循环
var fcount = web.Document.Window.Frames.Count;
for (int i = 0; i < fcount; i++)
{
//关键点,得到 跨域的iframe 的源码
IHTMLDocument3 baiduDoc = CorssDomainHelper.GetDocumentFromWindow(web.Document.Window.Frames[i].DomWindow as IHTMLWindow2);
var furl = ((mshtml.HTMLDocumentClass)baiduDoc).IHTMLDocument2_url;
if (furl.StartsWith("关键iframe的链接域名,这是不会变的"))
{
master.htmlString = baiduDoc.documentElement.innerHTML;//iframe的源码
break;
}
}
master.success = true;
Thread.CurrentThread.Abort();
}
}
}
}
using System;
using System.Runtime.InteropServices;
using System.Windows.Forms;
using mshtml;
namespace DownLoadNovel
{
// This is the COM IServiceProvider interface, not System.IServiceProvider .Net interface!
[ComImport(), ComVisible(true), Guid("6D5140C1-7436-11CE-8034-00AA006009FA"),
InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
public interface IServiceProvider
{
[return: MarshalAs(UnmanagedType.I4)]
[PreserveSig]
int QueryService(ref Guid guidService, ref Guid riid, [MarshalAs(UnmanagedType.Interface)] out object ppvObject);
}
public enum OLECMDF
{
OLECMDF_DEFHIDEONCTXTMENU = 0x20,
OLECMDF_ENABLED = 2,
OLECMDF_INVISIBLE = 0x10,
OLECMDF_LATCHED = 4,
OLECMDF_NINCHED = 8,
OLECMDF_SUPPORTED = 1
}
public enum OLECMDID
{
OLECMDID_PAGESETUP = 8,
OLECMDID_PRINT = 6,
OLECMDID_PRINTPREVIEW = 7,
OLECMDID_PROPERTIES = 10,
OLECMDID_SAVEAS = 4
}
public enum OLECMDEXECOPT
{
OLECMDEXECOPT_DODEFAULT,
OLECMDEXECOPT_PROMPTUSER,
OLECMDEXECOPT_DONTPROMPTUSER,
OLECMDEXECOPT_SHOWHELP
}
[ComImport, Guid("D30C1661-CDAF-11d0-8A3E-00C04FC9E26E"), TypeLibType(TypeLibTypeFlags.FOleAutomation | TypeLibTypeFlags.FDual | TypeLibTypeFlags.FHidden)]
public interface IWebBrowser2
{
[DispId(100)]
void GoBack();
[DispId(0x65)]
void GoForward();
[DispId(0x66)]
void GoHome();
[DispId(0x67)]
void GoSearch();
[DispId(0x68)]
void Navigate([In] string Url, [In] ref object flags, [In] ref object targetFrameName, [In] ref object postData, [In] ref object headers);
[DispId(-550)]
void Refresh();
[DispId(0x69)]
void Refresh2([In] ref object level);
[DispId(0x6a)]
void Stop();
[DispId(200)]
object Application { [return: MarshalAs(UnmanagedType.IDispatch)] get; }
[DispId(0xc9)]
object Parent { [return: MarshalAs(UnmanagedType.IDispatch)] get; }
[DispId(0xca)]
object Container { [return: MarshalAs(UnmanagedType.IDispatch)] get; }
[DispId(0xcb)]
object Document { [return: MarshalAs(UnmanagedType.IDispatch)] get; }
[DispId(0xcc)]
bool TopLevelContainer { get; }
[DispId(0xcd)]
string Type { get; }
[DispId(0xce)]
int Left { get; set; }
[DispId(0xcf)]
int Top { get; set; }
[DispId(0xd0)]
int Width { get; set; }
[DispId(0xd1)]
int Height { get; set; }
[DispId(210)]
string LocationName { get; }
[DispId(0xd3)]
string LocationURL { get; }
[DispId(0xd4)]
bool Busy { get; }
[DispId(300)]
void Quit();
[DispId(0x12d)]
void ClientToWindow(out int pcx, out int pcy);
[DispId(0x12e)]
void PutProperty([In] string property, [In] object vtValue);
[DispId(0x12f)]
object GetProperty([In] string property);
[DispId(0)]
string Name { get; }
[DispId(-515)]
int HWND { get; }
[DispId(400)]
string FullName { get; }
[DispId(0x191)]
string Path { get; }
[DispId(0x192)]
bool Visible { get; set; }
[DispId(0x193)]
bool StatusBar { get; set; }
[DispId(0x194)]
string StatusText { get; set; }
[DispId(0x195)]
int ToolBar { get; set; }
[DispId(0x196)]
bool MenuBar { get; set; }
[DispId(0x197)]
bool FullScreen { get; set; }
[DispId(500)]
void Navigate2([In] ref object URL, [In] ref object flags, [In] ref object targetFrameName, [In] ref object postData, [In] ref object headers);
[DispId(0x1f5)]
OLECMDF QueryStatusWB([In] OLECMDID cmdID);
[DispId(0x1f6)]
void ExecWB([In] OLECMDID cmdID, [In] OLECMDEXECOPT cmdexecopt, ref object pvaIn, IntPtr pvaOut);
[DispId(0x1f7)]
void ShowBrowserBar([In] ref object pvaClsid, [In] ref object pvarShow, [In] ref object pvarSize);
[DispId(-525)]
WebBrowserReadyState ReadyState { get; }
[DispId(550)]
bool Offline { get; set; }
[DispId(0x227)]
bool Silent { get; set; }
[DispId(0x228)]
bool RegisterAsBrowser { get; set; }
[DispId(0x229)]
bool RegisterAsDropTarget { get; set; }
[DispId(0x22a)]
bool TheaterMode { get; set; }
[DispId(0x22b)]
bool AddressBar { get; set; }
[DispId(0x22c)]
bool Resizable { get; set; }
}
class CorssDomainHelper
{
private static Guid IID_IWebBrowserApp = new Guid("0002DF05-0000-0000-C000-000000000046");
private static Guid IID_IWebBrowser2 = new Guid("D30C1661-CDAF-11D0-8A3E-00C04FC9E26E");
// Utility for IE cross domain access
// Returns null in case of failure.
public static IHTMLDocument3 GetDocumentFromWindow(IHTMLWindow2 htmlWindow)
{
if (htmlWindow == null)
{
return null;
}
// First try the usual way to get the document.
try
{
IHTMLDocument2 doc = htmlWindow.document;
return (IHTMLDocument3)doc;
}
catch (COMException comEx)
{
// I think COMException won't be ever fired but just to be sure ...
}
catch (UnauthorizedAccessException)
{
}
catch (Exception ex)
{
return null;
}
// At this point the error was E_ACCESSDENIED because the frame contains a document from another domain.
// IE tries to prevent a cross frame scripting security issue.
try
{
// Convert IHTMLWindow2 to IWebBrowser2 using IServiceProvider.
IServiceProvider sp = (IServiceProvider)htmlWindow;
// Use IServiceProvider.QueryService to get IWebBrowser2 object.
Object brws = null;
sp.QueryService(ref IID_IWebBrowserApp, ref IID_IWebBrowser2, out brws);
// Get the document from IWebBrowser2.
IWebBrowser2 browser = (IWebBrowser2)(brws);
return (IHTMLDocument3)browser.Document;
}
catch (Exception ex)
{
Console.WriteLine(ex);
}
return null;
}
}
}
最终的调用:
FindHtml content = new FindHtml();
var flag = content.Run("我爬虫的网站链接");
if (flag)
{
//这下面就是根据得到的源码来找规则了,每个网站都不同,这里就不写了
}
最后还是上一下我怎么下载那些mp3文件的吧,很简单用WebClient异步下载
private void DownFile(List files)
{
key = GetKey();
if (string.IsNullOrWhiteSpace(key))
{
MessageBox.Show("无法爬到key,%>_<%,请多试几次,如若还不行,可能资源网站已改版");
}
downNum = files.Count;
foreach (var i in files)
{
WebClient client = new WebClient();
client.DownloadProgressChanged += client_DownloadProgressChanged;
client.DownloadFileCompleted += client_DownloadFileCompleted;
string furl = domain + string.Format(@"/{0}.mp3?{1}", i < 100 ? i.ToString("000") : i.ToString(), key);
client.DownloadFileAsync(new Uri(furl), Path.Combine(this.textBox1.Text, i + ".mp3"), i);
}
}
void client_DownloadFileCompleted(object sender, AsyncCompletedEventArgs e)
{
try
{
_dloadFile++;
int percent = (int)(100.0 * _dloadFile/ downNum);
this.Invoke(new MethodInvoker(delegate
{
this.progressBar1.Value = percent;
this.label5.Text = string.Format("已完成文件下载{0}% {1}/{2}(文件个数)", percent, _dloadFile, downNum);
}));
var epn = Uri.UnescapeDataString(((System.Net.WebClient)sender).ResponseHeaders.GetValues("Content-Location")[0]).Replace(domain, "").Replace(key, "").Replace(".mp3?", "").Replace("/","");
aldownep.Add(Convert.ToInt32(epn));
if (sender is WebClient)
{
((WebClient)sender).CancelAsync();
((WebClient)sender).Dispose();
}
if (_dloadFile == downNum) { MessageBox.Show("下载完成!"); this.panel1.Visible = false; }
}
catch (Exception ex) { throw ex; }
}
void client_DownloadProgressChanged(object sender, DownloadProgressChangedEventArgs e)
{
//this.Invoke(new MethodInvoker(delegate
//{
// this.progressBar1.Value = e.ProgressPercentage;
// this.label5.Text = string.Format("正在下载文件,完成进度{0}% {1}/{2}(字节)"
// , e.ProgressPercentage
// , e.BytesReceived
// , e.TotalBytesToReceive);
//}));
//this.progressBar1.Minimum = 0;
//this.progressBar1.Maximum = (int)e.TotalBytesToReceive;
//this.progressBar1.Value = (int)e.BytesReceived;
//this.label5.Text = "第" + e.UserState.ToString() + "集" + e.ProgressPercentage + "%";
}
还有一种办法能获取到最完整的源码:CefSharp
下一篇:写python实现爬取全职高手有声小说
以上纯属个人独自研究成果,仅供参考,转载请注明出处