前言:
本人喜欢看漫画,觉得好看就 下载下来 收藏。
所以写了个图片 下载器。
不少网站会保护自己,防止别人下载图片。
因此下载器,先先后改了好几次,不断技术升级。
早期 用WebClient 下载 HTML 分析 URL 下载。
后来 为了下载 手机网站的图片,用上了 User-Agent 特性,模拟手机浏览器。
之前遇到一个复杂些的 网站M ,页面用 JS 加载生成。无法直接获得HTML。
用Chrome一番分析后,发现其图片地址 有固定格式。文件名是 3位数数字,从1开始累加。
因此只要在浏览器 取到第一个图,后面的图片URL 生成下 也得到了。批量下载搞定。
前段时间 网站M 升级。图片展示使用 HTML5 canavs 显示,而且 图片路径 也加随机字符。难以生成了。
对于Chrome来说依然没有遮挡效果。依然可以取到 URL,可以手工保存图片, 然而 无法批量下载了。
问题来了:
无法取到 HTML分析出 图片路径,而且路径名称随机。那么如何批量下载图片?
既然 图片展示到我们面前,数据必然也已经到了 我们这边,就看我们这么取得了。
解决办法:
WebClient不行,于是想到了 Winform 的 WebBrowser 实现浏览器,能显示必然能取到数据。
WebBrowser对于大部分网站还是可以的。然而对于 HTML5的 或者 新特性的 网站就有些 不方便了。有时候不能正常显示网站。例如,网站M.
它依赖 操作系统的 IE浏览器,而且默认是系统里老版本的 IE。可能是 IE7 吧。
一番资料查找 发现CefSharp。这个是Chrome 的另一个开源项目,的 C# 调用。可以在Winform,Wpf 等中 嵌入浏览器。
CefSharp在网上有一些资料,不多。为了弄明白CefSharp里 取得 图片URL,图片数据。花了一番功夫,查资料,看源码。
倾情大放送,来一大波代码:
开发环境:VS2015 framework 4.5.2 x86模式 CefSharp 63.0.30
winform:
private void button1_Click(object sender, EventArgs e)
{
//打开新网址
richTextBox2.Text = "";
//WebBrowser.Load(textBox1.Text);//浏览网址,不用Load ,Load 会造成 RegisterJsObject 失效,不知是不是bug。
panel1.Controls.Remove(CefWebBrowser);
CefWebBrowser = new ChromiumWebBrowser(textBox1.Text);//浏览网址
CefWebBrowser.FrameLoadStart += WebBrowser_FrameLoadStart;//加载开始事件
CefWebBrowser.FrameLoadEnd += WebBrowser_FrameLoadEnd; //加载完成
CefWebBrowser.RequestHandler = new RequestHandler_new(CefWebBrowser.RequestHandler);//获取任意 资源的关键处。
//WebBrowser.RegisterJsObject("jsObj", new JsEvent(), new CefSharp.BindingOptions() { CamelCaseJavascriptNames = false }); //交互数据
CefWebBrowser.Dock = DockStyle.Fill;//铺满
CefWebBrowser.Dock = DockStyle.Fill;//设置停靠方式
panel1.Controls.Add(CefWebBrowser);
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using CefSharp;
using System.IO;
namespace WFASpider
{
public class RequestHandler_new : CefSharp.Handler.DefaultRequestHandler //CefSharp.Example.Handlers
{
public string _directory = "DownloadFile/";
private Dictionary responseDictionary = new Dictionary();
public IRequestHandler _requestHeandler;
public RequestHandler_new(IRequestHandler rh) : base()
{
_requestHeandler = rh;
}
public override CefReturnValue OnBeforeResourceLoad(IWebBrowser browserControl, IBrowser browser, IFrame frame, IRequest request, IRequestCallback callback)
{
Uri url;
if (Uri.TryCreate(request.Url, UriKind.Absolute, out url) == false)
{
//If we're unable to parse the Uri then cancel the request
// avoid throwing any exceptions here as we're being called by unmanaged code
return CefReturnValue.Cancel;
}
//System.Diagnostics.Debug.WriteLine(request.ResourceType.ToString());
//System.Diagnostics.Debug.WriteLine(url);
var extension = url.ToString().ToLower();
if (request.ResourceType == ResourceType.Image || extension.EndsWith(".jpg") || extension.EndsWith(".png") || extension.EndsWith(".gif") || extension.EndsWith(".jpeg"))
{
System.Diagnostics.Debug.WriteLine(url);//打印
}
//下面是一大波官方 示例
//Uri url;
//if (Uri.TryCreate(request.Url, UriKind.Absolute, out url) == false)
//{
// //If we're unable to parse the Uri then cancel the request
// // avoid throwing any exceptions here as we're being called by unmanaged code
// return CefReturnValue.Cancel;
//}
////Example of how to set Referer
//// Same should work when setting any header
//// For this example only set Referer when using our custom scheme
//if (url.Scheme == CefSharpSchemeHandlerFactory.SchemeName)
//{
// //Referrer is now set using it's own method (was previously set in headers before)
// request.SetReferrer("http://google.com", ReferrerPolicy.Default);
//}
////Example of setting User-Agent in every request.
////var headers = request.Headers;
////var userAgent = headers["User-Agent"];
////headers["User-Agent"] = userAgent + " CefSharp";
////request.Headers = headers;
////NOTE: If you do not wish to implement this method returning false is the default behaviour
//// We also suggest you explicitly Dispose of the callback as it wraps an unmanaged resource.
////callback.Dispose();
////return false;
////NOTE: When executing the callback in an async fashion need to check to see if it's disposed
//if (!callback.IsDisposed)
//{
// using (callback)
// {
// if (request.Method == "POST")
// {
// using (var postData = request.PostData)
// {
// if (postData != null)
// {
// var elements = postData.Elements;
// var charSet = request.GetCharSet();
// foreach (var element in elements)
// {
// if (element.Type == PostDataElementType.Bytes)
// {
// var body = element.GetBody(charSet);
// }
// }
// }
// }
// }
// //Note to Redirect simply set the request Url
// //if (request.Url.StartsWith("https://www.google.com", StringComparison.OrdinalIgnoreCase))
// //{
// // request.Url = "https://github.com/";
// //}
// //Callback in async fashion
// //callback.Continue(true);
// //return CefReturnValue.ContinueAsync;
// }
//}
//return CefReturnValue.Continue;
if (_requestHeandler != null)
{
return _requestHeandler.OnBeforeResourceLoad(browserControl, browser, frame, request, callback);
//return base.OnBeforeResourceLoad(browserControl, browser, frame, request, callback);
}
return CefReturnValue.Continue;
}
public override IResponseFilter GetResourceResponseFilter(IWebBrowser browserControl, IBrowser browser, IFrame frame, IRequest request, IResponse response)
{
var url = new Uri(request.Url);
var extension = url.ToString().ToLower();
if (request.ResourceType == ResourceType.Image || extension.EndsWith(".jpg") || extension.EndsWith(".png") || extension.EndsWith(".gif") || extension.EndsWith(".jpeg"))
{
//Only called for our customScheme
var dataFilter = new MemoryStreamResponseFilter();//新建成数据 处理器
responseDictionary.Add(request.Identifier, dataFilter);
return dataFilter;
}
if (_requestHeandler != null)
{
return _requestHeandler.GetResourceResponseFilter(browserControl, browser, frame, request, response);
//return base.GetResourceResponseFilter(browserControl, browser, frame, request, response);
}
return null;
}
Random _rand = new Random();
public override void OnResourceLoadComplete(IWebBrowser browserControl, IBrowser browser, IFrame frame, IRequest request, IResponse response, UrlRequestStatus status, long receivedContentLength)
{
var url = new Uri(request.Url);
var extension = url.ToString().ToLower();
if (request.ResourceType == ResourceType.Image || extension.EndsWith(".jpg") || extension.EndsWith(".png") || extension.EndsWith(".gif") || extension.EndsWith(".jpeg"))
{
MemoryStreamResponseFilter filter;
if (responseDictionary.TryGetValue(request.Identifier, out filter))
{
if (!Directory.Exists(_directory))
{
Directory.CreateDirectory(_directory);
}
System.Diagnostics.Debug.WriteLine("responseDictionary.Count:" + responseDictionary.Count);
//TODO: Do something with the data here
var data = filter.Data;
var dataLength = filter.Data.Length;
//NOTE: You may need to use a different encoding depending on the request
//var dataAsUtf8String = Encoding.UTF8.GetString(data);
if (dataLength > 0)
{
string fileName = DateTime.Now.ToString("yyyy-MM-dd-HH-mm-ss-fff-") + _rand.Next(99999, 999999) + ".png";
string path = _directory + fileName;
try
{
fileName = Path.GetFileName(url.ToString());
File.WriteAllBytes(path, data);
return;
}
catch (Exception e)
{
//throw;
}
fileName = DateTime.Now.ToString("yyyy-MM-dd-HH-mm-ss-fff-") + _rand.Next(99999, 999999) + ".png";
path = _directory + fileName;
File.WriteAllBytes(path, data);//保存数据
}
}
return;
}
if (_requestHeandler != null)
{
_requestHeandler.OnResourceLoadComplete(browserControl, browser, frame, request, response, status, receivedContentLength);
}
}
}
}
using CefSharp;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace WFASpider
{
//数据处理器,仅将 数据保存到内存。
public class MemoryStreamResponseFilter : IResponseFilter
{
private MemoryStream memoryStream;
bool IResponseFilter.InitFilter()
{
//NOTE: We could initialize this earlier, just one possible use of InitFilter
memoryStream = new MemoryStream();
return true;
}
FilterStatus IResponseFilter.Filter(Stream dataIn, out long dataInRead, Stream dataOut, out long dataOutWritten)
{
if (dataIn == null)
{
dataInRead = 0;
dataOutWritten = 0;
return FilterStatus.Done;
}
dataInRead = dataIn.Length;
dataOutWritten = Math.Min(dataInRead, dataOut.Length);
//Important we copy dataIn to dataOut
dataIn.CopyTo(dataOut);
//Copy data to stream
dataIn.Position = 0;
dataIn.CopyTo(memoryStream);
return FilterStatus.Done;
}
void IDisposable.Dispose()
{
memoryStream.Dispose();
memoryStream = null;
}
public byte[] Data
{
get { return memoryStream.ToArray(); }
}
}
}