现在随着python火热,学爬虫的人不计其数,防爬虫的手段层出不穷,但防爬虫无非通过3种手段,1.后台限制IP访问次数,这个可以通过使用代理IP破解,2.检测访问来源数据,判定是否是爬虫还是机器人,3.数据传输加密,不在明文传输数据,通过js加密,在加密js文件,或者直接把数据封装成图片等其他格式,不让用户获取到数据本身。
针对第2、3点,这里就实现通过在程序里面嵌入一个开源Chromium,去破解所有的js加密和访问来源限制,达到抓取数据的目的
详见https://blog.csdn.net/c0411034/article/details/82841696
爬取数据,主要是分析后台获取到网络包的内容,就像使用chrome浏览器F12键打开调试器,选择NETWORK一样,你可以从中截取到所有的网络包
截取数据,需要实现CefSharp.Handler.DefaultRequestHandler类,并复写操作方法。
public class RequestHandler_new : CefSharp.Handler.DefaultRequestHandler
{
private Dictionary responseDictionary = new Dictionary();
public IRequestHandler _requestHeandler;
public RequestHandler_new(IRequestHandler rh) : base()
{
_requestHeandler = rh;
}
public override CefReturnValue OnBeforeResourceLoad(IWebBrowser browserControl, IBrowser browser, IFrame frame, IRequest request, IRequestCallback callback)
{
var url = new Uri(request.Url);
var extension = url.ToString().ToLower();
if (request.ResourceType ==
ResourceType.Image || extension.EndsWith(".jpg")
|| extension.EndsWith(".png")
|| extension.EndsWith(".gif")
|| extension.EndsWith(".jpeg")
)
{
//这里是过滤掉不想要的数据,如图片资源,避免加载时间过长
return CefReturnValue.Cancel;
}
if (Uri.TryCreate(request.Url, UriKind.Absolute, out url) == false)
{
return CefReturnValue.Cancel;
}
if (_requestHeandler != null)
{
return _requestHeandler.OnBeforeResourceLoad(browserControl, browser, frame, request, callback);(browserControl, browser, frame, request, callback);
}
return CefReturnValue.Continue;
}
public override IResponseFilter GetResourceResponseFilter(IWebBrowser browserControl, IBrowser browser, IFrame frame, IRequest request, IResponse response)
{
var url = new Uri(request.Url);
//("加载完毕" + url);
var extension = url.ToString().ToLower();
if (_requestHeandler != null)
{
return _requestHeandler.GetResourceResponseFilter(browserControl, browser, frame, request, response);
}
if (url.AbsoluteUri.Contains("XXX"))//这里填你要截的数据的路径关键字
{
var filter = FilterManager.CreateFilter(request.Identifier.ToString());
return filter;
}
return null;
}
Random _rand = new Random();
public override void OnResourceLoadComplete(IWebBrowser browserControl, IBrowser browser, IFrame frame, IRequest request, IResponse response, UrlRequestStatus status, long receivedContentLength)
{
var url = new Uri(request.Url);
var extension = url.ToString().ToLower();
//("返回状态" + status);
if (status == UrlRequestStatus.Failed)
{
if (url.AbsoluteUri.Contains("XXX"))//这里填你要截的网站数据
{
//那就是说明访问资源失败了
}
}
if (url.AbsoluteUri.Contains("XXX"))//这里填你要截Url的关键字
{
if (response.StatusCode != (200))
{
//说明访问数据失败,发送信号,让程序继续刷新数据
}
var filter = FilterManager.GetFileter(request.Identifier.ToString()) as TestImageFilter;
if (filter == null)
{
return;
}
byte[] data = filter.dataAll.ToArray();
filter_NotifyData(data);
string str = System.Text.Encoding.UTF8.GetString(data);
int getDataPageNum = 0;//当前获取到的数据的页码
IList elements = request.PostData.Elements;
foreach (IPostDataElement item in elements)
{
String postData = System.Text.Encoding.UTF8.GetString(item.Bytes);
//postData为页面提交的post表单数据,可以做查询
}
}
}
if (_requestHeandler != null)
{
_requestHeandler.OnResourceLoadComplete(browserControl, browser, frame, request, response, status, receivedContentLength);
}
}
public override bool OnResourceResponse(IWebBrowser browserControl, IBrowser browser, IFrame frame, IRequest request, IResponse response)
{
//NOTE: You cannot modify the response, only the request
// You can now access the headers
//var headers = response.ResponseHeaders;
try
{
var content_length = int.Parse(response.ResponseHeaders["Content-Length"]);
if (this.filter != null)
{
this.filter.SetContentLength(content_length);
}
}
catch { }
return false;
}
private TestImageFilter filter = null;
public event Action NotifyData;
void filter_NotifyData(byte[] data)
{
if (NotifyData != null)
{
NotifyData(data);
}
}
}
上面的类中用到了一个Filter类,名为MemoryStreamResponseFilter,目的是为了整合截取到数据,默认获取的数据长度大约只有427个汉字……
;还有一个管理Filter的FilterManager类
###MemoryStreamResponseFilter
public class MemoryStreamResponseFilter : IResponseFilter
{
private MemoryStream memoryStream;
public event Action NotifyData;
private int contentLength = 0;
private List dataAll = new List();
bool IResponseFilter.InitFilter()
{
//NOTE: We could initialize this earlier, just one possible use of InitFilter
memoryStream = new MemoryStream();
return true;
}
public void SetContentLength(int contentLength)
{
this.contentLength = contentLength;
}
FilterStatus IResponseFilter.Filter(Stream dataIn, out long dataInRead, Stream dataOut, out long dataOutWritten)
{
if (dataIn == null)
{
dataInRead = 0;
dataOutWritten = 0;
return FilterStatus.Done;
}
dataInRead = dataIn.Length;
dataOutWritten = Math.Min(dataInRead, dataOut.Length);
//Important we copy dataIn to dataOut
try
{
dataIn.CopyTo(dataOut);
//Copy data to stream
dataIn.Position = 0;
dataIn.CopyTo(memoryStream);
}
catch (Exception)
{
throw;
}
return FilterStatus.Done;
}
void IDisposable.Dispose()
{
memoryStream.Dispose();
memoryStream = null;
}
public byte[] Data
{
get { return memoryStream.ToArray(); }
}
}
public class FilterManager
{
private static Dictionary dataList = new Dictionary();
public static IResponseFilter CreateFilter(string guid)
{
lock (dataList)
{
var filter = new TestImageFilter();
dataList.Add(guid, filter);
return filter;
}
}
public static IResponseFilter GetFileter(string guid)
{
lock (dataList)
{
try
{
return dataList[guid];
}
catch (System.Collections.Generic.KeyNotFoundException e)
{
return null;
}
}
}
}
在xaml界面中添加控件
然后在xaml.cs中修改此控件的RequestHandler
browser.RequestHandler = new RequestHandler_new();
有时候为了加快抓取的数据,可能需要不断点页面上的刷新按钮,或者下一页、上一页进行遍历数据,就需要对页面进行操控
具体实现方法,详见
https://blog.csdn.net/c0411034/article/details/82841689
https://github.com/cefsharp/CefSharp/wiki/CefSharp中文帮助文档
https://blog.csdn.net/jinxiu0406/article/details/80365140
https://www.jb51.net/article/56316.htm