C# WPF 使用CefSharp 做爬虫,破除所有前端js加密限制

1.引言

现在随着python火热,学爬虫的人不计其数,防爬虫的手段层出不穷,但防爬虫无非通过3种手段,1.后台限制IP访问次数,这个可以通过使用代理IP破解,2.检测访问来源数据,判定是否是爬虫还是机器人,3.数据传输加密,不在明文传输数据,通过js加密,在加密js文件,或者直接把数据封装成图片等其他格式,不让用户获取到数据本身。
针对第2、3点,这里就实现通过在程序里面嵌入一个开源Chromium,去破解所有的js加密和访问来源限制,达到抓取数据的目的

2.CefSharp安装

详见https://blog.csdn.net/c0411034/article/details/82841696

3.爬取数据

爬取数据,主要是分析后台获取到网络包的内容,就像使用chrome浏览器F12键打开调试器,选择NETWORK一样,你可以从中截取到所有的网络包
截取数据,需要实现CefSharp.Handler.DefaultRequestHandler类,并复写操作方法。

public class RequestHandler_new : CefSharp.Handler.DefaultRequestHandler 
    {
    private Dictionary responseDictionary = new Dictionary();

    public IRequestHandler _requestHeandler;


    public RequestHandler_new(IRequestHandler rh) : base()
    {
        _requestHeandler = rh;
    }

    public override CefReturnValue OnBeforeResourceLoad(IWebBrowser browserControl, IBrowser browser, IFrame frame, IRequest request, IRequestCallback callback)
    {
        var url = new Uri(request.Url);
        var extension = url.ToString().ToLower();
        if (request.ResourceType == 
            ResourceType.Image || extension.EndsWith(".jpg") 
            || extension.EndsWith(".png") 
            || extension.EndsWith(".gif") 
            || extension.EndsWith(".jpeg")
            )
        {
			//这里是过滤掉不想要的数据,如图片资源,避免加载时间过长
            return CefReturnValue.Cancel;
        }

            if (Uri.TryCreate(request.Url, UriKind.Absolute, out url) == false)
        {
            return CefReturnValue.Cancel;
        }
        if (_requestHeandler != null)
        {
            return _requestHeandler.OnBeforeResourceLoad(browserControl, browser, frame, request, callback);(browserControl, browser, frame, request, callback);
        }

        return CefReturnValue.Continue;
    }

    public override IResponseFilter GetResourceResponseFilter(IWebBrowser browserControl, IBrowser browser, IFrame frame, IRequest request, IResponse response)
    {

        var url = new Uri(request.Url);
        //("加载完毕" + url);
        var extension = url.ToString().ToLower();
        

        if (_requestHeandler != null)
        {
            return _requestHeandler.GetResourceResponseFilter(browserControl, browser, frame, request, response);
        }
        if (url.AbsoluteUri.Contains("XXX"))//这里填你要截的数据的路径关键字
        {
            var filter = FilterManager.CreateFilter(request.Identifier.ToString());

            return filter;
        }

        return null;
    }

    Random _rand = new Random();

    public override void OnResourceLoadComplete(IWebBrowser browserControl, IBrowser browser, IFrame frame, IRequest request, IResponse response, UrlRequestStatus status, long receivedContentLength)
    {
        
        var url = new Uri(request.Url);
        var extension = url.ToString().ToLower();
        //("返回状态" + status);
        if (status == UrlRequestStatus.Failed)
        {
            if (url.AbsoluteUri.Contains("XXX"))//这里填你要截的网站数据
            {
                //那就是说明访问资源失败了
            }

        }
        
        if (url.AbsoluteUri.Contains("XXX"))//这里填你要截Url的关键字
        {
            if (response.StatusCode != (200))
            {
                //说明访问数据失败,发送信号,让程序继续刷新数据
                
            }
            var filter = FilterManager.GetFileter(request.Identifier.ToString()) as TestImageFilter;
            if (filter == null)
            {
                return;
            }
            byte[] data = filter.dataAll.ToArray();
            filter_NotifyData(data);
            string str = System.Text.Encoding.UTF8.GetString(data);
            int getDataPageNum = 0;//当前获取到的数据的页码
            IList elements = request.PostData.Elements;
            foreach (IPostDataElement item in elements)
            {
                String postData = System.Text.Encoding.UTF8.GetString(item.Bytes);
               //postData为页面提交的post表单数据,可以做查询
                }
            }
            
        }
        if (_requestHeandler != null)
        {
            _requestHeandler.OnResourceLoadComplete(browserControl, browser, frame, request, response, status, receivedContentLength);
        }

    }
    public override bool OnResourceResponse(IWebBrowser browserControl, IBrowser browser, IFrame frame, IRequest request, IResponse response)
    {
        //NOTE: You cannot modify the response, only the request  
        // You can now access the headers  
        //var headers = response.ResponseHeaders;  
        try
        {
            var content_length = int.Parse(response.ResponseHeaders["Content-Length"]);
            if (this.filter != null)
            {
                this.filter.SetContentLength(content_length);
            }
        }
        catch { }
        return false;
    }

    private TestImageFilter filter = null;
    public event Action NotifyData;


    void filter_NotifyData(byte[] data)
    {
        if (NotifyData != null)
        {
            NotifyData(data);
        }
    }
}

上面的类中用到了一个Filter类,名为MemoryStreamResponseFilter,目的是为了整合截取到数据,默认获取的数据长度大约只有427个汉字……
;还有一个管理Filter的FilterManager类
###MemoryStreamResponseFilter

public class MemoryStreamResponseFilter : IResponseFilter
{
    private MemoryStream memoryStream;

    public event Action NotifyData;
    private int contentLength = 0;
    private List dataAll = new List();

    bool IResponseFilter.InitFilter()
    {
        //NOTE: We could initialize this earlier, just one possible use of InitFilter
        memoryStream = new MemoryStream();

        return true;
    }
    public void SetContentLength(int contentLength)
    {
        this.contentLength = contentLength;
    }

    FilterStatus IResponseFilter.Filter(Stream dataIn, out long dataInRead, Stream dataOut, out long dataOutWritten)
    {
        if (dataIn == null)
        {
            dataInRead = 0;
            dataOutWritten = 0;

            return FilterStatus.Done;
        }

        dataInRead = dataIn.Length;
        dataOutWritten = Math.Min(dataInRead, dataOut.Length);

        //Important we copy dataIn to dataOut
        try
        {
            dataIn.CopyTo(dataOut);

            //Copy data to stream
            dataIn.Position = 0;
            dataIn.CopyTo(memoryStream);
        }
        catch (Exception)
        {

            throw;
        }
       

        return FilterStatus.Done;
    }

    void IDisposable.Dispose()
    {
        memoryStream.Dispose();
        memoryStream = null;
    }

    public byte[] Data
    {
        get { return memoryStream.ToArray(); }
    }


}

FilterManager

public class FilterManager
{
    private static Dictionary dataList = new Dictionary();

    public static IResponseFilter CreateFilter(string guid)
    {
        lock (dataList)
        {
            var filter = new TestImageFilter();
            dataList.Add(guid, filter);

            return filter;
        }
    }

    public static IResponseFilter GetFileter(string guid)
    {
        lock (dataList)
        {
            try
            {
                return dataList[guid];
            }
            catch (System.Collections.Generic.KeyNotFoundException e)
            {

                return null;
            }
        }
    }
}

使用方法

在xaml界面中添加控件


然后在xaml.cs中修改此控件的RequestHandler

browser.RequestHandler = new RequestHandler_new();

4.操控页面加快爬取速度,和进行ajax翻页

有时候为了加快抓取的数据,可能需要不断点页面上的刷新按钮,或者下一页、上一页进行遍历数据,就需要对页面进行操控
具体实现方法,详见
https://blog.csdn.net/c0411034/article/details/82841689

5.参考文档

https://github.com/cefsharp/CefSharp/wiki/CefSharp中文帮助文档
https://blog.csdn.net/jinxiu0406/article/details/80365140
https://www.jb51.net/article/56316.htm

你可能感兴趣的:(c#,WPF)