使用c#实现爬虫技术

这是我的第一个爬虫项目,也是我第一次接触c# 窗体程序。
我的需求:页面中有音频文件但是它时单个下载的,用户需要一个一个的去点击下载按钮进行下载,我的目的:根据用户的需求筛选出相关的数据,然后我拿到页面上用户筛选的数据,实现批量下载,然后将下载并存放到用户本地文件夹中,然后对下载下来的这些文件进行播放。
主要用到的插件有:CefSharp HtmlAgilityPack
将浏览器页面嵌入到winForm中

将web页面嵌入到winForm的界面中

//窗体load时执行下面方法
private void Form1_Load(object sender, EventArgs e)
        {
            CefSettings settings = new CefSettings();
            Cef.Initialize(settings);
            webbrowser = new ChromiumWebBrowser(“要嵌入的web地址”);
            webbrowser.Dock = DockStyle.Fill;
            this.pnlTop.Controls.Add(webbrowser);
            webbrowser.FrameLoadEnd += Webbrowser_FrameLoadEnd;//注册窗体加载事件onload
            webbrowser.FrameLoadEnd += SetCookie;
        }

下面是获取web页面的url地址做相应的操作

private void Webbrowser_FrameLoadEnd(object sender, FrameLoadEndEventArgs e)
        {
            if (e.Frame.IsMain)
            {
                if (e.Frame.Url == "页面的url地址(不同的地址处理不同的事情)")
                {
                    string listPage = "想要跳转的页面地址";
                    string js = "window.location.href='" + listPage + "';";
                    this.webbrowser.ExecuteScriptAsync(js);//将这段js添加到web页面中,它会执行此跳转
                    return;
                }
                if (e.Frame.Url == "url1")
                {
                    string html = "";
                    e.Frame.GetSourceAsync().ContinueWith(task =>//异步执行
                    {
                        html = task.Result;//抓取到的页面,然后分析页面的代码结构拿到想要的数据
                        String filePath = SavaProcess(html);
                    });
                    return;
                }
                if (e.Frame.Url == "url2")
                {
                    e.Frame.GetSourceAsync().ContinueWith(task =>
                    {
                        string htmlDom = task.Result;

                        var doc = new HtmlDocument();
                        doc.LoadHtml(htmlDom);//可以将html页面,使可以用类似于操作dom的一些方法来操作
                        //拿到总页数
                        request requoption = new request();
                        requoption.Method = "POST";
                        //下面是根据抓取到的实际的页面结构,和具体的也去需求,去获取页面上的数据
                        var pageTr = doc.DocumentNode.SelectNodes(@"/html[1]/body[1]/div[3]/table[1]/tbody[1]/tr[@class='forPage']/td[1]/div[1]/div[1]");//选择标签数组 
                        if (pageTr.Count > 0)
                        {
                            var p = pageTr[0];
                            var spanNodes = pageTr[0].SelectNodes(@".//span");//取到该节点下的所有span节点
                            }
                        }
                    });
                    return;
                } 
            }
        }

设置cookie方法

private void SetCookie(object sender, CefSharp.FrameLoadEndEventArgs e)
        {
            var cookieManager = CefSharp.Cef.GetGlobalCookieManager();
            CookieVisitor visitor = new CookieVisitor();
            visitor.SendCookie += Visitor_SendCookie;
            cookieManager.VisitAllCookies(visitor);
        }

/// 
        /// 将Cookie保存到字典COOKIES中
        /// 
        /// 
        private void Visitor_SendCookie(CefSharp.Cookie obj)
        {
            lock (lockObject)
            {
                string key = obj.Domain.TrimStart('.') + "^" + obj.Name;
                string value = obj.Value;
                if (!cookies.ContainsKey(key))
                {
                    cookies.Add(key, value);
                }
                else
                {
                    cookies[key] = value;
                }
            }
        }

/// 
        /// 将COOKIES解析成System.Net.Cookie
        /// 
        /// 
        private CookieCollection GetCookieCollection()
        {
            lock (lockObject)
            {
                CookieCollection cookieCollection = new CookieCollection();
                foreach (var keyValue in cookies)
                {
                    System.Net.Cookie cookie = new System.Net.Cookie();
                    cookie.Domain = keyValue.Key.Split('^')[0];
                    cookie.Name = keyValue.Key.Split('^')[1];
                    cookie.Value = keyValue.Value;
                    cookieCollection.Add(cookie);
                }
                return cookieCollection;
            }
        }

下面是已经拿到音频文件的地址了,然后请求下载地址下载文件

/// 
        /// 将文件下载到本地
        /// 
        public void HttpWebRequestGet(Uri url, string fileName, DataModel data)
        {
            try
            {
                HttpWebRequest AudioReq = (HttpWebRequest)HttpWebRequest.Create(url);
                AudioReq.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8";
                AudioReq.KeepAlive = true;
                AudioReq.Referer = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
                AudioReq.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36";
                AudioReq.Headers.Set("Accept-Encoding", "gzip,deflate");
                AudioReq.Headers.Set("Accept-Language", "zh-CN,zh;q=0.9");
                AudioReq.Headers.Set("Upgrade-Insecure-Requests", "1");
                AudioReq.Headers.Set("Cookie", "JSESSIONID=" + JSESSIONID + ";rememberPass=1;userAccount=" + uid + ";#pwd=" + pwd + ";loginByTwoCode=0");
                string responseData = String.Empty;
                AudioReq.Method = "GET";
                AudioReq.ContentType = "application/x-www-form-urlencoded";

                string path = System.AppDomain.CurrentDomain.BaseDirectory + @"AudioList\AMR";
                if (!System.IO.Directory.Exists(path))
                {
                    System.IO.Directory.CreateDirectory(path);
                }
                HttpWebResponse rsp = (HttpWebResponse)AudioReq.GetResponse();//获取回写流
                //将文件存到本地
                var localAmrnb = path + "\\" + fileName;
                FileStream fs = new FileStream(localAmrnb, FileMode.Create, FileAccess.Write, FileShare.ReadWrite);//创建本地文件写入流
                data.LocalPath = localAmrnb;
                var responseStream = rsp.GetResponseStream();                                                                                                            //创建本地文件写入流
                byte[] bArr = new byte[1024];
                int iTotalSize = 0;
                int size = responseStream.Read(bArr, 0, (int)bArr.Length);
                while (size > 0)
                {
                    iTotalSize += size;
                    fs.Write(bArr, 0, size);
                    size = responseStream.Read(bArr, 0, (int)bArr.Length);
                }
                fs.Close();
                responseStream.Close();
                rsp.Close();
                rsp.Dispose();
            }
            catch (Exception ex)
            {
                 ex.ToString();
            }
        }

c#序列化数据并写入文件
List dataList = new List();
System.IO.StreamWriter file1 = new System.IO.StreamWriter(DownloadDataPath, false);
file1.Write(new JavaScriptSerializer().Serialize(dataList));
file1.Close();
file1.Dispose();
从文件中读取数据并反序列化
using (System.IO.StreamReader sr = new System.IO.StreamReader(DownloadDataPath, Encoding.UTF8))
{
// 从文件读取并显示行,直到文件的末尾
string line = sr.ReadLine();
if (line != null)
{
oldData = line;
}
}
System.IO.StreamWriter file2 = new System.IO.StreamWriter(DownloadDataPath, false);
List oldDataList = new JavaScriptSerializer().Deserialize(oldData);//反序列化读取到的值
dataList.AddRange(oldDataList);//将新的数据添加到之前数据的末尾
file2.Write(new JavaScriptSerializer().Serialize(dataList));
file2.Close();
file2.Dispose();

下面向窗体中添加mediaPlay播放器
首先添加引用如下图所示:
使用c#实现爬虫技术_第1张图片
其次将mediaPlayer组件添加到工具箱中,菜单栏:工具—>选择工具箱选项,添加如下组件
使用c#实现爬虫技术_第2张图片
添加完之后就可以在工具箱中将组件直接拖到界面上了,

具体实现播放的代码如下所示

	public Boolean getMediaPlayData()
{
    this.playMedia.currentPlaylist.clear();
    for (int i = 0; i < oldDataList.Count; i++)
    {
        this.playMedia.currentPlaylist.appendItem(playMedia.newMedia(oldDataList[i].LocalPath));//将所有要播放的文件添加到播放列表  
    }
    return true;
}
/// 
/// 点击查询并播放按钮
 /// 
 /// 
 private void button1_Click(object sender, EventArgs e)
 {
     if (getMediaPlayData())
     {
         this.playMedia.settings.autoStart = true;
         this.playMedia.settings.setMode("shuffle", false);
         this.playMedia.Ctlcontrols.play();
     }
 }
private void wmp_PlayStateChange(object sender, AxWMPLib._WMPOCXEvents_PlayStateChangeEvent e)
{
     //如果已播放完毕就播放下一个文件
     if ((WMPLib.WMPPlayState)e.newState == WMPLib.WMPPlayState.wmppsReady) playMedia.Ctlcontrols.play();
 }

以上不是完整的代码。
总体来说把大致的过程和用到的一些技术记录下来,加深记忆。

你可能感兴趣的:(c#后台)