(精华)2020年8月14日 C#基础知识点 爬虫专题(腾讯课堂)

#region 抓取腾讯课堂类别数据 
ISearch search = new CategorySearch();
search.Crawler();
#endregion
#region 抓取课程
ISearch search1 = new CourseSearch(category);
search1.Crawler();
#endregion
#region 获取Ajax数据 
CourseSearch courseSearch = new CourseSearch();
courseSearch.GetAjaxRequest();
#endregion

CategorySearch–Crawler

/// 
    /// http://www.w3school.com.cn/xpath/index.asp XPATH语法
    /// 
    public class CategorySearch : ISearch
    {
        private static Logger logger = new Logger(typeof(CategorySearch));
        private int _Count = 1;//每次都得new一个 重新初始化类别


        /// 
        /// 如果爬虫需要获取腾讯课堂所有的课程数据,需要通过类目来获取
        /// 
        /// 还是 请求获取Html内容   解析过滤信息, 获取有效信息入库
        /// 
        public void Crawler()
        {
            List<TencentCategoryEntity> categoryList = new List<TencentCategoryEntity>();
            try
            {
                //配置爬取的url
                string url = $"{Constant.TencentClassUrl}/course/list/?tuin=7e4f8b7d";
                //加载url中的html
                string html = HttpHelper.DownloadUrl(url);
                //html文档解析
                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(html);
                //解析xpath获取对应的节点
                string fristPath = "//*[@id=\"auto-test-1\"]/div[1]/dl/dd";
                HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(fristPath);
                if (nodeList == null)
                {

                }
                foreach (HtmlNode node in nodeList)
                {
                    categoryList.AddRange(this.First(node.InnerHtml, null));
                }
                //将爬取的内容导入数据库(这之前网页数据已经获取到了,可以自己存入对应数据库)
                CategoryRepository categoryRepository = new CategoryRepository();
                categoryRepository.Save(categoryList);
            }
            catch (Exception ex)
            {
                logger.Error("CrawlerMuti出现异常", ex);
            }
            finally
            {
                Console.WriteLine($"类型数据初始化完成,共抓取类别{ categoryList?.Count}个");
            }
        }



        /// 
        /// 对每一个一级类进行查找
        /// 
        /// 
        /// 
        /// 
        /// 
        private List<TencentCategoryEntity> First(string html, string parentCode)
        {
            List<TencentCategoryEntity> categoryList = new List<TencentCategoryEntity>();
            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(html);
            string namePath = "//a/h2";
            HtmlNode name = doc.DocumentNode.SelectSingleNode(namePath);
            string codePath = "//a";
            HtmlNode codeNode = doc.DocumentNode.SelectSingleNode(codePath);
            string href = codeNode.Attributes["href"].Value;

            string code = string.Empty;
            if (href != null && href.IndexOf("mt=") != -1)
            {
                href = href.Replace(";", "&");
                code = href.Substring(href.IndexOf("mt=") + 3, 4);
            }
            TencentCategoryEntity category = new TencentCategoryEntity()
            {
                Id = _Count++,
                State = 1,
                CategoryLevel = 1,
                Code = code,
                ParentCode = parentCode
            };
            category.Name = name.InnerText;
            category.Url = href;
            categoryList.Add(category);
            if (name.InnerText != "全部")
            {
                categoryList.AddRange(this.Second($"{Constant.TencentClassUrl}{href}&tuin=7e4f8b7d", code));
            }
            return categoryList;
        }

        /// 
        /// 在一个一级类下面的全部二级类进行查找
        /// 
        /// 
        /// 
        /// 
        private List<TencentCategoryEntity> Second(string url, string parentCode)
        {
            string html = HttpHelper.DownloadUrl(url);
            List<TencentCategoryEntity> categoryList = new List<TencentCategoryEntity>();
            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(html);
            string path = "//*[@id='auto-test-1']/div[1]/dl/dd";
            HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path);

            foreach (HtmlNode node in nodeList)
            {
                HtmlDocument htmlDocument = new HtmlDocument();
                htmlDocument.LoadHtml(node.InnerHtml);

                string codePath = "//a";
                HtmlNode codeNode = htmlDocument.DocumentNode.SelectSingleNode(codePath);
                string href = codeNode.Attributes["href"].Value;
                if (!string.IsNullOrWhiteSpace(href))
                {
                    href = href.Replace(";", "&");
                }

                string code = string.Empty;
                if (href != null && href.IndexOf("st=") != -1)
                {
                    href = href.Replace(";", "&");
                    code = href.Substring(href.IndexOf("st=") + 3, 4);
                }
                TencentCategoryEntity category = new TencentCategoryEntity()
                {
                    Id = _Count++,
                    State = 1,
                    CategoryLevel = 2,
                    Code = code,
                    ParentCode = parentCode
                };
                category.Name = codeNode.InnerText;
                category.Url = href;

                categoryList.Add(category);

                if (codeNode.InnerText != "全部")
                {
                    categoryList.AddRange(this.Third($"{Constant.TencentClassUrl}{href}&tuin=7e4f8b7d", code));
                }
            }
            return categoryList;
        }

        /// 
        /// 在一个二级类下的全部三级类里面进行查找
        /// 
        /// 
        /// 
        /// 
        private List<TencentCategoryEntity> Third(string url, string parentCode)
        {
            string html = HttpHelper.DownloadUrl(url);
            List<TencentCategoryEntity> categoryList = new List<TencentCategoryEntity>();
            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(html);
            string path = "//*[@id='auto-test-1']/div[1]/dl/dd";
            HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path);
            if (nodeList == null)
            {

            }
            foreach (HtmlNode node in nodeList)
            {
                HtmlDocument htmlDocument = new HtmlDocument();

                htmlDocument.LoadHtml(node.InnerHtml);

                string codePath = "//a";
                HtmlNode codeNode = htmlDocument.DocumentNode.SelectSingleNode(codePath);
                string href = codeNode.Attributes["href"].Value;

                string code = string.Empty;
                if (href != null)
                {
                    href = href.Replace(";", "&");
                }
                if (href != null && href.IndexOf("tt=") != -1)
                {
                    href = href.Replace(";", "&");
                    code = href.Substring(href.IndexOf("tt=") + 3, 4);
                }
                TencentCategoryEntity category = new TencentCategoryEntity()
                {
                    Id = _Count++,
                    State = 1,
                    CategoryLevel = 3,
                    Code = code,
                    ParentCode = parentCode
                };
                category.Name = codeNode.InnerText;
                category.Url = href;
                categoryList.Add(category);
            }
            return categoryList;
        }
    }
public class HttpHelper
{
    private static Logger logger = new Logger(typeof(HttpHelper));

    /// 
    /// 根据url下载内容  之前是GB2312
    /// 
    /// 
    /// 
    public static string DownloadUrl(string url)
    {
        return DownloadHtml(url, Encoding.UTF8);
    }

    //HttpClient--WebApi

    /// 
    /// 下载html
    /// http://tool.sufeinet.com/HttpHelper.aspx
    /// HttpWebRequest功能比较丰富,WebClient使用比较简单
    /// WebRequest
    /// 
    /// 
    /// 
    /// 
    public static string DownloadHtml(string url, Encoding encode)
    {
        string html = string.Empty;
        try
        {
            HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
            request.Timeout = 30 * 1000;//设置30s的超时
            request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36";
            request.ContentType = "text/html; charset=utf-8";// "text/html;charset=gbk";// 
            //request.Host = "search.yhd.com";
            //request.Headers.Add("Cookie", @"newUserFlag=1; guid=YFT7C9E6TMFU93FKFVEN7TEA5HTCF5DQ26HZ; gray=959782; cid=av9kKvNkAPJ10JGqM_rB_vDhKxKM62PfyjkB4kdFgFY5y5VO; abtest=31; _ga=GA1.2.334889819.1425524072; grouponAreaId=37; provinceId=20; search_showFreeShipping=1; rURL=http%3A%2F%2Fsearch.yhd.com%2Fc0-0%2Fkiphone%2F20%2F%3Ftp%3D1.1.12.0.73.Ko3mjRR-11-FH7eo; aut=5GTM45VFJZ3RCTU21MHT4YCG1QTYXERWBBUFS4; ac=57265177%40qq.com; msessionid=H5ACCUBNPHMJY3HCK4DRF5VD5VA9MYQW; gc=84358431%2C102362736%2C20001585%2C73387122; tma=40580330.95741028.1425524063040.1430288358914.1430790348439.9; tmd=23.40580330.95741028.1425524063040.; search_browse_history=998435%2C1092925%2C32116683%2C1013204%2C6486125%2C38022757%2C36224528%2C24281304%2C22691497%2C26029325; detail_yhdareas=""; cart_cookie_uuid=b64b04b6-fca7-423b-b2d1-ff091d17e5e5; gla=20.237_0_0; JSESSIONID=14F1F4D714C4EE1DD9E11D11DDCD8EBA; wide_screen=1; linkPosition=search");
            //request.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            //request.Headers.Add("Accept-Encoding", "gzip, deflate, sdch");
            //request.Headers.Add("Referer", "http://list.yhd.com/c0-0/b/a-s1-v0-p1-price-d0-f0-m1-rt0-pid-mid0-kiphone/");
            //Encoding enc = Encoding.GetEncoding("GB2312"); // 如果是乱码就改成 utf-8 / GB2312
            //如何自动读取cookie
            request.CookieContainer = new CookieContainer();//1 给请求准备个container
            using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)//发起请求
            {
                if (response.StatusCode != HttpStatusCode.OK)
                {
                    logger.Warn(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode));
                }
                else
                {
                    try
                    {
                        //string sessionValue = response.Cookies["ASP.NET_SessionId"].Value;//2 读取cookie
                        StreamReader sr = new StreamReader(response.GetResponseStream(), encode);
                        html = sr.ReadToEnd();//读取数据
                        sr.Close();
                    }
                    catch (Exception ex)
                    {
                        logger.Error(string.Format($"DownloadHtml抓取{url}失败"), ex);
                        html = null;
                    }
                }
            }
        }
        catch (System.Net.WebException ex)
        {
            if (ex.Message.Equals("远程服务器返回错误: (306)。"))
            {
                logger.Error("远程服务器返回错误: (306)。", ex);
                html = null;
            }
        }
        catch (Exception ex)
        {
            logger.Error(string.Format("DownloadHtml抓取{0}出现异常", url), ex);
            html = null;
        }
        return html;
    }
}

CourseSearch–Crawler

/// 
    /// 商品抓取
    /// http://www.w3school.com.cn/xpath/index.asp XPATH语法
    /// 
    /// 1 HtmlAgilityPack还挺方便
    /// 2 订制,不同网站都要订制;
    ///   同一网站基本不需要升级
    /// 
    public class CourseSearch : ISearch
    {
        private Logger logger = new Logger(typeof(CourseSearch));
        private WarnRepository warnRepository = new WarnRepository();
        private CourseRepository courseRepository = new CourseRepository();
        private TencentCategoryEntity category = null;

        public CourseSearch()
        {

        }

        public CourseSearch(TencentCategoryEntity _category)
        {
            category = _category;
        }

        public void Crawler()
        {
            try
            {
                if (string.IsNullOrEmpty(category.Url))
                {
                    warnRepository.SaveWarn(category, string.Format("Url为空,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url));
                    return;
                }
                { 
                    #region 分页获取  
                    //ImageHelper.DeleteDir(Constant.ImagePath);
                   GetPageCourseData();
                    #endregion

                    #region 获取某一页的数据
                    //this.Show(category.Url);
                    #endregion
                }
            }
            catch (Exception ex)
            {
                logger.Error("CrawlerMuti出现异常", ex);
                warnRepository.SaveWarn(category, string.Format("出现异常,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url));
            }
        }

        static int count = 0;

        //这个爬虫定制的套路如果能理解;刷个1
        public void Show(string url)
        {
            string strHtml = HttpHelper.DownloadUrl(url);
            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(strHtml);
            string liPath = "/html/body/section[1]/div/div[@class='market-bd market-bd-6 course-list course-card-list-multi-wrap js-course-list']/ul/li";
            HtmlNodeCollection liNodes = document.DocumentNode.SelectNodes(liPath);
            foreach (var node in liNodes)
            {
                Console.WriteLine();
                Console.WriteLine();
                Console.WriteLine("************************************************");
                HtmlDocument lidocument = new HtmlDocument();
                lidocument.LoadHtml(node.OuterHtml);
                string aPath = "//*/a[1]";
                HtmlNode classANode = lidocument.DocumentNode.SelectSingleNode(aPath);
                string aHref = classANode.Attributes["href"].Value;

                Console.WriteLine($"课程Url:{aHref}");

                string Id = classANode.Attributes["data-id"].Value;

                Console.WriteLine($"课程Id:{Id}");

                string imgPath = "//*/a[1]/img";
                HtmlNode imgNode = lidocument.DocumentNode.SelectSingleNode(imgPath);
                string imgUrl = imgNode.Attributes["src"].Value;

                Console.WriteLine($"ImageUrl:{imgUrl}");

                string namePaths = "//*/h4/a[1]";
                HtmlNode nameNode = lidocument.DocumentNode.SelectSingleNode(namePaths);
                string name = nameNode.InnerText;
                Console.WriteLine(name);

                Console.WriteLine($"课程名称:{name}");
                // courseEntity.Price = new Random().Next(100, 10000);  //关于腾讯课堂上的课程价格抓取 这是一个进阶内容  通过普通方式搞不了(他有一个自己的算法) 

                count = count + 1;
            }
        }

        public void ShowPageData(string url)
        {
            string strHtml = HttpHelper.DownloadUrl(url);
            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(strHtml);
            string pagePath = "/html/body/section[1]/div/div[5]/a[@class='page-btn']";
            HtmlNodeCollection pageNodes = document.DocumentNode.SelectNodes(pagePath);
            int maxPage = pageNodes.Select(p => int.Parse(p.InnerText)).Max();
            for (int page = 1; page <= maxPage; page++)
            {
                string pageUrl = $"{url}&page={page}";
                Show(pageUrl);
            }
            Console.WriteLine($"一共抓取数据{count}条");
        }

        #region 分页抓取 
        private void GetPageCourseData()
        {
            //1. 确定总页数
            //2. 分别抓取每一页的数据
            //3. 分析  过滤  清洗
            //4. 入库 

            category.Url = $"{Constant.TencentClassUrl}{category.Url}";

            string strHtml = HttpHelper.DownloadUrl(category.Url);
            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(strHtml);
            //Xpath
            string pagePath = "/html/body/section[1]/div/div[@class='sort-page']/a[@class='page-btn']";
            HtmlNodeCollection pageNodes = document.DocumentNode.SelectNodes(pagePath);

            int pageCount = 1;
            if (pageNodes != null)
            {
                pageCount = pageNodes.Select(a => int.Parse(a.InnerText)).Max();
            }
            List<CourseEntity> courseList = new List<CourseEntity>();

            for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++)
            {
                Console.WriteLine($"******************************当前是第{pageIndex}页数据************************************");
                string pageIndexUrl = $"{category.Url}&page={pageIndex}";
                List<CourseEntity> courseEntities = GetPageIndeData(pageIndexUrl);
                courseList.AddRange(courseEntities);
            }
            courseRepository.SaveList(courseList);


        }

        private List<CourseEntity> GetPageIndeData(string url)
        {
            //获取li标签里面的数据 
            // 先获取所有的Li 
            //  然后循环获取li中的有效数据
            string strHtml = HttpHelper.DownloadUrl(url);
            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(strHtml);
            string liPath = "/html/body/section[1]/div/div[@class='market-bd market-bd-6 course-list course-card-list-multi-wrap js-course-list']/ul/li";
            HtmlNodeCollection liNodes = document.DocumentNode.SelectNodes(liPath);

            List<CourseEntity> courseEntities = new List<CourseEntity>();
            foreach (var node in liNodes)
            {
                CourseEntity courseEntity = GetLiData(node);
                courseEntities.Add(courseEntity);
            }
            return courseEntities;
        }

        /// 
        /// 当我们把这些数据获取到以后,那就应该保存起来
        /// 
        /// 
        private CourseEntity GetLiData(HtmlNode node)
        {
            CourseEntity courseEntity = new CourseEntity();
            //从这里开始 
            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(node.OuterHtml);
            string aPath = "//*/a[1]";
            HtmlNode classANode = document.DocumentNode.SelectSingleNode(aPath);
            string aHref = classANode.Attributes["href"].Value;
            courseEntity.Url = aHref;

            Console.WriteLine($"课程Url:{aHref}");

            string Id = classANode.Attributes["data-id"].Value;

            Console.WriteLine($"课程Id:{Id}");

            courseEntity.CourseId = long.Parse(Id);

            string imgPath = "//*/a[1]/img";
            HtmlNode imgNode = document.DocumentNode.SelectSingleNode(imgPath);
            string imgUrl = imgNode.Attributes["src"].Value;
            courseEntity.ImageUrl = imgUrl;

            Console.WriteLine($"ImageUrl:{imgUrl}");

            string namePaths = "//*/h4/a[1]";
            HtmlNode nameNode = document.DocumentNode.SelectSingleNode(namePaths);
            string name = nameNode.InnerText;

            courseEntity.Title = name;

            Console.WriteLine($"课程名称:{name}");

            courseEntity.Price = new Random().Next(100, 10000);  //关于腾讯课堂上的课程价格抓取 这是一个进阶内容  通过普通方式搞不了(他有一个自己的算法) 
            return courseEntity;

        }
        #endregion

        #region 获取Ajax 请求数据
        /// 
        /// 1.匹配页面和请求URL
        /// 2.获取请求的数据
        /// 3.解析数据:根据json格式数据建立实体  HashTable
        /// 
        /// 
        /// 
        public void GetAjaxRequest()
        {
            string url = "https://ke.qq.com/cgi-bin/get_cat_info?bkn=449651946&r=0.36532379182727115";
            var ajaxData = HttpHelper.DownloadHtml(url, Encoding.UTF8);

            Hashtable hashtable = JsonConvert.DeserializeObject<Hashtable>(ajaxData);
            string result = hashtable["result"].ToString();
            Hashtable hashResult = JsonConvert.DeserializeObject<Hashtable>(result);

            Dictionary<string,string> dicResult = JsonConvert.DeserializeObject<Dictionary<string, string>>(result);

            string catInfo = hashResult["catInfo"].ToString();
             
            //dynamic dynamicCatInfo = JsonConvert.DeserializeObject(catInfo);
             
            //Hashtable hashcatInfo = JsonConvert.DeserializeObject(catInfo);

            //foreach (var hashItem in hashcatInfo)
            //{
            //    JsonConvert.DeserializeObject(hashItem["1001"].ToString());
            //}

            //Hashtable cat1001 = JsonConvert.DeserializeObject(hashcatInfo["1001"].ToString());
            //Console.WriteLine($"类别为:{cat1001["n"]}");

            //Hashtable cat1002 = JsonConvert.DeserializeObject(hashcatInfo["1002"].ToString());
            //Console.WriteLine($"类别为:{cat1002["n"]}");

            //Hashtable cat1003 = JsonConvert.DeserializeObject(hashcatInfo["1003"].ToString());
            //Console.WriteLine($"类别为:{cat1003["n"]}");

            //Hashtable cat1004 = JsonConvert.DeserializeObject(hashcatInfo["1004"].ToString());
            //Console.WriteLine($"类别为:{cat1004["n"]}");

            //Hashtable cat1005 = JsonConvert.DeserializeObject(hashcatInfo["1005"].ToString());
            //Console.WriteLine($"类别为:{cat1005["n"]}");

            //Hashtable cat1006 = JsonConvert.DeserializeObject(hashcatInfo["1006"].ToString());
            //Console.WriteLine($"类别为:{cat1006["n"]}");




        }
        #endregion 
    }

你可能感兴趣的:(C#,c#)