#region 抓取腾讯课堂类别数据
ISearch search = new CategorySearch();
search.Crawler();
#endregion
#region 抓取课程
ISearch search1 = new CourseSearch(category);
search1.Crawler();
#endregion
#region 获取Ajax数据
CourseSearch courseSearch = new CourseSearch();
courseSearch.GetAjaxRequest();
#endregion
CategorySearch–Crawler
///
/// http://www.w3school.com.cn/xpath/index.asp XPATH语法
///
public class CategorySearch : ISearch
{
private static Logger logger = new Logger(typeof(CategorySearch));
private int _Count = 1;//每次都得new一个 重新初始化类别
///
/// 如果爬虫需要获取腾讯课堂所有的课程数据,需要通过类目来获取
///
/// 还是 请求获取Html内容 解析过滤信息, 获取有效信息入库
///
public void Crawler()
{
List<TencentCategoryEntity> categoryList = new List<TencentCategoryEntity>();
try
{
//配置爬取的url
string url = $"{Constant.TencentClassUrl}/course/list/?tuin=7e4f8b7d";
//加载url中的html
string html = HttpHelper.DownloadUrl(url);
//html文档解析
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
//解析xpath获取对应的节点
string fristPath = "//*[@id=\"auto-test-1\"]/div[1]/dl/dd";
HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(fristPath);
if (nodeList == null)
{
}
foreach (HtmlNode node in nodeList)
{
categoryList.AddRange(this.First(node.InnerHtml, null));
}
//将爬取的内容导入数据库(这之前网页数据已经获取到了,可以自己存入对应数据库)
CategoryRepository categoryRepository = new CategoryRepository();
categoryRepository.Save(categoryList);
}
catch (Exception ex)
{
logger.Error("CrawlerMuti出现异常", ex);
}
finally
{
Console.WriteLine($"类型数据初始化完成,共抓取类别{ categoryList?.Count}个");
}
}
///
/// 对每一个一级类进行查找
///
///
///
///
///
private List<TencentCategoryEntity> First(string html, string parentCode)
{
List<TencentCategoryEntity> categoryList = new List<TencentCategoryEntity>();
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
string namePath = "//a/h2";
HtmlNode name = doc.DocumentNode.SelectSingleNode(namePath);
string codePath = "//a";
HtmlNode codeNode = doc.DocumentNode.SelectSingleNode(codePath);
string href = codeNode.Attributes["href"].Value;
string code = string.Empty;
if (href != null && href.IndexOf("mt=") != -1)
{
href = href.Replace(";", "&");
code = href.Substring(href.IndexOf("mt=") + 3, 4);
}
TencentCategoryEntity category = new TencentCategoryEntity()
{
Id = _Count++,
State = 1,
CategoryLevel = 1,
Code = code,
ParentCode = parentCode
};
category.Name = name.InnerText;
category.Url = href;
categoryList.Add(category);
if (name.InnerText != "全部")
{
categoryList.AddRange(this.Second($"{Constant.TencentClassUrl}{href}&tuin=7e4f8b7d", code));
}
return categoryList;
}
///
/// 在一个一级类下面的全部二级类进行查找
///
///
///
///
private List<TencentCategoryEntity> Second(string url, string parentCode)
{
string html = HttpHelper.DownloadUrl(url);
List<TencentCategoryEntity> categoryList = new List<TencentCategoryEntity>();
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
string path = "//*[@id='auto-test-1']/div[1]/dl/dd";
HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path);
foreach (HtmlNode node in nodeList)
{
HtmlDocument htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(node.InnerHtml);
string codePath = "//a";
HtmlNode codeNode = htmlDocument.DocumentNode.SelectSingleNode(codePath);
string href = codeNode.Attributes["href"].Value;
if (!string.IsNullOrWhiteSpace(href))
{
href = href.Replace(";", "&");
}
string code = string.Empty;
if (href != null && href.IndexOf("st=") != -1)
{
href = href.Replace(";", "&");
code = href.Substring(href.IndexOf("st=") + 3, 4);
}
TencentCategoryEntity category = new TencentCategoryEntity()
{
Id = _Count++,
State = 1,
CategoryLevel = 2,
Code = code,
ParentCode = parentCode
};
category.Name = codeNode.InnerText;
category.Url = href;
categoryList.Add(category);
if (codeNode.InnerText != "全部")
{
categoryList.AddRange(this.Third($"{Constant.TencentClassUrl}{href}&tuin=7e4f8b7d", code));
}
}
return categoryList;
}
///
/// 在一个二级类下的全部三级类里面进行查找
///
///
///
///
private List<TencentCategoryEntity> Third(string url, string parentCode)
{
string html = HttpHelper.DownloadUrl(url);
List<TencentCategoryEntity> categoryList = new List<TencentCategoryEntity>();
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
string path = "//*[@id='auto-test-1']/div[1]/dl/dd";
HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path);
if (nodeList == null)
{
}
foreach (HtmlNode node in nodeList)
{
HtmlDocument htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(node.InnerHtml);
string codePath = "//a";
HtmlNode codeNode = htmlDocument.DocumentNode.SelectSingleNode(codePath);
string href = codeNode.Attributes["href"].Value;
string code = string.Empty;
if (href != null)
{
href = href.Replace(";", "&");
}
if (href != null && href.IndexOf("tt=") != -1)
{
href = href.Replace(";", "&");
code = href.Substring(href.IndexOf("tt=") + 3, 4);
}
TencentCategoryEntity category = new TencentCategoryEntity()
{
Id = _Count++,
State = 1,
CategoryLevel = 3,
Code = code,
ParentCode = parentCode
};
category.Name = codeNode.InnerText;
category.Url = href;
categoryList.Add(category);
}
return categoryList;
}
}
public class HttpHelper
{
private static Logger logger = new Logger(typeof(HttpHelper));
///
/// 根据url下载内容 之前是GB2312
///
///
///
public static string DownloadUrl(string url)
{
return DownloadHtml(url, Encoding.UTF8);
}
//HttpClient--WebApi
///
/// 下载html
/// http://tool.sufeinet.com/HttpHelper.aspx
/// HttpWebRequest功能比较丰富,WebClient使用比较简单
/// WebRequest
///
///
///
///
public static string DownloadHtml(string url, Encoding encode)
{
string html = string.Empty;
try
{
HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
request.Timeout = 30 * 1000;//设置30s的超时
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36";
request.ContentType = "text/html; charset=utf-8";// "text/html;charset=gbk";//
//request.Host = "search.yhd.com";
//request.Headers.Add("Cookie", @"newUserFlag=1; guid=YFT7C9E6TMFU93FKFVEN7TEA5HTCF5DQ26HZ; gray=959782; cid=av9kKvNkAPJ10JGqM_rB_vDhKxKM62PfyjkB4kdFgFY5y5VO; abtest=31; _ga=GA1.2.334889819.1425524072; grouponAreaId=37; provinceId=20; search_showFreeShipping=1; rURL=http%3A%2F%2Fsearch.yhd.com%2Fc0-0%2Fkiphone%2F20%2F%3Ftp%3D1.1.12.0.73.Ko3mjRR-11-FH7eo; aut=5GTM45VFJZ3RCTU21MHT4YCG1QTYXERWBBUFS4; ac=57265177%40qq.com; msessionid=H5ACCUBNPHMJY3HCK4DRF5VD5VA9MYQW; gc=84358431%2C102362736%2C20001585%2C73387122; tma=40580330.95741028.1425524063040.1430288358914.1430790348439.9; tmd=23.40580330.95741028.1425524063040.; search_browse_history=998435%2C1092925%2C32116683%2C1013204%2C6486125%2C38022757%2C36224528%2C24281304%2C22691497%2C26029325; detail_yhdareas=""; cart_cookie_uuid=b64b04b6-fca7-423b-b2d1-ff091d17e5e5; gla=20.237_0_0; JSESSIONID=14F1F4D714C4EE1DD9E11D11DDCD8EBA; wide_screen=1; linkPosition=search");
//request.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
//request.Headers.Add("Accept-Encoding", "gzip, deflate, sdch");
//request.Headers.Add("Referer", "http://list.yhd.com/c0-0/b/a-s1-v0-p1-price-d0-f0-m1-rt0-pid-mid0-kiphone/");
//Encoding enc = Encoding.GetEncoding("GB2312"); // 如果是乱码就改成 utf-8 / GB2312
//如何自动读取cookie
request.CookieContainer = new CookieContainer();//1 给请求准备个container
using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)//发起请求
{
if (response.StatusCode != HttpStatusCode.OK)
{
logger.Warn(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode));
}
else
{
try
{
//string sessionValue = response.Cookies["ASP.NET_SessionId"].Value;//2 读取cookie
StreamReader sr = new StreamReader(response.GetResponseStream(), encode);
html = sr.ReadToEnd();//读取数据
sr.Close();
}
catch (Exception ex)
{
logger.Error(string.Format($"DownloadHtml抓取{url}失败"), ex);
html = null;
}
}
}
}
catch (System.Net.WebException ex)
{
if (ex.Message.Equals("远程服务器返回错误: (306)。"))
{
logger.Error("远程服务器返回错误: (306)。", ex);
html = null;
}
}
catch (Exception ex)
{
logger.Error(string.Format("DownloadHtml抓取{0}出现异常", url), ex);
html = null;
}
return html;
}
}
CourseSearch–Crawler
///
/// 商品抓取
/// http://www.w3school.com.cn/xpath/index.asp XPATH语法
///
/// 1 HtmlAgilityPack还挺方便
/// 2 订制,不同网站都要订制;
/// 同一网站基本不需要升级
///
public class CourseSearch : ISearch
{
private Logger logger = new Logger(typeof(CourseSearch));
private WarnRepository warnRepository = new WarnRepository();
private CourseRepository courseRepository = new CourseRepository();
private TencentCategoryEntity category = null;
public CourseSearch()
{
}
public CourseSearch(TencentCategoryEntity _category)
{
category = _category;
}
public void Crawler()
{
try
{
if (string.IsNullOrEmpty(category.Url))
{
warnRepository.SaveWarn(category, string.Format("Url为空,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url));
return;
}
{
#region 分页获取
//ImageHelper.DeleteDir(Constant.ImagePath);
GetPageCourseData();
#endregion
#region 获取某一页的数据
//this.Show(category.Url);
#endregion
}
}
catch (Exception ex)
{
logger.Error("CrawlerMuti出现异常", ex);
warnRepository.SaveWarn(category, string.Format("出现异常,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url));
}
}
static int count = 0;
//这个爬虫定制的套路如果能理解;刷个1
public void Show(string url)
{
string strHtml = HttpHelper.DownloadUrl(url);
HtmlDocument document = new HtmlDocument();
document.LoadHtml(strHtml);
string liPath = "/html/body/section[1]/div/div[@class='market-bd market-bd-6 course-list course-card-list-multi-wrap js-course-list']/ul/li";
HtmlNodeCollection liNodes = document.DocumentNode.SelectNodes(liPath);
foreach (var node in liNodes)
{
Console.WriteLine();
Console.WriteLine();
Console.WriteLine("************************************************");
HtmlDocument lidocument = new HtmlDocument();
lidocument.LoadHtml(node.OuterHtml);
string aPath = "//*/a[1]";
HtmlNode classANode = lidocument.DocumentNode.SelectSingleNode(aPath);
string aHref = classANode.Attributes["href"].Value;
Console.WriteLine($"课程Url:{aHref}");
string Id = classANode.Attributes["data-id"].Value;
Console.WriteLine($"课程Id:{Id}");
string imgPath = "//*/a[1]/img";
HtmlNode imgNode = lidocument.DocumentNode.SelectSingleNode(imgPath);
string imgUrl = imgNode.Attributes["src"].Value;
Console.WriteLine($"ImageUrl:{imgUrl}");
string namePaths = "//*/h4/a[1]";
HtmlNode nameNode = lidocument.DocumentNode.SelectSingleNode(namePaths);
string name = nameNode.InnerText;
Console.WriteLine(name);
Console.WriteLine($"课程名称:{name}");
// courseEntity.Price = new Random().Next(100, 10000); //关于腾讯课堂上的课程价格抓取 这是一个进阶内容 通过普通方式搞不了(他有一个自己的算法)
count = count + 1;
}
}
public void ShowPageData(string url)
{
string strHtml = HttpHelper.DownloadUrl(url);
HtmlDocument document = new HtmlDocument();
document.LoadHtml(strHtml);
string pagePath = "/html/body/section[1]/div/div[5]/a[@class='page-btn']";
HtmlNodeCollection pageNodes = document.DocumentNode.SelectNodes(pagePath);
int maxPage = pageNodes.Select(p => int.Parse(p.InnerText)).Max();
for (int page = 1; page <= maxPage; page++)
{
string pageUrl = $"{url}&page={page}";
Show(pageUrl);
}
Console.WriteLine($"一共抓取数据{count}条");
}
#region 分页抓取
private void GetPageCourseData()
{
//1. 确定总页数
//2. 分别抓取每一页的数据
//3. 分析 过滤 清洗
//4. 入库
category.Url = $"{Constant.TencentClassUrl}{category.Url}";
string strHtml = HttpHelper.DownloadUrl(category.Url);
HtmlDocument document = new HtmlDocument();
document.LoadHtml(strHtml);
//Xpath
string pagePath = "/html/body/section[1]/div/div[@class='sort-page']/a[@class='page-btn']";
HtmlNodeCollection pageNodes = document.DocumentNode.SelectNodes(pagePath);
int pageCount = 1;
if (pageNodes != null)
{
pageCount = pageNodes.Select(a => int.Parse(a.InnerText)).Max();
}
List<CourseEntity> courseList = new List<CourseEntity>();
for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++)
{
Console.WriteLine($"******************************当前是第{pageIndex}页数据************************************");
string pageIndexUrl = $"{category.Url}&page={pageIndex}";
List<CourseEntity> courseEntities = GetPageIndeData(pageIndexUrl);
courseList.AddRange(courseEntities);
}
courseRepository.SaveList(courseList);
}
private List<CourseEntity> GetPageIndeData(string url)
{
//获取li标签里面的数据
// 先获取所有的Li
// 然后循环获取li中的有效数据
string strHtml = HttpHelper.DownloadUrl(url);
HtmlDocument document = new HtmlDocument();
document.LoadHtml(strHtml);
string liPath = "/html/body/section[1]/div/div[@class='market-bd market-bd-6 course-list course-card-list-multi-wrap js-course-list']/ul/li";
HtmlNodeCollection liNodes = document.DocumentNode.SelectNodes(liPath);
List<CourseEntity> courseEntities = new List<CourseEntity>();
foreach (var node in liNodes)
{
CourseEntity courseEntity = GetLiData(node);
courseEntities.Add(courseEntity);
}
return courseEntities;
}
///
/// 当我们把这些数据获取到以后,那就应该保存起来
///
///
private CourseEntity GetLiData(HtmlNode node)
{
CourseEntity courseEntity = new CourseEntity();
//从这里开始
HtmlDocument document = new HtmlDocument();
document.LoadHtml(node.OuterHtml);
string aPath = "//*/a[1]";
HtmlNode classANode = document.DocumentNode.SelectSingleNode(aPath);
string aHref = classANode.Attributes["href"].Value;
courseEntity.Url = aHref;
Console.WriteLine($"课程Url:{aHref}");
string Id = classANode.Attributes["data-id"].Value;
Console.WriteLine($"课程Id:{Id}");
courseEntity.CourseId = long.Parse(Id);
string imgPath = "//*/a[1]/img";
HtmlNode imgNode = document.DocumentNode.SelectSingleNode(imgPath);
string imgUrl = imgNode.Attributes["src"].Value;
courseEntity.ImageUrl = imgUrl;
Console.WriteLine($"ImageUrl:{imgUrl}");
string namePaths = "//*/h4/a[1]";
HtmlNode nameNode = document.DocumentNode.SelectSingleNode(namePaths);
string name = nameNode.InnerText;
courseEntity.Title = name;
Console.WriteLine($"课程名称:{name}");
courseEntity.Price = new Random().Next(100, 10000); //关于腾讯课堂上的课程价格抓取 这是一个进阶内容 通过普通方式搞不了(他有一个自己的算法)
return courseEntity;
}
#endregion
#region 获取Ajax 请求数据
///
/// 1.匹配页面和请求URL
/// 2.获取请求的数据
/// 3.解析数据:根据json格式数据建立实体 HashTable
///
///
///
public void GetAjaxRequest()
{
string url = "https://ke.qq.com/cgi-bin/get_cat_info?bkn=449651946&r=0.36532379182727115";
var ajaxData = HttpHelper.DownloadHtml(url, Encoding.UTF8);
Hashtable hashtable = JsonConvert.DeserializeObject<Hashtable>(ajaxData);
string result = hashtable["result"].ToString();
Hashtable hashResult = JsonConvert.DeserializeObject<Hashtable>(result);
Dictionary<string,string> dicResult = JsonConvert.DeserializeObject<Dictionary<string, string>>(result);
string catInfo = hashResult["catInfo"].ToString();
//dynamic dynamicCatInfo = JsonConvert.DeserializeObject(catInfo);
//Hashtable hashcatInfo = JsonConvert.DeserializeObject(catInfo);
//foreach (var hashItem in hashcatInfo)
//{
// JsonConvert.DeserializeObject(hashItem["1001"].ToString());
//}
//Hashtable cat1001 = JsonConvert.DeserializeObject(hashcatInfo["1001"].ToString());
//Console.WriteLine($"类别为:{cat1001["n"]}");
//Hashtable cat1002 = JsonConvert.DeserializeObject(hashcatInfo["1002"].ToString());
//Console.WriteLine($"类别为:{cat1002["n"]}");
//Hashtable cat1003 = JsonConvert.DeserializeObject(hashcatInfo["1003"].ToString());
//Console.WriteLine($"类别为:{cat1003["n"]}");
//Hashtable cat1004 = JsonConvert.DeserializeObject(hashcatInfo["1004"].ToString());
//Console.WriteLine($"类别为:{cat1004["n"]}");
//Hashtable cat1005 = JsonConvert.DeserializeObject(hashcatInfo["1005"].ToString());
//Console.WriteLine($"类别为:{cat1005["n"]}");
//Hashtable cat1006 = JsonConvert.DeserializeObject(hashcatInfo["1006"].ToString());
//Console.WriteLine($"类别为:{cat1006["n"]}");
}
#endregion
}