用.Net Core写C#爬虫(HtmlAgilityPack)

.NetCore C#爬虫

最近我尝试用.netcore结合HtmlAgilityPack库来写爬虫,从网上抓取资料,效果十分理想,所以在此总结一下,原理十分简单,就是用httpClient抓取网页脚本,再通过xpath语法用HtmlAgilityPack库来解析,解析出数据之后,用Dapper框架来存储数据,简单方便使用,非常适合个人从网上获取收集资料。

1. 源代码GitHub链接

dotnetCore 爬虫脚本

2. 数据库配置代码

用了两种数据库,Mysql和SQLite,可以通过枚举来进行切换,sqlite不用部署,方便小规模存储转移数据,但不支持并发存储,适合慢速的存储数据,个人用正好,Mysql适合用来存储大规模数据,支持高并发。


/// 
/// 数据库类型枚举
/// 
public enum DataBaseType
{
	SQLite,MySQL
}
static DataBaseType dbType = DataBaseType.SQLite;

/// 
/// 获取数据库连接对象
/// 
/// 
/// 
public static IDbConnection GetDbConn(DataBaseType dbType)
{
	string connStrSqlite = string.Format("Data Source={0}/example_db.sqlite", System.IO.Directory.GetCurrentDirectory());
	string connStrMysql = string.Format("server=127.0.0.1;port=3506;database=exampledb;userid=dahlin;password=123456") ;
	Dictionary<DataBaseType, IDbConnection> dbConnDic = new Dictionary<DataBaseType, IDbConnection>()
	{
		{DataBaseType.SQLite ,new SQLiteConnection(connStrSqlite) },
		{ DataBaseType.MySQL,new MySqlConnection(connStrMysql) },
	};
	return dbConnDic[dbType];
}

3. 页面抓取及解析

页面抓取用的HttpClient库,解析提取信息使用了HtmlAgilityPack,用nuget搜索安装即可。

/// 
/// 获取html页面
/// 
/// url地址
/// 
public static async Task<string> HtmlRequest(string requestUrl)
{
	HttpClient httpClient = new HttpClient();
	httpClient.DefaultRequestHeaders.Add("Method", "Get");
	httpClient.DefaultRequestHeaders.Add("KeepAlive", "false");   // HTTP KeepAlive设为false,防止HTTP连接保持
	httpClient.DefaultRequestHeaders.Add("UserAgent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
	var response = await httpClient.GetStringAsync(requestUrl);
	return response;
}

/// 
/// 解析提取字段
/// 
/// 
/// 
public  static List<ExampleModel> GetExampleData(string htmlStr)
{

	#region 字段
	string rootUrl = @"https://www.haolizi.net";
	string name = string.Empty;
	string detailUrl = string.Empty;
	string category = string.Empty;
	string categoryUrl = string.Empty;
	int hotNum = -1;
	int downloadCount = -1;
	int needScore = 0;
	string devLanguage = string.Empty;
	string downloadSize = string.Empty;
	string pubdate = string.Empty;
	string pubPersion = string.Empty;
	string downloadUrl = string.Empty;

	#endregion

	List<ExampleModel> examModels = new List<ExampleModel>();
	HtmlDocument htmlDoc = new HtmlDocument();

	htmlDoc.LoadHtml(htmlStr);
	var liNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class='content-box']/ul/li");

	foreach(HtmlNode node in liNodes)
	 {

		List<string> tags = new List<string>();

		#region 提取元素
		HtmlNode aNode = node.SelectSingleNode("./div[@class='baseinfo']/h3/a");
		 name = aNode.InnerText;
		 detailUrl = rootUrl + aNode.Attributes["href"].Value;

		 HtmlNode categoryNode = node.SelectSingleNode("./div[@class='baseinfo']/a");
		 category = categoryNode.InnerText;
		 categoryUrl = rootUrl + categoryNode.Attributes["href"].Value;

		 HtmlNode hotNumNode = node.SelectSingleNode("./div[@class='baseinfo']/div[@class='xj']/span[@class='rq']/em");
		 hotNum = Convert.ToInt32(hotNumNode.InnerText);

		 HtmlNode downloadCountNode = node.SelectSingleNode("./div[@class='baseinfo']/div[@class='xj']/span[2]");
		 downloadCount = Convert.ToInt32(downloadCountNode.InnerText);

		 HtmlNode needScoreNode = node.SelectSingleNode("./div[@class='baseinfo']/div[@class='xj']/span[3]");
		 needScore = Convert.ToInt32(needScoreNode.InnerText);

		 HtmlNode devLanguageNode = node.SelectSingleNode("./div[@class='sinfo']/div/p[@class='fun']/span[1]");
		 devLanguage = devLanguageNode.NextSibling.InnerText.Replace(" ", "").Replace("|", "");

		 HtmlNode downloadSizeNode = node.SelectSingleNode("./div[@class='sinfo']/div/p[@class='fun']/span[2]");
		 downloadSize = downloadSizeNode.InnerText;

		 HtmlNode pubdateNode = node.SelectSingleNode("./div[@class='sinfo']/div/p[@class='fun']/span[3]");
		 pubdate = pubdateNode.InnerText;

		 HtmlNode pubPersionNode = node.SelectSingleNode("./div[@class='sinfo']/div/p[@class='fun']/span[4]/a");
		 pubPersion = pubPersionNode.InnerText;
		 var tagNodes = node.SelectNodes("./div[@class='sinfo']/div/p[@class='fun']/span[contains(@class , 'zwch')]");
		 if (tagNodes != null)
		 {
			 foreach (var tnode in tagNodes)
			 {
				 tags.Add(tnode.SelectSingleNode("./a").InnerText);
				 Console.WriteLine(name + " tag:" + tnode.SelectSingleNode("./a").InnerText);
			 }
		 }
		#endregion

		ExampleModel examModel = new ExampleModel()
		{
			Name = name,
			DetailUrl = detailUrl,
			Category = category,
			CategoryUrl = categoryUrl,
			DevLanguage = devLanguage,
			DownloadCount = downloadCount,
			DownloadSize = downloadSize.Replace("大小:","").Trim(),
			DownloadUrl = downloadUrl,
			HotNum = hotNum,
			NeedScore = needScore,
			Pubdate = Convert.ToDateTime(pubdate.Replace("发布时间:","").Trim()),
			 PubPersion = pubPersion,
			 Tags = tags
		 };
		examModels.Add(examModel);
	 };
	return examModels;
}

4. 数据事务存储

数据解析提取完之后,就要存储数据库了,我这里使用了Dapper框架,采用了事务存储,因为数据存在一些复杂的关联。

/// 
/// 数据输入
/// 
/// 模型数据
/// 
public static int InsertDataBase(ExampleModel examModel)
{
	int result = -1;
	IDbConnection dbConn = GetDbConn(dbType);
	dbConn.Open();
	IDbTransaction transaction = dbConn.BeginTransaction();
	try
	{
		var existData = dbConn.Query<ExampleModel>("SELECT  * FROM ExampleModel WHERE name=@name", new { name = examModel.Name },transaction).ToList();
		if (existData == null || existData.Count==0)
		{
			result = dbConn.Execute("INSERT INTO ExampleModel(" +
				"name, detailUrl,category,categoryUrl,hotNum,downloadCount,needScore,devLanguage,downloadSize,pubdate,pubPersion, downloadUrl ) " +
				"VALUES (@Name, @DetailUrl,@Category,@CategoryUrl,@HotNum,@DownloadCount,@NeedScore,@DevLanguage,@DownloadSize,@Pubdate,@PubPersion, @DownloadUrl);", examModel,transaction);
			if (result > 0 && examModel.Tags != null && examModel.Tags.Count > 0)
			{
				foreach(var tag in examModel.Tags)
				{
					var existTag = dbConn.Query<TagModel>("SELECT * FROM TagModel WHERE tagName=@tagName;", new { tagName = tag },transaction).ToList();
					if(existTag==null || existTag.Count==0)
					{
						result = dbConn.Execute("INSERT INTO TagModel (tagName) VALUES (@tagName);", new { tagName = tag },transaction);
					}
					var em = dbConn.Query<ExampleModel>("SELECT  * FROM ExampleModel WHERE name=@name;", new { name = examModel.Name },transaction).SingleOrDefault();
					var tm = dbConn.Query<TagModel>("SELECT * FROM TagModel WHERE tagName=@tagName;", new { tagName = tag },transaction).SingleOrDefault();
					if(em!=null && tm!=null)
					{
						var existExampleMapTag = dbConn.Query<ExampleMapTag>("SELECT * FROM ExampleMapTag WHERE exampleId=@exampleId AND tagId=@tagId;", new { exampleId = em.Id,tagId=tm.Id },transaction).ToList();
						if (existExampleMapTag == null || existExampleMapTag.Count==0)
						{
							result = dbConn.Execute("INSERT INTO ExampleMapTag (exampleId,tagId) VALUES (@ExampleId,@TagId);", new ExampleMapTag() { ExampleId = em.Id, TagId = tm.Id },transaction);
						}
					}
				}
			}
			else
			{
				result = -1;
			}
		}
		transaction.Commit();
	}
	catch (Exception ex)
	{
		System.Diagnostics.Debug.WriteLine(ex.Message);
		transaction.Rollback();
	}
	dbConn.Close();
	return result;
}

5. 执行函数

执行函数的功能就是在找出url的索引规律后,遍历拼接url获取html脚本,再一层层往下执行调用自定义的解析方法和存储方法就好了。


static string urlRoot = "https://www.haolizi.net/examples/csharp_{0}.html";

/// 
/// 执行函数
/// 
/// 
public static void RunStart(int endIndex)
{
	for(int i=1;i<endIndex;i++)
	{
		Console.WriteLine("the page index is "+i);
		string requestUrl = string.Format(urlRoot, i);
		Console.WriteLine(requestUrl);
		string html = HtmlRequest(requestUrl).Result;
		Thread.Sleep(1000);
		List<ExampleModel> exampleModels = GetExampleData(html);
		foreach (var em in exampleModels)
		{
			int result = InsertDataBase(em);
			Console.WriteLine(dbType.ToString()+" Insert the data->" + em.Name + " result->" + result);
		};
	};
}

你可能感兴趣的:(.NET)