最近我尝试用.netcore结合HtmlAgilityPack库来写爬虫,从网上抓取资料,效果十分理想,所以在此总结一下,原理十分简单,就是用httpClient抓取网页脚本,再通过xpath语法用HtmlAgilityPack库来解析,解析出数据之后,用Dapper框架来存储数据,简单方便使用,非常适合个人从网上获取收集资料。
dotnetCore 爬虫脚本
用了两种数据库,Mysql和SQLite,可以通过枚举来进行切换,sqlite不用部署,方便小规模存储转移数据,但不支持并发存储,适合慢速的存储数据,个人用正好,Mysql适合用来存储大规模数据,支持高并发。
///
/// 数据库类型枚举
///
public enum DataBaseType
{
SQLite,MySQL
}
static DataBaseType dbType = DataBaseType.SQLite;
///
/// 获取数据库连接对象
///
///
///
public static IDbConnection GetDbConn(DataBaseType dbType)
{
string connStrSqlite = string.Format("Data Source={0}/example_db.sqlite", System.IO.Directory.GetCurrentDirectory());
string connStrMysql = string.Format("server=127.0.0.1;port=3506;database=exampledb;userid=dahlin;password=123456") ;
Dictionary<DataBaseType, IDbConnection> dbConnDic = new Dictionary<DataBaseType, IDbConnection>()
{
{DataBaseType.SQLite ,new SQLiteConnection(connStrSqlite) },
{ DataBaseType.MySQL,new MySqlConnection(connStrMysql) },
};
return dbConnDic[dbType];
}
页面抓取用的HttpClient库,解析提取信息使用了HtmlAgilityPack,用nuget搜索安装即可。
///
/// 获取html页面
///
/// url地址
///
public static async Task<string> HtmlRequest(string requestUrl)
{
HttpClient httpClient = new HttpClient();
httpClient.DefaultRequestHeaders.Add("Method", "Get");
httpClient.DefaultRequestHeaders.Add("KeepAlive", "false"); // HTTP KeepAlive设为false,防止HTTP连接保持
httpClient.DefaultRequestHeaders.Add("UserAgent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
var response = await httpClient.GetStringAsync(requestUrl);
return response;
}
///
/// 解析提取字段
///
///
///
public static List<ExampleModel> GetExampleData(string htmlStr)
{
#region 字段
string rootUrl = @"https://www.haolizi.net";
string name = string.Empty;
string detailUrl = string.Empty;
string category = string.Empty;
string categoryUrl = string.Empty;
int hotNum = -1;
int downloadCount = -1;
int needScore = 0;
string devLanguage = string.Empty;
string downloadSize = string.Empty;
string pubdate = string.Empty;
string pubPersion = string.Empty;
string downloadUrl = string.Empty;
#endregion
List<ExampleModel> examModels = new List<ExampleModel>();
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(htmlStr);
var liNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class='content-box']/ul/li");
foreach(HtmlNode node in liNodes)
{
List<string> tags = new List<string>();
#region 提取元素
HtmlNode aNode = node.SelectSingleNode("./div[@class='baseinfo']/h3/a");
name = aNode.InnerText;
detailUrl = rootUrl + aNode.Attributes["href"].Value;
HtmlNode categoryNode = node.SelectSingleNode("./div[@class='baseinfo']/a");
category = categoryNode.InnerText;
categoryUrl = rootUrl + categoryNode.Attributes["href"].Value;
HtmlNode hotNumNode = node.SelectSingleNode("./div[@class='baseinfo']/div[@class='xj']/span[@class='rq']/em");
hotNum = Convert.ToInt32(hotNumNode.InnerText);
HtmlNode downloadCountNode = node.SelectSingleNode("./div[@class='baseinfo']/div[@class='xj']/span[2]");
downloadCount = Convert.ToInt32(downloadCountNode.InnerText);
HtmlNode needScoreNode = node.SelectSingleNode("./div[@class='baseinfo']/div[@class='xj']/span[3]");
needScore = Convert.ToInt32(needScoreNode.InnerText);
HtmlNode devLanguageNode = node.SelectSingleNode("./div[@class='sinfo']/div/p[@class='fun']/span[1]");
devLanguage = devLanguageNode.NextSibling.InnerText.Replace(" ", "").Replace("|", "");
HtmlNode downloadSizeNode = node.SelectSingleNode("./div[@class='sinfo']/div/p[@class='fun']/span[2]");
downloadSize = downloadSizeNode.InnerText;
HtmlNode pubdateNode = node.SelectSingleNode("./div[@class='sinfo']/div/p[@class='fun']/span[3]");
pubdate = pubdateNode.InnerText;
HtmlNode pubPersionNode = node.SelectSingleNode("./div[@class='sinfo']/div/p[@class='fun']/span[4]/a");
pubPersion = pubPersionNode.InnerText;
var tagNodes = node.SelectNodes("./div[@class='sinfo']/div/p[@class='fun']/span[contains(@class , 'zwch')]");
if (tagNodes != null)
{
foreach (var tnode in tagNodes)
{
tags.Add(tnode.SelectSingleNode("./a").InnerText);
Console.WriteLine(name + " tag:" + tnode.SelectSingleNode("./a").InnerText);
}
}
#endregion
ExampleModel examModel = new ExampleModel()
{
Name = name,
DetailUrl = detailUrl,
Category = category,
CategoryUrl = categoryUrl,
DevLanguage = devLanguage,
DownloadCount = downloadCount,
DownloadSize = downloadSize.Replace("大小:","").Trim(),
DownloadUrl = downloadUrl,
HotNum = hotNum,
NeedScore = needScore,
Pubdate = Convert.ToDateTime(pubdate.Replace("发布时间:","").Trim()),
PubPersion = pubPersion,
Tags = tags
};
examModels.Add(examModel);
};
return examModels;
}
数据解析提取完之后,就要存储数据库了,我这里使用了Dapper框架,采用了事务存储,因为数据存在一些复杂的关联。
///
/// 数据输入
///
/// 模型数据
///
public static int InsertDataBase(ExampleModel examModel)
{
int result = -1;
IDbConnection dbConn = GetDbConn(dbType);
dbConn.Open();
IDbTransaction transaction = dbConn.BeginTransaction();
try
{
var existData = dbConn.Query<ExampleModel>("SELECT * FROM ExampleModel WHERE name=@name", new { name = examModel.Name },transaction).ToList();
if (existData == null || existData.Count==0)
{
result = dbConn.Execute("INSERT INTO ExampleModel(" +
"name, detailUrl,category,categoryUrl,hotNum,downloadCount,needScore,devLanguage,downloadSize,pubdate,pubPersion, downloadUrl ) " +
"VALUES (@Name, @DetailUrl,@Category,@CategoryUrl,@HotNum,@DownloadCount,@NeedScore,@DevLanguage,@DownloadSize,@Pubdate,@PubPersion, @DownloadUrl);", examModel,transaction);
if (result > 0 && examModel.Tags != null && examModel.Tags.Count > 0)
{
foreach(var tag in examModel.Tags)
{
var existTag = dbConn.Query<TagModel>("SELECT * FROM TagModel WHERE tagName=@tagName;", new { tagName = tag },transaction).ToList();
if(existTag==null || existTag.Count==0)
{
result = dbConn.Execute("INSERT INTO TagModel (tagName) VALUES (@tagName);", new { tagName = tag },transaction);
}
var em = dbConn.Query<ExampleModel>("SELECT * FROM ExampleModel WHERE name=@name;", new { name = examModel.Name },transaction).SingleOrDefault();
var tm = dbConn.Query<TagModel>("SELECT * FROM TagModel WHERE tagName=@tagName;", new { tagName = tag },transaction).SingleOrDefault();
if(em!=null && tm!=null)
{
var existExampleMapTag = dbConn.Query<ExampleMapTag>("SELECT * FROM ExampleMapTag WHERE exampleId=@exampleId AND tagId=@tagId;", new { exampleId = em.Id,tagId=tm.Id },transaction).ToList();
if (existExampleMapTag == null || existExampleMapTag.Count==0)
{
result = dbConn.Execute("INSERT INTO ExampleMapTag (exampleId,tagId) VALUES (@ExampleId,@TagId);", new ExampleMapTag() { ExampleId = em.Id, TagId = tm.Id },transaction);
}
}
}
}
else
{
result = -1;
}
}
transaction.Commit();
}
catch (Exception ex)
{
System.Diagnostics.Debug.WriteLine(ex.Message);
transaction.Rollback();
}
dbConn.Close();
return result;
}
执行函数的功能就是在找出url的索引规律后,遍历拼接url获取html脚本,再一层层往下执行调用自定义的解析方法和存储方法就好了。
static string urlRoot = "https://www.haolizi.net/examples/csharp_{0}.html";
///
/// 执行函数
///
///
public static void RunStart(int endIndex)
{
for(int i=1;i<endIndex;i++)
{
Console.WriteLine("the page index is "+i);
string requestUrl = string.Format(urlRoot, i);
Console.WriteLine(requestUrl);
string html = HtmlRequest(requestUrl).Result;
Thread.Sleep(1000);
List<ExampleModel> exampleModels = GetExampleData(html);
foreach (var em in exampleModels)
{
int result = InsertDataBase(em);
Console.WriteLine(dbType.ToString()+" Insert the data->" + em.Name + " result->" + result);
};
};
}