近期在项目中遇到了对外网新闻数据进行抓取的需求,经过研究和分析,最终决定用.Net Core创建一个服务结合组件HtmlAgilityPack进行实现(通过.net core定时抓取网页内容),具体方案如下
创建项目后,选择Worker Service (或 辅助角色服务),点击下一步输入项目名称,点击创建即可
通过HtmlAgilityPack抓取外网网页内容
var urlNewsList = "外网新闻url地址";
//以byte[]获取html
byte[] responseNewsList = await client.GetByteArrayAsync(urlNewsList);
//将byte[]重新编码成GB2312;
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
string tempNewsList = Encoding.GetEncoding("GB2312").GetString(responseNewsList);
//解析html,并输入
HtmlDocument htmlNewsList = new HtmlDocument();
htmlNewsList.LoadHtml(tempNewsList);
//通过xpath指定页面节点获取节点中对应的内容
var post_listnode = htmlNewsList.DocumentNode.SelectSingleNode("//div[@id='newsmain-ej']");
此类中包含:
a、通过HtmlAgilityPack抓取外网新闻内容
b、对页面内容编码,主要解决中文乱码问题,可参考 【.Net Core】.NET Core中使用编码GB2312报错‘GB2312‘ is not a supported encoding name解决方案
c、根据Xpath获取页面对应节点的内容可参考【Xpath】如何在谷歌浏览器中使用Xpath Helper插件获取网页节点内容
d、获取页面内容中的图片或附件,并把图片保存到本地
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Net.Http;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace CNPCNewsAcquireService
{
class AcquireNewsService1
{
static readonly HttpClient client = new HttpClient();
public async Task GetNews()
{
var articles = new List();
var urlNewsList = "外网新闻url地址";
//以byte[]获取html
byte[] responseNewsList = await client.GetByteArrayAsync(urlNewsList);
//将byte[]重新编码成GB2312;
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
string tempNewsList = Encoding.GetEncoding("GB2312").GetString(responseNewsList);
//解析html,并输入
HtmlDocument htmlNewsList = new HtmlDocument();
htmlNewsList.LoadHtml(tempNewsList);
var post_listnode = htmlNewsList.DocumentNode.SelectSingleNode("//div[@id='newsmain-ej']");
//获取列表页数据title和二级页面地址
var postitemsNodes = post_listnode.SelectNodes("//li[@class='ejli']");
if (postitemsNodes != null)
{
foreach (var item in postitemsNodes)
{
var article = new Article();
var titlenode = item.SelectSingleNode("./a");
article.Title = titlenode.InnerText;
article.PageUrl = titlenode.Attributes["href"].Value;
articles.Add(article);
}
}
//通过二级页面地址获取新闻内容及附件
foreach (var newsitem in articles)
{
var url2 = newsitem.PageUrl;
//以byte[]获取html
byte[] response2 = await client.GetByteArrayAsync(url2);
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
string temp2 = Encoding.GetEncoding("GB2312").GetString(response2);
//解析html,并输入
HtmlDocument htm2 = new HtmlDocument();
htm2.LoadHtml(temp2);
var post_itemnode2 = htm2.DocumentNode.SelectSingleNode("//div[@class='l-box']");
var newscontentnode = post_itemnode2.SelectSingleNode("//div[@class='sj-main']");
var newscontent = newscontentnode.OuterHtml;//新闻内容
//获取新闻内容中的图片,并把图片保存到项目目录中
//div[@class='as04']/table[1]/tbody/tr[1]/td[@class='lan4']/div/center/img/@src //Xpath节点
var newsContentImagesTable = newscontentnode.SelectNodes("div[@class='as04']/table");
if (newsContentImagesTable != null)
{
foreach (var imgitem in newsContentImagesTable)
{
var remoteImageUrl = imgitem.SelectSingleNode("tbody/tr[1]/td[@class='lan4']/div/center/img").Attributes["src"].Value;
var filename = remoteImageUrl.Split(@"/").Last();
var saveFolder = "image";
//将url地址转换为本地文件路径
newscontent = newscontent.Replace(remoteImageUrl, saveFolder + "/" + filename);
RemoteImageCatchUtil.Catch(remoteImageUrl.ToString(), filename, saveFolder, 2000000);
}
}
newsitem.Content = newscontent;
}
//把所有新闻信息(标题、内容、日期等)获取后,调取后续web api即可,如,把所有新闻插入到sql server中
//webapi
}
}
}
using System;
using System.Collections.Generic;
using System.Text;
namespace CNPCNewsAcquireService
{
public class Article
{
///
///
///
public string Id { get; set; }
///
/// 标题
///
public string Title { get; set; }
///
/// 日期
///
public string PublishDate { get; set; }
///
/// 文章链接
///
public string PageUrl { get; set; }
///
///明细
///
public string Content { get; set; }
///
///作者
///
public string Author { get; set; }
}
}
注意:需要添加UseWindowsService方法(需要在Guget包中添加服务包Microsoft.Extensions.Hosting.WindowsServices),用于部署windows服务
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
namespace CNPCNewsAcquireService
{
public class Program
{
public static void Main(string[] args)
{
CreateHostBuilder(args).Build().Run();
}
public static IHostBuilder CreateHostBuilder(string[] args) =>
Host.CreateDefaultBuilder(args)
.UseWindowsService()
.ConfigureServices((hostContext, services) =>
{
services.AddHostedService();
});
}
}
说明:该类中调用抓取新闻的方法,并且指定定时服务的时间间隔
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
namespace CNPCNewsAcquireService
{
public class Worker : BackgroundService
{
private readonly ILogger _logger;
public Worker(ILogger logger)
{
_logger = logger;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
while (!stoppingToken.IsCancellationRequested)
{
AcquireNewsService acquireService = new AcquireNewsService();
await acquireService.GetNews();
//_logger.LogInformation("Worker running at: {time}", DateTimeOffset.Now);
await Task.Delay(30000000, stoppingToken);
}
}
}
}
至此所有代码都介绍完了,下一步咱们开始来部署项目吧
通过工具-》命令行-》开发者PowerShell,打开Power Shell窗口(或是在项目文件夹中,按住shift右键打开power shall也可以)
注意:必须要在项目目录下发布,要不会出现拒绝访问的现象
在PowerShell窗口中执行命令dotnet publish -c Release -o C:\GetNewsService
说明:以管理员身份打开cmd或powershell,运行命令
sc.exe create GetNewsWorkService binPath= C:\GetNewsService\CNPCNewsAcquireService.exe
注意:binPath= C:\ ,等号后面必须要有空格
创建成功后,可以通过命令 sc.exe query GetNewsWorkService 查看一下服务的状态
通过命令sc.exe start GetNewsWorkService 启动Windows服务
启动后在windows 服务中可以看到该服务已经启动
停止服务:sc.exe stop GetNewsWorkService
删除服务:sc.exe delete GetNewsWorkService