DotNetspider

1.架构:.NET Core 2.0 控制台应用程序 + DotNetSpider + Nlog日志

架构:
引用DotNetSpider2.Core 以及 DotNetSpider2.Extension
框架是:.NET Core 2.0 控制台应用程序


2.创建实体类 JuzimiListEntity 继承 SpiderEntity类
using DotnetSpider.Extension.Model;
using System;
using System.Collections.Generic;
using System.Text;

namespace Ex003
{
    public class JuzimiListEntity:SpiderEntity
    {
        public string xlistju { get; set; }

        public override string ToString()
        {
            return $"句子迷:{ xlistju}";
        }
    }
}


3.创建用来处理页面数据的类 并且 该类 继承 BasePageProcessor类
using DotnetSpider.Core;
using DotnetSpider.Core.Processor;
using System;
using System.Collections.Generic;
using System.Text;

namespace Ex003
{
    class JuzimiProcessor:BasePageProcessor
    {
        protected override void Handle(Page page)
        {
            List list = new List();
            var modelHtmlList = page.Selectable.XPath(".//div[@class='views-field-phpcode']").Nodes();

            foreach (var modelHtml in modelHtmlList)
            {
                JuzimiListEntity entity = new JuzimiListEntity();
                var xlistju = modelHtml.XPath(".//a[@class='xlistju']").GetValue(DotnetSpider.Core.Selector.ValueOption.InnerText).Trim().Replace("
", string.Empty).Replace("\n", string.Empty).Replace("\t", string.Empty); entity.xlistju = xlistju.ToString(); list.Add(entity); } page.AddResultItem("JuzimiList", list); } } }

4.创建管道类 继承 BasePipeline类
using DotnetSpider.Core;
using DotnetSpider.Core.Pipeline;
using NLog;
using System;
using System.Collections.Generic;
using System.Text;

namespace Ex003
{
    class JuzimiPipe : BasePipeline
    {
        new static Logger Logger = LogManager.GetCurrentClassLogger();
        public override void Process(IEnumerable resultItems, ISpider spider)
        {
            foreach (var relusultItem in resultItems)
            {
                Console.WriteLine((relusultItem.Results["JuzimiList"] as List).Count);
                foreach (var item in relusultItem.Results["JuzimiList"] as List)
                {
                    Console.WriteLine(item);
                    Logger.Info("爬取的内容:" + item);
                }
            }
        }
    }
}


5.执行爬虫

using DotnetSpider.Core;
using DotnetSpider.Core.Scheduler;
using System;
using System.Collections.Generic;

namespace Ex003
{
    class Program
    {
        /// 
        /// 爬取句子迷
        /// 
        /// 
        static void Main(string[] args)
        {
            List resList = new List();
            var site = new DotnetSpider.Core.Site() { EncodingName = "UTF-8" };
            //循环获取29页句子迷原创句子
            for (int i = 1; i < 29; i++)
            {
                site.AddStartUrl($"http://www.juzimi.com/original/recommend?page={i}");
            }

            //site.AddStartUrl($"http://www.juzimi.com/original/recommend?page=1");
            var spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new JuzimiProcessor())
                .AddStartRequests(resList.ToArray())
                .AddPipeline(new JuzimiPipe());
            spider.ThreadNum = 1;
            spider.Run();
        }
    }
}

6.源码自取


你可能感兴趣的:(DotNetspider)