.Net开源的跨平台爬虫框架 DotnetSpider

项目详细介绍

 

DotnetSpider是开源的.NET跨平台数据采集爬虫框架。需要 Scheduler,Downloader ,Processor,Pipeline 四部分。

 

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

  public static void Main()

    {

        HttpClientDownloader downloader = new HttpClientDownloader();

 

        Core.Spider spider = Core.Spider.Create(new MyPageProcessor(), new QueueDuplicateRemovedScheduler()).AddPipeline(new MyPipeline()).SetThreadNum(1);

        var site = new Site() { EncodingName = "UTF-8" };

        for (int i = 1; i < 5; ++i)

        {

            site.AddStartUrl("http://www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_1.html");

        }

        spider.Site = site;

        spider.Start();

    }

 

    private class MyPipeline : IPipeline

    {

        public void Process(ResultItems resultItems, ISpider spider)

        {

            foreach (YoukuVideo entry in resultItems.Results["VideoResult"])

            {

                Console.WriteLine($"{entry.Name}:{entry.Click}");

            }

 

            //May be you want to save to database

            // 

        }

 

        public void Dispose()

        {

        }

    }

 

    private class MyPageProcessor : IPageProcessor

    {

        public void Process(Page page)

        {

            var totalVideoElements = page.Selectable.SelectList(Selectors.XPath("//div[@class='yk-col3']")).Nodes();

            List results = new List();

            foreach (var videoElement in totalVideoElements)

            {

                var video = new YoukuVideo();

                video.Name = videoElement.Select(Selectors.XPath("/div[4]/div[1]/a")).Value;

                video.Click = int.Parse(videoElement.Select(Selectors.Css("p-num")).Value.ToString());

                results.Add(video);

            }

            page.AddResultItem("VideoResult", results);

        }

 

        public Site Site => new Site { SleepTime = 0 };

    }

 

    public class YoukuVideo

    {

        public string Name { getset; }

        public string Click { getset; }

    }

 

  

添加config 文件: 

app.conf to your project 

 

1

2

redisServer:your redis server 

redisPassword:your redis password

 

添加爬虫上下文类:

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

public class JdSkuSpider : ISpiderContext

{

    public SpiderContextBuilder GetBuilder()

    {

        Log.TaskId = "JD SKU Weekly";

        SpiderContext context = new SpiderContext

        {

            SpiderName = "JD SKU " + DateTimeUtils.MONDAY_RUN_ID,

            CachedSize = 1,

            ThreadNum = 8,

            Site = new Site

            {

                EncodingName = "UTF-8"

            },

            Scheduler = new RedisScheduler()

            {

                Host = "redis",

                Port = 6379,

                Password = ""

            },

            StartUrls=new Dictionary<string, Dictionary<stringobject>> {

                "http://list.jd.com/list.html?cat=9987,653,655&page=1&go=0&JL=6_0_0&ms=5"new Dictionary<stringobject> { { "name","手机" }, { "cat3","9987" } } },

            },

            Pipeline = new MysqlPipeline()

            {

                ConnectString = ""

            },

            Downloader = new HttpDownloader()

        };

        return new SpiderContextBuilder(context, typeof(Product));

    }

 

    [Schema("jd""sku_v2", Suffix = TableSuffix.Monday)]

    [TargetUrl(new[] { @"page=[0-9]+" }, "//*[@id=\"J_bottomPage\"]")]

    [TypeExtractBy(Expression = "//div[contains(@class,'j-sku-item')]", Multi = true)]

    [Indexes(Primary = "sku")]

    public class Product : ISpiderEntity

    {

        private static readonly DateTime runId;

 

        static Product()

        {

            DateTime dt = DateTime.Now;

            runId = new DateTime(dt.Year, dt.Month, 1);

        }

 

        [StoredAs("category", DataType.String, 20)]

        [PropertyExtractBy(Expression = "name", Type = ExtractType.Enviroment)]

        public string CategoryName { getset; }

 

        [StoredAs("cat3", DataType.String, 20)]

        [PropertyExtractBy(Expression = "cat3", Type = ExtractType.Enviroment)]

        public int CategoryId { getset; }

 

        [StoredAs("url", DataType.Text)]

        [PropertyExtractBy(Expression = "./div[1]/a/@href")]

        public string Url { getset; }

 

        [StoredAs("sku", DataType.String, 25)]

        [PropertyExtractBy(Expression = "./@data-sku")]

        public string Sku { getset; }

 

        [StoredAs("commentscount", DataType.String, 20)]

        [PropertyExtractBy(Expression = "./div[@class='p-commit']/strong/a")]

        public long CommentsCount { getset; }

 

        [StoredAs("shopname", DataType.String, 100)]

        [PropertyExtractBy(Expression = "./div[@class='p-shop hide']/span[1]/a[1]")]

        public string ShopName { getset; }

 

        [StoredAs("name", DataType.String, 50)]

        [PropertyExtractBy(Expression = "./div[@class='p-name']/a/em")]

        public string Name { getset; }

 

        [StoredAs("shopid", DataType.String, 25)]

        public string ShopId { getset; }

 

        [StoredAs("venderid", DataType.String, 25)]

        [PropertyExtractBy(Expression = "./@venderid")]

        public string VenderId { getset; }

 

        [StoredAs("jdzy_shop_id", DataType.String, 25)]

        [PropertyExtractBy(Expression = "./@jdzy_shop_id")]

        public string JdzyShopId { getset; }

 

        [StoredAs("cdate", DataType.Time)]

        [PropertyExtractBy(Expression = "now", Type = ExtractType.Enviroment)]

        public DateTime CDate => DateTime.Now;

    }

}

你可能感兴趣的:(#,abp,#,net框架)