MongoDBcrud操作,采集部分代码

using System;

using System.Collections.Generic;

using System.ComponentModel.Design;

using System.Linq;

using System.Text;

using System.Text.RegularExpressions;

using System.Threading.Tasks;

using CDPWIB.DAL;

using CDPWIB.Data;

using CommonUtility;

using HtmlAgilityPack;

using MongoDB.Driver;

using MongoDB.Driver.Builders;

using MongoDB.Driver.Linq;

using Newtonsoft.Json;

using Newtonsoft.Json.Linq;

using WebKit;



namespace CDPWIB.WebCollection

{

    internal class QiDianCol : INovalCollect

    {

        private int Source = Convert.ToInt32(NovalSource.QiDian);



        private readonly MongoCollection<NovalTempBase> Novalcol =

            MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof (NovalTempBase));





        public void GetNovalTypeTemp()

        {

            try

            {

                var typecol = MongoConnectionFactory.GetMongoCollction<NovalTypeTemp>("Noval", typeof (NovalTypeTemp));

                var subcol = MongoConnectionFactory.GetMongoCollction<NovalSubType>("Noval", typeof (NovalSubType));

                // 大类 http://www.qidian.com/Javascript/qidian.bookstore.js?t=20130917

                string typeshtml =

                    NetHelper.HttpGet("http://www.qidian.com/Javascript/qidian.bookstore.js?t=20130917")

                        .Replace("/", "")

                        .Replace("&nbsp", "")

                        .Replace("\r", "")

                        .Replace("\n", "")

                        .Replace("\t", "")

                        .Replace("|", "")

                        .Replace(" ", "");

                ;

                string subtypes =

                    NetHelper.HttpGet("http://script.cmfu.com/script/BookStore.js ")

                        .Replace("&nbsp", "")

                        .Replace("\r", "")

                        .Replace("\n", "")

                        .Replace("\t", "")

                        .Replace("|", "")

                        .Replace(" ", "");

                ;



                Match mtype = Regex.Match(typeshtml, "CategoryArr:(.*?)]]",

                    RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline);



                string typesstring = mtype.Groups[1].Value + "]]";

                JArray typearr = (JArray) JsonConvert.DeserializeObject(typesstring);

                //JsonTextWriter



                Match msubtype = Regex.Match(subtypes, "SubCategoryArr=(.*?);",

                    RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline);



                string subtypesstring = msubtype.Groups[1].Value;



                JArray subarr = (JArray) JsonConvert.DeserializeObject(subtypesstring);



                List<NovalTypeTemp> lstypes = new List<NovalTypeTemp>(10);

                //CategoryArr: [["全部", "-1"], ["玄幻", "21"], ["奇幻", "1"], ["武侠", "2"], ["仙侠", "22"], ["都市", "4"], ["历史", "5"], ["军事", "6"], ["游戏", "7"]

                for (int i = 0; i < typearr.Count; i++)

                {

                    if (typearr[i][1].ToString() != "-1")

                    {

                        NovalTypeTemp type = new NovalTypeTemp()

                        {

                            WebNum = typearr[i][1].ToString().ToInt(),

                            Name = typearr[i][0].ToString(),

                            Source = Source

                        };

                        lstypes.Add(type);

                    }

                }

                IMongoQuery query = Query<NovalTypeTemp>.EQ(p => p.Source, Source);



                typecol.Remove(query);



                typecol.InsertBatch(lstypes);

                List<NovalSubType> subtypels = new List<NovalSubType>(300);



                foreach (var NovalTypeTemp in lstypes)

                {

                 

                    for (int i = 0; i < subarr.Count; i++)

                    {

                        var obj = subarr[i];

                        if (obj[0].ToString() == NovalTypeTemp.WebNum.ToString())

                        {

                            NovalSubType subtype = new NovalSubType()

                            {

                                Name = obj[2].ToString(),

                                ParentWebNum = NovalTypeTemp.WebNum,

                                WebNum = obj[1].ToString().ToInt(),

                                Source = Source

                            };

                            subtypels.Add(subtype);

                        }

                    }

                   

                }

                query = Query<NovalSubType>.EQ(p => p.Source, Source);

                subcol.Remove(query);

                subcol.InsertBatch(subtypels);

            }

            catch (Exception ex)

            {

                throw;

            }

        }



        /// <summary>

        /// 根据点击数页面查小说

        /// </summary>

        public void GetNovals()

        {

            //取1到10页

            //得到月点击排行小说。

            string sourcehtml = string.Empty;

            HtmlDocument htmldocc = new HtmlDocument();

            List<NovalTempBase> qdls = new List<NovalTempBase>(500);

            for (int j = 1; j < 11; j++)

            {

                sourcehtml =

                    NetHelper.HttpGet("http://top.qidian.com/Book/TopDetail.aspx?TopType&Time=2&PageIndex=" + j);

                ;

                htmldocc.LoadHtml(sourcehtml);

                var doc = htmldocc.GetElementbyId("textlist");

                //string tablehtml = "<table>" + doc.InnerHtml + "</table>";

                //     htmldocc.LoadHtml(tablehtml);

                //一页50列

                for (int i = 2; i < 52; i++)

                {

                    var trdoc = doc.SelectSingleNode("tr[" + i + "]");

                    //这里的下标,从1算起

                    var tdtype = trdoc.SelectSingleNode("td[2]/a");

                    var tdbook = trdoc.SelectSingleNode("td[3]/a[1]");

                    var tdclick = trdoc.SelectSingleNode("td[4]");

                    var tdauth = trdoc.SelectSingleNode("td[5]/a");

                    Match typematch = Regex.Match(tdtype.OuterHtml, "ChannelId=(\\d*?)&SubCategoryId=(\\d*?)'");

                    Match bookmatck = Regex.Match(tdbook.OuterHtml, "Book/(\\d*?).aspx");

                    Match authmatch = Regex.Match(tdauth.OuterHtml, "id=(\\d*?)\"");

                    int authid = authmatch.Groups[1].Value.ToInt();

                    int type = typematch.Groups[1].Value.ToInt();

                    int subtype = typematch.Groups[2].Value.ToInt();

                    int booknum = bookmatck.Groups[1].Value.ToInt();

                    string bookname = tdbook.InnerText.Trim();

                    //http://image.cmfu.com/books/3127618/3127618.jpg

                    string titleimg = "http://image.cmfu.com/books/" + booknum + "/" + booknum + ".jpg";



                   bool exist= qdls.Exists(p => p.SourceWebNum == booknum);

                    if (!exist)

                    {

                        NovalTempBase qidian = new NovalTempBase()

                        {

                            AuthName = tdauth.InnerText.Trim(),

                            AuthId = authid,

                            SubType = subtype,

                            TitleImg = titleimg,

                            Title = bookname,

                            TotalClick = tdclick.InnerText.ToInt(),

                            TotalComment = 0,

                            Type = type,

                            SourceWebNum = booknum,

                            Source = Source

                        };

                        qdls.Add(qidian);

                    }

                    

                }

            }



            PublicMethod.InsertAndUpdateNovalTmp(qdls,Source);

        }



        //public void GetNovalsByType()

        //{

        //}

        /// <summary>

        /// 得到小说章节 ,个别来源,带分卷。

        /// </summary>

        public void GetNovalChapers()

        {



            //http://sight.qq.com/book/chapterpage?uin=0&g_tk=5381&callback=_Callback&pagesize=100&pageno=2&bid=16043&_r=0.6934567329008132

            var novalcol = MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof (NovalTempBase));



            var books = novalcol.AsQueryable().Where(p=>p.Source==Source).ToList();

            foreach (var infoQidian in books)

            {

                GetSingleNovalChapers(infoQidian.SourceWebNum);

            }

        }



        public void GetSingleNovalChapers(int novalwebnum)

        {

            IMongoQuery q2 = Query<NovalVolumeTemp>.EQ(p => p.Source, Source);

            IMongoQuery q1 = Query<NovalVolumeTemp>.EQ(p => p.NovalWebNum, novalwebnum);

            IMongoQuery[] qarray = { q1, q2 };



            IMongoQuery query = Query.And(qarray);

     

            var chaptercol = MongoConnectionFactory.GetMongoCollction<NovalChapterTemp>("Noval", typeof(NovalChapterTemp));

            var volumecol = MongoConnectionFactory.GetMongoCollction<NovalVolumeTemp>("Noval", typeof (NovalVolumeTemp));

            List<NovalChapterTemp> lschapters = new List<NovalChapterTemp>(1000);

            List<NovalVolumeTemp> lsvolumes = new List<NovalVolumeTemp>(10);

            int chapterorder = 1;

            int volumeorder = 1;

            HtmlDocument htmldocc = new HtmlDocument();

            //http://read.qidian.com/BookReader/3127618.aspx



            string sourcehtml = string.Empty;

            string url = "http://read.qidian.com/BookReader/" + novalwebnum + ".aspx";

            try

            {

                sourcehtml = NetHelper.HttpGet(url);

                //目录主页

                htmldocc.LoadHtml(sourcehtml);

                var doc = htmldocc.GetElementbyId("content");

                int i = 1;



                var topdoc = doc.SelectSingleNode("div[" + i + "]");

                while (topdoc != null)

                {

                    var topa = topdoc.SelectSingleNode("div/a");

                    //如果是vip章节,没有这个A标签。

                    int topnum;

                    //分卷信息

                    if (topa != null)

                    {

                        string topahtml = topa.OuterHtml;

                        //href="http://www.qidian.com/BookReader/vol,107580,486625.aspx"

                        Match m = Regex.Match(topahtml, ",(\\d*?).aspx");

                        topnum = m.Groups[1].Value.ToInt();

                        var topaname = topdoc.SelectSingleNode("div/b");

                        string topname = topaname.InnerText.Trim();



                        topname = topname.Replace("&nbsp", "").Split(';')[1];

                        //if(topname=="作品相关")

                        NovalVolumeTemp volume = new NovalVolumeTemp()

                        {

                            Sort = volumeorder,

                            WebNum = topnum,

                            Name = topname,

                            NovalWebNum = novalwebnum,

                            Source = Source

                        };

                        lsvolumes.Add(volume);

                        volumeorder++;

                    }

                    else

                    {

                        topnum = 0;

                    }



                    var contextdoc = doc.SelectSingleNode("div[" + (i + 1) + "]");

                    var chaperas = contextdoc.SelectNodes("div/ul/li/a");

                    //<a itemprop='url' href="http://read.qidian.com/BookReader/107580,20901221.aspx" title='凡人修仙传&#xd;字数:84  更新时间:2008-08-01 07:54:48'><span itemprop='headline'>呵呵!终于上架了!</span></a>

                    //,(\d*?).aspx

                    string chaptername = string.Empty;

                    //章节信息

                    int chapterwebnum = 0;

                    for (int x = 0; x < chaperas.Count; x++)

                    {

                        var chapera = chaperas[x];

                        chaptername = chapera.InnerText.Trim();

                        Match chapmatchwebnum = Regex.Match(chapera.OuterHtml, ",(\\d*?).aspx");

                        chapterwebnum = chapmatchwebnum.Groups[1].Value.ToInt();

                        NovalChapterTemp chapter = new NovalChapterTemp()

                        {

                            Name = chaptername,

                            Sort = chapterorder,

                            WebNum = chapterwebnum,

                            VolumeId = topnum

                            ,

                            NovalWebNum = novalwebnum,

                            Source = Source

                        };

                        lschapters.Add(chapter);

                        chapterorder++;

                    }

                    i += 2;

                    topdoc = doc.SelectSingleNode("div[" + i + "]");

                }

                volumecol.Remove(query);

                volumecol.InsertBatch(lsvolumes);

                PublicMethod.InsertChapterTempToSQL(lschapters, Source, novalwebnum);

            

            }

            catch (Exception ex)

            {

                return;

            }



        }





        public void GetNovalCilckComment()

        {

            var novalcol = MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof(NovalTempBase));



            var books = novalcol.AsQueryable().Where(p => p.Source == Source).ToList();

            string sourcehtml = string.Empty;

            string url = string.Empty;

            HtmlDocument htmldocc = new HtmlDocument();

            foreach (var novalTempBase in books)

            {

                //http://www.qidian.com/Book/3106580.aspx

                 url = "http://www.qidian.com/Book/" + novalTempBase.SourceWebNum + ".aspx";

                sourcehtml = NetHelper.HttpGet(url);

                htmldocc.LoadHtml(sourcehtml);

                var cliclickdiv = htmldocc.GetElementbyId("contentdiv");

                // /div/div/div[1]/table/tbody/tr/td[1]



                var clickcount =

                    cliclickdiv.SelectSingleNode("div/div[1]/table/tr/td[1]")

                        .InnerText.Replace("总点击", "")

                        .Replace("", "").Trim();



                int click = Convert.ToInt32(clickcount);

              

            //    string urlcom = "http://forum.qidian.com/NewForum/List.aspx?BookId=3106580";

            ////http://forum.qidian.com/NewForum/List.aspx?BookId=3106580

          



            // //   http://c.pingba.qidian.com/BookComment.aspx?BookId=3106580

            //    url = "http://c.pingba.qidian.com/BookComment.aspx?" + novalTempBase.SourceWebNum;

            //    sourcehtml = NetHelper.HttpGet(url);

            //    htmldocc.LoadHtml(sourcehtml);

                novalTempBase.TotalClick = click;

                novalcol.Save(novalTempBase);

            }

            

             

          

           

            

               

                //目录主页

              

             



        }



    

    }

}

 

你可能感兴趣的:(mongodb)