Daily Report 2012/11/07 陈伯雄(step 8)

  今天针对PIPE组对数据表的修改,对建立倒排索引做了系统的修改,由于表DOC、VEDIO、QUESTION(由QAPAIR修改为QUESTION)的属性并不完全相同,处理数据方法进行少量修改:

  DOC表和VEDIO表具有的相同属性:title;

  DOC独有属性:author,keywords;

  QUESTION独有属性:question;

  3个表最后的到的倒排索引结构式相同的,得到WORDLIST和对应ID;

  以下功能整合到分词模块和更新倒排索引模块中

//分词

        static private List<string> getWords(int type, SqlDataReader reader)

        {

            List<string> listall = new List<string>();

            if (type == 0)

            {

                string title = reader[_Title].ToString();

                string keyword = reader[_KeyWords].ToString();

                string author = reader[_Author].ToString();

                //string description = reader[_Description].ToString();

                List<string> list1 = ChineseWordSegmentation.word_segmentation(title);

                List<string> list2 = keyword.Split(new char[2] { ' ', ':' }, StringSplitOptions.RemoveEmptyEntries).ToList();

                List<string> list3 = author.Split(new char[2] { ' ', '.' }, StringSplitOptions.RemoveEmptyEntries).ToList();

                //List<string> list4 = ChineseWordSegmentation.word_segmentation(description);

                //listall = list1.Union(list2).Union(list3).Union(list4).ToList();

                listall = list1.Union(list2).Union(list3).ToList();

            }

            else if (type == 1)

            {

                string title = reader[_Title].ToString();

                //string description = reader[_Description].ToString();

                //List<string> list1 = ChineseWordSegmentation.word_segmentation(title);

                //List<string> list2 = ChineseWordSegmentation.word_segmentation(description);

                //listall = list1.Union(list2).ToList();

                listall = ChineseWordSegmentation.word_segmentation(title);

            }

            else

            {

                string question = reader[_Question].ToString();

                listall = ChineseWordSegmentation.word_segmentation(question);

            }

            return listall;

        }



        //更新倒排索引

        static private void updateIndex(List<string> words, SqlConnection connection, string ID)

        {

            SqlCommand cmd = new SqlCommand();

            cmd.Connection = connection;

            foreach (string word in words)

            {

                //倒排表中加入新关键词                            

                cmd.CommandText = "SELECT value FROM index3 WHERE value = word";

                object val = cmd.ExecuteScalar();

                if (val == System.DBNull.Value)                               //if(cmd.ExecuteScalar() is DBNull)

                {

                    cmd.CommandText = "INSERT INTO index3 VALUES(word, ID)";

                    cmd.ExecuteNonQuery();

                }

                //倒排索引中存在的关键词,加上属性ID信息

                else

                {

                    string newValue = val.ToString() + "," + ID;

                    cmd.CommandText = "UPDATE index3 SET value = newValue WHERE key = word";

                    cmd.ExecuteNonQuery();

                }

            }

        }

主函数部分:

 1  List<Result> resultList = new List<Result>();

 2             string connectionString = GetConnectionString();                            //SQL Server链接字符串   

 3             using (SqlConnection connection = new SqlConnection(connectionString))      //SQL链接类的实例化

 4             {

 5                 connection.Open();                                                      //打开数据库

 6                 //建立倒排表

 7                 string sqlstr = "CREATE table index_doc(key varchar(50) primary key, ID varchar(50))";                

 8                 SqlCommand cmd = new SqlCommand();

 9                 cmd.Connection = connection;

10                 cmd.CommandText = sqlstr;

11                 cmd.ExecuteNonQuery();

12                 sqlstr = "CREATE table index_vedio(key varchar(50) primary key, ID varchar(50))";

13                 cmd.CommandText = sqlstr;

14                 cmd.ExecuteNonQuery();

15                 sqlstr = "CREATE table index_question(key varchar(50) primary key, ID varchar(50))";

16                 cmd.CommandText = sqlstr;

17                 cmd.ExecuteNonQuery();

18 

19                 for (int i = 0; i < 3;i++ )

20                 {

21                     string table = "";

22                     if (i == 0) table = _TableDoc;

23                     else if (i == 1) table = _TableVideo;

24                     else table = _TableQuestion;

25                     //读取顺序表

26                     sqlstr = "SELECT * FROM" + table;

27                     cmd.CommandText = sqlstr;

28                     SqlDataReader reader = cmd.ExecuteReader();                             

29                     try

30                     {

31                         while (reader.Read())

32                         {

33                             string ID = reader[_ID].ToString();

34                             //分词处理

35                             List<string> words = getWords(i, reader);

36                             //将keyword信息添加到倒排表

37                             updateIndex(words, connection, ID);

38                         }

39                     }

40                     finally

41                     {

42                         // Always call Close when done reading.

43                         reader.Close();

44                     }

45                 }

你可能感兴趣的:(port)