Lucene与Tag图

Tag图是采用了Tag作为文章管理工具的网站经常需要呈现的一种视图。利用Lucene的优异性能,可以出色的完成这一功能。

生成一个Tag图,首先需要知道用于一共使用了哪些Tag,其次需要知道每个Tag被使用的次数。

对于这两个功能,都可以使用Lucene.Index.IndexReader.Terms方法。这个方法返回索引目录下所有Term,以及他们在全部文档中被使用的次数。这就为我们生成Tag提供了必要的基础。但是Terms方法返回的TermEnum的排序算法是按照FieldName,text的方式排序的,而不是按照docfreq排序的,所以需要还实现一个排序算法。

首先是索引的结构。我设计了如下的索引结构:

docurl:文档的url

contents:文档的内容,以便全文索引

doctags:文档相关的所有tags.tag以空格或逗号作为分割,可以使用单独的Analyzer进行解析。可以参考Analyzer以及PerFieldAnalyzerWrapper两个类。
排序算法,使用一个链表作为保存Tag的形式。它的两个方法GetList(int top)和Top(int freq)可以帮助我们设定Tag图中需要包含的Tag。TermFreq是每个Tag的数据内容。TermFreq.term是Tag的内容。TermFreq.freq是被使用的次数,这样就可以设定Tag的显示样式了。链表通过一个SortedList作为帮助信息,以便提高排序的效率。经过测试,这个排序算法对200M的TermFreq只需要11秒的时间。

  1  internal   class  TermFreq
  2          {
  3               public   string  term;
  4               public   int  freq  =   0 ;
  5          }
  6           internal   class  TermFreqCompare : System.Collections.IComparer
  7          {
  8               #region  IComparer 成员
  9 
 10               public   int  Compare( object  x,  object  y)
 11              {
 12                  TermFreq f1  =  x  as  TermFreq;
 13                  TermFreq f2  =  y  as  TermFreq;
 14                   int  compareResult  =  f1.freq.CompareTo(f2.freq);
 15                   // if(compareResult==0) return f2.term.CompareTo(f1.term);
 16                   return  compareResult;
 17              }
 18 
 19               #endregion
 20 
 21          }
 22           internal   class  TermFreqSortedList
 23          {
 24               private  Element root;
 25               private  System.Collections.IComparer comparer;
 26               private  System.Collections.SortedList list;
 27               internal   class  Element
 28              {
 29                   public  Element prev;
 30                   public  Element next;
 31                   public  TermFreq current;
 32              }
 33               public  TermFreqSortedList(System.Collections.IComparer comparer)
 34              {
 35                  root  =   new  Element();
 36                  root.current  =   new  TermFreq();
 37                   this .comparer  =  comparer;
 38                  list  =   new  System.Collections.SortedList();
 39              }
 40               private  Element GetStartElement( int  freq)
 41              {
 42                  Element ele  =   null ;
 43                   if (list.ContainsKey(freq))
 44                  {
 45                      ele  =  list[freq]  as  Element;
 46                  }
 47                   else
 48                  {
 49                      list.Add(freq, null );
 50                       int  index  =  list.IndexOfKey(freq) - 1 ;
 51                       if (index < 0 ) ele  =  list[ 0 as  Element;
 52                       else  ele  =  list[index]  as  Element;
 53                  }
 54                   return  ele;
 55              }
 56               public   void  Add(TermFreq o)
 57              {
 58                  Element ele  =  GetStartElement(o.freq);
 59                   if (ele == null ) ele  =  root;
 60                  Element oEle  =   new  Element();
 61                  oEle.current  =  o;
 62                  list[oEle.current.freq]  =  oEle;
 63                   while (ele  != null )
 64                  {
 65                       int  compareResult  =  comparer.Compare(ele.current,oEle.current);
 66                       if (compareResult > 0 )
 67                      {
 68                           if (ele.next == null )
 69                          {
 70                              ele.next  =  oEle;
 71                              oEle.prev  =  ele;
 72                               break ;
 73                          }
 74                           else   if (comparer.Compare(ele.next.current,oEle.current) < 0 )
 75                          {
 76                              ele.next.prev  =  oEle;
 77                              oEle.next  =  ele.next;
 78                              ele.next  =  oEle;
 79                              oEle.prev  =  ele;
 80                               break ;
 81                          }
 82                           else
 83                          {
 84                              ele  =  ele.next;
 85                               continue ;
 86                          }
 87                      }
 88                       else   if (compareResult < 0 )
 89                      {
 90                           if (ele.prev == null )
 91                          {
 92                              ele.prev  =  oEle;
 93                              oEle.next  =  ele;
 94                              root  =  oEle;
 95                               break ;
 96                          }
 97                           else   if (comparer.Compare(ele.prev.current,oEle.current) > 0 )
 98                          {
 99                              ele.prev.next  =  oEle;
100                              oEle.prev  =  ele.prev;
101 
102                              ele.prev  =  oEle;
103                              oEle.next  =  ele;
104                               break ;
105                          }
106                           else
107                          {
108                              ele  =  ele.prev;
109                               continue ;
110                          }
111                      }
112                       if (ele.prev != null )
113                      {
114                          ele.prev.next  =  oEle;
115                          oEle.prev  =  ele.prev;
116                      }
117                       else
118                      {
119                          root  =  oEle;
120                      }
121                      oEle.next  =  ele;
122                      ele.prev  =  oEle;
123                       break ;
124                  }
125              }
126               public  System.Collections.ArrayList GetList( int  top)
127              {
128                  System.Collections.ArrayList list  =   new  System.Collections.ArrayList();
129                  Element ele  =  root;
130                   int  i = 0 ;
131                   while ((i ++ ) < top)
132                  {
133                      list.Add(ele.current);
134                       if (ele.next  ==   null )
135                      {
136                           return  list;
137                      }
138                      ele  =  ele.next;
139                  }
140                   return  list;
141              }
142               public  System.Collections.ArrayList Top( int  freq)
143              {
144                  System.Collections.ArrayList list  =   new  System.Collections.ArrayList();
145                  Element ele  =  root;
146                   while (ele.current.freq  >=  freq)
147                  {
148                      list.Add(ele.current);;
149                       if (ele.next == null )
150                           return  list;
151                      ele  =  ele.next;
152                  }
153                   return  list;
154              }
155          }

文档生成的代码:

1  Document doc  =   new  Document();        
2  doc.Add(Field.Keyword( " docurl " , docurl));
3  doc.Add(Field.Text( " contents " ,contents));
4  // storeTermVector==true.这样我们以后就可以通过TermFreqVector来访问tag在每个文档中被标注的次数了,以便生成单个文档的Tag图
5  doc.Add(Field.Text( " doctags " , reader, true ));

测试代码:

 1  Lucene.Net.Index.TermEnum enu  =  reader.Terms( new  Term( " contents " , " _ " ));
 2                  TermFreqSortedList list  =   new  TermFreqSortedList( new  TermFreqCompare());
 3                  
 4                   while (enu.Next())
 5                  {
 6                      Lucene.Net.Index.Term t  =  enu.Term();
 7                      
 8                      TermFreq f  =   new  TermFreq();
 9                      f.freq  =  enu.DocFreq();
10                      f.term  =  t.Text();
11                      list.Add(f);
12                  }
13                   for (System.Collections.IEnumerator ienu  =  list.GetList( 5 ).GetEnumerator();ienu.MoveNext();)
14                  {
15                      TermFreq ff  =  ienu.Current  as  TermFreq;
16                      
17                      Console.WriteLine( string .Format( " Term:{0}.\t\t\tDocFreq:{1} " ,
18                          ff.term,
19                          ff.freq));
20                  }
21                   for (System.Collections.IEnumerator ienu  =  list.Top( 3 ).GetEnumerator();ienu.MoveNext();)
22                  {
23                      TermFreq ff  =  ienu.Current  as  TermFreq;
24                      
25                      Console.WriteLine( string .Format( " Term:{0}.\t\t\tDocFreq:{1} " ,
26                          ff.term,
27                          ff.freq));
28                  }


 

你可能感兴趣的:(Lucene)