Tag图是采用了Tag作为文章管理工具的网站经常需要呈现的一种视图。利用Lucene的优异性能,可以出色的完成这一功能。
生成一个Tag图,首先需要知道用于一共使用了哪些Tag,其次需要知道每个Tag被使用的次数。
对于这两个功能,都可以使用Lucene.Index.IndexReader.Terms方法。这个方法返回索引目录下所有Term,以及他们在全部文档中被使用的次数。这就为我们生成Tag提供了必要的基础。但是Terms方法返回的TermEnum的排序算法是按照FieldName,text的方式排序的,而不是按照docfreq排序的,所以需要还实现一个排序算法。
首先是索引的结构。我设计了如下的索引结构:
docurl:文档的url
contents:文档的内容,以便全文索引
doctags:文档相关的所有tags.tag以空格或逗号作为分割,可以使用单独的Analyzer进行解析。可以参考Analyzer以及PerFieldAnalyzerWrapper两个类。
排序算法,使用一个链表作为保存Tag的形式。它的两个方法GetList(int top)和Top(int freq)可以帮助我们设定Tag图中需要包含的Tag。TermFreq是每个Tag的数据内容。TermFreq.term是Tag的内容。TermFreq.freq是被使用的次数,这样就可以设定Tag的显示样式了。链表通过一个SortedList作为帮助信息,以便提高排序的效率。经过测试,这个排序算法对200M的TermFreq只需要11秒的时间。
1
internal
class
TermFreq
2
{
3
public
string
term;
4
public
int
freq
=
0
;
5
}
6
internal
class
TermFreqCompare : System.Collections.IComparer
7
{
8
#region
IComparer 成员
9
10
public
int
Compare(
object
x,
object
y)
11
{
12
TermFreq f1
=
x
as
TermFreq;
13
TermFreq f2
=
y
as
TermFreq;
14
int
compareResult
=
f1.freq.CompareTo(f2.freq);
15
//
if(compareResult==0) return f2.term.CompareTo(f1.term);
16
return
compareResult;
17
}
18
19
#endregion
20
21
}
22
internal
class
TermFreqSortedList
23
{
24
private
Element root;
25
private
System.Collections.IComparer comparer;
26
private
System.Collections.SortedList list;
27
internal
class
Element
28
{
29
public
Element prev;
30
public
Element next;
31
public
TermFreq current;
32
}
33
public
TermFreqSortedList(System.Collections.IComparer comparer)
34
{
35
root
=
new
Element();
36
root.current
=
new
TermFreq();
37
this
.comparer
=
comparer;
38
list
=
new
System.Collections.SortedList();
39
}
40
private
Element GetStartElement(
int
freq)
41
{
42
Element ele
=
null
;
43
if
(list.ContainsKey(freq))
44
{
45
ele
=
list[freq]
as
Element;
46
}
47
else
48
{
49
list.Add(freq,
null
);
50
int
index
=
list.IndexOfKey(freq)
-
1
;
51
if
(index
<
0
) ele
=
list[
0
]
as
Element;
52
else
ele
=
list[index]
as
Element;
53
}
54
return
ele;
55
}
56
public
void
Add(TermFreq o)
57
{
58
Element ele
=
GetStartElement(o.freq);
59
if
(ele
==
null
) ele
=
root;
60
Element oEle
=
new
Element();
61
oEle.current
=
o;
62
list[oEle.current.freq]
=
oEle;
63
while
(ele
!=
null
)
64
{
65
int
compareResult
=
comparer.Compare(ele.current,oEle.current);
66
if
(compareResult
>
0
)
67
{
68
if
(ele.next
==
null
)
69
{
70
ele.next
=
oEle;
71
oEle.prev
=
ele;
72
break
;
73
}
74
else
if
(comparer.Compare(ele.next.current,oEle.current)
<
0
)
75
{
76
ele.next.prev
=
oEle;
77
oEle.next
=
ele.next;
78
ele.next
=
oEle;
79
oEle.prev
=
ele;
80
break
;
81
}
82
else
83
{
84
ele
=
ele.next;
85
continue
;
86
}
87
}
88
else
if
(compareResult
<
0
)
89
{
90
if
(ele.prev
==
null
)
91
{
92
ele.prev
=
oEle;
93
oEle.next
=
ele;
94
root
=
oEle;
95
break
;
96
}
97
else
if
(comparer.Compare(ele.prev.current,oEle.current)
>
0
)
98
{
99
ele.prev.next
=
oEle;
100
oEle.prev
=
ele.prev;
101
102
ele.prev
=
oEle;
103
oEle.next
=
ele;
104
break
;
105
}
106
else
107
{
108
ele
=
ele.prev;
109
continue
;
110
}
111
}
112
if
(ele.prev
!=
null
)
113
{
114
ele.prev.next
=
oEle;
115
oEle.prev
=
ele.prev;
116
}
117
else
118
{
119
root
=
oEle;
120
}
121
oEle.next
=
ele;
122
ele.prev
=
oEle;
123
break
;
124
}
125
}
126
public
System.Collections.ArrayList GetList(
int
top)
127
{
128
System.Collections.ArrayList list
=
new
System.Collections.ArrayList();
129
Element ele
=
root;
130
int
i
=
0
;
131
while
((i
++
)
<
top)
132
{
133
list.Add(ele.current);
134
if
(ele.next
==
null
)
135
{
136
return
list;
137
}
138
ele
=
ele.next;
139
}
140
return
list;
141
}
142
public
System.Collections.ArrayList Top(
int
freq)
143
{
144
System.Collections.ArrayList list
=
new
System.Collections.ArrayList();
145
Element ele
=
root;
146
while
(ele.current.freq
>=
freq)
147
{
148
list.Add(ele.current);;
149
if
(ele.next
==
null
)
150
return
list;
151
ele
=
ele.next;
152
}
153
return
list;
154
}
155
}
文档生成的代码:
1
Document doc
=
new
Document();
2
doc.Add(Field.Keyword(
"
docurl
"
, docurl));
3
doc.Add(Field.Text(
"
contents
"
,contents));
4
//
storeTermVector==true.这样我们以后就可以通过TermFreqVector来访问tag在每个文档中被标注的次数了,以便生成单个文档的Tag图
5
doc.Add(Field.Text(
"
doctags
"
, reader,
true
));
测试代码:
1
Lucene.Net.Index.TermEnum enu
=
reader.Terms(
new
Term(
"
contents
"
,
"
_
"
));
2
TermFreqSortedList list
=
new
TermFreqSortedList(
new
TermFreqCompare());
3
4
while
(enu.Next())
5
{
6
Lucene.Net.Index.Term t
=
enu.Term();
7
8
TermFreq f
=
new
TermFreq();
9
f.freq
=
enu.DocFreq();
10
f.term
=
t.Text();
11
list.Add(f);
12
}
13
for
(System.Collections.IEnumerator ienu
=
list.GetList(
5
).GetEnumerator();ienu.MoveNext();)
14
{
15
TermFreq ff
=
ienu.Current
as
TermFreq;
16
17
Console.WriteLine(
string
.Format(
"
Term:{0}.\t\t\tDocFreq:{1}
"
,
18
ff.term,
19
ff.freq));
20
}
21
for
(System.Collections.IEnumerator ienu
=
list.Top(
3
).GetEnumerator();ienu.MoveNext();)
22
{
23
TermFreq ff
=
ienu.Current
as
TermFreq;
24
25
Console.WriteLine(
string
.Format(
"
Term:{0}.\t\t\tDocFreq:{1}
"
,
26
ff.term,
27
ff.freq));
28
}