本文简单介绍Lucene.Net实现GroupBy效果的方法,与《Lucene.Net 按类别统计搜索结果数 》一文类似。注意,这种使用方法很影响效率,特别是命中结果多的情况下。这段代码修正自2.3.1版本,其它版本可能会与此有差别。
改造方法仍然是修改IndexSearcher,这里不再修改类库,而是通过自己的代码来实现。
扩充IndexSearcher类
/// <summary>
/// 增加了GroupBy功能的IndexSearcher
/// </summary>
public class IndexSearcherExtension : IndexSearcher
{
/// <summary>
/// 这里只用这一个构造函数,其它的就不再列出。
/// </summary>
/// <param name="path"></param>
public IndexSearcherExtension(string path) : base(path) { }
/// <summary>
/// 增加GroupBy字段
/// </summary>
private string fieldName;
/// <summary>
/// 给TopDocCollectorExtension类的Collect方法使用。
/// </summary>
public string FieldName {
get { return fieldName; }
}
/// <summary>
/// 在调用Search方法前一定要调用该方法。
/// </summary>
/// <param name="fieldName"></param>
public void GroupBy(string fieldName) {
this.fieldName = fieldName;
}
/// <summary>
/// 重写Seach方法,使其能调用构造好的方法。
/// </summary>
/// <param name="weight"></param>
/// <param name="filter"></param>
/// <param name="nDocs"></param>
/// <returns></returns>
public override TopDocs Search(Weight weight, Filter filter, int nDocs)
{
if (nDocs <= 0)
// null might be returned from hq.top() below.
throw new System.ArgumentException("nDocs must be > 0");
TopDocCollectorExtension collector = new TopDocCollectorExtension(nDocs, this);
Search(weight, filter, collector);
return collector.TopDocs();
}
}
实现与 HitQueue类完全一致,只因为这里无法使用类库提供的构造函数
/// <summary>
/// 实现与 HitQueue类完全一致,只因为这里无法使用类库提供的构造函数
/// </summary>
public class HitQueueExtension : PriorityQueue
{
internal HitQueueExtension(int size)
{
Initialize(size);
}
public override bool LessThan(System.Object a, System.Object b)
{
ScoreDoc hitA = (ScoreDoc) a;
ScoreDoc hitB = (ScoreDoc) b;
if (hitA.score == hitB.score)
return hitA.doc > hitB.doc;
else
return hitA.score < hitB.score;
}
}
///
<summary>
///
增加新的TopDocCollector类,无法直接继承TopDocCollector
///
</summary>
public
class
TopDocCollectorExtension : HitCollector
{
private
ScoreDoc reusableSD;
internal
int
totalHits;
internal
PriorityQueue hq;
///
<summary>
Construct to collect a given number of hits.
</summary>
///
<param name="numHits">
the maximum number of hits to collect
///
</param>
public
TopDocCollectorExtension(
int
numHits)
:
this
(numHits,
new
HitQueueExtension(numHits))
{
}
///
<summary>
///
注入IndexSearcherExtension对象
///
</summary>
private
IndexSearcherExtension searcher;
///
<summary>
///
构造函数注入对象
///
</summary>
///
<param name="numHits"></param>
///
<param name="searcher"></param>
public
TopDocCollectorExtension(
int
numHits, IndexSearcherExtension searcher)
:
this
(numHits)
{
this
.searcher
=
searcher;
}
internal
TopDocCollectorExtension(
int
numHits, PriorityQueue hq)
{
this
.hq
=
hq;
}
///
<summary>
///
临时数据,用于排重
///
</summary>
private
Dictionary
<
int
,
int
>
dict
=
new
Dictionary
<
int
,
int
>
();
//
javadoc inherited
public
override
void
Collect(
int
doc,
float
score)
{
if
(score
>
0.0f
)
{
//
排重算法
if
(
!
string
.IsNullOrEmpty(searcher.FieldName))
{
IndexReader reader
=
searcher.GetIndexReader();
Document docment
=
reader.Document(doc);
string
value
=
docment.Get(searcher.FieldName).Trim();
string
value1
=
string
.Empty;
string
value2
=
string
.Empty;
int
len
=
value.Length;
int
len1
=
(
int
)Math.Ceiling(len
/
2.0f
);
int
len2
=
len
-
len1;
int
hash1
=
value.Substring(
0
, len1).GetHashCode();
int
hash2
=
value.Substring(len1, len2).GetHashCode();
if
(
!
(dict.ContainsKey(hash1)
&&
dict.ContainsValue(hash2)))
dict.Add(hash1, hash2);
else
return
;
}
totalHits
++
;
if
(reusableSD
==
null
)
{
reusableSD
=
new
ScoreDoc(doc, score);
}
else
if
(score
>=
reusableSD.score)
{
//
reusableSD holds the last "rejected" entry, so, if
//
this new score is not better than that, there's no
//
need to try inserting it
reusableSD.doc
=
doc;
reusableSD.score
=
score;
}
else
{
return
;
}
reusableSD
=
(ScoreDoc)hq.InsertWithOverflow(reusableSD);
}
}
///
<summary>
The total number of documents that matched this query.
</summary>
public
virtual
int
GetTotalHits()
{
return
totalHits;
}
///
<summary>
The top-scoring hits.
</summary>
public
virtual
TopDocs TopDocs()
{
ScoreDoc[] scoreDocs
=
new
ScoreDoc[hq.Size()];
for
(
int
i
=
hq.Size()
-
1
; i
>=
0
; i
--
)
//
put docs in array
scoreDocs[i]
=
(ScoreDoc)hq.Pop();
float
maxScore
=
(totalHits
==
0
)
?
System.Single.NegativeInfinity : scoreDocs[
0
].score;
return
new
TopDocs(totalHits, scoreDocs, maxScore);
}
}
OK生产者完成了,下面看看消费者怎么搞。
static
void
Main(
string
[] args)
{
IndexWriter writer
=
new
IndexWriter(
"
e:\\index
"
,
new
StandardAnalyzer(),
true
);
Document doc
=
new
Document();
doc.Add(
new
Field(
"
field
"
,
"
query value!
"
, Field.Store.YES, Field.Index.TOKENIZED));
writer.AddDocument(doc);
writer.AddDocument(doc);
writer.AddDocument(doc);
writer.Close();
IndexSearcherExtension searcher
=
new
IndexSearcherExtension(
"
e:\\index
"
);
searcher.GroupBy(
"
field
"
);
Query q
=
new
QueryParser(
"
field
"
,
new
StandardAnalyzer())
.Parse(
"
query
"
);
Hits docs
=
searcher.Search(q);
for
(
int
i
=
0
; i
<
docs.Length(); i
++
)
{
Console.WriteLine(docs.Doc(i).Get(
"
field
"
));
}
searcher.Close();
Console.ReadKey();
}
添加了三个相同的文档,结果只查询到一个结果,从而达到了目的。这段修改比较简单,应该还可以设计出更加高效的算法。好长时间没写博客有些生疏了~~!