关于检索的核心IndexSearcher类。
IndexSearcher是Lucene的检索实现的最核心的实现类,它继承自抽象类Searcher,该抽象类中包含了用于检索的一些核心的方法的实现。而Searcher抽象类有实现了Searchable接口,Searchable接口是实现检索的抽象网络协议,可以基于此协议来实现对远程服务器上的索引目录的访问。这一点,可以从Searchable接口所继承的java.rmi.Remote接口来说明。
java.rmi.Remote接口在JDK中给出了说明,如下所示:
也就是说,继承java.rmi.Remote的接口具有的特性是:
1、远程接口用来识别那些继承java.rmi.Remote的接口类,这些接口被非本地虚拟机调用;
2、继承java.rmi.Remote的接口类具有远程可用的特性;
3、实现了java.rmi.Remote接口的子接口的实现类,可以对远程对象进行管理。
下面就对与检索相关的一些接口及一些抽象类做一个概览,有助于后面对这些接口的实现类进行学习研究:
Searchable接口类
Searchable接口的实现如下所示:
package org.apache.lucene.search;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.CorruptIndexException;
import java.io.IOException;
public interface Searchable extends java.rmi.Remote {
/* 用于检索的核心方法,指定了权重Weight和过滤器Filter参数。因为返回值为void类型,所以实际检索出来的Document都被存放在HitCollector中,该HitCollector类收集了那些得分大于0的Document。*/
void search(Weight weight, Filter filter, HitCollector results)
throws IOException;
// 释放一个IndexSearcher检索器所关联的资源
void close() throws IOException;
// 返回根据指定词条检索出来的Document的数量
int docFreq(Term term) throws IOException;
// 返回根据指定词条数组中所列词条检索出来的Document的数量的一个数组
int[] docFreqs(Term[] terms) throws IOException;
// 返回一个整数值:最大可能的Document的数量 + 1
int maxDoc() throws IOException;
// 检索的方法,返回检索出来的得分(Hits)排在前n位的Document
TopDocs search(Weight weight, Filter filter, int n) throws IOException;
// 获取编号为i的Document,(注意:是内部编号,可以在上面测试程序中执行System.out.println(searcher.doc(24));,打印出结果为Document<stored/uncompressed,indexed<path:E:\Lucene\txt1\mytxt\FAQ.txt> stored/uncompressed,indexed<modified:200604130754>>)
Document doc(int i) throws CorruptIndexException, IOException;
// 获取在位置n上的Document;FieldSelector接口类似于一个文件过滤器,它有一个方法FieldSelectorResult accept(String fieldName);
Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException;
// 重新设置Query(即,重写先前设定的Query)
Query rewrite(Query query) throws IOException;
// 返回一个Explanation,该Explanation用于计算得分
Explanation explain(Weight weight, int doc) throws IOException;
// 指定一种排序方式,在此基础上返回得分在前n位的Document
TopFieldDocs search(Weight weight, Filter filter, int n, Sort sort)
throws IOException;
}
Searcher抽象类
package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.Term;
import org.apache.lucene.document.Document;
// 该抽象类实现了Searchable接口
public abstract class Searcher implements Searchable {
// 查询与指定Query匹配的Document,返回Hits实例,该Hits内容相当丰富
public final Hits search(Query query) throws IOException {
return search(query, (Filter)null); // 调用下面的search()方法
}
public Hits search(Query query, Filter filter) throws IOException {
return new Hits(this, query, filter);
}
// 指定了Sort
public Hits search(Query query, Sort sort)
throws IOException {
return new Hits(this, query, null, sort);
}
// 指定了Filter和Sort
public Hits search(Query query, Filter filter, Sort sort)
throws IOException {
return new Hits(this, query, filter, sort);
}
// 实现了Searchable接口中方法,指定一种排序方式,在此基础上返回得分在前n位的Document
public TopFieldDocs search(Query query, Filter filter, int n,
Sort sort) throws IOException {
return search(createWeight(query), filter, n, sort); // 调用abstract public TopDocs search(Weight weight, Filter filter, int n) throws IOException;
}
public void search(Query query, HitCollector results)
throws IOException {
search(query, (Filter)null, results);
}
public void search(Query query, Filter filter, HitCollector results)
throws IOException {
search(createWeight(query), filter, results);
}
public TopDocs search(Query query, Filter filter, int n)
throws IOException {
return search(createWeight(query), filter, n);
}
public Explanation explain(Query query, int doc) throws IOException {
return explain(createWeight(query), doc);
}
// 为一个Searcher设置一个Similarity
public void setSimilarity(Similarity similarity) {
this.similarity = similarity;
}
public Similarity getSimilarity() {
return this.similarity;
}
// 根据指定的Query,创建一个用于记录该Query状态的Weight
protected Weight createWeight(Query query) throws IOException {
return query.weight(this);
}
// 实现了接口Searchable中的方法
public int[] docFreqs(Term[] terms) throws IOException {
int[] result = new int[terms.length];
for (int i = 0; i < terms.length; i++) {
result[i] = docFreq(terms[i]);
}
return result;
}
// 一些abstract方法,在接口Searchable中列举过
abstract public void search(Weight weight, Filter filter, HitCollector results) throws IOException;
abstract public void close() throws IOException;
abstract public int docFreq(Term term) throws IOException;
abstract public int maxDoc() throws IOException;
abstract public TopDocs search(Weight weight, Filter filter, int n) throws IOException;
abstract public Document doc(int i) throws CorruptIndexException, IOException;
abstract public Query rewrite(Query query) throws IOException;
abstract public Explanation explain(Weight weight, int doc) throws IOException;
abstract public TopFieldDocs search(Weight weight, Filter filter, int n, Sort sort) throws IOException;
}
Weight接口类
创建一个Weight的目的是,使得一个已经定制的Query实例不在检索过程中被修改,以至于该Query实例可以被重用,而无需重复创建。
一个Query实例是独立于IndexSearcher检索器的。Query的这种独立的状态应该被记录在一个Weight中。
Weight接口的源代码如下所示:
package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
public interface Weight extends java.io.Serializable {
// 获取该Weight所关联的Query实例
Query getQuery();
// 获取一个Query的Weight值
float getValue();
/** The sum of squared weights of contained query clauses. */
float sumOfSquaredWeights() throws IOException;
// 为一个Query设置标准化因子
void normalize(float norm);
// 为一个Weight创建一个Scorer(Scorer是与Document的得分相关的)
Scorer scorer(IndexReader reader) throws IOException;
// 为编号为i的Document计算得分,返回Explanation记录了该Document的得分
Explanation explain(IndexReader reader, int doc) throws IOException;
}
HitCollector抽象类
package org.apache.lucene.search;
// 抽象类用于收集检索出来的Document
public abstract class HitCollector {
// 根据Document的编号和得分,筛选符合条件的Document
public abstract void collect(int doc, float score);
}
Scorer抽象类
package org.apache.lucene.search;
import java.io.IOException;
// 用于管理与查询Query匹配的Document的得分
public abstract class Scorer {
private Similarity similarity;
// Constructs a Scorer.
protected Scorer(Similarity similarity) {
this.similarity = similarity;
}
public Similarity getSimilarity() {
return this.similarity;
}
// 遍历HitCollector,收集所有匹配的Document
public void score(HitCollector hc) throws IOException {
while (next()) {
hc.collect(doc(), score());
}
}
// 在指定范围内(编号<max的Document)收集匹配的Document
protected boolean score(HitCollector hc, int max) throws IOException {
while (doc() < max) {
hc.collect(doc(), score());
if (!next())
return false;
}
return true;
}
/** Advances to the next document matching the query. */
public abstract boolean next() throws IOException;
// 获取当前Document的编号
public abstract int doc();
// 获取当前匹配的Document的得分
public abstract float score() throws IOException;
/** Skips to the first match beyond the current whose document number is
* greater than or equal to a given target.
* <br>When this method is used the {@link #explain(int)} method should not be used.
* @param target The target document number.
* @return true iff there is such a match.
* <p>Behaves as if written: <pre>
* boolean skipTo(int target) {
* do {
* if (!next())
* return false;
* } while (target > doc());
* return true;
* }
* </pre>Most implementations are considerably more efficient than that.
*/
public abstract boolean skipTo(int target) throws IOException;
public abstract Explanation explain(int doc) throws IOException;
}
Similarity抽象类
关于该抽象类的说明,可以参考源代码说明,如下所示:
Expert: Scoring API.
Subclasses implement search scoring.
The score of query q
for document d
correlates to the cosine-distance or dot-product between document and query vectors in a Vector Space Model (VSM) of Information Retrieval. A document whose vector is closer to the query vector in that model is scored higher. The score is computed as follows:
|
where
tf(t in d) = | frequency½ |
idf(t) = | 1 + log ( |
|
) |
queryNorm(q) = queryNorm(sumOfSquaredWeights) = |
|
sumOfSquaredWeights = q.getBoost() 2 · | ∑ | ( idf(t) · t.getBoost() ) 2 |
t in q |
When a document is added to the index, all the above factors are multiplied. If the document has multiple fields with the same name, all their boosts are multiplied together:
norm(t,d) = doc.getBoost() · lengthNorm(field) · | ∏ | f.getBoost() |
field f in d named as t |
该抽象类的源代码如下所示:
package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.SmallFloat;
import java.io.IOException;
import java.io.Serializable;
import java.util.Collection;
import java.util.Iterator;
public abstract class Similarity implements Serializable {
// DefaultSimilarity是Similarity的子类
private static Similarity defaultImpl = new DefaultSimilarity();
public static void setDefault(Similarity similarity) {
Similarity.defaultImpl = similarity;
}
public static Similarity getDefault() {
return Similarity.defaultImpl;
}
// 标准化因子列表
private static final float[] NORM_TABLE = new float[256];
static { // 静态加载
for (int i = 0; i < 256; i++)
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); // 将Cache中的字节转化成浮点数
}
// 解码标准化因子(从byte变为float)
public static float decodeNorm(byte b) {
return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
// 获取解码标准化因子列表
public static float[] getNormDecoder() {
return NORM_TABLE;
}
// 指定了名称为fieldName的Field,以及该Field中包含的词条的数量numTokens,计算该Field的标准化因子长度
public abstract float lengthNorm(String fieldName, int numTokens);
// 给定了一个Query的每个词条的Weight的平方值,计算一个Query的标准化因子
public abstract float queryNorm(float sumOfSquaredWeights);
// 为一个索引中存储的标准化因子解码(从float到byte)
public static byte encodeNorm(float f) {
return SmallFloat.floatToByte315(f);
}
// 计算一个Document中的词条的得分因子
public float tf(int freq) {
return tf((float)freq);
}
/** Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
* the frequency that is passed to {@link #tf(float)}.
*
* <p>A phrase match with a small edit distance to a document passage more
* closely matches the document, so implementations of this method usually
* return larger values when the edit distance is small and smaller values
* when it is large.
*
* @see PhraseQuery#setSlop(int)
* @param distance the edit distance of this sloppy phrase match
* @return the frequency increment for this match
*/
public abstract float sloppyFreq(int distance);
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(Term, Searcher)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* <p>Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
public abstract float tf(float freq);
/** Computes a score factor for a simple term.
*
* <p>The default implementation is:<pre>
* return idf(searcher.docFreq(term), searcher.maxDoc());
* </pre>
*
* Note that {@link Searcher#maxDoc()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs()} because it is proportional to
* {@link Searcher#docFreq(Term)} , i.e., when one is inaccurate,
* so is the other, and in the same direction.
*
* @param term the term in question
* @param searcher the document collection being searched
* @return a score factor for the term
*/
public float idf(Term term, Searcher searcher) throws IOException {
return idf(searcher.docFreq(term), searcher.maxDoc());
}
// 为一个短语计算得分因子
public float idf(Collection terms, Searcher searcher) throws IOException {
float idf = 0.0f;
Iterator i = terms.iterator();
while (i.hasNext()) {
idf += idf((Term)i.next(), searcher);
}
return idf;
}
/** Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
* {@link #tf(int)} factor for each term in the query and these products are
* then summed to form the initial score for a document.
*/
public abstract float idf(int docFreq, int numDocs);
/** Computes a score factor based on the fraction of all query terms that a
* document contains. This value is multiplied into scores.
*/
public abstract float coord(int overlap, int maxOverlap);
/**
* Calculate a scoring factor based on the data in the payload. Overriding implementations
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about
* what is in the byte array.
*/
public float scorePayload(byte [] payload, int offset, int length)
{
//Do nothing
return 1;
}
}