Lucene索引的存储一般都是以倒排拉链的方式(term-doc), 但是在搜索相关功能处理的时候,如排序,高亮,摘要信息的获取,需要通过文档docid找到相应的正排信息,在Lucene4.0中,引入了一个新字段类型DocValue,是一个以列为主的字段,具有在索引的时候构建的文档到值(document-to-value)的映射.这个方法保证减轻了一些字段缓存的内存要求,并且使得sorting, faceting,grouping,fuction query更快.
在建索引的过程中,在设置了DocValues类型的数据直接以正排信息存在到RAM or Disk上.在读取的过程中(SegmentReader.java), 首先到CloseableThreadLocal的本地线程map中读取,如果存在直接返回.如果不存在则通过DocValuesProducer到磁盘上读取.
@Override
public NumericDocValues getNumericDocValues(String field) throws IOException {
ensureOpen();
Map<String,Object> dvFields = docValuesLocal.get();
Object previous = dvFields.get(field);
if (previous != null && previous instanceof NumericDocValues) {
return (NumericDocValues) previous;
} else {
FieldInfo fi = getDVField(field, DocValuesType.NUMERIC);
if (fi == null) {
return null;
}
DocValuesProducer dvProducer = dvProducersByField.get(field);
assert dvProducer != null;
NumericDocValues dv = dvProducer.getNumeric(fi);
dvFields.put(field, dv);
return dv;
}
}
DocValues直接通过AtomicReader.docValues(String) 获取某个分片上的信息
测试代码实现
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
public class DocVauesTest {
static final String NUMERIC_FIELD = "numeric";
static final String BINARY_FIELD = "binary";
static final String SORTED_FIELD = "sorted";
static final String SORTEDSET_FIELD = "sortedset";
static long[] numericVals = new long[] {12, 13, 0, 100, 19};
static String[] binary = new String[] {"lucene", "doc", "value", "test", "example"};
static String[] sortedVals = new String[] {"lucene", "facet", "abacus", "search", null};
static String[][] sortedSetVals = new String[][] {{"lucene", "search"}, {"search"}, {"facet", "abacus", "search"}, {}, {}};
static IndexReader topReader;
static AtomicReader atomicReader;
public static void main(String[] args) throws IOException {
RAMDirectory dir = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_0, new StandardAnalyzer());
IndexWriter writer = new IndexWriter(dir, config);
for (int i = 0; i < numericVals.length; ++i) {
Document doc = new Document();
doc.add(new NumericDocValuesField(NUMERIC_FIELD, numericVals[i]));
doc.add(new BinaryDocValuesField(BINARY_FIELD, new BytesRef(binary[i])));
if (sortedVals[i] != null) {
doc.add(new SortedDocValuesField(SORTED_FIELD, new BytesRef(sortedVals[i])));
}
for (String value : sortedSetVals[i]) {
doc.add(new SortedSetDocValuesField(SORTEDSET_FIELD, new BytesRef(value)));
}
writer.addDocument(doc);
}
writer.forceMerge(1);
writer.commit();
writer.close();
topReader = DirectoryReader.open(dir);
atomicReader = topReader.leaves().get(0).reader();
NumericDocValues docVals1 = atomicReader.getNumericDocValues(NUMERIC_FIELD);
System.out.println(docVals1.get(0));
BinaryDocValues docVals2 = atomicReader.getBinaryDocValues(BINARY_FIELD);
BytesRef bytesRef = docVals2.get(0);
System.out.println(bytesRef.utf8ToString());
SortedDocValues docVals3 = atomicReader.getSortedDocValues(SORTED_FIELD);
String ordInfo = "", values = "";
for (int i = 0; i < atomicReader.maxDoc(); ++i) {
ordInfo += docVals3.getOrd(i) + ":";
bytesRef = docVals3.get(i);
values += bytesRef.utf8ToString() + ":";
}
//2:1:0:3:-1
System.out.println(ordInfo);
//lucene:facet:abacus:search::
System.out.println(values);
SortedSetDocValues docVals = atomicReader.getSortedSetDocValues(SORTEDSET_FIELD);
String info = "";
for (int i = 0; i < atomicReader.maxDoc(); ++i) {
docVals.setDocument(i);
long ord;
info += "Doc " + i;
while ((ord = docVals.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
info += ", " + ord + "/";
bytesRef = docVals.lookupOrd(ord);
info += bytesRef.utf8ToString();
}
info += ";";
}
//Doc 0, 2/lucene, 3/search;Doc 1, 3/search;Doc 2, 0/abacus, 1/facet, 3/search;Doc 3;Doc 4;
System.out.println(info);
}
}