前面粗略研究了SortedSetDocValues如何index,这章研究粗略看下如何在搜索过程中做facet,还是以lucene 5.0自带的例子做为开头:
//SimpleSortedSetFacetsExample
private List<FacetResult> search() throws IOException {
DirectoryReader indexReader = DirectoryReader.open(indexDir);
IndexSearcher searcher = new IndexSearcher(indexReader);
/**建立SortedSetDocValuesReader 实例,并且将SortedSetDocValues 做了一部分预处理*/
SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(indexReader);
// Aggregatses the facet counts
FacetsCollector fc = new FacetsCollector();
// MatchAllDocsQuery is for "browsing" (counts facets
// for all non-deleted docs in the index); normally
// you'd use a "normal" query:
//先对所有的文档进行搜索,拿出所有的文档id,再来进行facet
FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc);
// Retrieve results
//对每个文档的$facets field 值进行统计
Facets facets = new SortedSetDocValuesFacetCounts(state, fc);
List<FacetResult> results = new ArrayList<>();
//开始对特定条件的facet进行归并
results.add(facets.getTopChildren(10, "Author"));
results.add(facets.getTopChildren(10, "Publish Year"));
indexReader.close();
return results;
}
先看看SortedSetDocValuesReaderState 都做了什么?
/** Creates this, pulling doc values from the specified
* field. */
public DefaultSortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException {
this.field = field;
this.origReader = reader;
// We need this to create thread-safe MultiSortedSetDV
// per collector:
topReader = SlowCompositeReaderWrapper.wrap(reader);
/**读取上章存储的数据字典集主要包括上章提及的OrdMap,ordinalCountsm,OrdinalIndex*/
SortedSetDocValues dv = topReader.getSortedSetDocValues(field);
if (dv == null) {
throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues");
}
if (dv.getValueCount() > Integer.MAX_VALUE) {
throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount());
}
//已经存储多少个docvalues(去重)
valueCount = (int) dv.getValueCount();
// TODO: we can make this more efficient if eg we can be
// "involved" when OrdinalMap is being created? Ie see
// each term/ord it's assigning as it goes...
String lastDim = null;
int startOrd = -1;
// TODO: this approach can work for full hierarchy?;
// TaxoReader can't do this since ords are not in
// "sorted order" ... but we should generalize this to
// support arbitrary hierarchy:
//循环查找所有存储的docvalues,确定同一个dim 在存储的binary的范围
for(int ord=0;ord<valueCount;ord++) {
final BytesRef term = dv.lookupOrd(ord);
String[] components = FacetsConfig.stringToPath(term.utf8ToString());
if (components.length != 2) {
throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.toString(components) + " " + term.utf8ToString());
}
if (!components[0].equals(lastDim)) {
if (lastDim != null) {
prefixToOrdRange.put(lastDim, new OrdRange(startOrd, ord-1));
}
startOrd = ord;
lastDim = components[0];
}
}
if (lastDim != null) {
prefixToOrdRange.put(lastDim, new OrdRange(startOrd, valueCount-1));
}
}
回想下前面讲过,所有的dim和label会拼接在一起生成一个长字符串存储,在例子中,用户一般是根据其自定义的dim来查找比如例子里面根据"Author" 或者"Publish Year"来查找,所以在读取term的的时候要确定前几个term是属于同一个dim(例如"Author"),prefixToOrdRange 就是为了确定每一个dim在所有的存储docvalues的范围,以便做后续统计。
再来看SortedSetDocValuesFacetCounts 做了什么:
public SortedSetDocValuesFacetCounts(SortedSetDocValuesReaderState state, FacetsCollector hits)
throws IOException {
this.state = state;
this.field = state.getField();
dv = state.getDocValues();
counts = new int[state.getSize()];
//System.out.println("field=" + field);
//开始统计所有的term在文档的出现次数
count(hits.getMatchingDocs());
}
private final void count(List<MatchingDocs> matchingDocs) throws IOException {
//System.out.println("ssdv count");
MultiDocValues.OrdinalMap ordinalMap;
// TODO: is this right? really, we need a way to
// verify that this ordinalMap "matches" the leaves in
// matchingDocs...
if (dv instanceof MultiSortedSetDocValues && matchingDocs.size() > 1) {
ordinalMap = ((MultiSortedSetDocValues) dv).mapping;
} else {
ordinalMap = null;
}
IndexReader origReader = state.getOrigReader();
for(MatchingDocs hits : matchingDocs) {
LeafReader reader = hits.context.reader();
//System.out.println(" reader=" + reader);
// LUCENE-5090: make sure the provided reader context "matches"
// the top-level reader passed to the
// SortedSetDocValuesReaderState, else cryptic
// AIOOBE can happen:
if (ReaderUtil.getTopLevelContext(hits.context).reader() != origReader) {
throw new IllegalStateException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader");
}
SortedSetDocValues segValues = reader.getSortedSetDocValues(field);
if (segValues == null) {
continue;
}
DocIdSetIterator docs = hits.bits.iterator();
// TODO: yet another option is to count all segs
// first, only in seg-ord space, and then do a
// merge-sort-PQ in the end to only "resolve to
// global" those seg ords that can compete, if we know
// we just want top K? ie, this is the same algo
// that'd be used for merging facets across shards
// (distributed faceting). but this has much higher
// temp ram req'ts (sum of number of ords across all
// segs)
if (ordinalMap != null) {
final int segOrd = hits.context.ord;
final LongValues ordMap = ordinalMap.getGlobalOrds(segOrd);
int numSegOrds = (int) segValues.getValueCount();
if (hits.totalHits < numSegOrds/10) {
//System.out.println(" remap as-we-go");
// Remap every ord to global ord as we iterate:
int doc;
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
//System.out.println(" doc=" + doc);
segValues.setDocument(doc);
int term = (int) segValues.nextOrd();
while (term != SortedSetDocValues.NO_MORE_ORDS) {
//System.out.println(" segOrd=" + segOrd + " ord=" + term + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, term));
counts[(int) ordMap.get(term)]++;
term = (int) segValues.nextOrd();
}
}
} else {
//System.out.println(" count in seg ord first");
// First count in seg-ord space:
final int[] segCounts = new int[numSegOrds];
int doc;
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
//System.out.println(" doc=" + doc);
segValues.setDocument(doc);
int term = (int) segValues.nextOrd();
while (term != SortedSetDocValues.NO_MORE_ORDS) {
//System.out.println(" ord=" + term);
segCounts[term]++;
term = (int) segValues.nextOrd();
}
}
// Then, migrate to global ords:
for(int ord=0;ord<numSegOrds;ord++) {
int count = segCounts[ord];
if (count != 0) {
//System.out.println(" migrate segOrd=" + segOrd + " ord=" + ord + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, ord));
counts[(int) ordMap.get(ord)] += count;
}
}
}
} else {
// No ord mapping (e.g., single segment index):
// just aggregate directly into counts:
int doc;
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
//设置该文档属于term的边界。
segValues.setDocument(doc);
//读取第一个term
int term = (int) segValues.nextOrd();
while (term != SortedSetDocValues.NO_MORE_ORDS) {
//判定termid 有效,统计该term的count
counts[term]++;
//读取下一个term
term = (int) segValues.nextOrd();
}
}
}
}
}
SortedSetDocValues 的初始话过程主要就是每个term的频率统计,深入看看segValues的查找每个term的结构:
//Lucene50DocValuesProducer
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetEntry ss = sortedSets.get(field.name);
if (ss.format == SORTED_SINGLE_VALUED) {
final SortedDocValues values = getSorted(field);
return DocValues.singleton(values);
} else if (ss.format != SORTED_WITH_ADDRESSES) {
throw new AssertionError();
}
final long valueCount = binaries.get(field.name).count;
// we keep the byte[]s and list of ords on disk, these could be large
final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
final LongValues ordinals = getNumeric(ords.get(field.name));
// but the addresses to the ord stream are in RAM
final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name));
return new RandomAccessOrds() {
long startOffset;
long offset;
long endOffset;
@Override
public long nextOrd() {
if (offset == endOffset) {
return NO_MORE_ORDS;
} else {
//通过确定的边界查找该文档包含准确的termid,?Ordinals =pending ords
long ord = ordinals.get(offset);
offset++;
return ord;
}
}
@Override
public void setDocument(int docID) {
/**通过文档id查找在Ords中的边界,OrdIndex = pendingCounts ordCounts*/
startOffset = offset = ordIndex.get(docID);
endOffset = ordIndex.get(docID+1L);
}
@Override
public BytesRef lookupOrd(long ord) {
//返回存储的term内容,ord传入的是termid ,同样查找过程需要OrdMap辅助
return binary.get(ord);
}
@Override
public long getValueCount() {
return valueCount;
}
@Override
public long lookupTerm(BytesRef key) {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues)binary).lookupTerm(key);
} else {
return super.lookupTerm(key);
}
}
@Override
public TermsEnum termsEnum() {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues)binary).getTermsEnum();
} else {
return super.termsEnum();
}
}
@Override
public long ordAt(int index) {
return ordinals.get(startOffset + index);
}
@Override
public int cardinality() {
return (int) (endOffset - startOffset);
}
};
}
最后看归并的过程:
@Override
public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
if (topN <= 0) {
throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")");
}
if (path.length > 0) {
throw new IllegalArgumentException("path should be 0 length");
}
//每个dim在存储的docvalues的范围
OrdRange ordRange = state.getOrdRange(dim);
if (ordRange == null) {
throw new IllegalArgumentException("dimension \"" + dim + "\" was not indexed");
}
return getDim(dim, ordRange, topN);
}
private final FacetResult getDim(String dim, OrdRange ordRange, int topN) {
TopOrdAndIntQueue q = null;
int bottomCount = 0;
int dimCount = 0;
int childCount = 0;
TopOrdAndIntQueue.OrdAndValue reuse = null;
//System.out.println("getDim : " + ordRange.start + " - " + ordRange.end);
/**在符合客户定义的dim的范围查看其term的频率,由于涉及到topN的定义,返回
前N个频率最多的必然涉及到最小堆的排序,TopOrdAndIntQueue就是用来排TopN。
for(int ord=ordRange.start; ord<=ordRange.end; ord++) {
//System.out.println(" ord=" + ord + " count=" + counts[ord]);
if (counts[ord] > 0) {
dimCount += counts[ord];
childCount++;
if (counts[ord] > bottomCount) {
if (reuse == null) {
reuse = new TopOrdAndIntQueue.OrdAndValue();
}
reuse.ord = ord;
reuse.value = counts[ord];
if (q == null) {
// Lazy init, so we don't create this for the
// sparse case unnecessarily
q = new TopOrdAndIntQueue(topN);
}
reuse = q.insertWithOverflow(reuse);
if (q.size() == topN) {
bottomCount = q.top().value;
}
}
}
}
if (q == null) {
return null;
}
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
for(int i=labelValues.length-1;i>=0;i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
final BytesRef term = dv.lookupOrd(ordAndValue.ord);
String[] parts = FacetsConfig.stringToPath(term.utf8ToString());
labelValues[i] = new LabelAndValue(parts[1], ordAndValue.value);
}
return new FacetResult(dim, new String[0], dimCount, labelValues, childCount);
}
到此所有的SortedSetDocValues 查找和索引都基本分析完结了。