为了解析Lucene对索引文件搜索的过程,预先写入索引了如下几个文件:
file01.txt: apple apples cat dog
file02.txt: apple boy cat category
file03.txt: apply dog eat etc
file04.txt: apply cat foods
代码为:
IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));
其实是调用了DirectoryReader.open(Directory, IndexDeletionPolicy, IndexCommit, boolean, int) 函数,其主要作用是生成一个SegmentInfos.FindSegmentsFile对象,并用它来找到此索引文件中所有的段,并打开这些段。
SegmentInfos.FindSegmentsFile.run(IndexCommit commit)主要做以下事情:
String[] files = directory.listAll(); long genA = getCurrentSegmentGeneration(files); |
long getCurrentSegmentGeneration(String[] files) { long max = -1; for (int i = 0; i < files.length; i++) { String file = files[i]; if (file.startsWith(IndexFileNames.SEGMENTS) //"segments_N" && !file.equals(IndexFileNames.SEGMENTS_GEN)) { //"segments.gen" long gen = generationFromSegmentsFileName(file); if (gen > max) { max = gen; } } } return max; } |
IndexInput genInput = directory.openInput(IndexFileNames.SEGMENTS_GEN); int version = genInput.readInt(); long gen0 = genInput.readLong(); long gen1 = genInput.readLong(); if (gen0 == gen1) { genB = gen0; } |
if (genA > genB) gen = genA; else gen = genB; String segmentFileName = IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS, "", gen); //segmentFileName "segments_4" |
SegmentInfos infos = new SegmentInfos(); infos.read(directory, segmentFileName); |
SegmentInfos.read(Directory, String) 代码如下: int format = input.readInt(); version = input.readLong(); counter = input.readInt(); for (int i = input.readInt(); i > 0; i—) { //读出每一个段,并构造SegmentInfo对象 add(new SegmentInfo(directory, format, input)); } |
SegmentInfo(Directory dir, int format, IndexInput input)构造函数如下: name = input.readString(); docCount = input.readInt(); delGen = input.readLong(); docStoreOffset = input.readInt(); if (docStoreOffset != -1) { docStoreSegment = input.readString(); docStoreIsCompoundFile = (1 == input.readByte()); } else { docStoreSegment = name; docStoreIsCompoundFile = false; } hasSingleNormFile = (1 == input.readByte()); int numNormGen = input.readInt(); normGen = new long[numNormGen]; for(int j=0;j<numNormGen;j++) { normGen[j] = input.readLong(); } isCompoundFile = input.readByte(); delCount = input.readInt(); hasProx = input.readByte() == 1; 其实不用多介绍,看过Lucene学习总结之三:Lucene的索引文件格式 (2)一章,就很容易明白。 |
SegmentReader[] readers = new SegmentReader[sis.size()]; for (int i = sis.size()-1; i >= 0; i—) { //打开每一个段 readers[i] = SegmentReader.get(readOnly, sis.info(i), termInfosIndexDivisor); } |
SegmentReader.get(boolean, Directory, SegmentInfo, int, boolean, int) 代码如下: instance.core = new CoreReaders(dir, si, readBufferSize, termInfosIndexDivisor); instance.core.openDocStores(si); //生成用于读取存储域和词向量的对象。 instance.loadDeletedDocs(); //读取被删除文档(.del)文件 instance.openNorms(instance.core.cfsDir, readBufferSize); //读取标准化因子(.nrm) |
CoreReaders(Directory dir, SegmentInfo si, int readBufferSize, int termsIndexDivisor)构造函数代码如下: cfsReader = new CompoundFileReader(dir, segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION, readBufferSize); //读取cfs的reader fieldInfos = new FieldInfos(cfsDir, segment + "." + IndexFileNames.FIELD_INFOS_EXTENSION); //读取段元数据信息(.fnm) TermInfosReader reader = new TermInfosReader(cfsDir, segment, fieldInfos, readBufferSize, termsIndexDivisor); //用于读取词典信息(.tii .tis) freqStream = cfsDir.openInput(segment + "." + IndexFileNames.FREQ_EXTENSION, readBufferSize); //用于读取freq proxStream = cfsDir.openInput(segment + "." + IndexFileNames.PROX_EXTENSION, readBufferSize); //用于读取prox |
FieldInfos(Directory d, String name)构造函数如下: IndexInput input = d.openInput(name); int firstInt = input.readVInt(); size = input.readVInt(); for (int i = 0; i < size; i++) { //读取域名 String name = StringHelper.intern(input.readString()); //读取域的各种标志位 byte bits = input.readByte(); boolean isIndexed = (bits & IS_INDEXED) != 0; boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0; boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean storePayloads = (bits & STORE_PAYLOADS) != 0; boolean omitTermFreqAndPositions = (bits & OMIT_TERM_FREQ_AND_POSITIONS) != 0; //将读出的域生成FieldInfo对象,加入fieldinfos进行管理 addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); } |
CoreReaders.openDocStores(SegmentInfo)主要代码如下: fieldsReaderOrig = new FieldsReader(storeDir, storesSegment, fieldInfos, readBufferSize, si.getDocStoreOffset(), si.docCount); //用于读取存储域(.fdx, .fdt) termVectorsReaderOrig = new TermVectorsReader(storeDir, storesSegment, fieldInfos, readBufferSize, si.getDocStoreOffset(), si.docCount); //用于读取词向量(.tvx, .tvd, .tvf) |
在Lucene中,每个段中的文档编号都是从0开始的,而一个索引有多个段,需要重新进行编号,于是维护数组start[],来保存每个段的文档号的偏移量,从而第i个段的文档号是从start[i]至start[i]+Num private void initialize(SegmentReader[] subReaders) { this.subReaders = subReaders; starts = new int[subReaders.length + 1]; for (int i = 0; i < subReaders.length; i++) { starts[i] = maxDoc; maxDoc += subReaders[i].maxDoc(); if (subReaders[i].hasDeletions()) hasDeletions = true; } starts[subReaders.length] = maxDoc; } |
reader ReadOnlyDirectoryReader (id=466) //索引文件夹 //段元数据信息 //每个段的Reader |
从上面的过程来看,IndexReader有以下几个特性:
代码为:
IndexSearcher searcher = new IndexSearcher(reader);
其过程非常简单:
private IndexSearcher(IndexReader r, boolean closeReader) { reader = r; //当关闭searcher的时候,是否关闭其reader this.closeReader = closeReader; //对文档号进行编号 List<IndexReader> subReadersList = new ArrayList<IndexReader>(); gatherSubReaders(subReadersList, reader); subReaders = subReadersList.toArray(new IndexReader[subReadersList.size()]); docStarts = new int[subReaders.length]; int maxDoc = 0; for (int i = 0; i < subReaders.length; i++) { docStarts[i] = maxDoc; maxDoc += subReaders[i].maxDoc(); } } |
IndexSearcher表面上看起来好像仅仅是reader的一个封装,它的很多函数都是直接调用reader的相应函数,如:int docFreq(Term term),Document doc(int i),int maxDoc()。然而它提供了两个非常重要的函数:
因而在某些应用之中,只想得到某个词的倒排表的时候,最好不要用IndexSearcher,而直接用IndexReader.termDocs(Term term),则省去了打分的计算。