Clucene在IndexWriter::addDocument中对每个加入的文档,就得到新的段名,生成一个新的段信息, segmentInfos数组信息里面加入这个新段信息,当达到合并阀值时触发段的合并操作
(1).void IndexWriter::maybeMergeSegments()方法
//检查是否合并段 void IndexWriter::maybeMergeSegments() { //初次建立时每个段就会有一个文档 int64_t targetMergeDocs = minMergeDocs; //默认值是 // find segments smaller than current target size // 不能超过最大的合并文档个数 while (targetMergeDocs <= maxMergeDocs) { int32_t minSegment = segmentInfos->size(); int32_t mergeDocs = 0; //从后面开始做循环,合并后面到前面的段里的文档 //(1).比如说有一个段已经合并了,有篇文档,在新来一篇文档,也就是一个新段,--minSegment后,因为合并的段 // 的文档数已经等于targetMergeDocs,循环直接退出,mergeDocs的值并没有增加,因此并不会触发合并; //(2).新来的篇文档,合并为一个新段;重复做,一直到有个新段出现,另外有个单独的段索引,此时新添加一个 // 文档,这篇文档,合并为一个新段,然后合并因子为,targetMergeDocs *= mergeFactor;===>100, // 然后处理这个段,每个段有篇文档,重新触发合并的操作,依次类推 while (--minSegment >= 0) { SegmentInfo* si = segmentInfos->info(minSegment); if (si->docCount >= targetMergeDocs) { break; } mergeDocs += si->docCount; } if (mergeDocs >= targetMergeDocs) { // 超过设置的最大buf内缓存文档数目,开始合并 mergeSegments(minSegment+1); } else { break; } //increase target size:在用乘法做mergeFactor:合并因子 targetMergeDocs *= mergeFactor; } } |
(2). void IndexWriter::mergeSegments()方法
void IndexWriter::mergeSegments(const uint32_t minSegment, const uint32_t end) { CLVector<SegmentReader*> segmentsToDelete(false); const char* mergedName = newSegmentName(); //段合并管理类 SegmentMerger merger(this, mergedName); //将要合并的段添加到segmentsToDelete中 for (size_t i = minSegment; i < end; i++) { SegmentInfo* si = segmentInfos->info(i); //为每一个段的信息生成一个段阅读器 SegmentReader* reader = _CLNEW SegmentReader(si); merger.add(reader); if ((reader->getDirectory() == this->directory) || // if we own the directory (reader->getDirectory() == this->ramDirectory)) { segmentsToDelete.push_back(reader); // queue segment for deletion } } //执行实际的段索引的合并操作 int32_t mergedDocCount = merger.merge(); #ifdef _CL_DEBUG_INFO fprintf(_CL_DEBUG_INFO,"/n into %s (%d docs)/n",mergedName, mergedDocCount); #endif //删除旧段索引信息 segmentInfos->clearto(minSegment);// remove old infos & add new //添加新的段索引信息 segmentInfos->add( _CLNEW SegmentInfo(mergedName, mergedDocCount, directory) ); merger.closeReaders(); LuceneLock* lock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME); LockWith2 with ( lock, commitLockTimeout,this, &segmentsToDelete, true );
{ SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync with.run(); } _CLDELETE( lock ); if (useCompoundFile) { char cmpdTmpName[CL_MAX_PATH]; strcpy(cmpdTmpName,mergedName); strcat(cmpdTmpName,".tmp"); AStringArrayWithDeletor filesToDelete; merger.createCompoundFile(cmpdTmpName, filesToDelete); LuceneLock* lock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME); LockWithCFS with ( lock,commitLockTimeout,directory, this, mergedName, &filesToDelete); { SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync with.run(); } _CLDELETE(lock); } _CLDELETE_CaARRAY( mergedName ); } |
(3). int32_t SegmentMerger::merge()方法
//执行实际的合并操作 int32_t SegmentMerger::merge() { int32_t value = mergeFields(); mergeTerms(); mergeNorms(); if (fieldInfos->hasVectors()) { mergeVectors(); } return value; } |
(4). int32_t SegmentMerger::mergeFields()方法
int32_t SegmentMerger::mergeFields() { //生成新的字段信息 fieldInfos = _CLNEW FieldInfos(); // merge field names //Condition check to see if fieldInfos points to a valid instance CND_CONDITION(fieldInfos != NULL,"Memory allocation for fieldInfos failed"); IndexReader* reader = NULL; int32_t docCount = 0; //Iterate through all readers for (uint32_t i = 0; i < readers.size(); i++){ //get the i-th reader reader = readers[i]; //Condition check to see if reader points to a valid instance CND_CONDITION(reader != NULL,"No IndexReader found"); StringArrayWithDeletor tmp; tmp.clear(); //调用段索引阅读器的getFieldNames()方法: class SegmentReader: public IndexReader reader->getFieldNames(IndexReader::TERMVECTOR_WITH_POSITION_OFFSET, tmp); addIndexed(reader, fieldInfos, tmp, true, true, true); tmp.clear(); reader->getFieldNames(IndexReader::TERMVECTOR_WITH_POSITION, tmp); addIndexed(reader, fieldInfos, tmp, true, true, false); tmp.clear(); reader->getFieldNames(IndexReader::TERMVECTOR_WITH_OFFSET, tmp); addIndexed(reader, fieldInfos, tmp, true, false, true); tmp.clear(); reader->getFieldNames(IndexReader::TERMVECTOR, tmp); addIndexed(reader, fieldInfos, tmp, true, false, false); tmp.clear(); reader->getFieldNames(IndexReader::INDEXED, tmp); addIndexed(reader, fieldInfos, tmp, false, false, false); tmp.clear(); reader->getFieldNames(IndexReader::UNINDEXED, tmp); if ( tmp.size() > 0 ) { TCHAR** arr = _CL_NEWARRAY(TCHAR*,tmp.size()+1); tmp.toArray(arr); fieldInfos->add((const TCHAR**)arr, false); _CLDELETE_ARRAY(arr); //no need to delete the contents, since tmp is responsible for it } } const char* buf = Misc::segmentname(segment,".fnm"); //写入合并后的字段 fieldInfos->write(directory, buf ); _CLDELETE_CaARRAY(buf); FieldsWriter* fieldsWriter = _CLNEW FieldsWriter(directory, segment, fieldInfos); CND_CONDITION(fieldsWriter != NULL,"Memory allocation for fieldsWriter failed"); try { IndexReader* reader = NULL; int32_t maxDoc = 0; //迭代处理SegmentReader阅读器 for (uint32_t i = 0; i < readers.size(); i++) { reader = readers[i]; CND_CONDITION(reader != NULL, "No IndexReader found"); //合并后一个段里面会有多个文档 int32_t maxDoc = reader->maxDoc(); //document buffer Document doc; //Iterate through all the documents managed by the current reader for (int32_t j = 0; j < maxDoc; j++){ //Check if the j-th document has been deleted, if so skip it //如果段索引中该文档没有被删除 if (!reader->isDeleted(j)){ //Get the document if ( reader->document(j, &doc) ){ //参见bool SegmentReader::document:调用FieldsReader* fieldsReader; //Add the document to the new FieldsWriter //写入合并后的文档里字段值 fieldsWriter->addDocument( &doc ); docCount++; //doc is cleard for re-use doc.clear(); } } } } }_CLFINALLY( fieldsWriter->close(); _CLDELETE( fieldsWriter ); ); //返回合并后的文档的数目 return docCount; } |
(5). void SegmentReader::initialize()方法
//段阅读器的信息初始化 void SegmentReader::initialize(SegmentInfo* si) { deletedDocs = NULL; ones = NULL; deletedDocsDirty = false; normsDirty=false; undeleteAll=false; //段的名称 segment = STRDUP_AtoA(si->name); //频率流,位置流 freqStream = NULL; proxStream = NULL; //instantiate a buffer large enough to hold a directory path char buf[CL_MAX_PATH]; // Use compound file directory for some files, if it exists Directory* cfsDir = getDirectory(); //目录由类的构造函数传入:getDirectory()函数里面直接return directory; SegmentName(buf, CL_MAX_PATH, ".cfs"); if (cfsDir->fileExists(buf)) { cfsReader = _CLNEW CompoundFileReader(cfsDir, buf); cfsDir = cfsReader; }else cfsReader = NULL; //Create the name of the field info file with suffix .fnm in buf //使用前缀得到字段输入流 SegmentName(buf, CL_MAX_PATH, ".fnm"); //类的构造函数:打开目录所在的输入流,读取流文件,重新得到字段信息 fieldInfos = _CLNEW FieldInfos(cfsDir, buf ); //Condition check to see if fieldInfos points to a valid instance CND_CONDITION(fieldInfos != NULL,"No memory could be allocated for fieldInfos"); //Create the name of the frequence file with suffix .frq in buf SegmentName(buf,CL_MAX_PATH, ".frq"); //Open an IndexInput freqStream to the frequency file #ifdef LUCENE_FS_MMAP if ( cfsDir->getDirectoryType() == FSDirectory::DirectoryType() ){ FSDirectory* fsdir = (FSDirectory*)cfsDir; freqStream = fsdir->openMMapFile( buf ); } else if (strcmp(cfsDir->getDirectoryType(), "CFS") == 0) { //todo: we should have a CFS Directory freqStream = cfsDir->openInput(buf,true); }else #endif //频率输入流 freqStream = cfsDir->openInput( buf ); //Condition check to see if freqStream points to a valid instance and was able to open the //frequency file CND_CONDITION(freqStream != NULL, "IndexInput freqStream could not open the frequency file"); //Create the name of the prox file with suffix .prx in buf SegmentName(buf, CL_MAX_PATH,".prx"); //Open an IndexInput proxStream to the prox file #ifdef LUCENE_FS_MMAP if (cfsDir->getDirectoryType() == FSDirectory::DirectoryType()) { FSDirectory* fsdir = (FSDirectory*)cfsDir; proxStream = fsdir->openMMapFile( buf ); } else if (strcmp(cfsDir->getDirectoryType(), "CFS") == 0) { proxStream = cfsDir->openInput(buf,true); } else #endif //位置输入流 proxStream = cfsDir->openInput( buf ); //Condition check to see if proxStream points to a valid instance and was able to open the //prox file CND_CONDITION(proxStream != NULL, "IndexInput proxStream could not open proximity file"); //Instantiate a FieldsReader for reading the Field Info File //得到字段阅读器 fieldsReader = _CLNEW FieldsReader(cfsDir, segment, fieldInfos); CND_CONDITION(fieldsReader != NULL,"No memory could be allocated for fieldsReader"); //得到词条阅读器 tis = _CLNEW TermInfosReader(cfsDir, segment, fieldInfos); //Condition check to see if tis points to a valid instance CND_CONDITION(tis != NULL,"No memory could be allocated for tis");
//Check if the segment has deletion according to the SegmentInfo instance si-> // NOTE: the bitvector is stored using the regular directory, not cfs if (hasDeletions(si)){ //Create a deletion file with suffix .del SegmentName(buf, CL_MAX_PATH,".del"); //Instantiate a BitVector that manages which documents have been deleted deletedDocs = _CLNEW BitSet(getDirectory(), buf ); } openNorms(cfsDir); //得到词条位置,偏移量阅读器 if (fieldInfos->hasVectors()) { // open term vector files only as needed termVectorsReaderOrig = _CLNEW TermVectorsReader(cfsDir, segment, fieldInfos); }else termVectorsReaderOrig = NULL; }
|
(6). void SegmentMerger::mergeTerms()方法
/合并词条 void SegmentMerger::mergeTerms() { CND_PRECONDITION(fieldInfos != NULL, "fieldInfos is NULL"); try{ //create a filename for the new Frequency File for segment const char* buf = Misc::segmentname(segment,".frq"); //Open an IndexOutput to the new Frequency File freqOutput = directory->createOutput( buf ); //Destroy the buffer of the filename _CLDELETE_CaARRAY(buf); //create a filename for the new Prox File for segment buf = Misc::segmentname(segment,".prx"); //Open an IndexOutput to the new Prox File proxOutput = directory->createOutput( buf ); //delete buffer _CLDELETE_CaARRAY( buf ); //Instantiate a new termInfosWriter which will write in directory //for the segment name segment using the new merged fieldInfos termInfosWriter = _CLNEW TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); //Condition check to see if termInfosWriter points to a valid instance CND_CONDITION(termInfosWriter != NULL,"Memory allocation for termInfosWriter failed") ; //得到设置的跳跃间隔 skipInterval = termInfosWriter->skipInterval; //合并段的优先级队列 queue = _CLNEW SegmentMergeQueue(readers.size()); //And merge the Term Infos mergeTermInfos(); }_CLFINALLY( if (freqOutput != NULL) { freqOutput->close(); _CLDELETE(freqOutput); } if (proxOutput != NULL) { proxOutput->close(); _CLDELETE(proxOutput); } if (termInfosWriter != NULL) { termInfosWriter->close(); _CLDELETE(termInfosWriter); } if (queue != NULL) { queue->close(); _CLDELETE(queue);} ); } |
(7). void SegmentMerger::mergeTermInfos()方法
/实际合并词条操作 //(1).取出各个段的第一个词条信息加入到优先级队列; //(2).取优先级队列的第一个元素,弹出来,在到优先级队列的下一个元素,如果这个元素的词条相同的话,持续弹出来, // 放到匹配的数组中; //(3).对词条相同的进行合并词条的操作; //(4).对词条相同的,在取下一个词条信息,加入到优先级队列(之前相同的已经被弹出优先级队列了); //(5).在依次循环进行处理; void SegmentMerger::mergeTermInfos() { CND_CONDITION(queue != NULL, "Memory allocation for queue failed"); //base is the id of the first document in a segment int32_t base = 0; IndexReader* reader = NULL; SegmentMergeInfo* smi = NULL; //iterate through all the readers //循环取出段里面的第一个词条,放到优先级队列queue中 for (uint32_t i = 0; i < readers.size(); i++) { //Get the i-th reader reader = readers[i]; //Condition check to see if reader points to a valid instance CND_CONDITION(reader != NULL, "No IndexReader found"); //Get the term enumeration of the reader //得到词条迭代器 TermEnum* termEnum = reader->terms(); //Instantiate a new SegmentMerginfo for the current reader and enumeration //使用迭代器去进行了类的构造 //reader是SegmentReader的派生类指针 smi = _CLNEW SegmentMergeInfo(base, termEnum, reader); //Condition check to see if smi points to a valid instance CND_CONDITION(smi != NULL, "Memory allocation for smi failed") ; //Increase the base by the number of documents that have not been marked deleted //so base will contain a new value for the first document of the next iteration base += reader->numDocs(); //Get the next current term //next方法执行后SegmentMergeInfo中的term就通过迭代器取得了词条指针 if (smi->next()){ //Store the SegmentMergeInfo smi with the initialized SegmentTermEnum TermEnum //into the queue queue->put(smi); }else{ //Apparently the end of the TermEnum of the SegmentTerm has been reached so //close the SegmentMergeInfo smi smi->close(); //And destroy the instance and set smi to NULL (It will be used later in this method) _CLDELETE(smi); } } //Instantiate an array of SegmentMergeInfo instances called match SegmentMergeInfo** match = _CL_NEWARRAY(SegmentMergeInfo*,readers.size()+1); //Condition check to see if match points to a valid instance CND_CONDITION(match != NULL, "Memory allocation for match failed") ; SegmentMergeInfo* top = NULL; //As long as there are SegmentMergeInfo instances stored in the queue //循环不断从各个段索引中的.tis文件得出的第一个term while (queue->size() > 0) { int32_t matchSize = 0; // pop matching terms //Pop the first SegmentMergeInfo from the queue match[matchSize++] = queue->pop(); //Get the Term of match[0] Term* term = match[0]->term; //Condition check to see if term points to a valid instance CND_CONDITION(term != NULL,"term is NULL") ; //Get the current top of the queue top = queue->top(); //For each SegmentMergInfo still in the queue //Check if term matches the term of the SegmentMergeInfo instances in the queue //在匹配过程中如果term相同,就在match数组中将SegmentMergeInfo类的实例添加进来,match中 //的term都是相同的,match数组的大小正是该term在文集中的docfreq(文集中有多少个文档包含这个term值) while (top != NULL && term->equals(top->term) ) { //A match has been found so add the matching SegmentMergeInfo to the match array match[matchSize++] = queue->pop(); //Get the next SegmentMergeInfo top = queue->top(); } match[matchSize]=NULL; //add new TermInfo mergeTermInfo(match); //matchSize //Restore the SegmentTermInfo instances in the match array back into the queue //如果存在相同的词条 while (matchSize > 0){ smi = match[--matchSize]; //Condition check to see if smi points to a valid instance CND_CONDITION(smi != NULL,"smi is NULL") ; //Move to the next term in the enumeration of SegmentMergeInfo smi //移到下一个词条在进行处理 if (smi->next()){ //There still are some terms so restore smi in the queue queue->put(smi); }else{ //Done with a segment //No terms anymore so close this SegmentMergeInfo instance smi->close(); _CLDELETE( smi ); } } } _CLDELETE_ARRAY(match); } |
(8). void SegmentMerger::mergeTermInfo()方法
//合并段中相同的词条 void SegmentMerger::mergeTermInfo( SegmentMergeInfo** smis) { CND_PRECONDITION(smis != NULL, "smis is NULL"); CND_PRECONDITION(freqOutput != NULL, "freqOutput is NULL"); CND_PRECONDITION(proxOutput != NULL, "proxOutput is NULL"); //Get the file pointer of the IndexOutput to the Frequency File int64_t freqPointer = freqOutput->getFilePointer(); //Get the file pointer of the IndexOutput to the Prox File int64_t proxPointer = proxOutput->getFilePointer(); //Process postings from multiple segments all positioned on the same term. int32_t df = appendPostings(smis); int64_t skipPointer = writeSkip(); //df contains the number of documents across all segments where this term was found if (df > 0) { //add an entry to the dictionary with pointers to prox and freq files termInfo.set(df, freqPointer, proxPointer, (int32_t)(skipPointer - freqPointer)); //Precondition check for to be sure that the reference to //smis[0]->term will be valid CND_PRECONDITION(smis[0]->term != NULL, "smis[0]->term is NULL"); //Write a new TermInfo termInfosWriter->add(smis[0]->term, &termInfo); } } |
(9). int32_t SegmentMerger::appendPostings()方法
//多个段含有相同的词条信息,写入词条位置 int32_t SegmentMerger::appendPostings(SegmentMergeInfo** smis) { CND_PRECONDITION(smis != NULL, "smis is NULL"); CND_PRECONDITION(freqOutput != NULL, "freqOutput is NULL"); CND_PRECONDITION(proxOutput != NULL, "proxOutput is NULL"); int32_t lastDoc = 0; int32_t df = 0; //文档计数器 resetSkip(); SegmentMergeInfo* smi = NULL; //Iterate through all SegmentMergeInfo instances in smis int32_t i = 0; while ( (smi=smis[i]) != NULL ){ //Get the i-th SegmentMergeInfo //Condition check to see if smi points to a valid instance CND_PRECONDITION(smi!=NULL," is NULL"); //Get the term positions TermPositions* postings = smi->getPositions(); //Get the base of this segment int32_t base = smi->base; //Get the docMap so we can see which documents have been deleted int32_t* docMap = smi->getDocMap(); //Seek the termpost postings->seek(smi->termEnum); while (postings->next()) { int32_t doc = postings->doc(); //Check if there are deletions //判断该文档是否已经被删除 if (docMap != NULL) { doc=docMap[doc]; // map around deletions } doc+= base; // convert to merged space //Condition check to see doc is eaqual to or bigger than lastDoc CND_CONDITION(doc >= lastDoc,"docs out of order"); //Increase the total frequency over all segments df++; if ((df % skipInterval) == 0) { bufferSkip(lastDoc); } //Calculate a new docCode //use low bit to flag freq=1 int32_t docCode = (doc - lastDoc) << 1; lastDoc = doc; //Get the frequency of the Term int32_t freq = postings->freq(); if (freq == 1){ //write doc & freq=1 freqOutput->writeVInt(docCode | 1); }else{ //write doc freqOutput->writeVInt(docCode); //write frequency in doc freqOutput->writeVInt(freq); } int32_t lastPosition = 0; // write position deltas for (int32_t j = 0; j < freq; j++) { //Get the next position int32_t position = postings->nextPosition(); //位置迭代器 //Write the difference between position and the last position proxOutput->writeVInt(position - lastPosition); //写入位置差值 lastPosition = position; } } i++; } //Return total number of documents across all segments where term was found //返回该词条在整个文档中出现的频率 return df; } |