Clucene索引合并剖析

  CluceneIndexWriter::addDocument中对每个加入的文档,就得到新的段名,生成一个新的段信息, segmentInfos数组信息里面加入这个新段信息,当达到合并阀值时触发段的合并操作

  (1).void IndexWriter::maybeMergeSegments()方法

//检查是否合并段

void IndexWriter::maybeMergeSegments()

{

    //初次建立时每个段就会有一个文档

    int64_t targetMergeDocs = minMergeDocs;  //默认值是

    // find segments smaller than current target size

    // 不能超过最大的合并文档个数

    while (targetMergeDocs <= maxMergeDocs)

    {

        int32_t minSegment = segmentInfos->size();

        int32_t mergeDocs = 0;

        //从后面开始做循环,合并后面到前面的段里的文档

        //(1).比如说有一个段已经合并了,有篇文档,在新来一篇文档,也就是一个新段,--minSegment,因为合并的段

        //    的文档数已经等于targetMergeDocs,循环直接退出,mergeDocs的值并没有增加,因此并不会触发合并;

        //(2).新来的篇文档,合并为一个新段;重复做,一直到有个新段出现,另外有个单独的段索引,此时新添加一个

        //    文档,这篇文档,合并为一个新段,然后合并因子为,targetMergeDocs *= mergeFactor;===>100,

        //    然后处理这个段,每个段有篇文档,重新触发合并的操作,依次类推

        while (--minSegment >= 0)

        {

            SegmentInfo* si = segmentInfos->info(minSegment);

            if (si->docCount >= targetMergeDocs)

            {

                break;

            }

            mergeDocs += si->docCount;

        }

        if (mergeDocs >= targetMergeDocs)

        {

            // 超过设置的最大buf内缓存文档数目,开始合并

            mergeSegments(minSegment+1);

        }

        else

        {

            break;

        }

        //increase target size:在用乘法做mergeFactor:合并因子

        targetMergeDocs *= mergeFactor;

    }

}

  (2). void IndexWriter::mergeSegments()方法

void IndexWriter::mergeSegments(const uint32_t minSegment, const uint32_t end)

{

    CLVector<SegmentReader*> segmentsToDelete(false);

    const char* mergedName = newSegmentName();

    //段合并管理类

    SegmentMerger merger(this, mergedName);

    //将要合并的段添加到segmentsToDelete

    for (size_t i = minSegment; i < end; i++)

    {

        SegmentInfo* si = segmentInfos->info(i);

        //为每一个段的信息生成一个段阅读器

        SegmentReader* reader = _CLNEW SegmentReader(si);

        merger.add(reader);

        if ((reader->getDirectory() == this->directory) || // if we own the directory

            (reader->getDirectory() == this->ramDirectory))

        {

                segmentsToDelete.push_back(reader);   // queue segment for deletion

        }

    }

    //执行实际的段索引的合并操作

    int32_t mergedDocCount = merger.merge();

#ifdef _CL_DEBUG_INFO

    fprintf(_CL_DEBUG_INFO,"/n into %s (%d docs)/n",mergedName, mergedDocCount);

#endif

    //删除旧段索引信息

    segmentInfos->clearto(minSegment);// remove old infos & add new

    //添加新的段索引信息

    segmentInfos->add( _CLNEW SegmentInfo(mergedName, mergedDocCount, directory) );

    merger.closeReaders();

    LuceneLock* lock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME);

    LockWith2 with ( lock, commitLockTimeout,this, &segmentsToDelete, true );

 

    {

        SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync

            with.run();

    }

    _CLDELETE( lock );

    if (useCompoundFile)

    {

        char cmpdTmpName[CL_MAX_PATH];

        strcpy(cmpdTmpName,mergedName);

        strcat(cmpdTmpName,".tmp");

        AStringArrayWithDeletor filesToDelete;

        merger.createCompoundFile(cmpdTmpName, filesToDelete);

        LuceneLock* lock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME);

        LockWithCFS with ( lock,commitLockTimeout,directory, this, mergedName, &filesToDelete);

        {

            SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync

                with.run();

        }

        _CLDELETE(lock);

    }

    _CLDELETE_CaARRAY( mergedName );

}

  (3). int32_t SegmentMerger::merge()方法

//执行实际的合并操作

int32_t SegmentMerger::merge()

{

    int32_t value = mergeFields();

    mergeTerms();

    mergeNorms();

    if (fieldInfos->hasVectors())

    {

        mergeVectors();

    }

    return value;

}

  (4). int32_t SegmentMerger::mergeFields()方法

int32_t SegmentMerger::mergeFields()

{

    //生成新的字段信息

    fieldInfos = _CLNEW FieldInfos();         // merge field names

    //Condition check to see if fieldInfos points to a valid instance

    CND_CONDITION(fieldInfos != NULL,"Memory allocation for fieldInfos failed");

    IndexReader* reader = NULL;

    int32_t docCount = 0;

    //Iterate through all readers

    for (uint32_t i = 0; i < readers.size(); i++){

        //get the i-th reader

        reader = readers[i];

        //Condition check to see if reader points to a valid instance

        CND_CONDITION(reader != NULL,"No IndexReader found");

        StringArrayWithDeletor tmp;

        tmp.clear();

        //调用段索引阅读器的getFieldNames()方法: class SegmentReader: public IndexReader

        reader->getFieldNames(IndexReader::TERMVECTOR_WITH_POSITION_OFFSET, tmp);

        addIndexed(reader, fieldInfos, tmp, true, true, true);

        tmp.clear();

        reader->getFieldNames(IndexReader::TERMVECTOR_WITH_POSITION, tmp);

        addIndexed(reader, fieldInfos, tmp, true, true, false);

        tmp.clear();

        reader->getFieldNames(IndexReader::TERMVECTOR_WITH_OFFSET, tmp);

        addIndexed(reader, fieldInfos, tmp, true, false, true);

        tmp.clear();

        reader->getFieldNames(IndexReader::TERMVECTOR, tmp);

        addIndexed(reader, fieldInfos, tmp, true, false, false);

        tmp.clear();

        reader->getFieldNames(IndexReader::INDEXED, tmp);

        addIndexed(reader, fieldInfos, tmp, false, false, false);

        tmp.clear();

        reader->getFieldNames(IndexReader::UNINDEXED, tmp);

        if ( tmp.size() > 0 )

        {

            TCHAR** arr = _CL_NEWARRAY(TCHAR*,tmp.size()+1);

            tmp.toArray(arr);

            fieldInfos->add((const TCHAR**)arr, false);

            _CLDELETE_ARRAY(arr); //no need to delete the contents, since tmp is responsible for it

        }

    }

    const char* buf = Misc::segmentname(segment,".fnm");

    //写入合并后的字段

    fieldInfos->write(directory, buf );

    _CLDELETE_CaARRAY(buf);

    FieldsWriter* fieldsWriter = _CLNEW FieldsWriter(directory, segment, fieldInfos);

    CND_CONDITION(fieldsWriter != NULL,"Memory allocation for fieldsWriter failed");

    try

    { 

        IndexReader* reader = NULL;

        int32_t maxDoc          = 0;

        //迭代处理SegmentReader阅读器

        for (uint32_t i = 0; i < readers.size(); i++) {

            reader = readers[i];

            CND_CONDITION(reader != NULL, "No IndexReader found");

            //合并后一个段里面会有多个文档

            int32_t maxDoc = reader->maxDoc();

            //document buffer

            Document doc;

            //Iterate through all the documents managed by the current reader

            for (int32_t j = 0; j < maxDoc; j++){

                //Check if the j-th document has been deleted, if so skip it

                //如果段索引中该文档没有被删除

                if (!reader->isDeleted(j)){

                    //Get the document

                    if ( reader->document(j, &doc) ){  //参见bool SegmentReader::document:调用FieldsReader* fieldsReader;

                        //Add the document to the new FieldsWriter

                        //写入合并后的文档里字段值

                        fieldsWriter->addDocument( &doc );

                        docCount++;

                        //doc is cleard for re-use

                        doc.clear();

                    }

                }

            }

        }

    }_CLFINALLY(        fieldsWriter->close();

    _CLDELETE( fieldsWriter );

    );

    //返回合并后的文档的数目

    return docCount;

}

   (5). void SegmentReader::initialize()方法

//段阅读器的信息初始化

void SegmentReader::initialize(SegmentInfo* si)

{

    deletedDocs      = NULL;

    ones               = NULL;

    deletedDocsDirty = false;

    normsDirty=false;

    undeleteAll=false;

    //段的名称

    segment          = STRDUP_AtoA(si->name);

    //频率流,位置流

    freqStream       = NULL;

    proxStream       = NULL;

    //instantiate a buffer large enough to hold a directory path

    char buf[CL_MAX_PATH];

    // Use compound file directory for some files, if it exists

    Directory* cfsDir = getDirectory();  //目录由类的构造函数传入:getDirectory()函数里面直接return directory;

    SegmentName(buf, CL_MAX_PATH, ".cfs");

    if (cfsDir->fileExists(buf)) {

        cfsReader = _CLNEW CompoundFileReader(cfsDir, buf);

        cfsDir = cfsReader;

    }else

        cfsReader = NULL;

    //Create the name of the field info file with suffix .fnm in buf

    //使用前缀得到字段输入流

    SegmentName(buf, CL_MAX_PATH, ".fnm");

    //类的构造函数:打开目录所在的输入流,读取流文件,重新得到字段信息

    fieldInfos = _CLNEW FieldInfos(cfsDir, buf );

    //Condition check to see if fieldInfos points to a valid instance

    CND_CONDITION(fieldInfos != NULL,"No memory could be allocated for fieldInfos");

    //Create the name of the frequence file with suffix .frq in buf

    SegmentName(buf,CL_MAX_PATH, ".frq");

    //Open an IndexInput freqStream to the frequency file

#ifdef LUCENE_FS_MMAP

    if ( cfsDir->getDirectoryType() == FSDirectory::DirectoryType() ){

        FSDirectory* fsdir = (FSDirectory*)cfsDir;

        freqStream = fsdir->openMMapFile( buf );

    } else if (strcmp(cfsDir->getDirectoryType(), "CFS") == 0) { //todo: we should have a CFS Directory

        freqStream = cfsDir->openInput(buf,true);

    }else

#endif

        //频率输入流   

        freqStream = cfsDir->openInput( buf );

    //Condition check to see if freqStream points to a valid instance and was able to open the

    //frequency file

    CND_CONDITION(freqStream != NULL, "IndexInput freqStream could not open the frequency file");

    //Create the name of the prox file with suffix .prx in buf

    SegmentName(buf, CL_MAX_PATH,".prx");

    //Open an IndexInput proxStream to the prox file

#ifdef LUCENE_FS_MMAP

    if (cfsDir->getDirectoryType() == FSDirectory::DirectoryType()) {

        FSDirectory* fsdir = (FSDirectory*)cfsDir;

        proxStream = fsdir->openMMapFile( buf );

    } else if (strcmp(cfsDir->getDirectoryType(), "CFS") == 0) {

        proxStream = cfsDir->openInput(buf,true);

    } else

#endif

        //位置输入流

        proxStream = cfsDir->openInput( buf );

    //Condition check to see if proxStream points to a valid instance and was able to open the

    //prox file

    CND_CONDITION(proxStream != NULL, "IndexInput proxStream could not open proximity file");

    //Instantiate a FieldsReader for reading the Field Info File

    //得到字段阅读器

    fieldsReader = _CLNEW FieldsReader(cfsDir, segment, fieldInfos);

    CND_CONDITION(fieldsReader != NULL,"No memory could be allocated for fieldsReader");

    //得到词条阅读器

    tis = _CLNEW TermInfosReader(cfsDir, segment, fieldInfos);

    //Condition check to see if tis points to a valid instance

    CND_CONDITION(tis != NULL,"No memory could be allocated for tis");

 

    //Check if the segment has deletion according to the SegmentInfo instance si->

    // NOTE: the bitvector is stored using the regular directory, not cfs

    if (hasDeletions(si)){

        //Create a deletion file with suffix .del         

        SegmentName(buf, CL_MAX_PATH,".del");

        //Instantiate a BitVector that manages which documents have been deleted

        deletedDocs = _CLNEW BitSet(getDirectory(), buf );

    }

    openNorms(cfsDir);

    //得到词条位置,偏移量阅读器

    if (fieldInfos->hasVectors()) { // open term vector files only as needed

        termVectorsReaderOrig = _CLNEW TermVectorsReader(cfsDir, segment, fieldInfos);

    }else

        termVectorsReaderOrig = NULL;

}

 

  (6). void SegmentMerger::mergeTerms()方法

/合并词条

void SegmentMerger::mergeTerms()

{

    CND_PRECONDITION(fieldInfos != NULL, "fieldInfos is NULL");

    try{

        //create a filename for the new Frequency File for segment

        const char* buf = Misc::segmentname(segment,".frq");

        //Open an IndexOutput to the new Frequency File

        freqOutput = directory->createOutput( buf );

        //Destroy the buffer of the filename

        _CLDELETE_CaARRAY(buf);

        //create a filename for the new Prox File for segment

        buf = Misc::segmentname(segment,".prx");

        //Open an IndexOutput to the new Prox File

        proxOutput = directory->createOutput( buf );

        //delete buffer

        _CLDELETE_CaARRAY( buf );

        //Instantiate  a new termInfosWriter which will write in directory

        //for the segment name segment using the new merged fieldInfos

        termInfosWriter = _CLNEW TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); 

        //Condition check to see if termInfosWriter points to a valid instance

        CND_CONDITION(termInfosWriter != NULL,"Memory allocation for termInfosWriter failed")    ;

        //得到设置的跳跃间隔

        skipInterval = termInfosWriter->skipInterval;

        //合并段的优先级队列

        queue = _CLNEW SegmentMergeQueue(readers.size());

        //And merge the Term Infos

        mergeTermInfos();        

    }_CLFINALLY(

        if (freqOutput != NULL)         { freqOutput->close(); _CLDELETE(freqOutput); }

        if (proxOutput != NULL)         { proxOutput->close(); _CLDELETE(proxOutput); }

        if (termInfosWriter != NULL)    { termInfosWriter->close(); _CLDELETE(termInfosWriter); }

        if (queue != NULL)            { queue->close(); _CLDELETE(queue);}

        );

}

(7). void SegmentMerger::mergeTermInfos()方法

 

/实际合并词条操作

//(1).取出各个段的第一个词条信息加入到优先级队列;

//(2).取优先级队列的第一个元素,弹出来,在到优先级队列的下一个元素,如果这个元素的词条相同的话,持续弹出来,

//    放到匹配的数组中;

//(3).对词条相同的进行合并词条的操作;

//(4).对词条相同的,在取下一个词条信息,加入到优先级队列(之前相同的已经被弹出优先级队列了);

//(5).在依次循环进行处理;

void SegmentMerger::mergeTermInfos()

{

    CND_CONDITION(queue != NULL, "Memory allocation for queue failed");

    //base is the id of the first document in a segment

    int32_t base = 0;

    IndexReader* reader = NULL;

    SegmentMergeInfo* smi = NULL;

    //iterate through all the readers

    //循环取出段里面的第一个词条,放到优先级队列queue

    for (uint32_t i = 0; i < readers.size(); i++)

    {

        //Get the i-th reader

        reader = readers[i];

        //Condition check to see if reader points to a valid instance

        CND_CONDITION(reader != NULL, "No IndexReader found");

        //Get the term enumeration of the reader

        //得到词条迭代器

        TermEnum* termEnum = reader->terms();

        //Instantiate a new SegmentMerginfo for the current reader and enumeration

        //使用迭代器去进行了类的构造

        //readerSegmentReader的派生类指针

        smi = _CLNEW SegmentMergeInfo(base, termEnum, reader);

        //Condition check to see if smi points to a valid instance

        CND_CONDITION(smi != NULL, "Memory allocation for smi failed")  ;

        //Increase the base by the number of documents that have not been marked deleted

        //so base will contain a new value for the first document of the next iteration

        base += reader->numDocs();

        //Get the next current term

        //next方法执行后SegmentMergeInfo中的term就通过迭代器取得了词条指针

        if (smi->next()){

            //Store the SegmentMergeInfo smi with the initialized SegmentTermEnum TermEnum

            //into the queue

            queue->put(smi);

        }else{

            //Apparently the end of the TermEnum of the SegmentTerm has been reached so

            //close the SegmentMergeInfo smi

            smi->close();

            //And destroy the instance and set smi to NULL (It will be used later in this method)

            _CLDELETE(smi);

        }

    }

    //Instantiate an array of SegmentMergeInfo instances called match

    SegmentMergeInfo** match = _CL_NEWARRAY(SegmentMergeInfo*,readers.size()+1);

    //Condition check to see if match points to a valid instance

    CND_CONDITION(match != NULL, "Memory allocation for match failed")  ;

    SegmentMergeInfo* top = NULL;

    //As long as there are SegmentMergeInfo instances stored in the queue

    //循环不断从各个段索引中的.tis文件得出的第一个term

    while (queue->size() > 0) {

        int32_t matchSize = 0;           

        // pop matching terms

        //Pop the first SegmentMergeInfo from the queue

        match[matchSize++] = queue->pop();

        //Get the Term of match[0]

        Term* term = match[0]->term;

        //Condition check to see if term points to a valid instance

        CND_CONDITION(term != NULL,"term is NULL")  ;

        //Get the current top of the queue

        top = queue->top();

        //For each SegmentMergInfo still in the queue

        //Check if term matches the term of the SegmentMergeInfo instances in the queue

        //在匹配过程中如果term相同,就在match数组中将SegmentMergeInfo类的实例添加进来,match

        //term都是相同的,match数组的大小正是该term在文集中的docfreq(文集中有多少个文档包含这个term)

        while (top != NULL && term->equals(top->term) )

        {

            //A match has been found so add the matching SegmentMergeInfo to the match array

            match[matchSize++] = queue->pop();

            //Get the next SegmentMergeInfo

            top = queue->top();

        }

        match[matchSize]=NULL;

        //add new TermInfo

        mergeTermInfo(match); //matchSize 

        //Restore the SegmentTermInfo instances in the match array back into the queue

        //如果存在相同的词条

        while (matchSize > 0){

            smi = match[--matchSize];

            //Condition check to see if smi points to a valid instance

            CND_CONDITION(smi != NULL,"smi is NULL")    ;

            //Move to the next term in the enumeration of SegmentMergeInfo smi

            //移到下一个词条在进行处理

            if (smi->next()){

                //There still are some terms so restore smi in the queue

                queue->put(smi);

            }else{

                //Done with a segment

                //No terms anymore so close this SegmentMergeInfo instance

                smi->close();                

                _CLDELETE( smi );

            }

        }

    }

    _CLDELETE_ARRAY(match);

}

(8). void SegmentMerger::mergeTermInfo()方法

//合并段中相同的词条

void SegmentMerger::mergeTermInfo( SegmentMergeInfo** smis)

{

    CND_PRECONDITION(smis != NULL, "smis is NULL");

    CND_PRECONDITION(freqOutput != NULL, "freqOutput is NULL");

    CND_PRECONDITION(proxOutput != NULL, "proxOutput is NULL");

    //Get the file pointer of the IndexOutput to the Frequency File

    int64_t freqPointer = freqOutput->getFilePointer();

    //Get the file pointer of the IndexOutput to the Prox File

    int64_t proxPointer = proxOutput->getFilePointer();

    //Process postings from multiple segments all positioned on the same term.

    int32_t df = appendPostings(smis); 

    int64_t skipPointer = writeSkip();

    //df contains the number of documents across all segments where this term was found

    if (df > 0) {

        //add an entry to the dictionary with pointers to prox and freq files

        termInfo.set(df, freqPointer, proxPointer, (int32_t)(skipPointer - freqPointer));

        //Precondition check for to be sure that the reference to

        //smis[0]->term will be valid

        CND_PRECONDITION(smis[0]->term != NULL, "smis[0]->term is NULL");

        //Write a new TermInfo

        termInfosWriter->add(smis[0]->term, &termInfo);

    }

}

(9). int32_t SegmentMerger::appendPostings()方法

//多个段含有相同的词条信息,写入词条位置

int32_t SegmentMerger::appendPostings(SegmentMergeInfo** smis)

{

    CND_PRECONDITION(smis != NULL, "smis is NULL");

    CND_PRECONDITION(freqOutput != NULL, "freqOutput is NULL");

    CND_PRECONDITION(proxOutput != NULL, "proxOutput is NULL");

    int32_t lastDoc = 0;

    int32_t df = 0;       //文档计数器

    resetSkip();

    SegmentMergeInfo* smi = NULL;

    //Iterate through all SegmentMergeInfo instances in smis

    int32_t i = 0;

    while ( (smi=smis[i]) != NULL ){

        //Get the i-th SegmentMergeInfo

        //Condition check to see if smi points to a valid instance

        CND_PRECONDITION(smi!=NULL,"    is NULL");

        //Get the term positions

        TermPositions* postings = smi->getPositions();

        //Get the base of this segment

        int32_t base = smi->base;

        //Get the docMap so we can see which documents have been deleted

        int32_t* docMap = smi->getDocMap();

        //Seek the termpost

        postings->seek(smi->termEnum);

        while (postings->next())

        {

            int32_t doc = postings->doc();

            //Check if there are deletions

            //判断该文档是否已经被删除

            if (docMap != NULL)

            {

                doc=docMap[doc]; // map around deletions

            }

            doc+= base;           // convert to merged space

            //Condition check to see doc is eaqual to or bigger than lastDoc

            CND_CONDITION(doc >= lastDoc,"docs out of order");

            //Increase the total frequency over all segments

            df++;

            if ((df % skipInterval) == 0)

            {

                bufferSkip(lastDoc);

            }

            //Calculate a new docCode

            //use low bit to flag freq=1

            int32_t docCode = (doc - lastDoc) << 1;  

            lastDoc = doc;

            //Get the frequency of the Term

            int32_t freq = postings->freq();

            if (freq == 1){

                //write doc & freq=1

                freqOutput->writeVInt(docCode | 1);  

            }else{

                //write doc

                freqOutput->writeVInt(docCode);  

                //write frequency in doc

                freqOutput->writeVInt(freq);         

            }

            int32_t lastPosition = 0;            

            // write position deltas

            for (int32_t j = 0; j < freq; j++)

            {

                //Get the next position

                int32_t position = postings->nextPosition(); //位置迭代器

                //Write the difference between position and the last position

                proxOutput->writeVInt(position - lastPosition); //写入位置差值   

                lastPosition = position;

            }

        }

        i++;

    }

    //Return total number of documents across all segments where term was found   

    //返回该词条在整个文档中出现的频率

    return df;

}

 

你可能感兴趣的:(File,null,Lucene,文档,Path,Allocation)