
  CluceneIndexWriter::addDocument中对每个加入的文档,就得到新的段名,生成一个新的段信息, segmentInfos数组信息里面加入这个新段信息,当达到合并阀值时触发段的合并操作

  (1).void IndexWriter::maybeMergeSegments()方法


void IndexWriter::maybeMergeSegments()



    int64_t targetMergeDocs = minMergeDocs;  //默认值是

    // find segments smaller than current target size

    // 不能超过最大的合并文档个数

    while (targetMergeDocs <= maxMergeDocs)


        int32_t minSegment = segmentInfos->size();

        int32_t mergeDocs = 0;



        //    的文档数已经等于targetMergeDocs,循环直接退出,mergeDocs的值并没有增加,因此并不会触发合并;


        //    文档,这篇文档,合并为一个新段,然后合并因子为,targetMergeDocs *= mergeFactor;===>100,

        //    然后处理这个段,每个段有篇文档,重新触发合并的操作,依次类推

        while (--minSegment >= 0)


            SegmentInfo* si = segmentInfos->info(minSegment);

            if (si->docCount >= targetMergeDocs)




            mergeDocs += si->docCount;


        if (mergeDocs >= targetMergeDocs)


            // 超过设置的最大buf内缓存文档数目,开始合并







        //increase target size:在用乘法做mergeFactor:合并因子

        targetMergeDocs *= mergeFactor;



  (2). void IndexWriter::mergeSegments()方法

void IndexWriter::mergeSegments(const uint32_t minSegment, const uint32_t end)


    CLVector<SegmentReader*> segmentsToDelete(false);

    const char* mergedName = newSegmentName();


    SegmentMerger merger(this, mergedName);


    for (size_t i = minSegment; i < end; i++)


        SegmentInfo* si = segmentInfos->info(i);


        SegmentReader* reader = _CLNEW SegmentReader(si);


        if ((reader->getDirectory() == this->directory) || // if we own the directory

            (reader->getDirectory() == this->ramDirectory))


                segmentsToDelete.push_back(reader);   // queue segment for deletion




    int32_t mergedDocCount = merger.merge();


    fprintf(_CL_DEBUG_INFO,"/n into %s (%d docs)/n",mergedName, mergedDocCount);



    segmentInfos->clearto(minSegment);// remove old infos & add new


    segmentInfos->add( _CLNEW SegmentInfo(mergedName, mergedDocCount, directory) );


    LuceneLock* lock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME);

    LockWith2 with ( lock, commitLockTimeout,this, &segmentsToDelete, true );



        SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync



    _CLDELETE( lock );

    if (useCompoundFile)


        char cmpdTmpName[CL_MAX_PATH];



        AStringArrayWithDeletor filesToDelete;

        merger.createCompoundFile(cmpdTmpName, filesToDelete);

        LuceneLock* lock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME);

        LockWithCFS with ( lock,commitLockTimeout,directory, this, mergedName, &filesToDelete);


            SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync





    _CLDELETE_CaARRAY( mergedName );


  (3). int32_t SegmentMerger::merge()方法


int32_t SegmentMerger::merge()


    int32_t value = mergeFields();



    if (fieldInfos->hasVectors())




    return value;


  (4). int32_t SegmentMerger::mergeFields()方法

int32_t SegmentMerger::mergeFields()



    fieldInfos = _CLNEW FieldInfos();         // merge field names

    //Condition check to see if fieldInfos points to a valid instance

    CND_CONDITION(fieldInfos != NULL,"Memory allocation for fieldInfos failed");

    IndexReader* reader = NULL;

    int32_t docCount = 0;

    //Iterate through all readers

    for (uint32_t i = 0; i < readers.size(); i++){

        //get the i-th reader

        reader = readers[i];

        //Condition check to see if reader points to a valid instance

        CND_CONDITION(reader != NULL,"No IndexReader found");

        StringArrayWithDeletor tmp;


        //调用段索引阅读器的getFieldNames()方法: class SegmentReader: public IndexReader

        reader->getFieldNames(IndexReader::TERMVECTOR_WITH_POSITION_OFFSET, tmp);

        addIndexed(reader, fieldInfos, tmp, true, true, true);


        reader->getFieldNames(IndexReader::TERMVECTOR_WITH_POSITION, tmp);

        addIndexed(reader, fieldInfos, tmp, true, true, false);


        reader->getFieldNames(IndexReader::TERMVECTOR_WITH_OFFSET, tmp);

        addIndexed(reader, fieldInfos, tmp, true, false, true);


        reader->getFieldNames(IndexReader::TERMVECTOR, tmp);

        addIndexed(reader, fieldInfos, tmp, true, false, false);


        reader->getFieldNames(IndexReader::INDEXED, tmp);

        addIndexed(reader, fieldInfos, tmp, false, false, false);


        reader->getFieldNames(IndexReader::UNINDEXED, tmp);

        if ( tmp.size() > 0 )


            TCHAR** arr = _CL_NEWARRAY(TCHAR*,tmp.size()+1);


            fieldInfos->add((const TCHAR**)arr, false);

            _CLDELETE_ARRAY(arr); //no need to delete the contents, since tmp is responsible for it



    const char* buf = Misc::segmentname(segment,".fnm");


    fieldInfos->write(directory, buf );


    FieldsWriter* fieldsWriter = _CLNEW FieldsWriter(directory, segment, fieldInfos);

    CND_CONDITION(fieldsWriter != NULL,"Memory allocation for fieldsWriter failed");



        IndexReader* reader = NULL;

        int32_t maxDoc          = 0;


        for (uint32_t i = 0; i < readers.size(); i++) {

            reader = readers[i];

            CND_CONDITION(reader != NULL, "No IndexReader found");


            int32_t maxDoc = reader->maxDoc();

            //document buffer

            Document doc;

            //Iterate through all the documents managed by the current reader

            for (int32_t j = 0; j < maxDoc; j++){

                //Check if the j-th document has been deleted, if so skip it


                if (!reader->isDeleted(j)){

                    //Get the document

                    if ( reader->document(j, &doc) ){  //参见bool SegmentReader::document:调用FieldsReader* fieldsReader;

                        //Add the document to the new FieldsWriter


                        fieldsWriter->addDocument( &doc );


                        //doc is cleard for re-use






    }_CLFINALLY(        fieldsWriter->close();

    _CLDELETE( fieldsWriter );



    return docCount;


   (5). void SegmentReader::initialize()方法


void SegmentReader::initialize(SegmentInfo* si)


    deletedDocs      = NULL;

    ones               = NULL;

    deletedDocsDirty = false;




    segment          = STRDUP_AtoA(si->name);


    freqStream       = NULL;

    proxStream       = NULL;

    //instantiate a buffer large enough to hold a directory path

    char buf[CL_MAX_PATH];

    // Use compound file directory for some files, if it exists

    Directory* cfsDir = getDirectory();  //目录由类的构造函数传入:getDirectory()函数里面直接return directory;

    SegmentName(buf, CL_MAX_PATH, ".cfs");

    if (cfsDir->fileExists(buf)) {

        cfsReader = _CLNEW CompoundFileReader(cfsDir, buf);

        cfsDir = cfsReader;


        cfsReader = NULL;

    //Create the name of the field info file with suffix .fnm in buf


    SegmentName(buf, CL_MAX_PATH, ".fnm");


    fieldInfos = _CLNEW FieldInfos(cfsDir, buf );

    //Condition check to see if fieldInfos points to a valid instance

    CND_CONDITION(fieldInfos != NULL,"No memory could be allocated for fieldInfos");

    //Create the name of the frequence file with suffix .frq in buf

    SegmentName(buf,CL_MAX_PATH, ".frq");

    //Open an IndexInput freqStream to the frequency file


    if ( cfsDir->getDirectoryType() == FSDirectory::DirectoryType() ){

        FSDirectory* fsdir = (FSDirectory*)cfsDir;

        freqStream = fsdir->openMMapFile( buf );

    } else if (strcmp(cfsDir->getDirectoryType(), "CFS") == 0) { //todo: we should have a CFS Directory

        freqStream = cfsDir->openInput(buf,true);




        freqStream = cfsDir->openInput( buf );

    //Condition check to see if freqStream points to a valid instance and was able to open the

    //frequency file

    CND_CONDITION(freqStream != NULL, "IndexInput freqStream could not open the frequency file");

    //Create the name of the prox file with suffix .prx in buf

    SegmentName(buf, CL_MAX_PATH,".prx");

    //Open an IndexInput proxStream to the prox file


    if (cfsDir->getDirectoryType() == FSDirectory::DirectoryType()) {

        FSDirectory* fsdir = (FSDirectory*)cfsDir;

        proxStream = fsdir->openMMapFile( buf );

    } else if (strcmp(cfsDir->getDirectoryType(), "CFS") == 0) {

        proxStream = cfsDir->openInput(buf,true);

    } else



        proxStream = cfsDir->openInput( buf );

    //Condition check to see if proxStream points to a valid instance and was able to open the

    //prox file

    CND_CONDITION(proxStream != NULL, "IndexInput proxStream could not open proximity file");

    //Instantiate a FieldsReader for reading the Field Info File


    fieldsReader = _CLNEW FieldsReader(cfsDir, segment, fieldInfos);

    CND_CONDITION(fieldsReader != NULL,"No memory could be allocated for fieldsReader");


    tis = _CLNEW TermInfosReader(cfsDir, segment, fieldInfos);

    //Condition check to see if tis points to a valid instance

    CND_CONDITION(tis != NULL,"No memory could be allocated for tis");


    //Check if the segment has deletion according to the SegmentInfo instance si->

    // NOTE: the bitvector is stored using the regular directory, not cfs

    if (hasDeletions(si)){

        //Create a deletion file with suffix .del         

        SegmentName(buf, CL_MAX_PATH,".del");

        //Instantiate a BitVector that manages which documents have been deleted

        deletedDocs = _CLNEW BitSet(getDirectory(), buf );




    if (fieldInfos->hasVectors()) { // open term vector files only as needed

        termVectorsReaderOrig = _CLNEW TermVectorsReader(cfsDir, segment, fieldInfos);


        termVectorsReaderOrig = NULL;



  (6). void SegmentMerger::mergeTerms()方法


void SegmentMerger::mergeTerms()


    CND_PRECONDITION(fieldInfos != NULL, "fieldInfos is NULL");


        //create a filename for the new Frequency File for segment

        const char* buf = Misc::segmentname(segment,".frq");

        //Open an IndexOutput to the new Frequency File

        freqOutput = directory->createOutput( buf );

        //Destroy the buffer of the filename


        //create a filename for the new Prox File for segment

        buf = Misc::segmentname(segment,".prx");

        //Open an IndexOutput to the new Prox File

        proxOutput = directory->createOutput( buf );

        //delete buffer

        _CLDELETE_CaARRAY( buf );

        //Instantiate  a new termInfosWriter which will write in directory

        //for the segment name segment using the new merged fieldInfos

        termInfosWriter = _CLNEW TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); 

        //Condition check to see if termInfosWriter points to a valid instance

        CND_CONDITION(termInfosWriter != NULL,"Memory allocation for termInfosWriter failed")    ;


        skipInterval = termInfosWriter->skipInterval;


        queue = _CLNEW SegmentMergeQueue(readers.size());

        //And merge the Term Infos



        if (freqOutput != NULL)         { freqOutput->close(); _CLDELETE(freqOutput); }

        if (proxOutput != NULL)         { proxOutput->close(); _CLDELETE(proxOutput); }

        if (termInfosWriter != NULL)    { termInfosWriter->close(); _CLDELETE(termInfosWriter); }

        if (queue != NULL)            { queue->close(); _CLDELETE(queue);}



(7). void SegmentMerger::mergeTermInfos()方法





//    放到匹配的数组中;




void SegmentMerger::mergeTermInfos()


    CND_CONDITION(queue != NULL, "Memory allocation for queue failed");

    //base is the id of the first document in a segment

    int32_t base = 0;

    IndexReader* reader = NULL;

    SegmentMergeInfo* smi = NULL;

    //iterate through all the readers


    for (uint32_t i = 0; i < readers.size(); i++)


        //Get the i-th reader

        reader = readers[i];

        //Condition check to see if reader points to a valid instance

        CND_CONDITION(reader != NULL, "No IndexReader found");

        //Get the term enumeration of the reader


        TermEnum* termEnum = reader->terms();

        //Instantiate a new SegmentMerginfo for the current reader and enumeration



        smi = _CLNEW SegmentMergeInfo(base, termEnum, reader);

        //Condition check to see if smi points to a valid instance

        CND_CONDITION(smi != NULL, "Memory allocation for smi failed")  ;

        //Increase the base by the number of documents that have not been marked deleted

        //so base will contain a new value for the first document of the next iteration

        base += reader->numDocs();

        //Get the next current term


        if (smi->next()){

            //Store the SegmentMergeInfo smi with the initialized SegmentTermEnum TermEnum

            //into the queue



            //Apparently the end of the TermEnum of the SegmentTerm has been reached so

            //close the SegmentMergeInfo smi


            //And destroy the instance and set smi to NULL (It will be used later in this method)




    //Instantiate an array of SegmentMergeInfo instances called match

    SegmentMergeInfo** match = _CL_NEWARRAY(SegmentMergeInfo*,readers.size()+1);

    //Condition check to see if match points to a valid instance

    CND_CONDITION(match != NULL, "Memory allocation for match failed")  ;

    SegmentMergeInfo* top = NULL;

    //As long as there are SegmentMergeInfo instances stored in the queue


    while (queue->size() > 0) {

        int32_t matchSize = 0;           

        // pop matching terms

        //Pop the first SegmentMergeInfo from the queue

        match[matchSize++] = queue->pop();

        //Get the Term of match[0]

        Term* term = match[0]->term;

        //Condition check to see if term points to a valid instance

        CND_CONDITION(term != NULL,"term is NULL")  ;

        //Get the current top of the queue

        top = queue->top();

        //For each SegmentMergInfo still in the queue

        //Check if term matches the term of the SegmentMergeInfo instances in the queue



        while (top != NULL && term->equals(top->term) )


            //A match has been found so add the matching SegmentMergeInfo to the match array

            match[matchSize++] = queue->pop();

            //Get the next SegmentMergeInfo

            top = queue->top();



        //add new TermInfo

        mergeTermInfo(match); //matchSize 

        //Restore the SegmentTermInfo instances in the match array back into the queue


        while (matchSize > 0){

            smi = match[--matchSize];

            //Condition check to see if smi points to a valid instance

            CND_CONDITION(smi != NULL,"smi is NULL")    ;

            //Move to the next term in the enumeration of SegmentMergeInfo smi


            if (smi->next()){

                //There still are some terms so restore smi in the queue



                //Done with a segment

                //No terms anymore so close this SegmentMergeInfo instance


                _CLDELETE( smi );






(8). void SegmentMerger::mergeTermInfo()方法


void SegmentMerger::mergeTermInfo( SegmentMergeInfo** smis)


    CND_PRECONDITION(smis != NULL, "smis is NULL");

    CND_PRECONDITION(freqOutput != NULL, "freqOutput is NULL");

    CND_PRECONDITION(proxOutput != NULL, "proxOutput is NULL");

    //Get the file pointer of the IndexOutput to the Frequency File

    int64_t freqPointer = freqOutput->getFilePointer();

    //Get the file pointer of the IndexOutput to the Prox File

    int64_t proxPointer = proxOutput->getFilePointer();

    //Process postings from multiple segments all positioned on the same term.

    int32_t df = appendPostings(smis); 

    int64_t skipPointer = writeSkip();

    //df contains the number of documents across all segments where this term was found

    if (df > 0) {

        //add an entry to the dictionary with pointers to prox and freq files

        termInfo.set(df, freqPointer, proxPointer, (int32_t)(skipPointer - freqPointer));

        //Precondition check for to be sure that the reference to

        //smis[0]->term will be valid

        CND_PRECONDITION(smis[0]->term != NULL, "smis[0]->term is NULL");

        //Write a new TermInfo

        termInfosWriter->add(smis[0]->term, &termInfo);



(9). int32_t SegmentMerger::appendPostings()方法


int32_t SegmentMerger::appendPostings(SegmentMergeInfo** smis)


    CND_PRECONDITION(smis != NULL, "smis is NULL");

    CND_PRECONDITION(freqOutput != NULL, "freqOutput is NULL");

    CND_PRECONDITION(proxOutput != NULL, "proxOutput is NULL");

    int32_t lastDoc = 0;

    int32_t df = 0;       //文档计数器


    SegmentMergeInfo* smi = NULL;

    //Iterate through all SegmentMergeInfo instances in smis

    int32_t i = 0;

    while ( (smi=smis[i]) != NULL ){

        //Get the i-th SegmentMergeInfo

        //Condition check to see if smi points to a valid instance

        CND_PRECONDITION(smi!=NULL,"    is NULL");

        //Get the term positions

        TermPositions* postings = smi->getPositions();

        //Get the base of this segment

        int32_t base = smi->base;

        //Get the docMap so we can see which documents have been deleted

        int32_t* docMap = smi->getDocMap();

        //Seek the termpost


        while (postings->next())


            int32_t doc = postings->doc();

            //Check if there are deletions


            if (docMap != NULL)


                doc=docMap[doc]; // map around deletions


            doc+= base;           // convert to merged space

            //Condition check to see doc is eaqual to or bigger than lastDoc

            CND_CONDITION(doc >= lastDoc,"docs out of order");

            //Increase the total frequency over all segments


            if ((df % skipInterval) == 0)




            //Calculate a new docCode

            //use low bit to flag freq=1

            int32_t docCode = (doc - lastDoc) << 1;  

            lastDoc = doc;

            //Get the frequency of the Term

            int32_t freq = postings->freq();

            if (freq == 1){

                //write doc & freq=1

                freqOutput->writeVInt(docCode | 1);  


                //write doc


                //write frequency in doc



            int32_t lastPosition = 0;            

            // write position deltas

            for (int32_t j = 0; j < freq; j++)


                //Get the next position

                int32_t position = postings->nextPosition(); //位置迭代器

                //Write the difference between position and the last position

                proxOutput->writeVInt(position - lastPosition); //写入位置差值   

                lastPosition = position;





    //Return total number of documents across all segments where term was found   


    return df;


