一、 IndexWriter分配了存储100个文档的缓冲(每个文档又分配了存储字段的缓冲)
此过程在void IndexWriter::defineSchema()函数中完成;
此函数的包括过程:
分配可存储100个文档的缓冲
…
m_ppCachedDocs = new FX_NS(document)::Document*[m_nNumCachedDocs];
// m_nNumCachedDocs默认为100
for(int32_t i = 0;i
{
m_ppCachedDocs[i] = new FX_NS(document)::Document(m_pIndex->getDocSchema());
}
…
定义文档的字段集合(有那些字段)
void Document::setSchema(DocumentSchema* pSchema)
根据字段集合在文档内分配存储字段的缓冲
…
void Document::setSchema(DocumentSchema* pSchema)
…
取得一个文档指针以填充文档内容
FX_NS(document)::Document* IndexWriter::getDoc()
如果缓冲已满(无法加入更多的缓冲,则将缓冲中的文档处理(索引并写入磁盘))
如果缓冲
如果缓冲未满,则从缓冲取文档填充数据
二、往文档的字段加入数据
void Document::addField(FX_NS2::fieldid_t id,const char* value);//第几个字段/字段内容
从文档的字段缓冲中取得字段加入数据,如果字段ID已经超出文档中的字段集合数,则将字段缓冲扩充
…
if(m_nNumFields == m_nMaxFields)
growFields();
m_ppFields[m_nNumFields].field = pField;
…
三、将缓冲中的文档刷入磁盘
void IndexWriter::flushDocuments()
满足条件二选一则启动flushDocuments过程
如果缓冲溢满
用户显式调用flushDocuments过程
此函数包含对每个缓冲中的文档采取两个步骤:
analyzeDoc(m_ppCachedDocs[i]);//分词过程
indexDoc(m_ppCachedDocs[i]); //索引过程
四、分词过程
IndexWriter::analyzeDoc(FX_NS(document)::Document* pParsedDoc)
可为每个字段配以不同的分词器
// m_pAnalyzerMapper->getAnalyzer(pField->getID())//为字段选择相应的分词器
标准分词器
Tokens* StandardAnalyzer::nextTokensInternal(Tokens* pUnAnalyzedTokens)
将分词后的词项映射成int32_t类型(根据词典)
C. 分词结果存储在相应的字段Field中
pField->tokensValue(pTokens,false);
定义m_pTokens为存储分词后结果
FX_NS(analyzer)::Tokens* m_pTokens; ///analyzed tokens
五、索引文档过程
void IndexWriter::indexDoc(FX_NS(document)::Document* pAnalyzedDoc)
a. 如果桶索引器为空则新建一个(createBarrelWriter),
void IndexWriter::createBarrelWriter()
{
// 批量存储文档,文档桶
if(m_pBarrelsInfo->getBarrelCount() > 0)//append
{
BarrelInfo* pLastBarrel = m_pBarrelsInfo->getLastBarrel();
m_pBarrelsInfo->addBarrel(m_pBarrelsInfo->newBarrel().c_str(),0,0);
m_pCurBarrelInfo = m_pBarrelsInfo->getLastBarrel();
// 设定文档编号=文档起始编号+偏移
m_pCurBarrelInfo->setBaseDocID(pLastBarrel->getBaseDocID()+pLastBarrel->getNumDocs());
}
…
// 默认Index.memory = 128000000;
m_pMemPool = new MemPool((size_t)getConfigurator()->index.memory);
m_pIndexBarrelWriter = new IndexBarrelWriter(m_pIndex->getDirectory(),m_pBarrelsInfo,m_pMemPool,m_pCurBarrelInfo->getName().c_str());
….
ComponentProducerMapper* pFactories = m_pIndex->getComponentProducerMapper();
// 除了定义字段集合外,为每个字段新建一个专属字段索引器
m_pIndexBarrelWriter->setSchema(m_pIndex->getDocSchema());
// 为每个字段生成一个DictHashIndexer
m_pIndexBarrelWriter->setComponentProducerMapper(pFactories);
// 选择域集合
m_pBarrelsInfo->setSchema(m_pIndex->getDocSchema());
// 选择合并方式,但不是说此时合并
createMerger();
}
IndexBarrelWriter桶索引器:
{
…
初始化对象时生成一些索引器
m_pFieldsInfo = new FieldsInfo();
m_pStoredFieldsWriter = new StoredFieldsWriter(pDirectory,name,pBarrelsInfo,m_pFieldsInfo);
m_pNormsWriter = new NormsWriter(pDirectory,name);//归一化因子索引器
m_pForwardIndexWriter = NULL;// 正向文档索引器
…
}
b. 为桶索引器分配内存做缓冲
m_pMemPool = new MemPool((size_t)getConfigurator()->index.memory);//分配内存
c. 定义倒排文档索引器FieldIndexer**
void IndexWriter::createBarrelWriter()
{
…
// 为每个字段生成一个DictHashIndexer,由m_ppFieldIndexers[nIndex]指向
m_pIndexBarrelWriter->setComponentProducerMapper(pFactories)
// 除了定义字段集合外,为每个字段新建一个专属字段索引器
m_pIndexBarrelWriter->setSchema(m_pIndex->getDocSchema());
…
}
FieldIndexer** m_ppFieldIndexers;作为IndexBarrelWriter的配属缓冲,存储临时倒排文档,其实质是DictHashIndexer,其缓冲内存从m_pMemPool内存池中取得
d. 索引过程一
void IndexBarrelWriter::addDocument(FX_NS(document)::Document* pDoc)
{
FX_NS(document)::Document::Iterator iter = pDoc->iterator();
FX_NS(document)::Field* pField = NULL;
while(iter.hasNext())
{ // 遍历文档中的每一个词项FID
// 分词过程已经完成,词项映射为ID值
pField = iter.next();//取出字段FID
if( pField->isIndexed())//只考虑需要索引
{
// fid为第几个字段
fid = pField->getID();
//添加词项生成倒排表
// DictHashIndexer m_ppFieldIndexers
// 倒排表写入缓冲
// 重要过程
m_ppFieldIndexers[fid]->addField(pDoc->getDocID(),pField);//对应进行索引,索引结果至相应字段的缓冲中
// 归一化因子
m_pNormsWriter->addField(pDoc->getDocID(),pField);
}
m_pStoredFieldsWriter->addDocument(pDoc); //这个理解为原始文档
// 正向文档,原文
// DictHashFreqVector
m_pForwardIndexWriter->addDocument(pDoc); //何意?
{m_pTVIOutput = m_pDirectory->createOutput(s + _T(".fim"));
m_pTVDOutput = m_pDirectory->createOutput(s + _T(".fid"));
m_pTVVOutput = m_pDirectory->createOutput(s + _T(".fiv"));
}
e. 索引过程二
Fid为字段ID,每个字段都有专属倒排文档索引缓冲区
m_ppFieldIndexers[fid]->addField(pDoc->getDocID(),pField);//对应进行索引,索引结果至相应字段的缓冲中
m_ppFieldIndexers[fid]实质为DictHashIndexer
void DictHashIndexer::addField(docid_t did,FX_NS(document)::Field* pField)
{
… //索引级别,词级别
if(getIndexLevel() == WORD_LEVEL)
{
wordLevelIndexing(did,tids,tcnt);
}
…
}
…
inline void DictHashIndexer::wordLevelIndexing(docid_t did,termid_t* tids,int32_t nNumTerms)
{
for (int32_t i = 0; i < nNumTerms; i++ )
{
if(tids[i] <= 0 )
{
nPosInc++;
continue; ///停用词
}
// 词项倒排列表,如空则在内存中新分配一个,这是从缓冲池中取的内存
curPosting = (PostingWriter_WL*)m_array[tids[i]];
if(curPosting == NULL)
{ //tids[i]为词项ID
curPosting = new PostingWriter_WL(m_pMemPool,getFieldInfo());
m_array[tids[i]] = curPosting;
}
// nPosInc为词项位置
// 问题是如果继续添加时内存已经耗尽-却没有重分配策略
// PooledByteSliceWriter中有重分配策略
curPosting->addLocation( did, nPosInc++ );
}//end for
}
f. 至此倒排文档已经生成,且缓存于m_array中
索引格式大致是:
每个桶索引器IndexBarrelWriter将所有字段的索引存储于m_ppFieldIndexers
m_ppFieldIndexers[0];
m_ppFieldIndexers[1];
…
m_ppFieldIndexers[n];
每个字段索引将所有的词项的索引存储于m_array
m_array[0];
m_array[1];
…
m_array[n];
n个词项,不一定所有的m_array[n]都有值(可为空),每个m_array都是一个倒排表集合PostingWriter_WL(其内存依旧从内存池中取得)
//存储内容为did文档编号,nPosInc词项位置,当然包括m_nCurTermFreq (TF)
curPosting->addLocation( did, nPosInc++ );
六、写索引过程
a.
void IndexWriter::writeCachedIndex()
{
…
// 写桶信息-barrels文件只是XML文件
m_pBarrelsInfo->write(m_pIndex->getDirectory());
// 调用写索引入磁盘
m_pIndexBarrelWriter->close();
…
}
void IndexBarrelWriter::close()
{
if(cacheEmpty() == false)
{
writeCache(); //写索引
closeFiles();
resetCache();
}
}
void IndexBarrelWriter::writeCache()
{
tstring s = m_barrelName +_T(".voc"); //词汇表
FX_NS(store)::CIndexOutput* pVocOutput = m_pDirectory->createOutput(s.c_str());
s = m_barrelName + _T(".dfp");
FX_NS(store)::CIndexOutput* pDOutput = m_pDirectory->createOutput(s.c_str());
s = m_barrelName + _T(".pop");
FX_NS(store)::CIndexOutput* pPOutput = m_pDirectory->createOutput(s.c_str());
OutputDescriptor desc(pVocOutput,pDOutput,pPOutput,true);
fileoffset_t vocOff1,vocOff2,dfiOff1,dfiOff2,ptiOff1,ptiOff2;
fileoffset_t vocOffset;
FieldIndexer* pFieldIndexer;
for (FX_NS2::fieldid_t i = 0;i<(FX_NS2::fieldid_t)m_nNumIndexers;i++)
{//遍历每个字段
pFieldIndexer = m_ppFieldIndexers[i];
vocOff1 = pVocOutput->getFilePointer();
dfiOff1 = pDOutput->getFilePointer();
ptiOff1 = pPOutput->getFilePointer();
m_pFieldsInfo->setDistinctNumTerms((FX_NS2::fieldid_t)i,pFieldIndexer->distinctNumTerms());///该字段词项数
// DictHashIndexer::write
vocOffset = pFieldIndexer->write(&desc);///写索引入磁盘write field index data
m_pFieldsInfo->setFieldOffset((FX_NS2::fieldid_t)i,vocOffset);///set offset of vocabulary descriptor
//重新设置偏移地址
vocOff2 = pVocOutput->getFilePointer();
dfiOff2 = pDOutput->getFilePointer();
ptiOff2 = pPOutput->getFilePointer();
//写域信息
s = m_barrelName + _T(".fdi");
FX_NS(store)::CIndexOutput* fdiOutput = m_pDirectory->createOutput(s.c_str());
m_pFieldsInfo->write(fdiOutput);
}
fileoffset_t DictHashIndexer::write(OutputDescriptor* pWriterDesc)
{
m_nVocFilePointer = pWriterDesc->getVocOutput()->getFilePointer();
FX_NS(store)::CIndexOutput* pVocWriter = pWriterDesc->getVocOutput();
fileoffset_t nPOffset;
termid_t tid;
fileoffset_t nLastOffset = 0;
termid_t nLastTermID= 0;
int32_t nTermCount = 0;
PostingWriter* pPosting;
fileoffset_t vocOffset = pVocWriter->getFilePointer();
DynPostingArray::array_iterator aiter = m_array.elements();
while(aiter.next())
{
pPosting = aiter.element();
if(!pPosting->isEmpty())
{
tid = (termid_t)aiter.position();
// 差分编码词项
pVocWriter->writeVInt(tid - nLastTermID); ///write term id
pVocWriter->writeVInt(pPosting->getDocFreq()); ///write df
//写位置信息
nPOffset = pPosting->writeTo(pWriterDesc); ///write posting data
pVocWriter->writeVLong(nPOffset - nLastOffset); ///write offset of posting descriptor
pPosting->reset(); ///clear posting data
nLastTermID= tid;
nLastOffset = nPOffset;
nTermCount++;
}
}
fileoffset_t vocDescOffset = pVocWriter->getFilePointer();
int64_t vocLength = vocDescOffset - vocOffset;
///begin write vocabulary descriptor
pVocWriter->writeVLong(vocLength); ///
pVocWriter->writeVLong(nTermCount); ///
///end write vocabulary descriptor
return vocDescOffset;
}
fileoffset_t PostingWriter_WL::writeTo(OutputDescriptor* pOutputDescriptor)
{
///flush last document
flushLastDoc();
CIndexOutput* pDOutput = pOutputDescriptor->getDPostingOutput();
///write chunk data
if(m_nDF > 1)
writeDPosting(pDOutput);
///save the offset of posting descriptor
fileoffset_t poffset = pDOutput->getFilePointer();
fileoffset_t positionPointer;
CIndexOutput* pPOutput = pOutputDescriptor->getPPostingOutput();
if(m_nCTF > 1)
{
///write position posting data
positionPointer = writePPosting(pPOutput);
}
else
{
positionPointer = pPOutput->getFilePointer();
}
///write descriptors
writeDescriptor(pDOutput,positionPointer);
#ifdef POSTING_CHECK
PostingDecoder* pDecoder = createDecoder();
pDecoder->check();
delete pDecoder;
#endif
return poffset;
}
合并过程
IndexWriter::optimizeIndex(bool bGC=false)
IndexWriter::mergeAndWriteCachedIndex()
IndexMerger::merge(firtex::index::BarrelsInfo * pBarrelsInfo=0x003b7eb8)
{
…
CMergeBarrel mb(pBarrelsInfo->getBarrelCount());
///put all index barrel into mb
BarrelsInfo::Iterator iter = pBarrelsInfo->iterator();
BarrelInfo* pBaInfo;
// 读入所有的桶信息
while (iter.hasNext())
{
pBaInfo = iter.next();
mb.put(new MergeBarrelEntry(m_pDirectory,pBaInfo));
}
// 合并过程
while (mb.size() > 0)
{
addBarrel(mb.pop());
}
endMerge();
…
}
OptimizeMerger::addBarrel(firtex::index::MergeBarrelEntry * pEntry=0x00839d20)
{
…
if(iter != m_levelsMap.end())
{
pLevel = iter->second;
pLevel->m_nLevelSize += m_nCurLevelSize;
pLevel->add(pEntry);
if((int32_t)pLevel->m_pMergeBarrel->size() >= m_nC)
// m_nC设置为5
// OptimizeMerger::OptimizeMerger(Directory* pSrcDirectory):m_nC(5)
///collision,trigger a merge event
{
m_nCurLevelSize = pLevel->m_nLevelSize;
pLevel->m_nLevelSize = 0;
mergeBarrel(pLevel->m_pMergeBarrel); //合并
pLevel->increaseMergeTimes();
m_nCurLevelSize = 1;
}
}
else
{
pLevel = new OptimizeMergeTreeLevel(nLevel,m_nCurLevelSize,m_nC);
pLevel->add(pEntry);
m_levelsMap.insert(make_pair(nLevel,pLevel));
}
}
IndexMerger::mergeBarrel(firtex::index::CMergeBarrel * pBarrel=0x00839e60)
///合并过程
void IndexMerger::mergeBarrel(CMergeBarrel* pBarrel)
{
//////////////////////////////////////////////////////////////////////////
if(!m_pSpeedProbe)
m_pSpeedProbe = new SpeedProbe;
FIRTEX_CLOG(level::info) << _T("Begin merge: ") << FIRTEX_END;
BarrelInfo* pBaInfo;
for(size_t i = 0;i < pBarrel->size();i++)
{
pBaInfo = pBarrel->getAt(i)->m_pBarrelInfo;
FIRTEX_CLOG(level::info) << _T("/t") << (int)i << _T(":") << pBaInfo->getNumDocs() << FIRTEX_END;
if(pBaInfo->getNumDeletedDocs() > 0)
FIRTEX_CLOG(level::info) << _T("(") << pBaInfo->getNumDeletedDocs() << _T(")") << FIRTEX_END;
}
FIRTEX_CLOG(level::info) << FIRTEX_ENDL;
m_pSpeedProbe->begin();
//////////////////////////////////////////////////////////////////////////
bool bGC = m_bForceGC;
if(bGC == false)
bGC = isGC(pBarrel);
pBarrel->load(bGC);
tstring sNewBarrelName = pBarrel->getIdentifier();
BarrelInfo* pNewBarrelInfo = new BarrelInfo(sNewBarrelName,0,0);
/// the file name of new index barrel
tstring name = sNewBarrelName + _T(".voc");/// 域词典信息
FX_NS(store)::CIndexOutput* pVocStream = m_pDirectory->createOutput(name);
name = sNewBarrelName + _T(".dfp"); /// 文档索引以及词条位置
FX_NS(store)::CIndexOutput* pDStream = m_pDirectory->createOutput(name);
name = sNewBarrelName + _T(".pop"); /// 词条位置信息
FX_NS(store)::CIndexOutput* pPStream = m_pDirectory->createOutput(name);
OutputDescriptor* pOutputDesc = new OutputDescriptor(pVocStream,pDStream,pPStream,true);
barrel_vector vDeletedBarrels;
MergeBarrelEntry* pEntry = NULL;
if(bGC)
{
///
}
size_t nEntry;
df_t nNumDocs = 0;
df_t nNumDelDocs = 0;
size_t nEntryCount = pBarrel->size();
///update min doc id of index barrels,let doc id continuous
for(nEntry = 0;nEntry < nEntryCount;nEntry++)
{ // 桶文档数倒序
pEntry = pBarrel->getAt(nEntry);
pEntry->m_pBarrelInfo->setBaseDocID(nNumDocs);
nNumDocs += pEntry->m_pBarrelInfo->getNumDocs();
nNumDelDocs += pEntry->m_pBarrelInfo->getNumDeletedDocs();
}
fileoffset_t nVocOff1,nVocOff2,nDocOff1,nDocOff2,nPosOff1 = 0,nPosOff2 = 0;
fileoffset_t nVocOffset = 0;
while (!bFinish)
{
for(nEntry = 0;nEntry < nEntryCount;nEntry++)
{
pEntry = pBarrel->getAt(nEntry);
if((FX_NS2::fieldid_t)pEntry->m_pFieldsInfo->getNumFields() > fieldid)
{
pFieldInfo = pEntry->m_pFieldsInfo->getField(fieldid);///get field information
if(pFieldInfo)
{
if(pFieldInfo->isIndexed())///it's a index field
{
if(pFieldMerger == NULL)
{
pFieldMerger = m_pProducerMapper->getProducer(fieldid)->createMerger();
pFieldMerger->setDirectory(m_pDirectory);
}
pFieldMerger->addField(pEntry->m_pBarrelInfo,pFieldInfo,pEntry->m_pDocFilter);///add to field merger
}
}
}
}
///close files of index writer 如果内存还有缓冲索引
///merge stored fields
mergeStoredFields(pBarrel);
// 生成临时sfv/sfm(_mid_0_0.sfm/ _mid_0_0.sfv)
// sfm文件:存储域数据索引
// sfv文件:只有在.fdi中的FieldFlag具有存储属性的域才需要进行此文件读写
///merge norms data
mergeNorms(pBarrel); // 生成临时归一化因子_mid_0_0.nx x为第几个字段
//merge bitmap of deleted documents
mergeDeletedDocs(pBarrel,bGC);
//merge term vector
bool bHasTermVector = false;
if(bHasTermVector == true)
{
mergeTermVector(pBarrel);
}
//deleted all merged barrels 删除所有的索引桶(合并前)
// 写位置信息
nEntryCount = pBarrel->size();
for(nEntry = 0;nEntry < nEntryCount;nEntry++)
{
pEntry = pBarrel->getAt(nEntry);
IndexBarrelWriter* pWriter = pEntry->m_pBarrelInfo->getWriter();
if(pWriter)///clear in-memory index
{
pWriter->resetCache(true);
///borrow buffer from indexer
setBuffer((char*)pWriter->getMemPool()->getBegin(),pWriter->getMemPool()->getSize());
m_bBorrowedBuffer = true;
}
///删除所有的索引桶(合并前) m_pBarrelsInfo->removeBarrel(m_pDirectory,pEntry->m_pBarrelInfo->getName());///delete merged barrels
}
m_pBarrelsInfo->addBarrel(pNewBarrelInfo,false);
continueDocIDs(m_pBarrelsInfo);///let doc ids in a continuous form
///TODO:UNLOCK
if(m_pMergeBarrels)
{
removeMergedBarrels(pBarrel);
}
pBarrel->clear();
name = sNewBarrelName + _T(".fdi");
FX_NS(store)::CIndexOutput* fieldsStream = m_pDirectory->createOutput(name);
fieldsInfo.write(fieldsStream);//field information
delete fieldsStream; // CIndexOutput在析构前有将索引刷磁盘的动作-fdi文件
if(bHasPPosting == false)
{
name = sNewBarrelName + _T(".pop");
m_pDirectory->deleteFile(name);
}
delete pOutputDesc; // CIndexOutput在析构前有将索引刷磁盘的动作-刷词典
pOutputDesc = NULL;
}
索引文件简析
sfm文件
每个文档存储8个字节(偏移地址: int64_t i)
存储sfv存储起始位置
sfv
// 循环存储每一个文档
int32_t I 需要存储的域个数
//只写需要存储属性的字段
//循环
int32_t fid
//以下按不同类型存储
//例如Field::TEXT
writeVInt(length); //int32_t 类型 字段长度
writeChars(s.c_str(), 0, length); //字段内容,字节流
voc文件
// 遍历字段
// 遍历词项
int32_t (tid – nLastTermID) //相邻词项编码的差值 // 差值只计算字段内,不跨字段
int32_t df //该词项文档频率
int64_t (nPOffset - nLastOffset) //
// 遍历结束
//
Int64_t vocLength // 词汇表长度(字节)
Int64_t nTermCount // 词项个数
pop文件
// 遍历所有位置
Int32_t location // 位置
//
Int64_t 位置数据长度(字节)
dpf文件
m_nCTF 所有文档的累加出现次数
m_nDF DF值,出现文档计数
// 遍历倒排表,所有该词项的文档集合
Int32_t (docid - m_nLastDocID) //相邻文档差值
Int32_t Tf ///m_nCurTermFreq
// 如果m_nDF == 1
// 如果m_nCTF > 1 仅在一篇文档出现且次数超过一
{
Int32_t ( (m_nLastDocID << 1) + 1)
Int64_t m_nCTF
Int64_t offset
}
//如果m_nCTF == 1
// 如果DF大于一
pDOutput->writeVLong(m_nCTF); ///
pDOutput->writeVLong(m_pDocFreqWriter->getLength());///
pDOutput->writeVLong(poffset); ///
// 以一个间隔记录偏移文档
if( m_pSkipListWriter && m_pSkipListWriter->getNumLevels() > 0) ///m_nDF > m_nSkipInterval
{
pDOutput->writeVInt( (m_nLastDocID << 1) + 1);
pDOutput->writeByte(m_pSkipListWriter->getNumLevels());
m_pSkipListWriter->writeTo(pDOutput); ///write skip list data
}
else
{
pDOutput->writeVInt(m_nLastDocID << 1);
}