we will build an output buffer ourself and then use O_DIRECT
we could be in read lock for this
for very large objects write directly to redo log in situ?
we could be unlocked (the main db lock that is...) for this, with sufficient care, but there is some complexity have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that).for now (1.7.5/1.8.0) we are in read lock which is not ideal.
actually write to the database data files in this phase. currently done by memcpy'ing the writes back to the non-private MMF. alternatively one could write to the files the traditional way; however the way our storage engine works that isn't any faster (actually measured a tiny bit slower).
we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want to be too frequent.
there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will be required. so doing these remaps fractionally is helpful.
void PREPLOGBUFFER(/*out*/ JSectHeader& h, AlignedBuilder& ab) { assertLockedForCommitting(); Timer t; j.assureLogFileOpen(); // so fileId is set
_PREPLOGBUFFER(h, ab); //直接调用了这个函数
stats.curr->_prepLogBufferMicros += t.micros(); }
static void _PREPLOGBUFFER(JSectHeader& h, AlignedBuilder& bb) { …… resetLogBuffer(/*out*/h, bb); // adds JSectHeader section对应于group commit // ops other than basic writes (DurOp's) ops干嘛用还不清楚
{ for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) { (*i)->serialize(bb); } } prepBasicWrites(bb); //从这里还是把东西写入到bb中
return; }
static void prepBasicWrites(AlignedBuilder& bb) { …… RelativePath lastDbPath; …… const vector<WriteIntent>& _intents = commitJob.getIntentsSorted(); //取出要处理的JOB写入意向
…… WriteIntent last; for( vector<WriteIntent>::const_iterator i = _intents.begin(); i != _intents.end(); i++ ) { //因为last为空所以第一遍的时候last=*i
if( i->start() < last.end() ) { //若job之间重叠就通过absorb连接上变成一个。
last.absorb(*i); } else { //若连不上则写入
if( i != _intents.begin() ) prepBasicWrite_inlock(bb, &last, lastDbPath); //对单个意向进行处理
last = *i; } } prepBasicWrite_inlock(bb, &last, lastDbPath); }
static void prepBasicWrite_inlock(AlignedBuilder&bb, const WriteIntent *i, RelativePath& lastDbPath) { size_t ofs = 1; DurableMappedFile *mmf = findMMF_inlock(i->start(), /*out*/ofs);//查找内存映射文件,应该是privare_view
if( unlikely(!mmf->willNeedRemap()) ) { // tag this mmf as needed a remap of its private view later. // usually it will already be dirty/already set, so we do the if above first // to avoid possibility of cpu cache line contention
mmf->willNeedRemap() = true; //标记等会儿要remap
} JEntry e; //JEntry表示group commit中单个的写操祝,整个entry要不被执行,要不不被执行
e.len = min(i->length(), (unsigned)(mmf->length() - ofs)); //don't write past end of file 不能超过mmf大小
…… e.setFileNo( mmf->fileSuffixNo() ); if( mmf->relativePath() == local ) { e.setLocalDbContextBit(); } else if( mmf->relativePath() != lastDbPath ) { lastDbPath = mmf->relativePath(); JDbContext c; bb.appendStruct(c); //把db上下文写入到bb
} bb.appendStruct(e);//把JEntry写入到日志
…… bb.appendBuf(i->start(), e.len); //把写入意向的内容写入到bb
…… }
void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed) { Timer t; j.journal(h, uncompressed); //调用Journal::jounal
stats.curr->_writeToJournalMicros += t.micros(); }
void Journal::journal(const JSectHeader& h, const AlignedBuilder& uncompressed) { …… static AlignedBuilder b(32*1024*1024); //分配一个值用于写文件的buf
const unsigned headTailSize = sizeof(JSectHeader) + sizeof(JSectFooter); //section头尾的大小
const unsigned max = maxCompressedLength(uncompressed.len()) + headTailSize;//获取buffer未压缩的所有大小
{ …… b.appendStruct(h);//写入section头到b这个buffer上
} size_t compressedLength = 0; rawCompress(uncompressed.buf(), uncompressed.len(), b.cur(), &compressedLength); //把带job的buffer放到b中,返回压缩后的长度
…… b.skip(compressedLength);//跳过compressedLength的大小,准备写下来的写入
unsigned L = 0xffffffff; { …… JSectFooter f(b.buf(), b.len()); // computes checksum
…… b.skip(L - lenUnpadded);//跳过尾的大小
…… } try { SimpleMutex::scoped_lock lk(_curLogFileMutex); …… _curLogFile->synchronousAppend((constvoid *) b.buf(), L); //写入数据到日志文件,LogFile::synchronousAppend
…… } catch(std::exception& e) { …… } }
static void WRITETODATAFILES_Impl1(const JSectHeader& h, AlignedBuilder& uncompressed) { LOG(3) << "journal WRITETODATAFILES 1" << endl; RecoveryJob::get().processSection(&h, uncompressed.buf(), uncompressed.len(), 0); //在这里进入
LOG(3) << "journal WRITETODATAFILES 2" << endl; }
void RecoveryJob::processSection(const JSectHeader *h, constvoid *p, unsigned len, const JSectFooter *f) { …… auto_ptr<JournalSectionIterator> i; if( _recovering /*表示recovering 或WRITETODATAFILES*/ ) { …… } else { //如果是WRITETODATAFILES
i = auto_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, /*after header*/p, /*w/out header*/len)); //把buffer转化为JournalSectionIterator类型
} static vector<ParsedJournalEntry> entries; //解析JEntry,然后放到entries上
entries.clear(); ParsedJournalEntry e; while( !i->atEof() ) { i->next(e); //把bb中的数据转化为ParsedJournalEntry
} …… // got all the entries for one group commit. apply them:
applyEntries(entries); //应用这些entries
void RecoveryJob::applyEntries(const vector<ParsedJournalEntry> &entries) { …… for( vector<ParsedJournalEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i ) { applyEntry(last, *i, apply, dump); //一个一个应用
} …… }
void RecoveryJob::applyEntry(Last& last, const ParsedJournalEntry& entry, bool apply, bool dump) { if( entry.e ) { //如果e存在写入操作 …… if( apply ) {//WRITETODATAFILES
write(last, entry); //在这里一个一个的写入
} } …… }
void RecoveryJob::write(Last& last, const ParsedJournalEntry& entry) { …… DurableMappedFile *mmf = last.newEntry(entry, *this); //获取要写入的对象
if ((entry.e->ofs + entry.e->len) <= mmf->length()) { …… void* dest = (char*)mmf->view_write() + entry.e->ofs;//目标位_view_write
memcpy(dest, entry.e->srcData(), entry.e->len); //通过memcopy写入日志到_view_write,也就是datafile
…… } …… }
写完了datafile之后,要对private view(_view_private)做重新映射
void REMAPPRIVATEVIEW() {//重新映射privare_view
Timer t; _REMAPPRIVATEVIEW(); //直接进入
stats.curr->_remapPrivateViewMicros += t.micros(); }
static void _REMAPPRIVATEVIEW() { …… set<MongoFile*>& files = MongoFile::getAllFiles(); //获取所有文件准备重新映射
…… constset<MongoFile*>::iterator b = files.begin(); constset<MongoFile*>::iterator e = files.end(); Timer t; for( unsigned x = 0; x < ntodo; x++ ) { …… if( (*i)->isDurableMappedFile() ) { //判断是不是DurableMappedFile,继承在DurableMappedFile中重写
DurableMappedFile *mmf = (DurableMappedFile*) *i; //有继承关系所以可以直接把MongoFile转化为DurableMappedFile
verify(mmf); if( mmf->willNeedRemap() ) { //如果需要重新映射
mmf->willNeedRemap() = false; mmf->remapThePrivateView(); //重新映射
} i++; if( i == e ) i = b; } } …… }
void DurableMappedFile::remapThePrivateView() { …… void *old = _view_private; _view_private = remapPrivateView(_view_private); //对private_view进行重新映射,也就是_view_pirvate
…… }
void* MemoryMappedFile::remapPrivateView(void *oldPrivateAddr) { …… void* newPrivateView = MapViewOfFileEx( //重新映射
maphandle, // file mapping handle
FILE_MAP_READ, // access
0, 0, // file offset, high and low
0, // bytes to map, 0 == all
oldPrivateAddr ); // we want the same address we had before
…… return newPrivateView; }
可以发现,进行了重新映射,但是里面有个maphandle,为了check一下手册里面说的,private view重新映射到shared view(_view_write)我们继续往下看。
bool DurableMappedFile::create(const std::string& fname, unsigned longlong& len, bool sequentialHint) { LOG(3) << "mmf create " << fname << endl; setPath(fname); _view_write = map(fname.c_str(), len, sequentialHint ? SEQUENTIAL : 0); return finishOpening(); }
void* MemoryMappedFile::map(constchar *filenameIn, unsigned longlong &length, int options) { …… DWORD flProtect = PAGE_READWRITE; //(options & READONLY)?PAGE_READONLY:PAGE_READWRITE;
maphandle = CreateFileMappingW(fd, NULL, flProtect, length >> 32/*maxsizehigh*/, (unsigned) length /*maxsizelow*/, NULL/*lpName*/); //在map数据文件的时候把返回结果复制给maphandle
…… void *view = 0; { …… view = MapViewOfFileEx(//创建了map,最后返回view
maphandle, // file mapping handle
access, // access
0, 0, // file offset, high and low
0, // bytes to map, 0 == all
thisAddress ); // address to place file
…… } …… return view; }
所以这个maphandle是_view_write的maphandle也就是shared view(数据库文件)。