Ceph Version : Kraken
Component:FileStore
mkjournal()函数是用来初始化OSD的事务日志,格式化事务日志的头,事务日志可以使用裸块设备(已分区)也可以是一个文件。
mkjournal函数实现:
1.读取OSD的fsid信息(从fsid文件中)
2.创建FileJournal类型实例。
3.检查事务日志是否已经存在,若存在读取事务日志头,检查是否跟OSD fsid相等。
4.若事务日志不存在或者fsid不相等,则初始化事务日志(添加事务日志头)。
mkjournal代码实现:
//初始化journal
int FileStore::mkjournal()
{
// read fsid
int ret;
char fn[PATH_MAX];
snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
int fd = ::open(fn, O_RDONLY, 0644);
if (fd < 0) {
¦ int err = errno;
¦ derr << __FUNC__ << ": open error: " << cpp_strerror(err) << dendl;
¦ return -err;
}
ret = read_fsid(fd, &fsid);
if (ret < 0) {
¦ derr << __FUNC__ << ": read error: " << cpp_strerror(ret) << dendl;
¦ VOID_TEMP_FAILURE_RETRY(::close(fd));
¦ return ret;
}
VOID_TEMP_FAILURE_RETRY(::close(fd));
ret = 0;
//创建FileJournal对象
new_journal();
if (journal) {
¦ ret = journal->check();
¦ if (ret < 0) {
//创建初始化日志设备,或者构建日志文件
¦ ¦ ret = journal->create();
¦ ¦ if (ret)
derr << __FUNC__ << ": error creating journal on " << journalpath
<< ": " << cpp_strerror(ret) << dendl;
¦ ¦ else
dout(0) << __FUNC__ << ": created journal on " << journalpath << dendl;
¦ }
¦ delete journal;
¦ journal = 0;
}
return ret;
}
事务日志可以使用裸块分区也可以是一个文件,下面这段函数中主要是:如果是裸分区用于记录日志,则检查该分区大小是否满足,并格式化事务日志头,如果是一个文件用于记录日志,则构建该文件,检查大小是否满足,并格式化事务日志头。
//创建日志文件或初始日志设备,并初始化日志头
int FileJournal::create()
{
void *buf = 0;
int64_t needed_space;
int ret;
buffer::ptr bp;
dout(2) << "create " << fn << " fsid " << fsid << dendl;
//检查日志块
ret = _open(true, true);
if (ret)
¦ goto done;
//构建空的日志头
// write empty header
header = header_t();
header.flags = header_t::FLAG_CRC; // enable crcs on any new journal.
header.fsid = fsid;
header.max_size = max_size;
header.block_size = block_size;
if (cct->_conf->journal_block_align || directio)
¦ header.alignment = block_size;
else
¦ header.alignment = 16; // at least stay word aligned on 64bit machines...
header.start = get_top(); //设置日志头之后的第一个日志块位置
header.start_seq = 0;
print_header(header);
// static zeroed buffer for alignment padding
delete [] zero_buf;
zero_buf = new char[header.alignment];
memset(zero_buf, 0, header.alignment);
bp = prepare_header();
//将日志头信息写入到日志的开始处。
if (TEMP_FAILURE_RETRY(::pwrite(fd, bp.c_str(), bp.length(), 0)) < 0) {
¦ ret = -errno;
¦ derr << "FileJournal::create : create write header error "
¦ ¦ ¦ ¦<< cpp_strerror(ret) << dendl;
¦ goto close_fd;
}
// zero first little bit, too.
ret = posix_memalign(&buf, block_size, block_size);
if (ret) {
¦ ret = -ret;
¦ derr << "FileJournal::create: failed to allocate " << block_size
¦<< " bytes of memory: " << cpp_strerror(ret) << dendl;
¦ goto close_fd;
}
memset(buf, 0, block_size);
//将头之后的第一个日志块,初始化为0.
if (TEMP_FAILURE_RETRY(::pwrite(fd, buf, block_size, get_top())) < 0) {
¦ ret = -errno;
¦ derr << "FileJournal::create: error zeroing first " << block_size
¦<< " bytes " << cpp_strerror(ret) << dendl;
¦ goto free_buf;
}
//判断当前日志的空间是否满足最小日志大小要求。
needed_space = ((int64_t)cct->_conf->osd_max_write_size) << 20;
needed_space += (2 * sizeof(entry_header_t)) + get_top();
//注:这里的 header.max_size 不应该在减去 header.start(日志头所占用的空间大小),因为
//needed_space已经将其包含在内。
if (header.max_size - header.start < needed_space) {
¦ derr << "FileJournal::create: OSD journal is not large enough to hold "
¦<< "osd_max_write_size bytes!" << dendl;
¦ ret = -ENOSPC;
¦ goto free_buf;
}
dout(2) << "create done" << dendl;
ret = 0;
free_buf:
free(buf);
buf = 0;
close_fd:
if (TEMP_FAILURE_RETRY(::close(fd)) < 0) {
¦ ret = -errno;
¦ derr << "FileJournal::create: error closing fd: " << cpp_strerror(ret)
¦<< dendl;
}
done:
fd = -1;
return ret;
}
检查日志头中记录的fsid与current/fsid的值是否相等
//检查日志头中记录的fsid与current/fsid的值是否相等。
// This can not be used on an active journal
int FileJournal::check()
{
int ret;
assert(fd == -1);
//打开日志文件
ret = _open(false, false);
if (ret)
¦ return ret;
//读取日志头信息
ret = read_header(&header);
if (ret < 0)
¦ goto done;
//判断当前OSD的fsid与日志头中记录的fsid是否相等
if (header.fsid != fsid) {
¦ derr << "check: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
¦<< ", invalid (someone else's?) journal" << dendl;
¦ ret = -EINVAL;
¦ goto done;
}
dout(1) << "check: header looks ok" << dendl;
ret = 0;
done:
close();
return ret;
}
//将内存中的日志header结构对象序列化到一段内存空间中,并返回该空间的指针
bufferptr FileJournal::prepare_header()
{
bufferlist bl;
{
¦ Mutex::Locker l(finisher_lock);
¦ header.committed_up_to = journaled_seq;
}
::encode(header, bl);
bufferptr bp = buffer::create_page_aligned(get_top());
// don't use bp.zero() here, because it also invalidates
// crc cache (which is not yet populated anyway)
char* data = bp.c_str();
memcpy(data, bl.c_str(), bl.length());
data += bl.length();
memset(data, 0, bp.length()-bl.length());
return bp;
}
/*
¦* journal header
¦*/
struct header_t {
¦ enum {
¦ ¦ FLAG_CRC = (1<<0),
¦ ¦ // NOTE: remove kludgey weirdness in read_header() next time a flag is added.
¦ };
¦ uint64_t flags;
¦ uuid_d fsid;
¦ __u32 block_size;
¦ __u32 alignment;
¦ int64_t max_size; // max size of journal ring buffer
¦ int64_t start; // offset of first entry
¦ uint64_t committed_up_to; // committed up to
¦ /**
¦ ¦* start_seq
¦ ¦*
¦ ¦* entry at header.start has sequence >= start_seq
¦ ¦*
¦ ¦* Generally, the entry at header.start will have sequence
¦ ¦* start_seq if it exists. The only exception is immediately
¦ ¦* after journal creation since the first sequence number is
¦ ¦* not known.
¦ ¦*
¦ ¦* If the first read on open fails, we can assume corruption
¦ ¦* if start_seq > committed_up_to because the entry would have
¦ ¦* a sequence >= start_seq and therefore > committed_up_to.
¦ ¦*/
¦ uint64_t start_seq;
¦ void encode(bufferlist& bl) const {
¦ ¦ __u32 v = 4;
¦ ¦ ::encode(v, bl);
¦ ¦ bufferlist em;
¦ ¦ {
::encode(flags, em);
::encode(fsid, em);
::encode(block_size, em);
::encode(alignment, em);
::encode(max_size, em);
::encode(start, em);
::encode(committed_up_to, em);
::encode(start_seq, em);
¦ ¦ }
¦ ¦ ::encode(em, bl);
¦ }
¦ void decode(bufferlist::iterator& bl) {
¦ ¦ __u32 v;
¦ ¦ ::decode(v, bl);
¦ ¦ if (v < 2) { // normally 0, but concievably 1
// decode old header_t struct (pre v0.40).
bl.advance(4); // skip __u32 flags (it was unused by any old code)
flags = 0;
uint64_t tfsid;
::decode(tfsid, bl);
*(uint64_t*)&fsid.bytes()[0] = tfsid;
*(uint64_t*)&fsid.bytes()[8] = tfsid;
::decode(block_size, bl);
::decode(alignment, bl);
::decode(max_size, bl);
::decode(start, bl);
committed_up_to = 0;
start_seq = 0;
return;
¦ ¦ }
¦ ¦ bufferlist em;
¦ ¦ ::decode(em, bl);
¦ ¦ bufferlist::iterator t = em.begin();
¦ ¦ ::decode(flags, t);
¦ ¦ ::decode(fsid, t);
¦ ¦ ::decode(block_size, t);
¦ ¦ ::decode(alignment, t);
¦ ¦ ::decode(max_size, t);
¦ ¦ ::decode(start, t);
¦ ¦ if (v > 2)
::decode(committed_up_to, t);
¦ ¦ else
committed_up_to = 0;
¦ ¦ if (v > 3)
::decode(start_seq, t);
¦ ¦ else
start_seq = 0;
¦ }
。。。
}