FileStore::mkjournal()&&FileJournal::create()

Ceph Version : Kraken
Component:FileStore
mkjournal()函数是用来初始化OSD的事务日志,格式化事务日志的头,事务日志可以使用裸块设备(已分区)也可以是一个文件。
mkjournal函数实现:
1.读取OSD的fsid信息(从fsid文件中)
2.创建FileJournal类型实例。
3.检查事务日志是否已经存在,若存在读取事务日志头,检查是否跟OSD fsid相等。
4.若事务日志不存在或者fsid不相等,则初始化事务日志(添加事务日志头)。
mkjournal代码实现:

//初始化journal
int FileStore::mkjournal()
{
  // read fsid
  int ret;
  char fn[PATH_MAX];
  snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
  int fd = ::open(fn, O_RDONLY, 0644);
  if (fd < 0) {
  ¦ int err = errno;
  ¦ derr << __FUNC__ << ": open error: " << cpp_strerror(err) << dendl;
  ¦ return -err;
  }
  ret = read_fsid(fd, &fsid);
  if (ret < 0) {
  ¦ derr << __FUNC__ << ": read error: " << cpp_strerror(ret) << dendl;
  ¦ VOID_TEMP_FAILURE_RETRY(::close(fd));
  ¦ return ret;
  }
  VOID_TEMP_FAILURE_RETRY(::close(fd));

  ret = 0;
//创建FileJournal对象
  new_journal();
  if (journal) {
  ¦ ret = journal->check();
  ¦ if (ret < 0) {
  //创建初始化日志设备,或者构建日志文件
  ¦ ¦ ret = journal->create();
  ¦ ¦ if (ret)
        derr << __FUNC__ << ": error creating journal on " << journalpath
                << ": " << cpp_strerror(ret) << dendl;
  ¦ ¦ else
        dout(0) << __FUNC__ << ": created journal on " << journalpath << dendl;
  ¦ }
  ¦ delete journal;
  ¦ journal = 0;
  }
  return ret;
}           

初始事务日志

事务日志可以使用裸块分区也可以是一个文件,下面这段函数中主要是:如果是裸分区用于记录日志,则检查该分区大小是否满足,并格式化事务日志头,如果是一个文件用于记录日志,则构建该文件,检查大小是否满足,并格式化事务日志头。

//创建日志文件或初始日志设备,并初始化日志头                                             
int FileJournal::create()                                                        
{                                                                                
  void *buf = 0;                                                                 
  int64_t needed_space;                                                          
  int ret;                                                                       
  buffer::ptr bp;                                                                
  dout(2) << "create " << fn << " fsid " << fsid << dendl;                       
//检查日志块                                                                     
  ret = _open(true, true);                                                       
  if (ret)                                                                       
  ¦ goto done;                                                                   
   //构建空的日志头                                                                              
  // write empty header                                                          
  header = header_t();                                                           
  header.flags = header_t::FLAG_CRC;  // enable crcs on any new journal.         
  header.fsid = fsid;                                                            
  header.max_size = max_size;                                                    
  header.block_size = block_size;                                                
  if (cct->_conf->journal_block_align || directio)                               
  ¦ header.alignment = block_size;                                               
  else                                                                           
  ¦ header.alignment = 16;  // at least stay word aligned on 64bit machines...   

  header.start = get_top();  //设置日志头之后的第一个日志块位置                                                    
  header.start_seq = 0;                                                          

  print_header(header);                                                          

  // static zeroed buffer for alignment padding                                  
  delete [] zero_buf;                                                            
  zero_buf = new char[header.alignment];                                         
  memset(zero_buf, 0, header.alignment);                                         
  bp = prepare_header();   
//将日志头信息写入到日志的开始处。  
  if (TEMP_FAILURE_RETRY(::pwrite(fd, bp.c_str(), bp.length(), 0)) < 0) {        
  ¦ ret = -errno;                                                                
  ¦ derr << "FileJournal::create : create write header error "                   
  ¦ ¦ ¦ ¦<< cpp_strerror(ret) << dendl;                                          
  ¦ goto close_fd;                                                               
  }                                                                              

  // zero first little bit, too.                                                 
  ret = posix_memalign(&buf, block_size, block_size);                            
  if (ret) {                                                                     
  ¦ ret = -ret;                                                                  
  ¦ derr << "FileJournal::create: failed to allocate " << block_size             
        ¦<< " bytes of memory: " << cpp_strerror(ret) << dendl;                  
  ¦ goto close_fd;                                                               
  }                                                                              
  memset(buf, 0, block_size); 
//将头之后的第一个日志块,初始化为0.  
  if (TEMP_FAILURE_RETRY(::pwrite(fd, buf, block_size, get_top())) < 0) {        
  ¦ ret = -errno;                                                                
  ¦ derr << "FileJournal::create: error zeroing first " << block_size            
        ¦<< " bytes " << cpp_strerror(ret) << dendl;                             
  ¦ goto free_buf;                                                               
  }                                                                              
//判断当前日志的空间是否满足最小日志大小要求。                                                                                
  needed_space = ((int64_t)cct->_conf->osd_max_write_size) << 20;                
  needed_space += (2 * sizeof(entry_header_t)) + get_top();  
//注:这里的  header.max_size 不应该在减去 header.start(日志头所占用的空间大小),因为
//needed_space已经将其包含在内。
  if (header.max_size - header.start < needed_space) {                           
  ¦ derr << "FileJournal::create: OSD journal is not large enough to hold "      
        ¦<< "osd_max_write_size bytes!" << dendl;                                
  ¦ ret = -ENOSPC;                                                               
  ¦ goto free_buf;                                                               
  }                                                                              

  dout(2) << "create done" << dendl;                                             
  ret = 0;                                                                       

free_buf:                                                                        
  free(buf);                                                                     
  buf = 0;                                                                       
close_fd:                                                                        
  if (TEMP_FAILURE_RETRY(::close(fd)) < 0) {                                     
  ¦ ret = -errno;                                                                
  ¦ derr << "FileJournal::create: error closing fd: " << cpp_strerror(ret)       
        ¦<< dendl;                                                               
  }                                                                              
done:                                                                            
  fd = -1;                                                                       
  return ret;                                                                    
}

检查事务日志

检查日志头中记录的fsid与current/fsid的值是否相等

//检查日志头中记录的fsid与current/fsid的值是否相等。
// This can not be used on an active journal
int FileJournal::check()
{
  int ret;

  assert(fd == -1);
  //打开日志文件
  ret = _open(false, false);
  if (ret)
  ¦ return ret;
//读取日志头信息
  ret = read_header(&header);
  if (ret < 0)
  ¦ goto done;
//判断当前OSD的fsid与日志头中记录的fsid是否相等
  if (header.fsid != fsid) {
  ¦ derr << "check: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
        ¦<< ", invalid (someone else's?) journal" << dendl;
  ¦ ret = -EINVAL;
  ¦ goto done;
  }

  dout(1) << "check: header looks ok" << dendl;
  ret = 0;

 done:
  close();
  return ret;
}

序列化事务日志头

//将内存中的日志header结构对象序列化到一段内存空间中,并返回该空间的指针
bufferptr FileJournal::prepare_header() 
{
  bufferlist bl; 
  {
  ¦ Mutex::Locker l(finisher_lock);
  ¦ header.committed_up_to = journaled_seq;
  }
  ::encode(header, bl);
  bufferptr bp = buffer::create_page_aligned(get_top());
  // don't use bp.zero() here, because it also invalidates
  // crc cache (which is not yet populated anyway)
  char* data = bp.c_str();
  memcpy(data, bl.c_str(), bl.length());
  data += bl.length();
  memset(data, 0, bp.length()-bl.length());                                                                                                                                                  
  return bp;
}

事务日志头信息:

 /*
  ¦* journal header
  ¦*/   
  struct header_t {
  ¦ enum {
  ¦ ¦ FLAG_CRC = (1<<0),
  ¦ ¦ // NOTE: remove kludgey weirdness in read_header() next time a flag is added.
  ¦ };

  ¦ uint64_t flags;
  ¦ uuid_d fsid;
  ¦ __u32 block_size;
  ¦ __u32 alignment;
  ¦ int64_t max_size;   // max size of journal ring buffer
  ¦ int64_t start;      // offset of first entry
  ¦ uint64_t committed_up_to; // committed up to

  ¦ /**
  ¦ ¦* start_seq
  ¦ ¦*
  ¦ ¦* entry at header.start has sequence >= start_seq
  ¦ ¦*
  ¦ ¦* Generally, the entry at header.start will have sequence
  ¦ ¦* start_seq if it exists.  The only exception is immediately
  ¦ ¦* after journal creation since the first sequence number is
  ¦ ¦* not known.
  ¦ ¦*
  ¦ ¦* If the first read on open fails, we can assume corruption
  ¦ ¦* if start_seq > committed_up_to because the entry would have
  ¦ ¦* a sequence >= start_seq and therefore > committed_up_to.
  ¦ ¦*/
  ¦ uint64_t start_seq;
 ¦ void encode(bufferlist& bl) const {
  ¦ ¦ __u32 v = 4;
  ¦ ¦ ::encode(v, bl);
  ¦ ¦ bufferlist em;
  ¦ ¦ {
        ::encode(flags, em);
        ::encode(fsid, em);
        ::encode(block_size, em);
        ::encode(alignment, em);
        ::encode(max_size, em);
        ::encode(start, em);
        ::encode(committed_up_to, em);
        ::encode(start_seq, em);
  ¦ ¦ }
  ¦ ¦ ::encode(em, bl);
  ¦ }
  ¦ void decode(bufferlist::iterator& bl) {
  ¦ ¦ __u32 v;
  ¦ ¦ ::decode(v, bl);
  ¦ ¦ if (v < 2) {  // normally 0, but concievably 1
        // decode old header_t struct (pre v0.40).
        bl.advance(4); // skip __u32 flags (it was unused by any old code)
        flags = 0;
        uint64_t tfsid;
        ::decode(tfsid, bl);
        *(uint64_t*)&fsid.bytes()[0] = tfsid;
        *(uint64_t*)&fsid.bytes()[8] = tfsid;
        ::decode(block_size, bl);
        ::decode(alignment, bl);
        ::decode(max_size, bl);
        ::decode(start, bl);
        committed_up_to = 0;
        start_seq = 0;
        return;
  ¦ ¦ }
  ¦ ¦ bufferlist em;
  ¦ ¦ ::decode(em, bl);
  ¦ ¦ bufferlist::iterator t = em.begin();
  ¦ ¦ ::decode(flags, t);
  ¦ ¦ ::decode(fsid, t);
  ¦ ¦ ::decode(block_size, t);
  ¦ ¦ ::decode(alignment, t);
  ¦ ¦ ::decode(max_size, t);
  ¦ ¦ ::decode(start, t);

  ¦ ¦ if (v > 2)
        ::decode(committed_up_to, t);
  ¦ ¦ else
        committed_up_to = 0;

  ¦ ¦ if (v > 3)
        ::decode(start_seq, t);
  ¦ ¦ else
        start_seq = 0;
  ¦ }

。。。
}

你可能感兴趣的:(ceph)