Ceph RBD请求过程的分析
RBD是ceph中提供块存储的客户端服务,只所以说是客户端服务是相对于RADOS而言,RBD是基于librados API开发的一个块存储服务。
本文会通过rbd的一个写入操作来介绍RBD对RADOS的请求过程。
以通过rbd想RADOS中导入一个文件为例,介绍整个请求过程:
rbd -p {pool_name} import {local_file_path} {rbd_image}
rbd 是一个可执行的二进制程序,该命名会启动一个进程来完成该文件的导入工作。程序运行时会到默认的目录下打开给程序需要的配置文件(ceph.conf)加载一些配置项,
并解析命令行中传入的参数,接着进行参数的安全监察。在必要的一些监察结束之后,就会根据解析出来的命令做对应具体的请求处理操作:
主要的变量
librbd::RBD rbd; //操作rbd的实例对象
librados::IoCtx io_ctx, dest_io_ctx; //与pool关联的上下文
librbd::Image image; //镜像
1.初始化rados对象:
cerr << "rbd: couldn't initialize rados!" << std::endl;
return EXIT_FAILURE;
}
2.与rados服务端建立连接:
if (talk_to_cluster && rados.connect() < 0) {
cerr << "rbd: couldn't connect to the cluster!" << std::endl;
return EXIT_FAILURE;
3.初始化io_ctx,io_ctx对应一个pool池,保存与该pool相关的一些信息。
if (talk_to_cluster && opt_cmd != OPT_IMPORT) {
r = rados.ioctx_create(poolname, io_ctx);
if (r < 0) {
cerr << "rbd: error opening pool " << poolname << ": "
<< cpp_strerror(-r) << std::endl;
return -r;
}
4.打开本地要上传的文件:
r = -errno;
cerr << "rbd: error opening " << path << std::endl;
goto done2;
}
5.先创建该镜像:
stripe_unit, stripe_count);
static int do_create(librbd::RBD &rbd, librados::IoCtx& io_ctx,
const char *imgname, uint64_t size, int *order,
int format, uint64_t features,
uint64_t stripe_unit, uint64_t stripe_count)
{
int r;
if (format == 1) {
// weird striping not allowed with format 1!
if ((stripe_unit || stripe_count) &&
(stripe_unit != (1ull << *order) && stripe_count != 1)) {
cerr << "non-default striping not allowed with format 1; use --image-format 2"
<< std::endl;
return -EINVAL;
}
r = rbd.create(io_ctx, imgname, size, order);
} else {
r = rbd.create3(io_ctx, imgname, size, features, order,
stripe_unit, stripe_count);
}
if (r < 0)
return r;
return 0;
}
6.打开image镜像,该过程中创建了与该镜像绑定的ImageCtx类实例:
int RBD::open(IoCtx& io_ctx, Image& image, const char *name,
const char *snap_name)
{
ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, false);
TracepointProvider::initialize(get_cct(io_ctx));
tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only);
if (image.ctx != NULL) {
close_image(reinterpret_cast(image.ctx));
image.ctx = NULL;
}
int r = librbd::open_image(ictx);
if (r < 0) {
tracepoint(librbd, open_image_exit, r);
return r;
}
image.ctx = (image_ctx_t) ictx;
tracepoint(librbd, open_image_exit, 0);
return 0;
}
7.从打开的本地文件中循环读取数据,写入到打开的image中。
while ((readlen = ::read(fd, p + blklen, reqlen)) >= 0) {
blklen += readlen;
// if read was short, try again to fill the block before writing
if (readlen && ((size_t)readlen < reqlen)) {
reqlen -= readlen;
continue;
}
if (!from_stdin)
pc.update_progress(image_pos, size);
bufferlist bl(blklen);
bl.append(p, blklen);
// resize output image by binary expansion as we go for stdin
if (from_stdin && (image_pos + (size_t)blklen) > size) {
size *= 2;
r = image.resize(size);
if (r < 0) {
cerr << "rbd: can't resize image during import" << std::endl;
goto done;
}
}
// write as much as we got; perhaps less than imgblklen
// but skip writing zeros to create sparse images
if (!bl.is_zero()) {
//在AioImportContext的构造方法中实现了数据的写入过程。
new AioImportContext(*throttle, image, bl, image_pos);
}
// done with whole block, whether written or not
image_pos += blklen;
// if read had returned 0, we're at EOF and should quit
if (readlen == 0)
break;
blklen = 0;
reqlen = imgblklen;
}
...
8.将读取的数据m_bufferlist,写入到镜像的m_offset偏移位置处:
int r = image.aio_write2(m_offset, m_bufferlist.length(), m_bufferlist,
m_aio_completion, op_flags);
...
9.提交一步写入请求:
submit_aio_write(ictx, off, len, bl.c_str(), get_aio_completion(c),
op_flags);
void submit_aio_write(librbd::ImageCtx *ictx, uint64_t off, size_t len,
const char *buf, librbd::AioCompletion *c, int op_flags) {
if (ictx->cct->_conf->rbd_non_blocking_aio) {
ictx->aio_work_queue->queue(new C_AioWriteWQ(ictx, off, len, buf, c,
op_flags));
} else {
librbd::aio_write(ictx, off, len, buf, c, op_flags);
}
}
10.librbd执行写入操作:
librbd::aio_write(ictx, off, len, buf, c, op_flags);
11.将本次写入操作映射到对应的对象:
vector extents;
if (len > 0) {
Striper::file_to_extents(ictx->cct, ictx->format_string,
&ictx->layout, off, clip_len, 0, extents);
}
12.将写入的数据根据对象进行聚合:
...
for (vector ::iterator p = extents.begin(); p != extents.end(); ++p) {
ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
<< " from " << p->buffer_extents << dendl;
// assemble extent
bufferlist bl;
for (vector >::iterator q = p->buffer_extents.begin();
q != p->buffer_extents.end();
++q) {
bl.append(buf + q->first, q->second);
}
...
13.创建一步写请求的回调对象。
C_AioWrite *req_comp = new C_AioWrite(cct, c);
14.rbd支持cache和非cache两种写入操作:
if (ictx->object_cacher) {
c->add_request();
ictx->write_to_cache(p->oid, bl, p->length, p->offset, req_comp, op_flags);
} else {
AioWrite *req = new AioWrite(ictx, p->oid.name, p->objectno, p->offset,
bl, snapc, req_comp);
c->add_request();
req->set_op_flags(op_flags);
req->send();
}
15.以非cache写入操作为列,创建异步写请求对象。
AioWrite *req = new AioWrite(ictx, p->oid.name, p->objectno, p->offset,
bl, snapc, req_comp);
待续。。。。。。。。