Ceph 0.94.1
Qemu 2.4.0
rbd.c
在qemu_rbd_open函数中,操作了一个重要的数据结构——结构体BDRVRBDState,这个结构体保存了非常重要的信息。
typedef struct BDRVRBDState { rados_t cluster; //cluster的handle rados_ioctx_t io_ctx; //cluster的IO上下文 rbd_image_t image; //rbd镜像的结构体 char name[RBD_MAX_IMAGE_NAME_SIZE]; //rbd镜像的名字 char *snap; //rbd镜像快照的名字 } BDRVRBDState;
对qemu_rbd_open函数的分析:
static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVRBDState *s = bs->opaque; char pool[RBD_MAX_POOL_NAME_SIZE]; char snap_buf[RBD_MAX_SNAP_NAME_SIZE]; char conf[RBD_MAX_CONF_SIZE]; char clientname_buf[RBD_MAX_CONF_SIZE]; char *clientname; QemuOpts *opts; Error *local_err = NULL; const char *filename; int r; /*这里应该是参数相关*/ opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); if (local_err) { error_propagate(errp, local_err); qemu_opts_del(opts); return -EINVAL; } filename = qemu_opt_get(opts, "filename"); /*分析filename,然后分别赋值*/ if (qemu_rbd_parsename(filename, pool, sizeof(pool), snap_buf, sizeof(snap_buf), s->name, sizeof(s->name), conf, sizeof(conf), errp) < 0) { r = -EINVAL; goto failed_opts; } /* pool:pool的名字 snap_buf:镜像快照的名字 s->name:rbd镜像的名字 conf:配置的内容 */ clientname = qemu_rbd_parse_clientname(conf, clientname_buf); /*client_buf:client的名字*/ r = rados_create(&s->cluster, clientname); //创建cluster handle if (r < 0) { error_setg(errp, "error initializing"); goto failed_opts; } /*复制snap_buf中的内容到s->snap*/ s->snap = NULL; if (snap_buf[0] != '\0') { s->snap = g_strdup(snap_buf); } /*设置配置文件,如果用户没有设置则采用默认设置*/ if (strstr(conf, "conf=") == NULL) { /* try default location, but ignore failure */ rados_conf_read_file(s->cluster, NULL); } else if (conf[0] != '\0') { r = qemu_rbd_set_conf(s->cluster, conf, true, errp); if (r < 0) { goto failed_shutdown; } } /*这里没有理解是什么意思*/ if (conf[0] != '\0') { r = qemu_rbd_set_conf(s->cluster, conf, false, errp); if (r < 0) { goto failed_shutdown; } } /* * Fallback to more conservative semantics if setting cache * options fails. Ignore errors from setting rbd_cache because the * only possible error is that the option does not exist, and * librbd defaults to no caching. If write through caching cannot * be set up, fall back to no caching. */ /*设置cache的参数*/ if (flags & BDRV_O_NOCACHE) { rados_conf_set(s->cluster, "rbd_cache", "false"); } else { rados_conf_set(s->cluster, "rbd_cache", "true"); } r = rados_connect(s->cluster); //连接cluster if (r < 0) { error_setg(errp, "error connecting"); goto failed_shutdown; } r = rados_ioctx_create(s->cluster, pool, &s->io_ctx); //创建IO上下文 if (r < 0) { error_setg(errp, "error opening pool %s", pool); goto failed_shutdown; } r = rbd_open(s->io_ctx, s->name, &s->image, s->snap); //在取得了IO上下文的情况下,打开rbd镜像 if (r < 0) { error_setg(errp, "error reading header from %s", s->name); goto failed_open; } bs->read_only = (s->snap != NULL); qemu_opts_del(opts); return 0; failed_open: rados_ioctx_destroy(s->io_ctx); failed_shutdown: rados_shutdown(s->cluster); g_free(s->snap); failed_opts: qemu_opts_del(opts); return r; }
可见,在完成qemu_rbd_open函数之后,关于rbd镜像的信息被保存在了bs->opaque中,从而可以被其它函数所利用。
qemu_rbd_close函数非常简单,在这里就不贴出来了。
qemu对于rbd的读写操作貌似都是以异步的方式进行的。对读、写和flush函数的分析如下:
/*异步读函数*/ static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockCompletionFunc *cb, void *opaque) { return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, RBD_AIO_READ); } /*异步写函数*/ static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockCompletionFunc *cb, void *opaque) { return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, RBD_AIO_WRITE); } /*flush操作函数*/ #ifdef LIBRBD_SUPPORTS_AIO_FLUSH static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque) { return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH); } #else static int qemu_rbd_co_flush(BlockDriverState *bs) { #if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1) /* rbd_flush added in 0.1.1 */ BDRVRBDState *s = bs->opaque; return rbd_flush(s->image); #else return 0; #endif } #endif #ifdef LIBRBD_SUPPORTS_DISCARD static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors, BlockCompletionFunc *cb, void *opaque) { return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque, RBD_AIO_DISCARD); } #endif
可见,读写和flush都是通过rbd_start_aio这个函数来完成。当然,这是在librbd支持异步flush的情况下。那么rbd_start_aio又是怎样的呢?
/*开始异步IO操作的函数*/ /*在参数列表当中,sector_num, qiov, nb_sectors, cb应该都是与qemu磁盘操作有关的参数*/ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockCompletionFunc *cb, void *opaque, RBDAIOCmd cmd) { RBDAIOCB *acb; RADOSCB *rcb = NULL; rbd_completion_t c; int64_t off, size; char *buf; int r; BDRVRBDState *s = bs->opaque; acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque); acb->cmd = cmd; acb->qiov = qiov; if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) { acb->bounce = NULL; } else { acb->bounce = qemu_try_blockalign(bs, qiov->size); if (acb->bounce == NULL) { goto failed; } } acb->ret = 0; acb->error = 0; acb->s = s; acb->bh = NULL; if (cmd == RBD_AIO_WRITE) { qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); } buf = acb->bounce; off = sector_num * BDRV_SECTOR_SIZE; size = nb_sectors * BDRV_SECTOR_SIZE; rcb = g_new(RADOSCB, 1); rcb->acb = acb; rcb->buf = buf; rcb->s = acb->s; rcb->size = size; r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c); if (r < 0) { goto failed; } /*acb和rcb应该都是与磁盘IO相关的数据结构,描述了磁盘IO的信息,比如写入位置的扇区位置,写入大小,写入信息等等。上面的代码初始化了磁盘IO的信息*/ /*根据command的类型,来对磁盘镜像进行操作*/ switch (cmd) { case RBD_AIO_WRITE: r = rbd_aio_write(s->image, off, size, buf, c); break; case RBD_AIO_READ: r = rbd_aio_read(s->image, off, size, buf, c); break; case RBD_AIO_DISCARD: r = rbd_aio_discard_wrapper(s->image, off, size, c); break; case RBD_AIO_FLUSH: r = rbd_aio_flush_wrapper(s->image, c); break; default: r = -EINVAL; } if (r < 0) { goto failed_completion; } return &acb->common; failed_completion: rbd_aio_release(c); failed: g_free(rcb); qemu_vfree(acb->bounce); qemu_aio_unref(acb); return NULL; }
在对rbd.c的分析中,我们可以看到无论是读还是写,最后都是对image的操作。
那么,我们来rbd_open这个函数是如何利用IO上下文和snap来打开一个rbd镜像的。
在Ceph的文档里并没有介绍如何使用C/C++版的librbd,差评啊。只能自己对照qemu和Ceph的源码进行分析了。
Ceph的源码版本:0.94.1
在src/include/rbd/librbd.h中声明了librbd的C语言接口
其中rbd_open的定义如下:
CEPH_RBD_API int rbd_open(rados_ioctx_t io, const char *name, rbd_image_t *image, const char *snap_name);
rbd_close的定义如下:
CEPH_RBD_API int rbd_close(rbd_image_t image);
我们的目标是让一个image连接到两个pool,一个ssd pool和一个hdd pool。在更改源码的过程当中,我们尽量少的改动源码。当前我的想法是:
可以设置一个当前的默认pool,将image映射到该pool上,然后在运行的过程当中,image对应的pool可能改变,这样的话就不必改写那些读写image的代码。
librbd.h中对应的函数声明在src/librbd/librbd.cc中定义
extern "C" int rbd_open(rados_ioctx_t p, const char *name, rbd_image_t *image, const char *snap_name) { librados::IoCtx io_ctx; librados::IoCtx::from_rados_ioctx_t(p, io_ctx); librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx, false); //创建镜像上下文 tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); //创建回溯点? int r = librbd::open_image(ictx); //打开rbd镜像 if (r >= 0) *image = (rbd_image_t)ictx; 将镜像上下文赋给image,提供给函数的调用者使用 tracepoint(librbd, open_image_exit, r); return r; } extern "C" int rbd_close(rbd_image_t image) { librbd::ImageCtx *ctx = (librbd::ImageCtx *)image; tracepoint(librbd, close_image_enter, ctx, ctx->name.c_str(), ctx->id.c_str()); librbd::close_image(ctx); tracepoint(librbd, close_image_exit); return 0; }