5.4 QEMU block & AIO


5.4.1 Qemu block driver

(1) block driver注册

bdrv_register 用户驱动注册,raw-posix.c是raw block driver.

block_init(bdrv_file_init); block_init是总的注册函数

bdrv_file_init ==> bdrv_register(&bdrv_file);

void bdrv_register(BlockDriver *bdrv)

{

    if(!bdrv->bdrv_co_readv) {

       bdrv->bdrv_co_readv = bdrv_co_readv_em;

       bdrv->bdrv_co_writev = bdrv_co_writev_em;

 

        if(!bdrv->bdrv_aio_readv) {

            bdrv->bdrv_aio_readv= bdrv_aio_readv_em;

           bdrv->bdrv_aio_writev = bdrv_aio_writev_em;

        }

    }

bdrv_co_readv/writev是读写的底层实现函数

static BlockDriver bdrv_file = {

    .format_name ="file",

    .protocol_name ="file",

    .instance_size =sizeof(BDRVRawState),

    .bdrv_probe = NULL, /*no probe for protocols */

    .bdrv_file_open =raw_open,

    .bdrv_reopen_prepare =raw_reopen_prepare,

    .bdrv_reopen_commit =raw_reopen_commit,

    .bdrv_reopen_abort =raw_reopen_abort,

    .bdrv_close = raw_close,

    .bdrv_create =raw_create,

    .bdrv_co_discard =raw_co_discard,

    .bdrv_co_is_allocated= raw_co_is_allocated,

 

    .bdrv_aio_readv =raw_aio_readv,

    .bdrv_aio_writev =raw_aio_writev,

    .bdrv_aio_flush =raw_aio_flush,

 

    .bdrv_truncate =raw_truncate,

    .bdrv_getlength =raw_getlength,

   .bdrv_get_allocated_file_size

                        =raw_get_allocated_file_size,

 

    .create_options =raw_create_options,

};

 

   QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);

}

bdrv_file直接读写file.

 

常用的block driver 还有bdrv_qcow2, 它负责实现了kvm虚拟机镜像文件格式。

 

(2) block driver匹配

5.2节中分析到main(vl.c)==> drive_init_func==> drive_init(blockdev.c),本节分析该函数流程

a)  通过qemu_opt_get_number得到block device的各种信息,如:

b)  file =qemu_opt_get(opts, "file"); 并检察这些参数的合法性。初始化BlockIOLimit io_limits

 

c)  如果用户在参数中指定了format,则根据format得到drv

drv= bdrv_find_whitelisted_format(buf); 否则drv为空; 我们这里分析为空的case

 

d)  最后将这些参数存入到DriveInfo*dinfo 中;

QTAILQ_INSERT_TAIL(&drives,dinfo, next);

bdrv_set_on_error(dinfo->bdrv,on_read_error, on_write_error);

bdrv_set_io_limits(dinfo->bdrv,&io_limits);

 

e)  根据file参数打开drv

ret = bdrv_open(dinfo->bdrv, file, bdrv_flags, drv);

a)  如果制定了BDRV_O_SNAPSHOT则先处理这种类别

 

b)  未指定drv的case调用find_image_format得到drv;

find_image_format

如果是scsi disk或drive为空则直接返回raw drive

if(bs->sg || !bdrv_is_inserted(bs)) {

bdrv_delete(bs);

drv= bdrv_find_format("raw");

if(!drv) {

ret = -ENOENT;

}

*pdrv= drv;

returnret;

}

根据文件名读取文件内容,并调用

QLIST_FOREACH(drv1,&bdrv_drivers, list) {

if(drv1->bdrv_probe) {

score = drv1->bdrv_probe(buf, ret, filename);

if (score > score_max) {

score_max = score;

drv = drv1;

}

}

} 来匹配采用哪种block driver

 

5.4.2 Block Drive读写

先来回顾一下ide dma的读写:

ide_dma_cb(hw\ide\core.c)==>

s->bus->dma->aiocb = dma_bdrv_read(s->bs,&s->sg, sector_num, ide_dma_cb,s);

 

dma_bdrv_read==> return dma_bdrv_io(bs, sg, sector, bdrv_aio_readv,cb, opaque,

                      DMA_DIRECTION_FROM_DEVICE); ==> dma_bdrv_cb

 

dma_bdrv_cb ==>  dbs =qemu_aio_get //dbs->cb = ide_dma_cb

           dbs->acb =dbs->io_func(dbs->bs, dbs->sector_num, &dbs->iov,

                           dbs->iov.size / 512,dma_bdrv_cb, dbs);

 

io_func = bdrv_aio_readv, cb ==> bdrv_co_aio_rw_vector ==>

    a. acb = qemu_aio_get //acb->cb = dma_bdrv_cb

    b.    co = qemu_coroutine_create(bdrv_co_do_rw);

       qemu_coroutine_enter(co, acb);

  coroutine协程是一种轻量级并行执行的实现(qeum中winos用纤程实现;

  linux用thread实现) bdrv_co_do_rw是coroutine 中执行的函数体。qemu_coroutine_enter开始执行bdrv_co_do_rw。

 

static void coroutine_fn bdrv_co_do_rw(void *opaque)

{

   BlockDriverAIOCBCoroutine *acb = opaque;

    BlockDriverState *bs =acb->common.bs;

 

    if (!acb->is_write){

        acb->req.error= bdrv_co_do_readv(bs, acb->req.sector,

           acb->req.nb_sectors, acb->req.qiov, 0);

    } else {

        acb->req.error= bdrv_co_do_writev(bs, acb->req.sector,

           acb->req.nb_sectors, acb->req.qiov, 0);

    }

 

    acb->bh =qemu_bh_new(bdrv_co_em_bh, acb); //

   qemu_bh_schedule(acb->bh);

}

 

bdrv_co_do_readv(/writev是读写的异步执行的启动函数;

bottom half用于提交完成回调 qemu_bh_new(bdrv_co_em_bh, acb);

acb->bh会执行bdrv_co_em_bh

 

bdrv_co_do_readv ==> drv->bdrv_co_readv = drv->bdrv_co_readv  在 bdrv_register设为了

bdrv_co_readv_em  ==> bdrv_co_io_em

static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs,int64_t sector_num,

                                      int nb_sectors,QEMUIOVector *iov,

                                      boolis_write)

{

    CoroutineIOCompletionco = {

        .coroutine =qemu_coroutine_self(),

    };

    BlockDriverAIOCB *acb;

 

    if (is_write) {

        acb =bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,

                                       bdrv_co_io_em_complete, &co);

    } else {

        acb =bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,

                                     bdrv_co_io_em_complete, &co);

    }

    .......

}

drv->bdrv_aio_readv =  raw_aio_readv==> raw_aio_submit ==> paio_submit提交异步io

BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,

        int64_tsector_num, QEMUIOVector *qiov, int nb_sectors,

       BlockDriverCompletionFunc *cb, void *opaque, int type)

{

    struct qemu_paiocb*acb;

 

    acb =qemu_aio_get(&raw_aio_pool, bs, cb, opaque);

    acb->aio_type =type;

    acb->aio_fildes =fd;

    .......

    qemu_paio_submit(acb);

    return&acb->common;

}

static void qemu_paio_submit(struct qemu_paiocb *aiocb)

{

    aiocb->ret =-EINPROGRESS;

    aiocb->active = 0;

    mutex_lock(&lock);

    if (idle_threads == 0&& cur_threads < max_threads)

        spawn_thread();

   QTAILQ_INSERT_TAIL(&request_list, aiocb, node); //将aiocb加入到aio request链表中

   mutex_unlock(&lock);

   cond_signal(&cond);

}

整个调用通过callback形成了一个回调链。每层的回调在上文中用黑体标出了。下一节将分析异步aio的执行

 

5.4.3 Qemu AIO 框架

(1) spawn_thread

上节中的spawn_thread ==> qemu_bh_schedule(new_thread_bh);

 

在aio框架的初始化函数中 paio_init(posix-aio-compact.c)

   qemu_pipe(fds)

   s->rfd = fds[0];

    s->wfd = fds[1];//aio管理器的fds为管道

   qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL,posix_aio_flush, s);

 

   QTAILQ_INIT(&request_list); //初始化aio 的request

    new_thread_bh =qemu_bh_new(spawn_thread_bh_fn, NULL);

 

posix_aio_read用户调用aio的callback

 

qemu_aio_set_fd_handler负责设置AioHandler *node,

        node->io_read =posix_aio_read;

        node->io_write= NULL;

        node->io_flush= posix_aio_flush;

        node->opaque =s;

注册aio管理器的node

 

spawn_thread_bh_fn ==> do_spawn_thread ==> thread_create(&thread_id,&attr, aio_thread, NULL);

 

所以spawn_thread最终调用aio_thread

 

(2) aio_thread

aio_thread是aio的处理框架

 a. 从QTAILQ_REMOVE(&request_list, aiocb, node);取出aiocb.

 b. 调用handle_aiocb_rw执行读写

 

handle_aiocb_rw ==> handle_aiocb_rw_linear ==>

pread(aiocb->aio_fildes, buf + offset,

                        aiocb->aio_nbytes - offset,

                        aiocb->aio_offset + offset);

aio_fildes为 block_drv对应文件句柄,来发起读写操作

 

c. posix_aio_notify_event

 

(3) bh 的执行

那么  qemu_bh_schedule是如何工作的呢?

void qemu_bh_schedule(QEMUBH *bh)

{

    if (bh->scheduled)

        return;

    bh->scheduled = 1;

    bh->idle = 0;

    qemu_notify_event();//call SetEvent(qemu_event_handle)

}

 

main_loop(vl.c) ==>  main_loop_wait ==> main_loop_wait==>

a)  qemu_iohandler_fill根据io_handlers填写readwrite的select 句柄

b)  os_host_main_loop_wait调用select

c)  qemu_iohandler_poll如果select返回的句柄被set 调用对应的io_read,io_write

d)  qemu_bh_poll(async.c) 遍历bh调用bh的处理函数

所以spawn_thread_bh_fn会被调用。

int qemu_bh_poll(void)

{

    .....

    for (bh = first_bh;bh; bh = next) {

        next =bh->next;

        if(!bh->deleted && bh->scheduled) {

           bh->scheduled = 0;

            if(!bh->idle)

                ret = 1;

            bh->idle =0;

            bh->cb(bh->opaque);

        }

    }

 

    .......

}

 

下面在来看看aio的 fd handler何时被调用

aio_thread ==> posix_aio_notify_event ==> write(posix_aio_state->wfd,&byte, sizeof(byte));

原来当一次读写完成后会向管道写,这样os_host_main_loop_wait的select 会set posix_aio_state->wfd. qemu_iohandler_poll 就会调用node->io_read = posix_aio_read;

posix_aio_read ==》              

             *pacb =acb->next;

                /* callthe callback */

               acb->common.cb(acb->common.opaque, ret);

               qemu_aio_release(acb); //会调用acb的完成函数

对于前面读写的例子bdrv_co_io_em_complete会被调用,接着下一个bh的方法dma_bdrv_cb(bdrv_co_do_rw中被schedule也会被调用,从而dma操作被推动,直达数据处理完成。 其代码如下:

static void dma_bdrv_cb(void *opaque, int ret)

{

    .....

    if(dbs->sg_cur_index == dbs->sg->nsg || ret < 0) {

        dma_complete(dbs,ret); //如果数据完成, 在不在提交aio

        return;

    }

 

    while(dbs->sg_cur_index < dbs->sg->nsg) {

        cur_addr =dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;

        cur_len =dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;

        mem =dma_memory_map(dbs->sg->dma, cur_addr, &cur_len, dbs->dir);

        if (!mem)

            break;

       qemu_iovec_add(&dbs->iov, mem, cur_len);

       dbs->sg_cur_byte += cur_len;

        if(dbs->sg_cur_byte == dbs->sg->sg[dbs->sg_cur_index].len) {

           dbs->sg_cur_byte = 0;

           ++dbs->sg_cur_index;

        }

    }

    .....

    第一次或还有数据剩余则调用bdrv_co_do_rw发起下一次aio

    dbs->acb =dbs->io_func(dbs->bs, dbs->sector_num, &dbs->iov,

                           dbs->iov.size / 512, dma_bdrv_cb, dbs);

 }

你可能感兴趣的:(虚拟化)