tapdisk vhd block device driver
struct tap_disk tapdisk_vhd = {
.disk_type = "tapdisk_vhd",
.flags = 0,
.private_data_size = sizeof(struct vhd_state),
.td_open = _vhd_open,
.td_close = _vhd_close,
.td_queue_read = vhd_queue_read,
.td_queue_write = vhd_queue_write,
.td_get_parent_id = vhd_get_parent_id,
.td_validate_parent = vhd_validate_parent,
.td_debug = vhd_debug,
};
/* Layout of a dynamic disk:
*
* +-------------------------------------------------+
* | Mirror image of HD footer (hd_ftr) (512 bytes) |
* +-------------------------------------------------+
* | Sparse drive header (dd_hdr) (1024 bytes) |
* +-------------------------------------------------+
* | BAT (Block allocation table) |
* | - Array of absolute sector offsets into the |
* | file (u32). |
* | - Rounded up to a sector boundary. |
* | - Unused entries are marked as 0xFFFFFFFF |
* | - max entries in dd_hdr->max_bat_size |
* +-------------------------------------------------+
* | Data Block 0 |
* | Bitmap (padded to 512 byte sector boundary) |
* | - each bit indicates whether the associated |
* | sector within this block is used. |
* | Data |
* | - power-of-two multiple of sectors. |
* | - default 2MB (4096 * 512) |
* | - Any entries with zero in bitmap should be |
* | zero on disk |
* +-------------------------------------------------+
* | Data Block 1 |
* +-------------------------------------------------+
* | ... |
* +-------------------------------------------------+
* | Data Block n |
* +-------------------------------------------------+
* | HD Footer (511 bytes) |
* +-------------------------------------------------+
*/
_vhd_open: _vhd_open -> __vhd_open -> vhd_open , 其中 vhd_open 通过 open 函数打开 vhd 文件同时把 fd 赋值给 vhd_context,之后调用 vhd_read_footer, vhd_read_header 等读取 vhd 信息。
_vhd_close: 调用 vhd_write_footer, vhd_write_batmap 来回写 vhd 的 BAT 和 Bitmap
_vhd_debug : VHD 的 bitmap 是存放在 cache 中的,类似于文件系统的 cache 机制
vhd_queue_read : 首先调用 read_bitmap_cache,读取对应 block 的 bitmap。
如果 BAT 为空 (VHD_BM_BAT_CLEAR),调用 td_forward_request
如果 bitmap 所有位都为空 (VHD_BM_BIT_CLEAR), 调用 td_forward_request
(这两种情况可能表示vhd镜像的数据是无效的,所以不执行真正的IO)
如果 bitmap 不为空 (VHD_BM_BIT_SET),调用read_bitmap_cache_span,找到bitmap为1的sector个数,然后调用schedule_data_read去读取这些sector。
schedule_data_read 构造 td_request_t 请求,通过aio_read 发出真正的IO请求。
aio_read 调用 tapdisk_prep_tiocb, tapdisk_queue_tiocb 把 tiocb 结构请求放入tapdisk的队列中
如果 bitmap 没有在 cache 中,调用schedule_bitmap_read,读取对应 block 中的bitmap 成功之后会同时把 bitmap 写入cache。
schedule_bitmap_read 里首先构造一个操作符为 VHD_OP_BITMAP_READ 的请求,调用aio_read去读取block对应的bitmap的内容,可以看到aio_read之后调用了lock_bitmap把bitmap设置为VHD_FLAG_BM_LOCKED状态,调用install_bitmap把 bitmap置入cache,最后把bitmap设置为VHD_FLAG_BM_READ_PENDING状态。
schedule_bitmap_read之后调用__vhd_queue_request,构造一个td_request_t的IO请求,并把该请求加入到vhd_bitmap->waiting队列中,之后再次lock该bitmap。下面就是等待bitmap读取完成了。
从下面的struct vhd_bitmap的定义可以知道waiting的含义:
struct vhd_bitmap {
u32 blk;
u64 seqno; /* lru sequence number */
vhd_flag_t status;
char *map; /* map should only be modified
* in finish_bitmap_write */
char *shadow; /* in-memory bitmap changes are
* made to shadow and copied to
* map only after having been
* flushed to disk */
struct vhd_transaction tx; /* transaction data structure
* encapsulating data, bitmap,
* and bat writes */
struct vhd_req_list queue; /* data writes waiting for next
* transaction */
struct vhd_req_list waiting; /* pending requests that cannot
* be serviced until this bitmap
* is read from disk */
struct vhd_request req;
};
如果 bitmap 处于 VHD_BM_READ_PENDING 状态,则只调用 __vhd_queue_request 等待 bitmap 的读取完毕
vhd_queue_write 的原理和 vhd_queue_read 基本类似,不多说了。
最后提下vhd_complete函数,在tapdisk queue的机制中,vhd_complete被作为callback函数传入struct tiocb结构,每次这个iocb的IO返回,都会调用事先注册号的vhd_complete,我们看下这个函数究竟做啥:
vhd_complete中最重要的一部分如下:
switch (req->op) {
case VHD_OP_DATA_READ:
finish_data_read(req);
break;
case VHD_OP_DATA_WRITE:
finish_data_write(req);
break;
case VHD_OP_BITMAP_READ:
finish_bitmap_read(req);
break;
case VHD_OP_BITMAP_WRITE:
finish_bitmap_write(req);
break;
case VHD_OP_ZERO_BM_WRITE:
finish_zero_bm_write(req);
break;
case VHD_OP_BAT_WRITE:
finish_bat_write(req);
break;
default:
ASSERT(0);
break;
}
根据request的操作符,执行不同的finish_xxxxx函数