通用块层简介
Linux中的通用块层是一个内核组件,它负责处理来自系统中的所有块设备访问,并将块设备的访问转换为请求下发到IO调度层。这个过程中会涉及到多种数据结构的转换,下面我们来讨论通用块层所涉及的数据结构以及通用块层所做的工作。
通用块层数据结构
注:本文所涉及的数据结构均为Linux 3.0内核中的数据结构
1、bio结构
bio描述符是通用块层的核心数据结构,它描述了块设备的IO操作,包含了IO操作所设计的磁盘的存储区标识符,与IO操作相关的内存区的段信息等。bio秒数据结构如下:
struct bio {
sector_t bi_sector; /* 块IO操作的第一个磁盘扇区 */
struct bio *bi_next; /* 链接到请求队列的下一个bio */
struct block_device *bi_bdev; /* 指向块设备描述符的指针 */
unsigned long bi_flags; /* bio的状态标志 */
unsigned long bi_rw; /* IO操作标志 */
unsigned short bi_vcnt; /* bio中bio_vec数组中段的数目,即bio_vec数组长度 */
unsigned short bi_idx; /* bio中bio_vec数组当前索引值 */
/* Number of segments in this BIO after
* physical address coalescing is performed.
*/
unsigned int bi_phys_segments; /* 合并之后bio中的物理段 */
unsigned int bi_size; /* 需要传送的字节数 */
/*
* To keep track of the max segment size, we account for the
* sizes of the first and last mergeable segments in this bio.
*/
unsigned int bi_seg_front_size; /* 段合并算法使用 */
unsigned int bi_seg_back_size; /* 段合并算法使用 */
unsigned int bi_max_vecs; /* bio中bio_vec数组允许的最大段数 */
unsigned int bi_comp_cpu; /* completion CPU */
atomic_t bi_cnt; /* pin count */
struct bio_vec *bi_io_vec; /* 指向bio的bio_vec数组中的第一个段的指针 */
bio_end_io_t *bi_end_io; /* bio的IO操作结束时调用的方法 */
void *bi_private; /* bio私有数据 */
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
bio_destructor_t *bi_destructor; /* 释放bio时调用的析构方法*/
/*
* We can inline a number of vecs at the end of the bio, to avoid
* double allocations for a small number of bio_vecs. This member
* MUST obviously be kept at the very end of the bio.
*/
struct bio_vec bi_inline_vecs[0];
};
bio中的每个段都是由bio_vec结构表示,其中个字段如下所示。bio中的bio_io_vec字段存放bio_vec数组中的第一个元素地址,bi_vcnt字段存放bio_vec数组当前的元素个数
struct bio_vec {
struct page *bv_page; /* 指向段的页框中页描述符的指针 */
unsigned int bv_len; /* 段的长度,以字节为单位 */
unsigned int bv_offset; /* 页框中段数据的偏移量 */
};
2、gendisk结构
gendisk描述一个磁盘或磁盘分区。磁盘是由通用块层梳理的逻辑设备,通常一个磁盘对应一个硬件设备。gendisk具体字段如下所示:
struct gendisk {
/* major, first_minor and minors are input parameters only,
* don't use directly. Use disk_devt() and disk_max_parts().
*/
int major; /* 磁盘主设备号 */
int first_minor; /* 与磁盘关联的第一个次设备号 */
int minors; /* 与磁盘关联的此设备号范围 */
char disk_name[DISK_NAME_LEN]; /* 磁盘标准名称 */
char *(*devnode)(struct gendisk *gd, mode_t *mode);
unsigned int events; /* supported events */
unsigned int async_events; /* async events, subset of all */
/* Array of pointers to partitions indexed by partno.
* Protected with matching bdev lock but stat and other
* non-critical accesses use RCU. Always access through
* helpers.
*/
struct disk_part_tbl __rcu *part_tbl; /* 磁盘分区表 */
struct hd_struct part0; /* 磁盘第一个分区 */
const struct block_device_operations *fops; /* 磁盘操作方法 */
struct request_queue *queue; /* 指向磁盘请求队列的指针 */
void *private_data; /* 块设备驱动的私有数据 */
int flags; /* 磁盘类型的标志 */
struct device *driverfs_dev; // FIXME: remove
struct kobject *slave_dir;
struct timer_rand_state *random;
atomic_t sync_io; /* RAID */
struct disk_events *ev;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity *integrity;
#endif
int node_id;
};
gendisk结构中fops字段指向磁盘操作方法的结构提指针,这些方法包括open、release、ioctl等,类似于字符设备驱动程序中的fops结构;
gendisk结构中part_tbl字段指向磁盘的分区表,分区表结构具体的字段如下:
struct disk_part_tbl {
struct rcu_head rcu_head;
int len;
struct hd_struct __rcu *last_lookup;
struct hd_struct __rcu *part[];
};
struct hd_struct {
sector_t start_sect; /* 磁盘中分区的起始扇区 */
sector_t nr_sects; /* 分区的扇区数 */
sector_t alignment_offset;
unsigned int discard_alignment;
struct device __dev; /* 实际的块设备 */
struct kobject *holder_dir;
int policy, partno;
struct partition_meta_info *info;
#ifdef CONFIG_FAIL_MAKE_REQUEST
int make_it_fail;
#endif
unsigned long stamp;
atomic_t in_flight[2];
#ifdef CONFIG_SMP
struct disk_stats __percpu *dkstats;
#else
struct disk_stats dkstats;
#endif
atomic_t ref;
struct rcu_head rcu_head;
};
通用块层工作流程
本节以do_erase函数为例讨论Linux中当向通用块层提交了一个IO操作时通用块层的处理流程。
首先附上do_erase函数代码:
static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
size_t nr_pages)
{
struct logfs_super *super = logfs_super(sb);
struct bio *bio;
struct request_queue *q = bdev_get_queue(sb->s_bdev);
unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
int i;
if (max_pages > BIO_MAX_PAGES)
max_pages = BIO_MAX_PAGES;
bio = bio_alloc(GFP_NOFS, max_pages); //申请bio结构
BUG_ON(!bio);
for (i = 0; i < nr_pages; i++) {
if (i >= max_pages) { //当请求的数据大于磁盘一次数据传输大小时,会将请求分成多个bio提交
/* Block layer cannot split bios :( */
bio->bi_vcnt = i;
bio->bi_idx = 0;
bio->bi_size = i * PAGE_SIZE;
bio->bi_bdev = super->s_bdev;
bio->bi_sector = ofs >> 9;
bio->bi_private = sb;
bio->bi_end_io = erase_end_io;
atomic_inc(&super->s_pending_writes);
submit_bio(WRITE, bio);
ofs += i * PAGE_SIZE;
index += i;
nr_pages -= i;
i = 0;
bio = bio_alloc(GFP_NOFS, max_pages); //申请新的bio结构
BUG_ON(!bio);
}
bio->bi_io_vec[i].bv_page = super->s_erase_page;
bio->bi_io_vec[i].bv_len = PAGE_SIZE;
bio->bi_io_vec[i].bv_offset = 0;
}
bio->bi_vcnt = nr_pages;
bio->bi_idx = 0;
bio->bi_size = nr_pages * PAGE_SIZE;
bio->bi_bdev = super->s_bdev;
bio->bi_sector = ofs >> 9;
bio->bi_private = sb;
bio->bi_end_io = erase_end_io;
atomic_inc(&super->s_pending_writes);
submit_bio(WRITE, bio);
return 0;
}
上述代码可有如下流程图解释:
图1 do_erase流程图 |
在do_erase函数中,如果像磁盘请求的数据大小大于一次bio操作允许的最大值(i>max_pages),则会将磁盘数据请求分成多个bio进行,先完善并提交当前bio,然后申请新的bio结构并将剩余的数据请求填充到新的bio中。
接下来讨论一下提交bio请求函数submit_bio,源代码如下:
void submit_bio(int rw, struct bio *bio)
{
int count = bio_sectors(bio);
bio->bi_rw |= rw;
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_size);
count_vm_events(PGPGIN, count);
}
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
current->comm, task_pid_nr(current),
(rw & WRITE) ? "WRITE" : "READ",
(unsigned long long)bio->bi_sector,
bdevname(bio->bi_bdev, b),
count);
}
}
generic_make_request(bio);
}
submit_bio完善一下bio信息后会调用generic_make_request函数提交bio。
generic_make_request函数源代码如下:
void generic_make_request(struct bio *bio)
{
struct bio_list bio_list_on_stack;
if (current->bio_list) {
/* make_request is active */
bio_list_add(current->bio_list, bio);
return;
}
BUG_ON(bio->bi_next);
bio_list_init(&bio_list_on_stack);
current->bio_list = &bio_list_on_stack;
do {
__generic_make_request(bio);
bio = bio_list_pop(current->bio_list);
} while (bio);
current->bio_list = NULL; /* deactivate */
}
generic_make_request函数将bio连接到current->bio_list链表中,并调用__generic_make_request函数提交链表中所有的bio。__generic_make_request函数最终会调用块设备的请求队列中的make_request_fn成员函数将bio请求发送给I/O调度层,至此对磁盘的数据请求离开通用块层,进入下一层——I/O调度层
通用块层总结
综上,一个磁盘数据请求在通用块层经过的流程为:
- 上层下发磁盘数据请求
- 通用块层申请bio结构,将请求的数据分段记录到bio中
- 如果请求的数据大于一个bio允许的最大数据量,则将请求分成多个bio
- 调用submit_bio提交bio请求
- submit_bio函数经过层层调用,最终调用块设备请求队列中的make_request_fn成员函数将bio提交给I/O调度层进行处理