块设备的特点是其平均访问时间较长,因此为了提高块设备的访问效率,Linux内核用了很多的笔墨来设计和块设备相关的部分,这样一来,从代码的角度来看,访问一个文件的过程变得尤其的漫长……整个路径包含的过程基本可以概括为虚拟文件系统-->块设备实际文件系统-->通用块层-->I/O scheduler-->块设备驱动程序。为了提高块设备的访问效率,内核主要是在两个方面下功夫:
1.引入缓存,当用户空间要访问文件时,内核不可能每次都去访问块设备,内核会将块设备的内容读取到内存中,以便下次访问时可以直接在内存中找到相应的内容,这其中又涉及到了预读等相关的问题,当然这不是现在关注的重点……
2.对于I/O请求的重排列,I/O请求并不会立即被响应,而是会放在一个队列里进行一段延迟,以期能够和后来的I/O请求进行合并或者进行排序。因为像磁盘这样的块设备,其耗时主要是因为磁头的定位,因此内核会尽量保证磁头只往一个方向移动,而不是来回移动(可以和电梯的运作进行对比),简而言之,就是将存储介质上相邻的数据请求安排在一起,对于I/O请求的处理主要包括合并和排序,具体如何处理,由I/O scheduler决定。
首先,我们先来了解一个块设备是如何表示的。描述块设备的数据结构有两个,一个是struct block_device,用来描述一个块设备或者块设备的一个分区;另一个是struct gendisk,用来描述整个块设备的特性。对于一个包含多个分区的块设备,struct block_device结构有多个,而struct gendisk结构永远只有一个。
struct block_device {
dev_t bd_dev; /* not a kdev_t - it's a search key */
struct inode * bd_inode; /* will die */
struct super_block * bd_super;
int bd_openers;
struct mutex bd_mutex; /* open/close mutex */
struct list_head bd_inodes;
void * bd_holder;
int bd_holders;
#ifdef CONFIG_SYSFS
struct list_head bd_holder_list;
#endif
struct block_device * bd_contains;
unsigned bd_block_size;
struct hd_struct * bd_part;
/* number of times partitions within this device have been opened. */
unsigned bd_part_count;
int bd_invalidated;
struct gendisk * bd_disk;
struct list_head bd_list;
/*
* Private data. You must have bd_claim'ed the block_device
* to use this. NOTE: bd_claim allows an owner to claim
* the same device multiple times, the owner must take special
* care to not mess up bd_private for that case.
*/
unsigned long bd_private;
/* The counter of freeze processes */
int bd_fsfreeze_count;
/* Mutex for freeze */
struct mutex bd_fsfreeze_mutex;
};
bd_dev:该设备(分区)的设备号
bd_inode:指向该设备文件的inode
bd_openers:一个引用计数,记录了该块设备打开的次数,或者说有多少个进程打开了该设备
bd_contains:如果该block_device描述的是一个分区,则该变量指向描述主块设备的block_device,反之,其指向本身
bd_part:如果该block_device描述的是一个分区,则该变量指向分区的信息
bd_part_count:如果是分区,该变量记录了分区被打开的次数,在进行分区的重新扫描前,要保证该计数值为0
bd_disk:指向描述整个设备的gendisk结构
struct gendisk {
/* major, first_minor and minors are input parameters only,
* don't use directly. Use disk_devt() and disk_max_parts().
*/
int major; /* major number of driver */
int first_minor;
int minors; /* maximum number of minors, =1 for
* disks that can't be partitioned. */
char disk_name[DISK_NAME_LEN]; /* name of major driver */
char *(*devnode)(struct gendisk *gd, mode_t *mode);
/* Array of pointers to partitions indexed by partno.
* Protected with matching bdev lock but stat and other
* non-critical accesses use RCU. Always access through
* helpers.
*/
struct disk_part_tbl *part_tbl;
struct hd_struct part0;
const struct block_device_operations *fops;
struct request_queue *queue;
void *private_data;
int flags;
struct device *driverfs_dev; // FIXME: remove
struct kobject *slave_dir;
struct timer_rand_state *random;
atomic_t sync_io; /* RAID */
struct work_struct async_notify;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity *integrity;
#endif
int node_id;
};
major:块设备的主设备号
first_minor:起始次设备号
minors:描述了该块设备有多少个次设备号,或者说有多少个分区,如果minors为1,则表示该块设备没有分区
part_tbl:整个块设备的分区信息都包含在里面,其核心结构是一个struct hd_struct的指针数组,每一项都指向一个描述分区的hd_struct结构
fops:指向特定于设备的底层操作函数集
queue:块设备的请求队列,所有针对该设备的请求都会放入该请求队列中,经过I/O scheduler的处理再进行提交
块设备的分区信息由struct hd_struct结构描述,其中最重要的信息就是分区的起始扇区号和分区的大小。所有分区信息都一起保存在gendisk的part_tbl结构中,同时每个分区的block_device也可以通过bd_part来查询对应的分区信息。
下图描述了block_device,gendisk以及分区描述符之间的关系(块设备有两个分区)
下面通过打开一个块设备的过程,来理解这些结构之间的联系。
对于块设备文件的操作,通过block_dev伪文件系统来完成,open操作定义的函数为blkdev_open()
blkdev_open的主要任务有两个
1.获取设备的block_device信息
2.从gendisk中读取相关信息保存到block_device,同时建立数据结构之间的联系
static int blkdev_open(struct inode * inode, struct file * filp)
{
struct block_device *bdev;
int res;
/*
* Preserve backwards compatibility and allow large file access
* even if userspace doesn't ask for it explicitly. Some mkfs
* binary needs it. We might want to drop this workaround
* during an unstable branch.
*/
filp->f_flags |= O_LARGEFILE;
if (filp->f_flags & O_NDELAY)
filp->f_mode |= FMODE_NDELAY;
if (filp->f_flags & O_EXCL)
filp->f_mode |= FMODE_EXCL;
if ((filp->f_flags & O_ACCMODE) == 3)
filp->f_mode |= FMODE_WRITE_IOCTL;
bdev = bd_acquire(inode);//获取block device实例
if (bdev == NULL)
return -ENOMEM;
filp->f_mapping = bdev->bd_inode->i_mapping;
res = blkdev_get(bdev, filp->f_mode);//通过gendisk获取信息并建立联系
if (res)
return res;
if (filp->f_mode & FMODE_EXCL) {
res = bd_claim(bdev, filp);
if (res)
goto out_blkdev_put;
}
return 0;
out_blkdev_put:
blkdev_put(bdev, filp->f_mode);
return res;
}
bd_acquire()负责获取block_device的实例
static struct block_device *bd_acquire(struct inode *inode)
{
struct block_device *bdev;
spin_lock(&bdev_lock);
bdev = inode->i_bdev;//如果这个设备之前被打开过则可以直接通过i_bdev获取
if (bdev) {
atomic_inc(&bdev->bd_inode->i_count);
spin_unlock(&bdev_lock);
return bdev;
}
spin_unlock(&bdev_lock);
bdev = bdget(inode->i_rdev);//通过设备号的信息来获取block device实例
if (bdev) {
spin_lock(&bdev_lock);
if (!inode->i_bdev) {
/*
* We take an additional bd_inode->i_count for inode,
* and it's released in clear_inode() of inode.
* So, we can access it via ->i_mapping always
* without igrab().
*/
atomic_inc(&bdev->bd_inode->i_count);
inode->i_bdev = bdev;
inode->i_mapping = bdev->bd_inode->i_mapping;
list_add(&inode->i_devices, &bdev->bd_inodes);
}
spin_unlock(&bdev_lock);
}
return bdev;
}
struct block_device *bdget(dev_t dev)
{
struct block_device *bdev;
struct inode *inode;
/*这里先在inode的哈希表中进行查找与dev设备号对应的inode,如果没找到的话,
则通过bdev伪文件系统创建bdev_inode(包含inode和block device的结构体)*/
inode = iget5_locked(blockdev_superblock, hash(dev),
bdev_test, bdev_set, &dev);
if (!inode)
return NULL;
//通过inode获取bdev_inode,再通过bdev_inode获取block device实例
bdev = &BDEV_I(inode)->bdev;
if (inode->i_state & I_NEW) {
/*分别设置block device和inode的相关域*/
bdev->bd_contains = NULL;
bdev->bd_inode = inode;
bdev->bd_block_size = (1 << inode->i_blkbits);
bdev->bd_part_count = 0;
bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = dev;
inode->i_bdev = bdev;
inode->i_data.a_ops = &def_blk_aops;
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
inode->i_data.backing_dev_info = &default_backing_dev_info;
spin_lock(&bdev_lock);
list_add(&bdev->bd_list, &all_bdevs);
spin_unlock(&bdev_lock);
unlock_new_inode(inode);
}
return bdev;
}
blkdev_get()函数负责从gendisk中获取信息,并建立相关数据结构之间的联系
int blkdev_get(struct block_device *bdev, fmode_t mode)
{
return __blkdev_get(bdev, mode, 0);
}
注意_blkdev_get()传递的最后一个参数为0,也就是说默认打开的是主设备
获取到gendisk之后会分四种情况进行处理,也就是针对设备是不是第一次打开以及打开的设备是主设备还是分区来进行不同的处理,具体见代码注释
static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
{
struct gendisk *disk;
int ret;
int partno;
int perm = 0;
if (mode & FMODE_READ)
perm |= MAY_READ;
if (mode & FMODE_WRITE)
perm |= MAY_WRITE;
/*
* hooks: /n/, see "layering violations".
*/
if (!for_part) {
ret = devcgroup_inode_permission(bdev->bd_inode, perm);
if (ret != 0) {
bdput(bdev);
return ret;
}
}
lock_kernel();
restart:
ret = -ENXIO;
//获取该设备的gendisk实例,如果bd_dev对应的是一个分区设备的话,partno将会被修改
disk = get_gendisk(bdev->bd_dev, &partno);
if (!disk)
goto out_unlock_kernel;
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {//如果是第一次打开设备
bdev->bd_disk = disk;//建立block device和gendisk之间的联系
bdev->bd_contains = bdev;
if (!partno) {//partno为0,也就是说打开的是主设备而不是分区
struct backing_dev_info *bdi;
ret = -ENXIO;
bdev->bd_part = disk_get_part(disk, partno);//获取gendisk中的分区数组
if (!bdev->bd_part)
goto out_clear;
if (disk->fops->open) {//gendisk中定义了open方式
ret = disk->fops->open(bdev, mode);//调用open针对具体的设备进行打开操作
if (ret == -ERESTARTSYS) {
/* Lost a race with 'disk' being
* deleted, try again.
* See md.c
*/
disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
module_put(disk->fops->owner);
put_disk(disk);
bdev->bd_disk = NULL;
mutex_unlock(&bdev->bd_mutex);
goto restart;
}
if (ret)
goto out_clear;
}
if (!bdev->bd_openers) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);//从gendisk中提取容量信息设置到block device
bdi = blk_get_backing_dev_info(bdev);
if (bdi == NULL)
bdi = &default_backing_dev_info;
bdev->bd_inode->i_data.backing_dev_info = bdi;
}
//块设备上的分区改变导致分区在内核中的信息无效,则要重新扫描分区
if (bdev->bd_invalidated)
rescan_partitions(disk, bdev);
} else {//如果打开的是分区
struct block_device *whole;
whole = bdget_disk(disk, 0);//获取主设备的block device实例
ret = -ENOMEM;
if (!whole)
goto out_clear;
BUG_ON(for_part);
ret = __blkdev_get(whole, mode, 1);
if (ret)
goto out_clear;
bdev->bd_contains = whole;//设置分区的block device实例的bd_contains域到主设备
bdev->bd_inode->i_data.backing_dev_info =
whole->bd_inode->i_data.backing_dev_info;
bdev->bd_part = disk_get_part(disk, partno);
if (!(disk->flags & GENHD_FL_UP) ||
!bdev->bd_part || !bdev->bd_part->nr_sects) {
ret = -ENXIO;
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
}
} else {//如果不是第一次打开
module_put(disk->fops->owner);
put_disk(disk);
disk = NULL;
if (bdev->bd_contains == bdev) {//打开的是主设备
if (bdev->bd_disk->fops->open) {
ret = bdev->bd_disk->fops->open(bdev, mode);//调用定义的open
if (ret)
goto out_unlock_bdev;
}
if (bdev->bd_invalidated)
rescan_partitions(bdev->bd_disk, bdev);
}
}
bdev->bd_openers++;//计数值加1
if (for_part)//如果是分区则分区计数值也加1
bdev->bd_part_count++;
mutex_unlock(&bdev->bd_mutex);
unlock_kernel();
return 0;
out_clear:
disk_put_part(bdev->bd_part);
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
out_unlock_bdev:
mutex_unlock(&bdev->bd_mutex);
out_unlock_kernel:
unlock_kernel();
if (disk)
module_put(disk->fops->owner);
put_disk(disk);
bdput(bdev);
return ret;
}