sys_mount和sys_open-->blkdev_open中都要涉及block_device字段,今天分析sys_mount和块设备层
static int __init genhd_device_init(void)
{
int err;
bdev_map = kobj_map_init(base_probe, &block_subsys_lock);
blk_dev_init();
err = subsystem_register(&block_subsys);
if (err printk(KERN_WARNING "%s: subsystem_register error: %d/n",
__FUNCTION__, err);
return err;
}
subsys_initcall(genhd_device_init);
这个函数在do_initcalls中被调用。
struct kobj_map *kobj_map_init(kobj_probe_t *base_probe, struct mutex *lock)
{
struct kobj_map *p = kmalloc(sizeof(struct kobj_map), GFP_KERNEL);
struct probe *base = kzalloc(sizeof(*base), GFP_KERNEL);
int i;
if ((p == NULL) || (base == NULL)) {
…
}
base->dev = 1;
base->range = ~0;
base->get = base_probe;
for (i = 0; i p->probes[i] = base;
p->lock = lock;
return p;
}
同样在字符设备中也有一个map,叫做cdev_map,这里是初始化块设备的,这个map的目的是纳入统一设备模型连接成便于查找的结构,从而组织数据。
int __init blk_dev_init(void)
{
int i;
kblockd_workqueue = create_workqueue("kblockd");
if (!kblockd_workqueue)
panic("Failed to create kblockd/n");
request_cachep = kmem_cache_create("blkdev_requests", sizeof(struct request), 0, SLAB_PANIC, NULL);
requestq_cachep = kmem_cache_create("blkdev_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
iocontext_cachep = kmem_cache_create("blkdev_ioc", sizeof(struct io_context), 0, SLAB_PANIC, NULL);
for_each_possible_cpu(i)
INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i))
open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
register_hotcpu_notifier(&blk_cpu_notifier);
blk_max_low_pfn = max_low_pfn - 1;
blk_max_pfn = max_pfn - 1;
return 0;
}
这些都是后续工作,前提是块设备文件系统已经初始化并挂载了,那么就看看再往前的事情:
start_kernel-->vfs_caches_init
void __init vfs_caches_init(unsigned long mempages)
{
unsigned long reserve;
reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1);
mempages -= reserve;
names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
dcache_init(mempages);
inode_init(mempages);
files_init(mempages);
mnt_init(mempages);
bdev_cache_init();
chrdev_init();
}
void __init bdev_cache_init(void)
{
int err;
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD|SLAB_PANIC),
init_once);
err = register_filesystem(&bd_type);
bd_mnt = kern_mount(&bd_type);
err = PTR_ERR(bd_mnt);
blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
}
上面的代码进行完了以后,文件系统的工作就可以开始了,比如挂载一个新的块设备,以下分析sys_mount
sys_mount-->do_mount-->do_new_mount
以上有需要说明的地方,在do_mount里通过retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd),将nameidata类型变量nd设置为正确的挂载点信息,然后才调用do_new_mount,上面的nd在do_add_mount(mnt, nd, mnt_flags, NULL)里面有用。
static int do_new_mount(struct nameidata *nd, char *type, int flags,
int mnt_flags, char *name, void *data)
{
struct vfsmount *mnt;
if (!type || !memchr(type, 0, PAGE_SIZE))
return -EINVAL;
/* we need capabilities... */
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
mnt = do_kern_mount(type, flags, name, data);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
return do_add_mount(mnt, nd, mnt_flags, NULL);
}
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
struct file_system_type *type = get_fs_type(fstype);
struct vfsmount *mnt;
mnt = vfs_kern_mount(type, flags, name, data);
if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
!mnt->mnt_sb->s_subtype)
mnt = fs_set_subtype(mnt, fstype);
put_filesystem(type);
return mnt;
}
最核心的是vfs_kern_mount调用,而在此之前有个文件系统类型的查找函数file_system_type *type = get_fs_type(fstype)
struct file_system_type *get_fs_type(const char *name)
{
struct file_system_type *fs;
const char *dot = strchr(name, '.');
unsigned len = dot ? dot - name : strlen(name);
read_lock(&file_systems_lock);
fs = *(find_filesystem(name, len));
if (fs && !try_module_get(fs->owner))
fs = NULL;
read_unlock(&file_systems_lock);
if (!fs && (request_module("%.*s", len, name) == 0)) {
read_lock(&file_systems_lock);
fs = *(find_filesystem(name, len));
if (fs && !try_module_get(fs->owner))
fs = NULL;
read_unlock(&file_systems_lock);
}
if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
put_filesystem(fs);
fs = NULL;
}
return fs;
}
这个函数仅仅是为了说明内核的和谐性并且理清思路解决前些天的几个问题,并不是今天的重点,在request_module里面就是调用一下呼叫用户空间函数的例程。以下分析重点函数
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
struct vfsmount *mnt;
char *secdata = NULL;
int error;
error = -ENOMEM;
mnt = alloc_vfsmnt(name);
if (data) {
secdata = alloc_secdata();
error = security_sb_copy_data(type, data, secdata);
}
error = type->get_sb(type, flags, name, data, mnt);
error = security_sb_kern_mount(mnt->mnt_sb, secdata);
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
up_write(&mnt->mnt_sb->s_umount);
free_secdata(secdata);
return mnt;
…
}
这里又出现了一个调用层次:error = type->get_sb(type, flags, name, data, mnt);对于ext2文件系统的挂载,这里就是ext2_get_sb函数:
static int ext2_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt);
}
//
int get_sb_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int),
struct vfsmount *mnt)
{
struct block_device *bdev;
struct super_block *s;
int error = 0;
bdev = open_bdev_excl(dev_name, flags, fs_type);//马上分析这个调用,很重要,因为这里和底层的驱动和通用磁盘gendisk联系起来。
if (IS_ERR(bdev))
return PTR_ERR(bdev);
down(&bdev->bd_mount_sem);
s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);//这里将bdev设置进ext2的super_block,这里的bdev已经和一个gendisk联系了,因为之前已经打开过了
up(&bdev->bd_mount_sem);
if (IS_ERR(s))
goto error_s;
if (s->s_root) {
if ((flags ^ s->s_flags) & MS_RDONLY) {
up_write(&s->s_umount);
deactivate_super(s);
error = -EBUSY;
goto error_bdev;
}
close_bdev_excl(bdev);
} else {
char b[BDEVNAME_SIZE];
s->s_flags = flags;
strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);//调用回调函数,填充超级块。
…
s->s_flags |= MS_ACTIVE;
}
return simple_set_mnt(mnt, s);
…
}
以上注释了3个相关调用:open_bdev_excl,sget,fill_super,对于open_bdev_excl,一会分析,现在仅仅知道它得到了一个block_device指针就可以了。下面看一下sget
struct super_block *sget(struct file_system_type *type,
int (*test)(struct super_block *,void *),
int (*set)(struct super_block *,void *),
void *data)
{
struct super_block *s = NULL;
struct list_head *p;
int err;
retry:
spin_lock(&sb_lock);
if (test) list_for_each(p, &type->fs_supers) {
struct super_block *old;
old = list_entry(p, struct super_block, s_instances);
if (!test(old, data))
continue;
if (!grab_super(old))
goto retry;
if (s)
destroy_super(s);
return old;
}
if (!s) {
spin_unlock(&sb_lock);
s = alloc_super(type);
if (!s)
return ERR_PTR(-ENOMEM);
goto retry;
}
err = set(s, data);
if (err) {
spin_unlock(&sb_lock);
destroy_super(s);
return ERR_PTR(err);
}
s->s_type = type;
strlcpy(s->s_id, type->name, sizeof(s->s_id));
list_add_tail(&s->s_list, &super_blocks);
list_add(&s->s_instances, &type->fs_supers);
spin_unlock(&sb_lock);
get_filesystem(type);
return s;
}
这个函数主要就是alloc_super和回调函数set,alloc_super用kzalloc为超级块分配内存并且初始化一些默认字段,然后返回,之后sget调用回调用函数set为这个超级块设置一些字段,在这里就是块设备。此回调函数为set_bdev_super
static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
return 0;
}
就是这样,很简单。继续get_sb_bdev,sget玩了以后,就要调用真正的填充超级块的函数了:
static int ext2_fill_super(struct super_block *sb, void *data, int silent)
{
struct buffer_head * bh;
struct ext2_sb_info * sbi;
struct ext2_super_block * es;
struct inode *root;
unsigned long block;
unsigned long sb_block = get_sb_block(&data);
unsigned long logic_sb_block;
unsigned long offset = 0;
unsigned long def_mount_opts;
int blocksize = BLOCK_SIZE;
int db_count;
int i, j;
__le32 features;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
…
sb->s_fs_info = sbi;
blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
if (!blocksize) {
…
}
if (blocksize != BLOCK_SIZE) {
logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize;
offset = (sb_block*BLOCK_SIZE) % blocksize;
} else {
logic_sb_block = sb_block;
}
if (!(bh = sb_bread(sb, logic_sb_block))) {
…
}
es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
sbi->s_es = es;
sb->s_magic = le16_to_cpu(es->s_magic);
if (sb->s_magic != EXT2_SUPER_MAGIC)
goto cantfind_ext2;
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
…
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
MS_POSIXACL : 0);
ext2_xip_verify_sb(sb);
if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
(EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
EXT2_HAS_INCOMPAT_FEATURE(sb, ~0U)))
printk("EXT2-fs warning: feature flags set on rev 0 fs, "
"running e2fsck is recommended/n");
features = EXT2_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP);
if (features) {
…
}
if (!(sb->s_flags & MS_RDONLY) &&
(features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){
…
}
blocksize = BLOCK_SIZE s_es->s_log_block_size);
if ((ext2_use_xip(sb)) && ((blocksize != PAGE_SIZE) ||
(sb->s_blocksize != blocksize))) {
…
}
if (sb->s_blocksize != blocksize) {
brelse(bh);
if (!sb_set_blocksize(sb, blocksize)) {
…
logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize;
offset = (sb_block*BLOCK_SIZE) % blocksize;
bh = sb_bread(sb, logic_sb_block);
if(!bh) {
…
}
es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
sbi->s_es = es;
if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) {
printk ("EXT2-fs: Magic mismatch, very weird !/n");
goto failed_mount;
}
}
sb->s_maxbytes = ext2_max_size(sb->s_blocksize_bits);
if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV) {
sbi->s_inode_size = EXT2_GOOD_OLD_INODE_SIZE;
sbi->s_first_ino = EXT2_GOOD_OLD_FIRST_INO;
} else {
sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
if ((sbi->s_inode_size (sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
(sbi->s_inode_size > blocksize)) {
printk ("EXT2-fs: unsupported inode size: %d/n",
sbi->s_inode_size);
goto failed_mount;
}
}
sbi->s_frag_size = EXT2_MIN_FRAG_SIZE s_log_frag_size);
if (sbi->s_frag_size == 0)
goto cantfind_ext2;
sbi->s_frags_per_block = sb->s_blocksize / sbi->s_frag_size;
sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
if (EXT2_INODE_SIZE(sb) == 0)
goto cantfind_ext2;
sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb);
if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0)
goto cantfind_ext2;
sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block;
sbi->s_desc_per_block = sb->s_blocksize / sizeof (struct ext2_group_desc);
sbi->s_sbh = bh;
sbi->s_mount_state = le16_to_cpu(es->s_state);
sbi->s_addr_per_block_bits =ilog2 (EXT2_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2 (EXT2_DESC_PER_BLOCK(sb));
…
if (sb->s_blocksize != bh->b_size) {
…
sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
le32_to_cpu(es->s_first_data_block) - 1)
/ EXT2_BLOCKS_PER_GROUP(sb)) + 1;
db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
EXT2_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL);
…
}
bgl_lock_init(&sbi->s_blockgroup_lock);
sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
…
for (i = 0; i block = descriptor_loc(sb, logic_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
if (!sbi->s_group_desc[i]) {
for (j = 0; j brelse (sbi->s_group_desc[j]);
printk ("EXT2-fs: unable to read group descriptors/n");
goto failed_mount_group_desc;
}
}
…
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
percpu_counter_init(&sbi->s_freeblocks_counter, ext2_count_free_blocks(sb));
percpu_counter_init(&sbi->s_freeinodes_counter, ext2_count_free_inodes(sb));
percpu_counter_init(&sbi->s_dirs_counter, ext2_count_dirs(sb));
sb->s_op = &ext2_sops;
sb->s_export_op = &ext2_export_ops;
sb->s_xattr = ext2_xattr_handlers;
root = iget(sb, EXT2_ROOT_INO);
sb->s_root = d_alloc_root(root);
…
return 0;
…
}
这个函数很长,它启动了磁盘操作,从磁盘上读了一些信息,并将之设置到超级块,最终fill_super完了以后就要调用
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
mnt->mnt_sb = sb;
mnt->mnt_root = dget(sb->s_root);
return 0;
}
这同样设置一些字段,下面终于开始分析open_bdev_excl了。
struct block_device *open_bdev_excl(const char *path, int flags, void *holder)
{
struct block_device *bdev;
mode_t mode = FMODE_READ;
int error = 0;
bdev = lookup_bdev(path);
if (IS_ERR(bdev))
return bdev;
if (!(flags & MS_RDONLY))
mode |= FMODE_WRITE;
error = blkdev_get(bdev, mode, 0);//打开设备
if (error)
return ERR_PTR(error);
error = -EACCES;
if (!(flags & MS_RDONLY) && bdev_read_only(bdev))
goto blkdev_put;
error = bd_claim(bdev, holder);
if (error)
goto blkdev_put;
return bdev;
…
}
同样按照上面的深度优先分析,首先lookup_bdev(path)这一句找到了一个block_device结构
struct block_device *lookup_bdev(const char *path)
{
struct block_device *bdev;
struct inode *inode;
struct nameidata nd;
int error;
error = path_lookup(path, LOOKUP_FOLLOW, &nd);
inode = nd.dentry->d_inode;
…
bdev = bd_acquire(inode);
…
}
path_lookup按照设备文件路径进行解析,比如:/dev/hda1,然后将设备文件的内存inode为参数调用bd_acquire
//
static struct block_device *bd_acquire(struct inode *inode)
{
struct block_device *bdev;
spin_lock(&bdev_lock);
bdev = inode->i_bdev;
if (bdev) {
atomic_inc(&bdev->bd_inode->i_count);
spin_unlock(&bdev_lock);
return bdev;
}
spin_unlock(&bdev_lock);
bdev = bdget(inode->i_rdev);//核心函数
if (bdev) {
spin_lock(&bdev_lock);
if (!inode->i_bdev) {
atomic_inc(&bdev->bd_inode->i_count);
inode->i_bdev = bdev;
inode->i_mapping = bdev->bd_inode->i_mapping;
list_add(&inode->i_devices, &bdev->bd_inodes);
}
spin_unlock(&bdev_lock);
}
return bdev;
}
//
struct block_device *bdget(dev_t dev)
{
struct block_device *bdev;
struct inode *inode;
inode = iget5_locked(bd_mnt->mnt_sb, hash(dev),
bdev_test, bdev_set, &dev);//核心函数
if (!inode)
return NULL;
bdev = &BDEV_I(inode)->bdev;
if (inode->i_state & I_NEW) {
bdev->bd_contains = NULL;
bdev->bd_inode = inode;
bdev->bd_block_size = (1 i_blkbits);
bdev->bd_part_count = 0;
bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = dev;
inode->i_bdev = bdev;
inode->i_data.a_ops = &def_blk_aops;
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
inode->i_data.backing_dev_info = &default_backing_dev_info;
spin_lock(&bdev_lock);
list_add(&bdev->bd_list, &all_bdevs);
spin_unlock(&bdev_lock);
unlock_new_inode(inode);
}
return bdev;
}
//
iget5_locked-->get_new_inode
//
static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data)
{
struct inode * inode;
inode = alloc_inode(sb);
if (inode) {
struct inode * old;
spin_lock(&inode_lock);
/* We released the lock, so.. */
old = find_inode(sb, head, test, data);
if (!old) {
if (set(inode, data))
goto set_failed;
inodes_stat.nr_inodes++;
list_add(&inode->i_list, &inode_in_use);
list_add(&inode->i_sb_list, &sb->s_inodes);
hlist_add_head(&inode->i_hash, head);
inode->i_state = I_LOCK|I_NEW;
spin_unlock(&inode_lock);
return inode;
}
__iget(old);
spin_unlock(&inode_lock);
destroy_inode(inode);
inode = old;
wait_on_inode(inode);
}
return inode;
set_failed:
spin_unlock(&inode_lock);
destroy_inode(inode);
return NULL;
}
//
alloc_inode调用inode = sb->s_op->alloc_inode(sb),对于块设备有其特殊的函数。接下来一层一层返回,一个block_device便存在了,但是这个block_device还没有和任何块设备联系起来,结下来回到open_bdev_excl下一个调用是:blkdev_get(bdev, mode, 0)-->__blkdev_get
static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags,
int for_part)
{
struct file fake_file = {};
struct dentry fake_dentry = {};
fake_file.f_mode = mode;
fake_file.f_flags = flags;
fake_file.f_path.dentry = &fake_dentry;
fake_dentry.d_inode = bdev->bd_inode;
return do_open(bdev, &fake_file, for_part);
}
最终调用do_open(bdev, &fake_file, for_part)-->get_gendisk(bdev->bd_dev, &part)
struct gendisk *get_gendisk(dev_t dev, int *part)
{
struct kobject *kobj = kobj_lookup(bdev_map, dev, part);
return kobj ? to_disk(kobj) : NULL;
}
这样,刚才的那个系统初始化阶段的bdev_map就有用了,然后do_open将block_device的gendisk字段指向get_gendisk返回的gendisk,到此为止,一个数据可以从用户应用程序写到gendisk了,那么要写到具体的磁盘该怎么办呢?这就是设备驱动,设备驱动是和gendisk相关的,而block_device只是通过一个软件层与gendisk联系起来,从而实现块缓冲,下面看看scsi的probe函数:sd_probe
sd_probe-->gd = alloc_disk(16)-->add_disk(gd)
void add_disk(struct gendisk *disk)
{
disk->flags |= GENHD_FL_UP;
blk_register_region(MKDEV(disk->major, disk->first_minor),
disk->minors, NULL, exact_match, exact_lock, disk);
register_disk(disk);
blk_register_queue(disk);
}
blk_register_region-->kobj_map(bdev_map, dev, range, module, probe, lock, data)
int kobj_map(struct kobj_map *domain, dev_t dev, unsigned long range,
struct module *module, kobj_probe_t *probe,
int (*lock)(dev_t, void *), void *data)
{
unsigned n = MAJOR(dev + range - 1) - MAJOR(dev) + 1;
unsigned index = MAJOR(dev);
unsigned i;
struct probe *p;
if (n > 255)
n = 255;
p = kmalloc(sizeof(struct probe) * n, GFP_KERNEL);
if (p == NULL)
return -ENOMEM;
for (i = 0; i p->owner = module;
p->get = probe;
p->lock = lock;
p->dev = dev;
p->range = range;
p->data = data;
}
mutex_lock(domain->lock);
for (i = 0, p -= n; i struct probe **s = &domain->probes[index % 255];
while (*s && (*s)->range s = &(*s)->next;
p->next = *s;
*s = p;
}
mutex_unlock(domain->lock);
return 0;
}
这个函数就是将gendisk通过设备号加入了一个map,等待get_gendisk来取得。下面的是register_disk
void register_disk(struct gendisk *disk)
{
struct block_device *bdev;
char *s;
int i;
struct hd_struct *p;
int err;
strlcpy(disk->kobj.name,disk->disk_name,KOBJ_NAME_LEN);
s = strchr(disk->kobj.name, '/');
if (s)
*s = '!';
if ((err = kobject_add(&disk->kobj)))
return;
err = disk_sysfs_symlinks(disk);
if (err) {
kobject_del(&disk->kobj);
return;
}
disk_sysfs_add_subdirs(disk);
if (disk->minors == 1)
goto exit;
/* No such device (e.g., media were just removed) */
if (!get_capacity(disk))
goto exit;
bdev = bdget_disk(disk, 0);
if (!bdev)
goto exit;
bdev->bd_invalidated = 1;
disk->part_uevent_suppress = 1;
err = blkdev_get(bdev, FMODE_READ, 0);
disk->part_uevent_suppress = 0;
if (err goto exit;
blkdev_put(bdev);
…
}
注意502行的调用实际上调用了do_open函数,并且打开了设备,而且管理了分区信息。
到此为止,数据贯穿了一切,从用户到硬件
今天分析的代码很多,而且用的还是自己的lxr,在最新的2.6.23内核里,inode的i_bdev字段我不知道怎么用,我看在读写文件的时候,buffer_head的block_device字段都是从超级块里面取到的,实际上这是很合乎逻辑的,毕竟一个文件系统大多数在一个磁盘上,而一个超级块代表一个文件系统
在以上的分析中还有一个前置问题,就是块文件系统怎么挂载的问题,其实,在vfs_caches_init中的块初始化前已经有了mnt_init调用,它在系统的内存中初始化了一颗内存文件系统树
void __init mnt_init(unsigned long mempages)
{
struct list_head *d;
unsigned int nr_hash;
int i;
int err;
init_rwsem(&namespace_sem);
mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
nr_hash = PAGE_SIZE / sizeof(struct list_head);
hash_bits = 0;
do {
hash_bits++;
} while ((nr_hash >> hash_bits) != 0);
hash_bits--;
nr_hash = 1UL hash_mask = nr_hash - 1;
/* And initialize the newly allocated array */
d = mount_hashtable;
i = nr_hash;
do {
INIT_LIST_HEAD(d);
d++;
i--;
} while (i);
err = sysfs_init();
if (err)
printk(KERN_WARNING "%s: sysfs_init error: %d/n",
__FUNCTION__, err);
err = subsystem_register(&fs_subsys);
init_rootfs();
init_mount_tree();
}
只看init_rootfs和init_mount_tree
int __init init_rootfs(void)
{
return register_filesystem(&rootfs_fs_type);
}
static struct file_system_type rootfs_fs_type = {
.name = "rootfs",
.get_sb = rootfs_get_sb,
.kill_sb = kill_litter_super,
};
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
struct mnt_namespace *ns;
mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
if (IS_ERR(mnt))
panic("Can't create rootfs");
ns = kmalloc(sizeof(*ns), GFP_KERNEL);
if (!ns)
panic("Can't allocate initial namespace");
atomic_set(&ns->count, 1);
INIT_LIST_HEAD(&ns->list);
init_waitqueue_head(&ns->poll);
ns->event = 0;
list_add(&mnt->mnt_list, &ns->list);
ns->root = mnt;
mnt->mnt_ns = ns;
init_task.nsproxy->mnt_ns = ns;
get_mnt_ns(ns);
set_fs_pwd(current->fs, ns->root, ns->root->mnt_root);
set_fs_root(current->fs, ns->root, ns->root->mnt_root);
}
又到了do_kern_mount,前面分析过了
do_kern_mount-->|get_fs_type
|vfs_kern_mount-->|mnt = alloc_vfsmnt (调用kmem_cache_zalloc分配vfsmount)
|type->get_sb(type, flags, name, data, mnt)(见下面)
static int rootfs_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super, mnt);
}
int get_sb_nodev(struct file_system_type *fs_type,
int flags, void *data,
int (*fill_super)(struct super_block *, void *, int),
struct vfsmount *mnt)
{
int error;
struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
if (IS_ERR(s))
return PTR_ERR(s);
s->s_flags = flags;
error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
if (error) {
up_write(&s->s_umount);
deactivate_super(s);
return error;
}
s->s_flags |= MS_ACTIVE;
return simple_set_mnt(mnt, s);
}
看一下fill_super
static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
{
struct inode * inode;
struct dentry * root;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = RAMFS_MAGIC;
sb->s_op = &ramfs_ops;
sb->s_time_gran = 1;
inode = ramfs_get_inode(sb, S_IFDIR | 0755, 0);
if (!inode)
return -ENOMEM;
root = d_alloc_root(inode);
if (!root) {
iput(inode);
return -ENOMEM;
}
sb->s_root = root;
return 0;
}
在这之后,内存文件系统完毕,意味着vfs初始化完毕一半,下一个vfs重量级的初始化就是块文件系统的初始化,就是前面说的那一大堆,再往后,磁盘就可以随意挂载了,原来vfs的初始化也是一个分级的初始化过程,一切尽在vfs_caches_init和do_initcalls,前者申请了静态数据,后者注册了动态的行为
对于字符设备要比这块设备简单多了,就是一个cdev_map就可以搞定一切,不就是查找嘛?既然新内核提供乐kobject这个这么好的组织者,那么对于不需要缓冲的字符设备来说,通过设备号查找字符设备的文件操作结构简直太容易了,有了文件操作,接下来可能就和具体的设备有关了,比如,read调用可能就是从一个自己申请的缓冲区取数据,而数据需要中断来放入或者通过轮询得到,比如input子系统