在看initramfs方式启动linux的时候,需要把文件解压到rootfs文件系统中。比较困惑这些文件是如何写到rootfs文件系统中的,所以决定研究一下rootfs根文件系统的读写方式。linux初始化的时候,第一个挂载的根文件系统本质上是个ramfs文件系统。ramfs文件系统只存在于ram中。先看一下根文件系统的初始化过程。
start_kernel
----------->vfs_caches_init
-------------->mnt_init
在mnt_init中进行根文件系统的初始化:
void __init mnt_init(void)
{
unsigned u;
int err;
init_rwsem(&namespace_sem);
。。。。。。。。。。。。。。。。。。。。。。
fs_kobj = kobject_create_and_add("fs", NULL);
if (!fs_kobj)
printk(KERN_WARNING "%s: kobj create error\n", __func__);
init_rootfs(); //注册rootfs根文件系统
init_mount_tree();//挂载根文件系统
}
init_rootfs完成了rootfs文件系统的注册:
static struct file_system_type rootfs_fs_type = {
.name = "rootfs",
.mount = rootfs_mount,
.kill_sb = kill_litter_super,
};
int __init init_rootfs(void)
{
int err;
err = bdi_init(&ramfs_backing_dev_info);
if (err)
return err;
err = register_filesystem(&rootfs_fs_type);
if (err){
bdi_destroy(&ramfs_backing_dev_info);
}
return err;
}
init_mount_tree完成rootfs文件系统的挂载,为rootfs文件系统初始化挂载点,并创建了‘/’根目录dentry,以及inode节点。并把该根路径赋值给当前进程。
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
struct mnt_namespace *ns;
struct path root;
struct file_system_type *type;
type = get_fs_type("rootfs");
if (!type)
panic("Can't find rootfs type");
mnt = vfs_kern_mount(type, 0, "rootfs", NULL);//挂载根文件系统
put_filesystem(type);
if (IS_ERR(mnt))
panic("Can't create rootfs");
ns = create_mnt_ns(mnt);
if (IS_ERR(ns))
panic("Can't allocate initial namespace");
init_task.nsproxy->mnt_ns = ns;
get_mnt_ns(ns);
root.mnt = mnt;//记录挂载点
root.dentry = mnt->mnt_root;//记录根目录的dentry
set_fs_pwd(current->fs, &root);//设置进程的当前路径为根文件系统根目录
set_fs_root(current->fs, &root);//设置进程的根路径为根文件系统根目录
}
主要的挂载工作是在vfs_kern_mount中做的
vfs_kern_mount
------------>mount_fs
在mount_fs中调用上面的rootfs文件系统中的rootfs_mount函数来挂载:
static struct dentry *rootfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
}
struct dentry *mount_nodev(struct file_system_type *fs_type,
int flags, void *data,
int (*fill_super)(struct super_block *, void *, int))
{
int error;
struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);//分配一个super_block并初始化
if (IS_ERR(s))
return ERR_CAST(s);
error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);//再一次初始化superblock,并为根节点分配dentry和inode结构
if (error) {
deactivate_locked_super(s);
return ERR_PTR(error);
}
s->s_flags |= MS_ACTIVE;
return dget(s->s_root);
}
fill_super函数就是ramfs_fill_super:
int ramfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct ramfs_fs_info *fsi;
struct inode *inode;
int err;
save_mount_options(sb, data);
fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
sb->s_fs_info = fsi;
if (!fsi)
return -ENOMEM;
err = ramfs_parse_options(data, &fsi->mount_opts);
if (err)
return err;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = RAMFS_MAGIC;
sb->s_op = &ramfs_ops;
sb->s_time_gran = 1;
inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);//新建inode
sb->s_root = d_make_root(inode);//新建dentry
if (!sb->s_root)
return -ENOMEM;
return 0;
}
看一下ramfs是如何新建inode的:
struct inode *ramfs_get_inode(struct super_block *sb,
const struct inode *dir, umode_t mode, dev_t dev)
{
struct inode * inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_mapping->a_ops = &ramfs_aops;
inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
break;
case S_IFREG: //普通文件的操作函数
inode->i_op = &ramfs_file_inode_operations;
inode->i_fop = &ramfs_file_operations;
break;
case S_IFDIR: //普通目录的操作函数
inode->i_op = &ramfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
break;
}
}
return inode;
}
上面inode的初始化比较重要,里面操作集函数在文件读写的时候都会用到。下面读写的时候会具体分析到。关于文件系统mount更详细的分析,可以参考这篇文章:
https://blog.csdn.net/oqqYuJi12345678/article/details/101689334
下面以一个具体文件为例,分析整个读写过程。假如我们对/test文件进行读写操作。test文件就位于根文件系统的根目录下面。第一次打开/test文件,并设置了create位,打开的过程中如果没有该文件,会临时创建。上面挂载根文件系统的时候,已经建立了根目录。详细的文件系统打开操作可以参考这篇文章,重复部分这边不再展开介绍:
https://blog.csdn.net/oqqYuJi12345678/article/details/101849978
在路径查找的时候,从根目录开始找,从上面可知,根目录的dentry和inode已经存在。所以在do_sys_open函数中,找到根目录的dentry和inode以后,如果设置了O_CREAT位,调用lookup_open函数,在lookup_open函数中完成test节点的dentry和inode节点的创建。
do_sys_open
----------->do_filp_open
-------------->path_openat
-------------->do_last
----------------->lookup_open
static int lookup_open(struct nameidata *nd, struct path *path,
struct file *file,
const struct open_flags *op,
bool got_write, int *opened)
{
struct dentry *dir = nd->path.dentry;
struct inode *dir_inode = dir->d_inode;
struct dentry *dentry;
int error;
bool need_lookup;
*opened &= ~FILE_CREATED;
//完成dentry的创建
dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
//如果dentry的inode还没有创建,并且打开的时候设置了O_CREAT位
if (!dentry->d_inode && (op->open_flag & O_CREAT)) {
umode_t mode = op->mode;
if (!IS_POSIXACL(dir->d_inode))
mode &= ~current_umask();
/*
* This write is needed to ensure that a
* rw->ro transition does not occur between
* the time when the file is created and when
* a permanent write count is taken through
* the 'struct file' in finish_open().
*/
if (!got_write) {
error = -EROFS;
goto out_dput;
}
*opened |= FILE_CREATED;
error = security_path_mknod(&nd->path, dentry, mode, 0);
if (error)
goto out_dput;
//完成inode的创建
error = vfs_create(dir->d_inode, dentry, mode,
nd->flags & LOOKUP_EXCL);
if (error)
goto out_dput;
}
out_no_open:
path->dentry = dentry;
path->mnt = nd->path.mnt;
return 1;
out_dput:
dput(dentry);
return error;
}
看一下vfs_create的实现:
int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool want_excl)
{
int error = may_create(dir, dentry);
if (error)
return error;
if (!dir->i_op->create)
return -EACCES; /* shouldn't it be ENOSYS? */
mode &= S_IALLUGO;
mode |= S_IFREG;
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
//调用了父节点inode的i_op操作集的create方法来新建本节点的inode
error = dir->i_op->create(dir, dentry, mode, want_excl);
if (!error)
fsnotify_create(dir, dentry);
return error;
}
父节点inode的i_op操作集的create方法,具体是哪个函数呢,这边父节点其实就是根目录,那么该方法是根目录inode节点的create方法。从第一节的初始化可以知道,rootfs文件系统的inode节点,代表目录时,i_op操作集函数为ramfs_dir_inode_operations:
static const struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
.link = simple_link,
.unlink = simple_unlink,
.symlink = ramfs_symlink,
.mkdir = ramfs_mkdir,
.rmdir = simple_rmdir,
.mknod = ramfs_mknod,
.rename = simple_rename,
};
create函数为ramfs_create
static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
{
return ramfs_mknod(dir, dentry, mode | S_IFREG, 0);
}
static int
ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
int error = -ENOSPC;
if (inode) {
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
}
return error;
}
可以看到最终调用了ramfs_get_inode为test文件建立了inode。这边需要注意的是,在分配并初始化inode的时候,在ramfs_get_inode中,inode->i_mapping->a_ops = &ramfs_aops;,下面会用到该操作函数集。
至此test文件的inode和dentry都建立完毕。最终调用finish_open函数完成inode和file描述符的绑定。
do_last
-------->finish_open
int finish_open(struct file *file, struct dentry *dentry,
int (*open)(struct inode *, struct file *),
int *opened)
{
int error;
BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
file->f_path.dentry = dentry;//和dentry绑定
error = do_dentry_open(file, open, current_cred());
if (!error)
*opened |= FILE_OPENED;
return error;
}
static int do_dentry_open(struct file *f,
int (*open)(struct inode *, struct file *),
const struct cred *cred)
{
static const struct file_operations empty_fops = {};
struct inode *inode;
int error;
f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
FMODE_PREAD | FMODE_PWRITE;
if (unlikely(f->f_flags & O_PATH))
f->f_mode = FMODE_PATH;
path_get(&f->f_path);
inode = f->f_inode = f->f_path.dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
error = __get_file_write_access(inode, f->f_path.mnt);
if (error)
goto cleanup_file;
if (!special_file(inode->i_mode))
file_take_write(f);
}
--------------------------------------------------------------(1)
f->f_mapping = inode->i_mapping;
file_sb_list_add(f, inode->i_sb);
if (unlikely(f->f_mode & FMODE_PATH)) {
f->f_op = &empty_fops;
return 0;
}
-----------------------------------------------------------------(2)
//把inode的i_fop赋值给f->f_op,下面读写的时候会用到
f->f_op = fops_get(inode->i_fop);
error = security_file_open(f, cred);
if (error)
goto cleanup_all;
error = break_lease(inode, f->f_flags);
if (error)
goto cleanup_all;
if (!open && f->f_op)
open = f->f_op->open;
if (open) {
error = open(inode, f);//没有提供open函数
if (error)
goto cleanup_all;
}
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(inode);
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
return 0;
.。。。。。。。。。。。。。。。。。。。。。。。。。。。
}
(1)把inode的i_mapping赋值给f->f_mapping,i_mapping主要用来记录当前文件的空间,下面读写的时候会用到
(2)由于test是文件,所以inode的i_fop操作函数集是ramfs_file_operations,下面读写的时候会用到
下面看一下如何向test节点写数据。
写的时候会调用到内核函数vfs_write:
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_READ, buf, count)))
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
if (ret >= 0) {
count = ret;
file_start_write(file);
if (file->f_op->write)//先判断file->f_op->write存在不
ret = file->f_op->write(file, buf, count, pos);
else
ret = do_sync_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
inc_syscw(current);
file_end_write(file);
}
return ret;
}
在上面函数中先判断file->f_op->write在不在,在上面的open操作中,我们知道file->f_op操作集函数为ramfs_file_operations:
const struct file_operations ramfs_file_operations = {
.read = do_sync_read,
.aio_read = generic_file_aio_read,
.write = do_sync_write,
.aio_write = generic_file_aio_write,
.mmap = generic_file_mmap,
.fsync = noop_fsync,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
};
所以调用do_sync_write函数进行写操作:
ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
return ret;
}
inti_sync_kiob没什么花头的,重点无非是kiocb.ki_filp=filp。还有就是将要传入的buff信息打包至iov。然后又调用了aio_write函数。该函数为generic_file_aio_write:
ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
//inode初始化的时候,在inode_init_always函数中,有
//struct address_space *const mapping = &inode->i_data;
//mapping->host = inode;
//inode->i_mapping = mapping;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
mutex_lock(&inode->i_mutex);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
if (err < 0 && ret > 0)
ret = err;
}
return ret;
}
接着又调用了__generic_file_aio_write:
ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
size_t ocount; /* original count */
size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
loff_t pos;
ssize_t written;
ssize_t err;
ocount = 0;
err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
if (err)
return err;
count = ocount;
pos = *ppos;
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
written = 0;
。。。。。。。。。。。。。。。。。
written = generic_file_buffered_write(iocb, iov, nr_segs,
pos, ppos, count, written);
out:
current->backing_dev_info = NULL;
return written ? written : err;
}
一般情况下是不带O_DIRECT标志的,所以省去了不重要的代码,可以看到关键函数又调用了generic_file_buffered_write。
对这个函数的入口参数进行分析,iocb还是那个icob。iov,nr_segs都是一样的,nr_segs为1。在文件大小没有突破限制的情况下,pos=ppos文件写起始位置,文件长度count=iov->iov_len,written为0。
ssize_t
generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos, loff_t *ppos,
size_t count, ssize_t written)
{
struct file *file = iocb->ki_filp;
ssize_t status;
struct iov_iter i;
//把相关信息都填入到i中,i->iov = iov;i->nr_segs = nr_segs;i->iov_offset = 0;
//i->count = count;
iov_iter_init(&i, iov, nr_segs, count, written);
status = generic_perform_write(file, &i, pos);
if (likely(status >= 0)) {
written += status;
*ppos = pos + status;
}
return written ? written : status;
}
核心函数为generic_perform_write:
static ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
{
struct address_space *mapping = file->f_mapping;
--------------------------------------------------------(1)
const struct address_space_operations *a_ops = mapping->a_ops;
long status = 0;
ssize_t written = 0;
unsigned int flags = 0;
/*
* Copies from kernel address space cannot fail (NFSD is a big user).
*/
if (segment_eq(get_fs(), KERNEL_DS))
flags |= AOP_FLAG_UNINTERRUPTIBLE;
do {
struct page *page;
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
void *fsdata;
offset = (pos & (PAGE_CACHE_SIZE - 1));//写的起始地址在一个页内的偏移
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_count(i));
//iov_iter_count为i->count,即要写入的文件长度,取要写入的第一个页的剩余空间和文件大小
//做比较,如果文件较小,那么当前写入的bytes就是文件大小,如果文件较大,说明写入内容不止当前
//页,那么这次写入的字数先写到当前页的剩余空间中
again:
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*
* Not only is this an optimisation, but it is also required
* to check that the address is actually valid, when atomic
* usercopies are used, below.
*/
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
status = -EFAULT;
break;
}
--------------------------------------------------------------(2)
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
if (unlikely(status))
break;
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
pagefault_disable();
---------------------------------------------------------------(3)
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
pagefault_enable();
flush_dcache_page(page);
mark_page_accessed(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
if (unlikely(status < 0))
break;
copied = status;
cond_resched();
//根据写入的字数,更新要写入文件的大小count
iov_iter_advance(i, copied);
if (unlikely(copied == 0)) {
/*
* If we were unable to copy any data at all, we must
* fall back to a single segment length write.
*
* If we didn't fallback here, we could livelock
* because not all segments in the iov can be copied at
* once without a pagefault.
*/
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_single_seg_count(i));
goto again;
}
//把当前文件指针指向到写入内容的后面
pos += copied;
//统计写入的字节
written += copied;
balance_dirty_pages_ratelimited(mapping);
if (fatal_signal_pending(current)) {
status = -EINTR;
break;
}
} while (iov_iter_count(i));//判断count是否为0,不为0则还没写完,继续loop
return written ? written : status;
}
(1)mapping->a_ops操作函数集为ramfs_aops,上面初始化inode的时候赋值:
const struct address_space_operations ramfs_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
.write_end = simple_write_end,
.set_page_dirty = __set_page_dirty_no_writeback,
};
(2)write_begin函数为simple_write_begin,参数len为此次要写入的长度,肯定小于等于一页:
int simple_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
struct page *page;
pgoff_t index;
index = pos >> PAGE_CACHE_SHIFT;//计算文件写入位置在整个文件的第几个页中
//根据页的index,来查找maping中管理的页是否存在,没有,则申请一个新页,给mapping管理,后面写入的地址就是这个页
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
}
return 0;
}
可以看到上面根据要写入的文件的光标,来计算当前要写入页的index。从这边就可以推断出。这个文件是按页来管理的,并都存放在mapping结构中。如果这个文件是8K,那么需要两个页。如果写入位置是6K 处,那么应该index是1,从第二个页开始写入。具体看一下grab_cache_page_write_begin函数:
struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags)
{
int status;
gfp_t gfp_mask;
struct page *page;
gfp_t gfp_notmask = 0;
gfp_mask = mapping_gfp_mask(mapping);
if (mapping_cap_account_dirty(mapping))
gfp_mask |= __GFP_WRITE;
if (flags & AOP_FLAG_NOFS)
gfp_notmask = __GFP_FS;
repeat:
page = find_lock_page(mapping, index);//根据index查找mapping结构中是否已经有该页
if (page)//找到则直接返回
goto found;
//如果没有找到,则分配一个新的页
page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
if (!page)
return NULL;
//把该页加入mapping结构中
status = add_to_page_cache_lru(page, mapping, index,
GFP_KERNEL & ~gfp_notmask);
if (unlikely(status)) {
page_cache_release(page);
if (status == -EEXIST)
goto repeat;
return NULL;
}
found:
wait_for_stable_page(page);
return page;
}
(3)iov_iter_copy_from_user_atomic函数完成具体的内容写操作:
size_t iov_iter_copy_from_user_atomic(struct page *page,
struct iov_iter *i, unsigned long offset, size_t bytes)
{
char *kaddr;
size_t copied;
BUG_ON(!in_atomic());
kaddr = kmap_atomic(page);//获取该页的逻辑地址
if (likely(i->nr_segs == 1)) {
int left;
char __user *buf = i->iov->iov_base + i->iov_offset;//要复制的数据的原地址
left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);//进行数据复制
copied = bytes - left;
} else {
copied = __iovec_copy_from_user_inatomic(kaddr + offset,
i->iov, i->iov_offset, bytes);
}
kunmap_atomic(kaddr);
return copied;
}
至此写文件分析完了,再来看一下读文件,过程是类似的。
vfs_read
----------->do_sync_read
ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
return ret;
}
参数buf是读的目的地址,len是读取长度,ppos是读文件的起始位置。filp->f_op->aio_read为generic_file_aio_read:
ssize_t
generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *filp = iocb->ki_filp;
ssize_t retval;
unsigned long seg = 0;
size_t count;
loff_t *ppos = &iocb->ki_pos;
count = 0;
retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
if (retval)
return retval;
。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
count = retval;
//nr_segs为1
for (seg = 0; seg < nr_segs; seg++) {
read_descriptor_t desc;
loff_t offset = 0;
/*
* If we did a short DIO read we need to skip the section of the
* iov that we've already read data into.
*/
if (count) {
if (count > iov[seg].iov_len) {
count -= iov[seg].iov_len;
continue;
}
offset = count;
count = 0;
}
desc.written = 0;
desc.arg.buf = iov[seg].iov_base + offset;//存储数据区域
desc.count = iov[seg].iov_len - offset;//读取长度
if (desc.count == 0)
continue;
desc.error = 0;
do_generic_file_read(filp, ppos, &desc, file_read_actor);
retval += desc.written;
if (desc.error) {
retval = retval ?: desc.error;
break;
}
if (desc.count > 0)
break;
}
out:
return retval;
}
直接看do_generic_file_read函数:
static void do_generic_file_read(struct file *filp, loff_t *ppos,
read_descriptor_t *desc, read_actor_t actor)
{
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct file_ra_state *ra = &filp->f_ra;
pgoff_t index;
pgoff_t last_index;
pgoff_t prev_index;
unsigned long offset; /* offset into pagecache page */
unsigned int prev_offset;
int error;
index = *ppos >> PAGE_CACHE_SHIFT;//读取的内容起始地址在mapping中的第一个页
prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;//偏移,可以理解成从第几个字节开始读
for (;;) {
struct page *page;
pgoff_t end_index;
loff_t isize;
unsigned long nr, ret;
cond_resched();
find_page:
page = find_get_page(mapping, index);//在mapping结构中找到要开始读的那个页
if (!page) {
page_cache_sync_readahead(mapping,
ra, filp,
index, last_index - index);
page = find_get_page(mapping, index);
if (unlikely(page == NULL))
goto no_cached_page;
}
if (PageReadahead(page)) {
page_cache_async_readahead(mapping,
ra, filp, page,
index, last_index - index);
}
if (!PageUptodate(page)) {
if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
if (!trylock_page(page))
goto page_not_up_to_date;
/* Did it get truncated before we got the lock? */
if (!page->mapping)
goto page_not_up_to_date_locked;
if (!mapping->a_ops->is_partially_uptodate(page,
desc, offset))
goto page_not_up_to_date_locked;
unlock_page(page);
}
page_ok:
/*
* i_size must be checked after we know the page is Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
//读出inode->i_size的长度,也就是文件数据的长度
isize = i_size_read(inode);
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;//inode的数据有多少页
if (unlikely(!isize || index > end_index)) {
page_cache_release(page);
goto out;
}
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
if (index == end_index) {
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
if (nr <= offset) {
page_cache_release(page);
goto out;
}
}
//上面是对nr的调整,假如是在最后一页了,那么nr就是实际块长度
//如果不是最后页,那么就是整页
nr = nr - offset;//要读的数据。
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
/*
* When a sequential read accesses a page several times,
* only mark it as accessed the first time.
*/
if (prev_index != index || offset != prev_offset)
mark_page_accessed(page);
prev_index = index;
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*
* The actor routine returns how many bytes were actually used..
* NOTE! This may not be the same as how much of a user buffer
* we filled up (we may be padding etc), so we can only update
* "pos" here (the actor routine has to update the user buffer
* pointers and the remaining count).
*/
ret = actor(desc, page, offset, nr);
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
prev_offset = offset;
page_cache_release(page);
if (ret == nr && desc->count)
continue;
goto out;
page_not_up_to_date:
/* Get exclusive access to the page ... */
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
page_not_up_to_date_locked:
/* Did it get truncated before we got the lock? */
if (!page->mapping) {
unlock_page(page);
page_cache_release(page);
continue;
}
/* Did somebody else fill it already? */
if (PageUptodate(page)) {
unlock_page(page);
goto page_ok;
}
readpage:
/*
* A previous I/O error may have been due to temporary
* failures, eg. multipath errors.
* PG_error will be set again if readpage fails.
*/
ClearPageError(page);
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
if (unlikely(error)) {
if (error == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
goto find_page;
}
goto readpage_error;
}
if (!PageUptodate(page)) {
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
if (!PageUptodate(page)) {
if (page->mapping == NULL) {
/*
* invalidate_mapping_pages got it
*/
unlock_page(page);
page_cache_release(page);
goto find_page;
}
unlock_page(page);
shrink_readahead_size_eio(filp, ra);
error = -EIO;
goto readpage_error;
}
unlock_page(page);
}
goto page_ok;
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
desc->error = error;
page_cache_release(page);
goto out;
no_cached_page:
/*
* Ok, it wasn't cached, so we need to create a new
* page..
*/
page = page_cache_alloc_cold(mapping);
if (!page) {
desc->error = -ENOMEM;
goto out;
}
error = add_to_page_cache_lru(page, mapping,
index, GFP_KERNEL);
if (error) {
page_cache_release(page);
if (error == -EEXIST)
goto find_page;
desc->error = error;
goto out;
}
goto readpage;
}
out:
ra->prev_pos = prev_index;
ra->prev_pos <<= PAGE_CACHE_SHIFT;
ra->prev_pos |= prev_offset;
*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
file_accessed(filp);
}
上面一大堆代码里真正在读取数据的其实是那个回调函数,其他都只是在index,offset,nr之类的,也就是这个回调函数的参数而已。ret = actor(desc, page, offset, nr);
四个入口参数,分别是描述符,页,偏移,读取的字节数。:
int file_read_actor(read_descriptor_t *desc, struct page *page,
unsigned long offset, unsigned long size)
{
char *kaddr;
unsigned long left, count = desc->count;
if (size > count)
size = count;
/*
* Faults on the destination of a read are common, so do it before
* taking the kmap.
*/
if (!fault_in_pages_writeable(desc->arg.buf, size)) {
kaddr = kmap_atomic(page);//得到页的逻辑地址
left = __copy_to_user_inatomic(desc->arg.buf,
kaddr + offset, size);//从文件页的逻辑地址中拷贝内容到目的地址
kunmap_atomic(kaddr);
if (left == 0)
goto success;
}
/* Do it the slow way */
kaddr = kmap(page);
left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
kunmap(page);
if (left) {
size -= left;
desc->error = -EFAULT;
}
success:
desc->count = count - size;
desc->written += size;
desc->arg.buf += size;//指针跳向后面
return size;
}
这个回调函数里,就是把kaddr+offset上的size的数据,copy到描述符里buf里。并且des->count减少,buf指向后面。至此open,write和read过程都已经分析完了。