【linux内核源码】 io操作之read

准备

本文所涉及的源码全部基于linux内核5.15。

ftrace

由于系统调用的路径比较复杂,当我们首次阅读内核代码不知道从哪里寻找系统调用函数入口的时候,可以通过ftrace跟踪系统函数的调用栈来获取函数的调用链。

使用ftrace来跟踪read系统调用. 以下脚本默认机器已经挂载了debugfs。具体的ftrace使用可参考 https://01.org/linuxgraphics/gfx-docs/drm/trace/ftrace.html

cd /sys/kernel/debug/tracing # 为了方便操作先进入该目录,通过ls该目录即可查看当前系统是否支持ftrace
echo 0 > tracing_on # 先关闭trace,减少其他trace的干扰
echo function_graph > current_tracer # 设置trace为func_graph
echo  __x64_sys_read  > set_graph_function # 跟踪sys_read系统调用,在x64系统下sys_read的实际函数为__x64_sys_read
echo 1 > tracing_on # 开启系统调用
head -30 trace # 查看trace跟踪信息

比如下图为开启ftrace时sys_read在xfs文件系统下的调用站内核调用栈.从调用栈中可以得知在new_sync_read中会调用xfs_file_read_iter也即具体的文件系统的实现。后续的源码阅读会基于ext4文件系统做说明,下面的调用栈只是使用测试机器的一个例子。

__x64_sys_read() {
  ksys_read() {
    __fdget_pos() {
    __fget_light();
    }       
        vfs_read() {
          rw_verify_area() {
            security_file_permission() {
              apparmor_file_permission() {
                common_file_perm() {
                  aa_file_perm() {
                    rcu_read_unlock_strict();
                  }
                }
              }
              __fsnotify_parent();
            }
          }
          new_sync_read() {
            xfs_file_read_iter [xfs]() {
              xfs_file_buffered_aio_read [xfs]() {
                xfs_ilock [xfs]() {
                  down_read() {
                    _cond_resched() {
                      rcu_all_qs();
                    }
                  }
                }
                generic_file_read_iter() {
                  generic_file_buffered_read() {
                    _cond_resched() {
                      rcu_all_qs();
                    }
                    pagecache_get_page() {
                      find_get_entry() {
                        rcu_read_unlock_strict();
                      }
                      PageHuge();
                    }
                    mark_page_accessed();
                    _cond_resched() {
                      rcu_all_qs();
                    }
                    touch_atime() {
                      atime_needs_update();
                    }
                  }
                }
                xfs_iunlock [xfs]() {
                  up_read();
                }
              }
            }
          }
          __fsnotify_parent();
        }

系统调用sys_read

当用户调用系统调用read从文件读取数据时,实际会触发0x80中断,中断处理程序根据中断号找到内核read的入口函数 sys_read。sys_read 的函数定义如下,SYSCALL_DEFINEx 是内核的系统调用宏定义,x表示参数的个数,例如sys_read的宏定义为 SYSCALL_DEFINE3(read,int,char*,size_t)表示sys_read有三个参数。

sys_read

SYSCALL_DEFINE3(read, unsigned int,fd, char__user *, buf,size_t, count)
{
    return ksys_read(fd, buf, count);
}

sys_call实际调用的是ksys_read。

ksys_read ksys_read首先会根据fd拿到struct fd信息。判断fd是否存在,fd如果错误直接返回EBADF错误码。ksys_read的核心流程为vfs_read。通过虚拟文件系统的vfs_read来完成文件的读取,对外屏蔽了不同文件系统的具体实现。

ssize_t ksys_read(unsigned intfd, char__user *buf,size_t count)
{
    struct fd f =fdget_pos(fd);
    ssize_t ret = -EBADF;
    if (f.file) {
        loff_tpos, *ppos =file_ppos(f.file); // 获取当前文件的offset
        if (ppos) {
            pos = *ppos;
            ppos = &pos;
        }
        ret =vfs_read(f.file, buf, count,ppos); // 调用虚拟文件系统vfs_read进行读取
        if (ret >= 0 && ppos)
            f.file->f_pos =pos;
            fdput_pos(f);
    }
    return ret;
}

虚拟文件系统

vfs_read 为虚拟文件系统的读操作实现,函数内部会根据具体的文件系统实现调用对应的读操作。

ssize_t vfs_read(struct file *file, char__user *buf,size_t count,loff_t *pos)
{
ssize_t ret;

    if (!(file->f_mode &FMODE_READ))
        return -EBADF;
    if (!(file->f_mode &FMODE_CAN_READ))
        return -EINVAL;
    if (unlikely(!access_ok(buf, count)))
        return -EFAULT;

    ret = rw_verify_area(READ,file,pos, count);
    if (ret)
        return ret;
    if (count >MAX_RW_COUNT)
        count =MAX_RW_COUNT;

    if (file->f_op->read)// 判断文件系统是否实现了read接口
        ret =file->f_op->read(file, buf, count,pos);
    else if (file->f_op->read_iter) // 判断文件系统是否实现了read_iter接口
        ret = new_sync_read(file, buf, count,pos);
    else
        ret = -EINVAL;
    if (ret > 0) {
fsnotify_access(file);
add_rchar(current, ret);
    }
inc_syscr(current);
    return ret;
}

vfs_read涉及到一个核心的数据结构 struct **[file]()** 为fd对应的文件句柄的实现,file包含了文件权限,inode等信息,此处需要重点介绍的是file_operations, file_operations定义了一系列的文件操作实现的函数指针,不同文件系统通过实现该系列函数指针来实现具体文件系统的io操作。

struct file {
    union {
        struct llist_node   fu_llist;
        struct rcu_head     fu_rcuhead;
    } f_u;
    struct path     f_path;
    struct inode        *f_inode;   /* cached value */
    const struct file_operations    *f_op; // 文件io操作函数指针定义

    /*
     * Protects f_ep, f_flags.
     * Must not be taken from IRQ context.
     */
    spinlock_t      f_lock;
    enum rw_hint        f_write_hint;
    atomic_long_t       f_count;
    unsigned int        f_flags;
    fmode_t         f_mode;
    struct mutex        f_pos_lock;
    loff_t          f_pos;
    struct fown_struct  f_owner;
    const struct cred   *f_cred;
    struct file_ra_state    f_ra;

    u64         f_version;
#ifdef CONFIG_SECURITY
    void            *f_security;
#endif
    /* needed for tty driver, and maybe others */
    void            *private_data;

#ifdef CONFIG_EPOLL
    /* Used by fs/eventpoll.c to link all the hooks to this file */
    struct hlist_head   *f_ep;
#endif /* #ifdef CONFIG_EPOLL */
    struct address_space    *f_mapping; // 文件映射在物理内存的page
    errseq_t        f_wb_err;
    errseq_t        f_sb_err; /* for syncfs */
} __randomize_layout
  __attribute__((aligned(4)));

file_operations file_operations 定义了一些列文件操作的函数指针,包括seek read write open flush等等。

struct file_operations {
    struct module *owner;
    loff_t (*llseek) (struct file *, loff_t, int);
    ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
    ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
    ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
    ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
    int (*iopoll)(struct kiocb *kiocb, bool spin);
    int (*iterate) (struct file *, struct dir_context *);
    int (*iterate_shared) (struct file *, struct dir_context *);
    __poll_t (*poll) (struct file *, struct poll_table_struct *);
    long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
    long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
    int (*mmap) (struct file *, struct vm_area_struct *);
    unsigned long mmap_supported_flags;
    int (*open) (struct inode *, struct file *);
    int (*flush) (struct file *, fl_owner_t id);
    int (*release) (struct inode *, struct file *);
    int (*fsync) (struct file *, loff_t, loff_t, int datasync);
    int (*fasync) (int, struct file *, int);
    int (*lock) (struct file *, int, struct file_lock *);
    ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
    unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
    int (*check_flags)(int);
    int (*flock) (struct file *, int, struct file_lock *);
    ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
    ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
    int (*setlease)(struct file *, long, struct file_lock **, void **);
    long (*fallocate)(struct file *file, int mode, loff_t offset,
              loff_t len);
    void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
    unsigned (*mmap_capabilities)(struct file *);
#endif
    ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
            loff_t, size_t, unsigned int);
    loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                   struct file *file_out, loff_t pos_out,
                   loff_t len, unsigned int remap_flags);
    int (*fadvise)(struct file *, loff_t, loff_t, int);
} __randomize_layout;

ext4文件系统

vfs_read的实现里通过 file→f_op的判断来获取对应的文件io实现方法,以ext4文件系统为例,ext4文件系统实现了read_iter方法,因此ext4 系统下sys_read实际会调用 new_sync_read。

if (file->f_op->read)// 判断文件系统是否实现了read接口
        ret =file->f_op->read(file, buf, count,pos);
    else if (file->f_op->read_iter) // 判断文件系统是否实现了read_iter接口
        ret = new_sync_read(file, buf, count,pos);
    else
        ret = -EINVAL;

ext4_file_read_iter ext4_file_operations 为ext4 文件操作接口的具体实现,可以看到ext4只实现了read_iter方法没有实现read方法

const struct file_operations ext4_file_operations = {
    .llseek     = ext4_llseek,
    .read_iter  = ext4_file_read_iter,
    .write_iter =ext4_file_write_iter,
    .iopoll     =iomap_dio_iopoll,
    .unlocked_ioctl =ext4_ioctl,
#ifdef CONFIG_COMPAT.
     compat_ioctl   =ext4_compat_ioctl,
#endif
    .mmap       =ext4_file_mmap,
    .mmap_supported_flags =MAP_SYNC,
    .open       =ext4_file_open,
    .release    =ext4_release_file,
    .fsync      =ext4_sync_file,
    .get_unmapped_area =thp_get_unmapped_area,
    .splice_read    =generic_file_splice_read,
    .splice_write   =iter_file_splice_write,
    .fallocate  =ext4_fallocate,
};

由于ext4实现了read_iter接口,因此vfs_read的实际调用为 new_sync_read

statics size_t new_sync_read(structfile *filp, char__user *buf,size_t len,loff_t *ppos)
{
    struct iovec iov = { .iov_base = buf, .iov_len = len };
    struct kiocb kiocb;
    struct iov_iter iter;
  ssize_t ret;

 init_sync_kiocb(&kiocb,filp);
 kiocb.ki_pos = (ppos ? *ppos : 0);
 iov_iter_init(&iter,READ, &iov, 1, len);

ret =call_read_iter(filp, &kiocb, &iter);// 调用具体的read_iter实现
BUG_ON(ret == -EIOCBQUEUED);
    if (ppos)
        *ppos =kiocb.ki_pos;
    return ret;
}

new_sync_read首先会初始化iovec结构, 然后调用 call_read_iter进行文件的io读取。

call_read_iter 定义在include/linux/fs.h ,此处则通过f_op→read_iter(),调用具体的文件系统的实现。

static inline ssize_t call_read_iter(structfile *file, structkiocb *kio,
                     structiov_iter *iter)
{
    returnfile->f_op->read_iter(kio,iter);// 此处调用read_iter具体的文件系统实现,根据上面讲的,调用的具体实现即 ext4_file_read_iter
}

对应上文说到的,ext4文件系统的实现为 ext4_file_read_iter

statics size_t ext4_file_read_iter(structkiocb *iocb, structiov_iter *to)
{
    structinode *inode =file_inode(iocb->ki_filp);

    if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
        return -EIO;

    if (!iov_iter_count(to))
        return 0;/* skip atime */
#ifdef CONFIG_FS_DAX
    if (IS_DAX(inode))
        return ext4_dax_read_iter(iocb,to);
#endif
    if (iocb->ki_flags &IOCB_DIRECT)
        return ext4_dio_read_iter(iocb,to);

    return generic_file_read_iter(iocb,to); // 不考虑dio的情况,此处调用了系统默认的读取实现。
}

ext4_file_read_iter会判断文件系统是否挂载了fs_dax参数,fs_dax的含义此处不做深入介绍,本文只介绍常用情况下ext4的读操作流程。正常的读操作流程其实是调用了vfs的默认实现

generic_file_read_iter

ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
    size_t count = iov_iter_count(iter);
    ssize_t retval = 0;

    if (!count)
        return 0; /* skip atime */

    if (iocb->ki_flags & IOCB_DIRECT) { // 判断是否要进行DIO
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping; // 物理文件映射到内存page
        struct inode *inode = mapping->host;
        loff_t size;

        size = i_size_read(inode);
        if (iocb->ki_flags & IOCB_NOWAIT) {
            if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
                        iocb->ki_pos + count - 1))
                return -EAGAIN;
        } else {
            retval = filemap_write_and_wait_range(mapping,
                        iocb->ki_pos,
                            iocb->ki_pos + count - 1);
            if (retval < 0)
                return retval;
        }

        file_accessed(file);

        retval = mapping->a_ops->direct_IO(iocb, iter);
        if (retval >= 0) {
            iocb->ki_pos += retval;
            count -= retval;
        }
        if (retval != -EIOCBQUEUED)
            iov_iter_revert(iter, count - iov_iter_count(iter));

        /*
         * Btrfs can have a short DIO read if we encounter
         * compressed extents, so if there was an error, or if
         * we've already read everything we wanted to, or if
         * there was a short read because we hit EOF, go ahead
         * and return.  Otherwise fallthrough to buffered io for
         * the rest of the read.  Buffered reads will not work for
         * DAX files, so don't bother trying.
         */
        if (retval < 0 || !count || iocb->ki_pos >= size ||
            IS_DAX(inode))
            return retval;
    }

    return filemap_read(iocb, iter, retval); // 不考虑dio的场景,实际调用为此处
}

该函数会判断是否设置了dio,dio此处不做深入解析,直接看filemap_read的具体实现。

page cache

filemap_read

ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
        ssize_t already_read)
{
    struct file *filp = iocb->ki_filp;
    struct file_ra_state *ra = &filp->f_ra;
    struct address_space *mapping = filp->f_mapping;
    struct inode *inode = mapping->host;
    struct pagevec pvec;
    int i, error = 0;
    bool writably_mapped;
    loff_t isize, end_offset;

    if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
        return 0;
    if (unlikely(!iov_iter_count(iter)))
        return 0;

    iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
    pagevec_init(&pvec);

    do {
        cond_resched();

        /*
         * If we've already successfully copied some data, then we
         * can no longer safely return -EIOCBQUEUED. Hence mark
         * an async read NOWAIT at that point.
         */
        if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
            iocb->ki_flags |= IOCB_NOWAIT;

        error = filemap_get_pages(iocb, iter, &pvec); // 获取page cache
        if (error < 0)
            break;

        /*
         * i_size must be checked after we know the pages are Uptodate.
         *
         * Checking i_size after the check allows us to calculate
         * the correct value for "nr", which means the zero-filled
         * part of the page is not copied back to userspace (unless
         * another truncate extends the file - this is desired though).
         */
        isize = i_size_read(inode);
        if (unlikely(iocb->ki_pos >= isize))
            goto put_pages;
        end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);

        /*
         * Once we start copying data, we don't want to be touching any
         * cachelines that might be contended:
         */
        writably_mapped = mapping_writably_mapped(mapping);

        /*
         * When a sequential read accesses a page several times, only
         * mark it as accessed the first time.
         */
        if (iocb->ki_pos >> PAGE_SHIFT !=
            ra->prev_pos >> PAGE_SHIFT)
            mark_page_accessed(pvec.pages[0]);

        for (i = 0; i < pagevec_count(&pvec); i++) {
            struct page *page = pvec.pages[i];
            size_t page_size = thp_size(page);
            size_t offset = iocb->ki_pos & (page_size - 1);
            size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
                         page_size - offset);
            size_t copied;

            if (end_offset < page_offset(page))
                break;
            if (i > 0)
                mark_page_accessed(page);
            /*
             * If users can be writing to this page using arbitrary
             * virtual addresses, take care about potential aliasing
             * before reading the page on the kernel side.
             */
            if (writably_mapped) {
                int j;

                for (j = 0; j < thp_nr_pages(page); j++)
                    flush_dcache_page(page + j);
            }

            copied = copy_page_to_iter(page, offset, bytes, iter);

            already_read += copied;
            iocb->ki_pos += copied;
            ra->prev_pos = iocb->ki_pos;

            if (copied < bytes) {
                error = -EFAULT;
                break;
            }
        }
put_pages:
        for (i = 0; i < pagevec_count(&pvec); i++)
            put_page(pvec.pages[i]); // 将page放入page cache缓存
          pagevec_reinit(&pvec);
    } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);

    file_accessed(filp);

    return already_read ? already_read : error;
}

当我们从文件读取数据时,在非dio的场景下,往往是先判断文件对应的page是否存在page cache中,如果存在并且当前的cache不是dirty的那么就可以直接从page cache读取,通过page cache可以大大提升文件的读写性能,page cache的读取具体实现细节则在 filemap_get_pages

static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
        struct pagevec *pvec)
{
    struct file *filp = iocb->ki_filp;
    struct address_space *mapping = filp->f_mapping;
    struct file_ra_state *ra = &filp->f_ra;
    pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
    pgoff_t last_index;
    struct page *page;
    int err = 0;

    last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
retry:
    if (fatal_signal_pending(current))
        return -EINTR;

    filemap_get_read_batch(mapping, index, last_index, pvec); // 批量获取page 
    if (!pagevec_count(pvec)) {
        if (iocb->ki_flags & IOCB_NOIO)
            return -EAGAIN;
        page_cache_sync_readahead(mapping, ra, filp, index,
                last_index - index);
        filemap_get_read_batch(mapping, index, last_index, pvec);
    }
    if (!pagevec_count(pvec)) {
        if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
            return -EAGAIN;
        err = filemap_create_page(filp, mapping,
                iocb->ki_pos >> PAGE_SHIFT, pvec); //在cache中不存在,触发缺页处理,读取磁盘
        if (err == AOP_TRUNCATED_PAGE)
            goto retry;
        return err;
    }

    page = pvec->pages[pagevec_count(pvec) - 1];
    if (PageReadahead(page)) {
        err = filemap_readahead(iocb, filp, mapping, page, last_index); // 是否进行预读
// 在顺序io的情况下,通过预判进行预读可以提升下一次读取的性能,减少磁盘io
        if (err)
            goto err;
    }
    if (!PageUptodate(page)) {
        if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1)
            iocb->ki_flags |= IOCB_NOWAIT;
        err = filemap_update_page(iocb, mapping, iter, page);
        if (err)
            goto err;
    }

    return 0;
err:
    if (err < 0)
        put_page(page);
    if (likely(--pvec->nr))
        return 0;
    if (err == AOP_TRUNCATED_PAGE)
        goto retry;
    return err;
}

当文件的page不存在page cache中,则会触发缺页,进入缺页处理函数 filemap_create_page

static int filemap_create_page(struct file *file,
        struct address_space *mapping, pgoff_t index,
        struct pagevec *pvec)
{
    struct page *page;
    int error;

    page = page_cache_alloc(mapping); // 分配pagecache
    if (!page)
        return -ENOMEM;

    /*
     * Protect against truncate / hole punch. Grabbing invalidate_lock here
     * assures we cannot instantiate and bring uptodate new pagecache pages
     * after evicting page cache during truncate and before actually
     * freeing blocks.  Note that we could release invalidate_lock after
     * inserting the page into page cache as the locked page would then be
     * enough to synchronize with hole punching. But there are code paths
     * such as filemap_update_page() filling in partially uptodate pages or
     * ->readpages() that need to hold invalidate_lock while mapping blocks
     * for IO so let's hold the lock here as well to keep locking rules
     * simple.
     */
    filemap_invalidate_lock_shared(mapping);
    error = add_to_page_cache_lru(page, mapping, index,
            mapping_gfp_constraint(mapping, GFP_KERNEL));// 添加到lru pagecache
    if (error == -EEXIST)
        error = AOP_TRUNCATED_PAGE;
    if (error)
        goto error;

    error = filemap_read_page(file, mapping, page); // 读取pagecache
    if (error)
        goto error;

    filemap_invalidate_unlock_shared(mapping);
    pagevec_add(pvec, page);
    return 0;
error:
    filemap_invalidate_unlock_shared(mapping);
    put_page(page);
    return error;
}

缺页处理函数首先会分配内存page,分配内存page的实现为 page_alloc_cache ,此处通过内核的内存分配伙伴系统分配一个page,伙伴系统的详细实现本文先不做深入探讨。

#ifdef CONFIG_NUMA
extern struct page *__page_cache_alloc(gfp_t gfp);
#else
static inline struct page *__page_cache_alloc(gfp_t gfp)
{
    return alloc_pages(gfp, 0);
}
#endif

static inline struct page *page_cache_alloc(struct address_space *x)
{
    return __page_cache_alloc(mapping_gfp_mask(x));
}

[alloc_pages] (https://elixir.bootlin.com/linux/v5.15/source/include/linux/gfp.h#L588)

#ifdef CONFIG_NUMA
struct page *alloc_pages(gfp_t gfp, unsigned int order);
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
            struct vm_area_struct *vma, unsigned long addr,
            int node, bool hugepage);
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \\
    alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
#else
static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
{
    return alloc_pages_node(numa_node_id(), gfp_mask, order);
}

分配完page后,则从磁盘读取文件page filemap_read_page

static int filemap_read_page(struct file *file, struct address_space *mapping,
        struct page *page)
{
    int error;

    /*
     * A previous I/O error may have been due to temporary failures,
     * eg. multipath errors.  PG_error will be set again if readpage
     * fails.
     */
    ClearPageError(page);
    /* Start the actual read. The read will unlock the page. */
    error = mapping->a_ops->readpage(file, page);// readpage 为函数指针,对应到ext4的实现为
// 
    if (error)
        return error;

    error = wait_on_page_locked_killable(page);
    if (error)
        return error;
    if (PageUptodate(page))
        return 0;
    shrink_readahead_size_eio(&file->f_ra);
    return -EIO;
}

从物理文件读取page的核心定义为 a_ops 该结构体定义了从物理文件读取page的一系列函数,不同的文件系统对应到具体不同的函数实现。

struct address_space_operations {
    int (*writepage)(struct page *page, struct writeback_control *wbc);
    int (*readpage)(struct file *, struct page *);

    /* Write back some dirty pages from this mapping. */
    int (*writepages)(struct address_space *, struct writeback_control *);

    /* Set a page dirty.  Return true if this dirtied it */
    int (*set_page_dirty)(struct page *page);

    /*
     * Reads in the requested pages. Unlike ->readpage(), this is
     * PURELY used for read-ahead!.
     */
    int (*readpages)(struct file *filp, struct address_space *mapping,
            struct list_head *pages, unsigned nr_pages);
    void (*readahead)(struct readahead_control *);

    int (*write_begin)(struct file *, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata);
    int (*write_end)(struct file *, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned copied,
                struct page *page, void *fsdata);

    /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
    sector_t (*bmap)(struct address_space *, sector_t);
    void (*invalidatepage) (struct page *, unsigned int, unsigned int);
    int (*releasepage) (struct page *, gfp_t);
    void (*freepage)(struct page *);
    ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
    /*
     * migrate the contents of a page to the specified target. If
     * migrate_mode is MIGRATE_ASYNC, it must not block.
     */
    int (*migratepage) (struct address_space *,
            struct page *, struct page *, enum migrate_mode);
    bool (*isolate_page)(struct page *, isolate_mode_t);
    void (*putback_page)(struct page *);
    int (*launder_page) (struct page *);
    int (*is_partially_uptodate) (struct page *, unsigned long,
                    unsigned long);
    void (*is_dirty_writeback) (struct page *, bool *, bool *);
    int (*error_remove_page)(struct address_space *, struct page *);

    /* swapfile support */
    int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
                sector_t *span);
    void (*swap_deactivate)(struct file *file);
};

同样以ext4为例,ext4 a_ops具体定义实现 相对于的readpage实现为ext4_readpage

static const struct address_space_operations ext4_aops = {
    .readpage       = ext4_readpage,
    .readahead      = ext4_readahead,
    .writepage      = ext4_writepage,
    .writepages     = ext4_writepages,
    .write_begin        = ext4_write_begin,
    .write_end      = ext4_write_end,
    .set_page_dirty     = ext4_set_page_dirty,
    .bmap           = ext4_bmap,
    .invalidatepage     = ext4_invalidatepage,
    .releasepage        = ext4_releasepage,
    .direct_IO      = noop_direct_IO,
    .migratepage        = buffer_migrate_page,
    .is_partially_uptodate  = block_is_partially_uptodate,
    .error_remove_page  = generic_error_remove_page,
    .swap_activate      = ext4_iomap_swap_activate,
};

ext4_readpage

static int ext4_readpage(struct file *file, struct page *page)
{
    int ret = -EAGAIN;
    struct inode *inode = page->mapping->host;

    trace_ext4_readpage(page);

    if (ext4_has_inline_data(inode))
        ret = ext4_readpage_inline(inode, page);

    if (ret == -EAGAIN)
        return ext4_mpage_readpages(inode, NULL, page);

    return ret;
}

ext4_mpage_readpages则构造bio请求从块设备读取数据 ,通过构造bio将任务提交到io调度器,从而向块设备驱动提交读请求。至此,用户发起读系统调用请求真正进入磁盘块设备读取物理文件数据。

reference:

  1. SYSCALL_DEFINE https://blog.csdn.net/hxmhyp/article/details/22699669

你可能感兴趣的:(【linux内核源码】 io操作之read)