Linux内核读取文件流程源码及阻塞点超详解

以linux内核3.13版本为例,首先内核通过系统调用read(),执行sys_read()函数,在文件linux/fs/read_write.c中:

//linux/fs/read_write.c

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
    struct fd f = fdget(fd);      //先根据文件描述符fd得到对应的file对象
    ssize_t ret = -EBADF;

    if (f.file) {
        loff_t pos = file_pos_read(f.file);        //得到文件的当前位置 
        ret = vfs_read(f.file, buf, count, &pos);  //调用vfs_read函数
        if (ret >= 0)
            file_pos_write(f.file, pos);     //更新文件当前位置
        fdput(f);
    }
    return ret;
}12345678910111213141516复制代码

每个进程的进程控制块task_struct中都有一个files_struct结构体,它保存了进程所有打开的文件,以文件描述符fd为索引即可找到对应的file对象,file对象中也包含了文件当前位置的信息。
再来看vfs_read函数,同样在文件linux/fs/read_write.c中:

//linux/fs/read_write.c

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
    ssize_t ret;

    if (!(file->f_mode & FMODE_READ))
        return -EBADF;
    if (!file->f_op->read && !file->f_op->aio_read)
        return -EINVAL;
    if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))    //用户缓冲区是否可写
        return -EFAULT;

    ret = rw_verify_area(READ, file, pos, count);  //检验文件的锁
    if (ret >= 0) {
        count = ret;
        if (file->f_op->read)
            ret = file->f_op->read(file, buf, count, pos);
        else
            ret = do_sync_read(file, buf, count, pos);
        if (ret > 0) {
            fsnotify_access(file);
            add_rchar(current, ret);
        }
        inc_syscr(current);
    }

    return ret;
}1234567891011121314151617181920212223242526272829复制代码

如果文件定义了read函数,由调用文件自身的read函数,否则调用do_sync_read()函数。file->f_op是从对应的inode->i_fop而来,而inode->i_fop是由对应的文件系统类型在生成这个inode时赋予的file->f_op->read对于磁盘文件系统来说通常就等同于do_sync_read(),比如ext2文件系统。
来看一下do_sync_read()函数:

//linux/fs/read_write.c

ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
    //iovec结构体用来描述一个读/写操作的用户缓冲区,iov_base是缓冲区起点,iov_len是缓冲区长度,kiocb结构体用来描述文件对象、位置和字数等
    //linux系统的一次读取请求过程中可以支持多个不连续数据段,每个数据段用一个iovec结构体表示。系统调用sys_read()每次只使用一个数据段,但是sys_readv()则可以使用多个数据段
    struct iovec iov = { .iov_base = buf, .iov_len = len };
    struct kiocb kiocb;
    ssize_t ret;

    //初始化同步控制块kiocb
    init_sync_kiocb(&kiocb, filp);
    kiocb.ki_pos = *ppos;
    kiocb.ki_nbytes = len;

    //调用文件系统的异步读操作,此函数只是提交请求到磁盘
    ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
    //如果值为-EIOCBQUEUED,则说明请求尚在队列中,需要等待操作完成
    if (-EIOCBQUEUED == ret)
        ret = wait_on_sync_kiocb(&kiocb);  //进程设置为TASK_UNINTERRUPTIBLE,等待kiocb的成员ki_ctx变为有效值
    *ppos = kiocb.ki_pos;
    return ret;
}1234567891011121314151617181920212223复制代码

do_sync_read()函数里继续调用了本文件的f_op->aio_read()函数进行异步读操作,最后还需要调用wait_on_sync_kiocb()函数进行同步(即wait_on_sync_kiocb()函数返回时数据已经准备好)。对于ext2文件系统,其f_op->aio_read()函数指向通用的generic_file_aio_read()
来看一下generic_file_aio_read()函数:

//linux/mm/filemap.c

ssize_t
generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        unsigned long nr_segs, loff_t pos)
{
    struct file *filp = iocb->ki_filp;
    ssize_t retval;
    unsigned long seg = 0;
    size_t count;
    loff_t *ppos = &iocb->ki_pos;

    count = 0;
    //逐段进行用户缓冲区的可写检查并返回iovec的数目nr_segs
    retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
    if (retval)
        return retval;

    /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
    //direct IO,不经过页缓存。将页缓存中的数据与设备同步之后清除页缓存中的内容,然后再调用文件系统提供的address_space->direct_IO方法从设备读取数据。
    if (filp->f_flags & O_DIRECT) {
        loff_t size;
        struct address_space *mapping;
        struct inode *inode;

        mapping = filp->f_mapping;
        inode = mapping->host;
        if (!count)
            goto out; /* skip atime */
        size = i_size_read(inode);
        if (pos < size) {
            //将缓存内容写入设备
            retval = filemap_write_and_wait_range(mapping, pos,
                    pos + iov_length(iov, nr_segs) - 1);
            if (!retval) {
                //调用文件系统提供的address_space->direct_IO方法从设备读取数据
                retval = mapping->a_ops->direct_IO(READ, iocb,
                            iov, pos, nr_segs);
            }
            if (retval > 0) {
                *ppos = pos + retval;
                count -= retval;
            }

            /*
             * Btrfs can have a short DIO read if we encounter
             * compressed extents, so if there was an error, or if
             * we've already read everything we wanted to, or if
             * there was a short read because we hit EOF, go ahead
             * and return.  Otherwise fallthrough to buffered io for
             * the rest of the read.
             */
            if (retval < 0 || !count || *ppos >= size) {
                file_accessed(filp);
                goto out;
            }
        }
    }

    count = retval;
    //对于每个iovec数组都转化为一个read_descriptor_t对象并调用do_generic_file_read函数进行处理
    for (seg = 0; seg < nr_segs; seg++) {
        read_descriptor_t desc;
        loff_t offset = 0;

        /*
         * If we did a short DIO read we need to skip the section of the
         * iov that we've already read data into.
         */
        if (count) {
            if (count > iov[seg].iov_len) {
                count -= iov[seg].iov_len;
                continue;
            }
            offset = count;
            count = 0;
        }

        desc.written = 0;
        desc.arg.buf = iov[seg].iov_base + offset;
        desc.count = iov[seg].iov_len - offset;
        if (desc.count == 0)
            continue;
        desc.error = 0;
        do_generic_file_read(filp, ppos, &desc);
        retval += desc.written;
        if (desc.error) {
            retval = retval ?: desc.error;
            break;
        }
        if (desc.count > 0)
            break;
    }
out:
    return retval;
}123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596复制代码

do_generic_file_read()函数是内核提供的一个通用的读函数,它完成一个iovec对应的连续缓冲区上的数据的读入操作(用read_descriptor_t类型的结构体描述)。do_generic_file_read()函数会判断该页是否在页缓存中,如果在则直接将数据到空间,如果不在,则先通过磁盘IO读入页缓存,再按照已经存在的情况处理。代码如下:

//linux/mm/filemap.c

static void do_generic_file_read(struct file *filp, loff_t *ppos,
        read_descriptor_t *desc)
{
    struct address_space *mapping = filp->f_mapping;
    struct inode *inode = mapping->host;
    struct file_ra_state *ra = &filp->f_ra;
    pgoff_t index;
    pgoff_t last_index;
    pgoff_t prev_index;
    unsigned long offset;      /* offset into pagecache page */
    unsigned int prev_offset;
    int error;

    //以下几行把文件读取的字节数计算为文件在页缓存中使用的索引值,从index到last_index
    index = *ppos >> PAGE_CACHE_SHIFT;
    prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
    prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
    last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
    offset = *ppos & ~PAGE_CACHE_MASK;

    for (;;) {
        struct page *page;
        pgoff_t end_index;
        loff_t isize;
        unsigned long nr, ret;

        cond_resched();   //此处可能导致进程切换
find_page:
        page = find_get_page(mapping, index);   //在页缓存中查找相应的页
        if (!page) {      //如果没有查找到,则预读取之后再次查找
            page_cache_sync_readahead(mapping,
                    ra, filp,
                    index, last_index - index);
            page = find_get_page(mapping, index);
            if (unlikely(page == NULL))   //如果还是查找不到,说明文件内容对应的页缓存页帧还未分配
                goto no_cached_page;
        }
        //文件预读      
        if (PageReadahead(page)) {
            page_cache_async_readahead(mapping,
                    ra, filp, page,
                    index, last_index - index);
        }
        //如果在页缓存中找到则还需要判断其是否为最新,即页帧的标志是否为PG_uptodate,完成bio请求的回调函数会设置此标志(见下文)
        if (!PageUptodate(page)) {
            if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
                    !mapping->a_ops->is_partially_uptodate)
                goto page_not_up_to_date;
            if (!trylock_page(page))
                goto page_not_up_to_date;
            /* Did it get truncated before we got the lock? */
            if (!page->mapping)
                goto page_not_up_to_date_locked;
            if (!mapping->a_ops->is_partially_uptodate(page,
                                desc, offset))
                goto page_not_up_to_date_locked;
            unlock_page(page);
        }
page_ok:
        /*
         * i_size must be checked after we know the page is Uptodate.
         *
         * Checking i_size after the check allows us to calculate
         * the correct value for "nr", which means the zero-filled
         * part of the page is not copied back to userspace (unless
         * another truncate extends the file - this is desired though).
         */

        isize = i_size_read(inode);
        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
        if (unlikely(!isize || index > end_index)) {
            page_cache_release(page);
            goto out;
        }

        /* nr is the maximum number of bytes to copy from this page */
        nr = PAGE_CACHE_SIZE;
        if (index == end_index) {
            nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
            if (nr <= offset) {
                page_cache_release(page);
                goto out;
            }
        }
        nr = nr - offset;

        /* If users can be writing to this page using arbitrary
         * virtual addresses, take care about potential aliasing
         * before reading the page on the kernel side.
         */
        if (mapping_writably_mapped(mapping))
            flush_dcache_page(page);

        /*
         * When a sequential read accesses a page several times,
         * only mark it as accessed the first time.
         */
        if (prev_index != index || offset != prev_offset)
            mark_page_accessed(page);
        prev_index = index;

        /*
         * Ok, we have the page, and it's up-to-date, so
         * now we can copy it to user space...
         *
         * The file_read_actor routine returns how many bytes were
         * actually used..
         * NOTE! This may not be the same as how much of a user buffer
         * we filled up (we may be padding etc), so we can only update
         * "pos" here (the actor routine has to update the user buffer
         * pointers and the remaining count).
         */
        //将数据复制到用户空间
        ret = file_read_actor(desc, page, offset, nr);
        offset += ret;
        index += offset >> PAGE_CACHE_SHIFT;
        offset &= ~PAGE_CACHE_MASK;
        prev_offset = offset;

        page_cache_release(page);
        if (ret == nr && desc->count)
            continue;
        goto out;

page_not_up_to_date:
        /* Get exclusive access to the page ... */
        //锁住页面,此时如果有其它进程也在读取这个页面,就会导致进程阻塞。等到其它进程解除了锁才会把它唤醒
        error = lock_page_killable(page);
        if (unlikely(error))
            goto readpage_error;

page_not_up_to_date_locked:
        /* Did it get truncated before we got the lock? */
        if (!page->mapping) {
            unlock_page(page);
            page_cache_release(page);
            continue;
        }

        /* Did somebody else fill it already? */
        //在等待锁的过程中是不是有其它的进程更新了页缓存的内容?
        if (PageUptodate(page)) {
            unlock_page(page);
            goto page_ok;
        }

readpage:
        /*
         * A previous I/O error may have been due to temporary
         * failures, eg. multipath errors.
         * PG_error will be set again if readpage fails.
         */
        ClearPageError(page);
        /* Start the actual read. The read will unlock the page. */
        //真正读取磁盘数据的入口
        error = mapping->a_ops->readpage(filp, page);

        if (unlikely(error)) {
            if (error == AOP_TRUNCATED_PAGE) {
                page_cache_release(page);
                goto find_page;
            }
            goto readpage_error;
        }

        //如果页面内容不是最新的,那么锁住该页面并阻塞。当磁盘读取操作成功完成后会发出一个中断,其中断处理函数会将页面标志设置成最新的并解锁该页面。
        if (!PageUptodate(page)) {
            error = lock_page_killable(page);
            if (unlikely(error))
                goto readpage_error;
            if (!PageUptodate(page)) {
                if (page->mapping == NULL) {
                    /*
                     * invalidate_mapping_pages got it
                     */
                    unlock_page(page);
                    page_cache_release(page);
                    goto find_page;
                }
                unlock_page(page);
                shrink_readahead_size_eio(filp, ra);
                error = -EIO;
                goto readpage_error;
            }
            unlock_page(page);
        }

        goto page_ok;

readpage_error:
        /* UHHUH! A synchronous read error occurred. Report it */
        desc->error = error;
        page_cache_release(page);
        goto out;

no_cached_page:
        /*
         * Ok, it wasn't cached, so we need to create a new
         * page..
         */
        //分配一个页帧,用于缓存将要读取的数据
        page = page_cache_alloc_cold(mapping);
        if (!page) {
            desc->error = -ENOMEM;
            goto out;
        }
        //把页帧加入到address_space相应的基树上(address_space使用基树来组织各个用于缓存的页帧,便于查找)
        error = add_to_page_cache_lru(page, mapping,
                        index, GFP_KERNEL);
        if (error) {
            page_cache_release(page);
            if (error == -EEXIST)
                goto find_page;
            desc->error = error;
            goto out;
        }
        goto readpage;
    }

out:
    ra->prev_pos = prev_index;
    ra->prev_pos <<= PAGE_CACHE_SHIFT;
    ra->prev_pos |= prev_offset;

    *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
    file_accessed(filp);
}123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229复制代码

do_generic_file_read()函数的主体是一个for循环,先计算出来文件在页缓存中的索引值,然后对于每个index都有三种情况:

  1. 页缓存中存在最新数据,用file_read_actor()将页缓存中的数据复制到用户空间
  2. 页缓存中存在数据但不是最新的,则需要加锁后用address_space->a_ops->readpage()读入。ext2文件系统对应的函数是ext2_readpage()。在加锁后、启动readpage()之前也可能因别的进程使用操作已完成而转入上一种情况处理。
  3. 页缓存中页帧不存在,此时需要先调用page_cache_alloc_cold()函数分配一个页帧,并加入address_space,然后转到上一条处理。

普通文件的read操作主要是和页缓存相交互的,即使要读取磁盘,通常也是先读取到页缓存中再复制到用户空间的。

如果文件数据不在页缓存中,需要调用address_spacereadpage()函数从磁盘读入对应的页:

//linux/fs/ext2/inode.c
static int ext2_readpage(struct file *file, struct page *page)
{
    return mpage_readpage(page, ext2_get_block);
}

//linux/fs/mpage.c
int mpage_readpage(struct page *page, get_block_t get_block)
{
    struct bio *bio = NULL;
    sector_t last_block_in_bio = 0;
    struct buffer_head map_bh;
    unsigned long first_logical_block = 0;

    map_bh.b_state = 0;
    map_bh.b_size = 0;
    //do_mpage_readpage为每个page构造一个bio结构体,一个bio结构体里面包含了一次访问请求的起始扇区号、访问多少个扇区、是读还是写、相应的内存页有哪些、页偏移和数据长度是多少等等信息
    bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
            &map_bh, &first_logical_block, get_block);
    if (bio)
        //向磁盘驱动提交bio请求
        mpage_bio_submit(READ, bio);
    return 0;
}

//linux/fs/mpage.c
static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
    //设置bio请求完成后的回调函数。此函数遍历bio结构的每个向量,检查相关页帧是否获得最新数据,如果成功则设置页面状态为最新,同时解锁页面。这个解锁动作会唤醒等待该页帧更新的进程。至于具体的调用时机见下文。
    bio->bi_end_io = mpage_end_io;
    //交给通用块IO层处理,通用块IO层会对所有到达的读写请求进行汇总和调度
    submit_bio(rw, bio);
    return NULL;
}

//linux/fs/mpage.c
static void mpage_end_io(struct bio *bio, int err)
{
    const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
    struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;

    do {
        struct page *page = bvec->bv_page;

        if (--bvec >= bio->bi_io_vec)
            prefetchw(&bvec->bv_page->flags);
        if (bio_data_dir(bio) == READ) {
            if (uptodate) {
                SetPageUptodate(page);
            } else {
                ClearPageUptodate(page);
                SetPageError(page);
            }
            unlock_page(page);
        } else { /* bio_data_dir(bio) == WRITE */
            if (!uptodate) {
                SetPageError(page);
                if (page->mapping)
                    set_bit(AS_EIO, &page->mapping->flags);
            }
            end_page_writeback(page);
        }
    } while (bvec >= bio->bi_io_vec);
    bio_put(bio);
}1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465复制代码

submit_bio()函数将bio请求转换为磁盘request请求,其中包含了合并及调度IO优化,并将request请求挂入到磁盘请求队列上。每个硬盘设备都带有一个request请求队列,用于异步地接收读写等请求,这些请求由磁盘工作队列kblockd_workqueue在后台完成,不过具体的操作还是要通过磁盘驱动器的驱动程序完成。用户发出的读写请求在挂入磁盘的request请求队列之前会进行IO调度和优化。其函数内部主要是调用了generic_make_request()函数:

//linux/block/blk-core.c
void submit_bio(int rw, struct bio *bio)
{
    bio->bi_rw |= rw;

    /*
     * If it's a regular read/write or a barrier with data attached,
     * go through the normal accounting stuff before submission.
     */
    if (bio_has_data(bio)) {
        unsigned int count;

        if (unlikely(rw & REQ_WRITE_SAME))
            count = bdev_logical_block_size(bio->bi_bdev) >> 9;
        else
            count = bio_sectors(bio);

        if (rw & WRITE) {
            count_vm_events(PGPGOUT, count);
        } else {
            task_io_account_read(bio->bi_size);
            count_vm_events(PGPGIN, count);
        }

        if (unlikely(block_dump)) {
            char b[BDEVNAME_SIZE];
            printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
            current->comm, task_pid_nr(current),
                (rw & WRITE) ? "WRITE" : "READ",
                (unsigned long long)bio->bi_sector,
                bdevname(bio->bi_bdev, b),
                count);
        }
    }

    generic_make_request(bio);
}

void generic_make_request(struct bio *bio)
{
    struct bio_list bio_list_on_stack;

    //做一些合法性检查
    if (!generic_make_request_checks(bio))
        return;

    /*
     * We only want one ->make_request_fn to be active at a time, else
     * stack usage with stacked devices could be a problem.  So use
     * current->bio_list to keep a list of requests submited by a
     * make_request_fn function.  current->bio_list is also used as a
     * flag to say if generic_make_request is currently active in this
     * task or not.  If it is NULL, then no make_request is active.  If
     * it is non-NULL, then a make_request is active, and new requests
     * should be added at the tail
     */
    //如果current->bio_list不等于NULL,说明有其他进程正在执行generic_make_request函数,此时新增加的bio将被添加到current->bio_list上
    if (current->bio_list) {
        bio_list_add(current->bio_list, bio);
        return;
    }

    /* following loop may be a bit non-obvious, and so deserves some
     * explanation.
     * Before entering the loop, bio->bi_next is NULL (as all callers
     * ensure that) so we have a list with a single bio.
     * We pretend that we have just taken it off a longer list, so
     * we assign bio_list to a pointer to the bio_list_on_stack,
     * thus initialising the bio_list of new bios to be
     * added.  ->make_request() may indeed add some more bios
     * through a recursive call to generic_make_request.  If it
     * did, we find a non-NULL value in bio_list and re-enter the loop
     * from the top.  In this case we really did just take the bio
     * of the top of the list (no pretending) and so remove it from
     * bio_list, and call into ->make_request() again.
     */
    BUG_ON(bio->bi_next);
    //如果bio_list不存在,则初始化
    bio_list_init(&bio_list_on_stack);
    current->bio_list = &bio_list_on_stack;
    do {
        //获取所请求块设备对应物理磁盘的请求队列
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        //调用物理磁盘请求队列的make_request_fn创建一个请求。通常的磁盘其请求队列的处理函数make_request_fn指向blk_queue_bio()
        q->make_request_fn(q, bio);

        bio = bio_list_pop(current->bio_list);
    } while (bio);
    current->bio_list = NULL; /* deactivate */
}123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990复制代码

generic_make_request()函数用于将bio转化为request对象并挂入到合适的请求队列上,通常有多个bio可以合并在一个request对象上。通常的磁盘请求队列处理函数make_request_fn()指向内核提供的通用函数blk_queue_bio(),在这个函数中会检查能否合并bio、做一些优化和当前IO请求的数量等因素,决定是否生成一个新的request还是把bio添加到一个原有的request对象上,然后决定是把这个新的request对象添加到准备队列current->plug上还是直接调用__blk_run_queue()函数把request对象提交给驱动程序。代码如下:

//linux/block/blk-core.c

void blk_queue_bio(struct request_queue *q, struct bio *bio)
{
    const bool sync = !!(bio->bi_rw & REQ_SYNC);
    struct blk_plug *plug;
    int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
    struct request *req;
    unsigned int request_count = 0;

    /*
     * low level driver can indicate that it wants pages above a
     * certain limit bounced to low memory (ie for highmem, or even
     * ISA dma in theory)
     */
    blk_queue_bounce(q, &bio);

    if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
        bio_endio(bio, -EIO);
        return;
    }

    if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
        spin_lock_irq(q->queue_lock);
        where = ELEVATOR_INSERT_FLUSH;
        goto get_rq;
    }

    /*
     * Check if we can merge with the plugged list before grabbing
     * any locks.
     */
    if (blk_attempt_plug_merge(q, bio, &request_count))
        return;

    spin_lock_irq(q->queue_lock);

    //以下代码尝试能否把请求合并到电梯调度算法的请求队列上
    el_ret = elv_merge(q, &req, bio);
    if (el_ret == ELEVATOR_BACK_MERGE) {
        if (bio_attempt_back_merge(q, req, bio)) {
            elv_bio_merged(q, req, bio);
            if (!attempt_back_merge(q, req))
                elv_merged_request(q, req, el_ret);
            goto out_unlock;
        }
    } else if (el_ret == ELEVATOR_FRONT_MERGE) {
        if (bio_attempt_front_merge(q, req, bio)) {
            elv_bio_merged(q, req, bio);
            if (!attempt_front_merge(q, req))
                elv_merged_request(q, req, el_ret);
            goto out_unlock;
        }
    }

get_rq:    //下面是无法合并的情况
    /*
     * This sync check and mask will be re-done in init_request_from_bio(),
     * but we need to set it earlier to expose the sync flag to the
     * rq allocator and io schedulers.
     */
    rw_flags = bio_data_dir(bio);
    if (sync)
        rw_flags |= REQ_SYNC;

    /*
     * Grab a free request. This is might sleep but can not fail.
     * Returns with the queue unlocked.
     */
    //创建新的request对象
    req = get_request(q, rw_flags, bio, GFP_NOIO);
    if (unlikely(!req)) {
        bio_endio(bio, -ENODEV);    /* @q is dead */
        goto out_unlock;
    }

    /*
     * After dropping the lock and possibly sleeping here, our request
     * may now be mergeable after it had proven unmergeable (above).
     * We don't worry about that case for efficiency. It won't happen
     * often, and the elevators are able to handle it.
     */
    init_request_from_bio(req, bio);

    if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
        req->cpu = raw_smp_processor_id();

    //task_struct->plug成员用于暂存本进程的IO请求。出于性能方面考虑,Linux在磁盘IO上进行了许多优化,比如plug机制和IO调度机制
    plug = current->plug;
    if (plug) {     //如果plug不为空,把新的request对象暂存在这里
        /*
         * If this is the first request added after a plug, fire
         * of a plug trace.
         */
        if (!request_count)
            trace_block_plug(q);
        else {
            if (request_count >= BLK_MAX_REQUEST_COUNT) {
                //如果队列上请求数目达到一定数量,则调用blk_flush_plug_list函数将队列中的IO请求移入块设备的request_queue
                blk_flush_plug_list(plug, false);
                trace_block_plug(q);
            }
        }
        list_add_tail(&req->queuelist, &plug->list);
        blk_account_io_start(req, true);
    } else {     //如果plug为空,则直接执行
        spin_lock_irq(q->queue_lock);
        //插入块设备的request_queue队列
        add_acct_request(q, req, where);
        //通过块设备的request_fn()处理设备请求
        __blk_run_queue(q);
out_unlock:
        spin_unlock_irq(q->queue_lock);
    }
}123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115复制代码

其中blk_flush_plug_list()用于将plug请求队列上的请求转移到调度队列和磁盘设备的请求队列上,以准备向驱动程序传递。其调用序列为blk_flush_plug_list()->queue_unplugged()->__blk_run_queue(),可以看到最后也是调用了__blk_run_queue()函数来处理请求对象。

__blk_run_queue()函数用来将请求队列上的请求提交到驱动程序,提交的方式有两种,一种是在用blk_queue_bio生成请求后直接调用__blk_run_queue()函数,另一种是经过kblockd_workqueue工作队列间接调用__blk_run_queue()函数。__blk_run_queue()函数的代码如下:

//linux/block/blk-core.c

void __blk_run_queue(struct request_queue *q)
{
    if (unlikely(blk_queue_stopped(q)))
        return;

    __blk_run_queue_uncond(q);
}

inline void __blk_run_queue_uncond(struct request_queue *q)
{
    if (unlikely(blk_queue_dead(q)))
        return;

    /*
     * Some request_fn implementations, e.g. scsi_request_fn(), unlock
     * the queue lock internally. As a result multiple threads may be
     * running such a request function concurrently. Keep track of the
     * number of active request_fn invocations such that blk_drain_queue()
     * can wait until all these request_fn calls have finished.
     */
    q->request_fn_active++;
    //调用请求队列的request_fn函数,由块设备提供。对于硬盘来说指向do_hd_request()
    q->request_fn(q);
    q->request_fn_active--;
}123456789101112131415161718192021222324252627复制代码

do_hd_request()函数是IDE硬盘使用的request_fn()函数,它从全局变量hd_queue中获得一个请求request并使能相应的中断,并根据是读还是写请求设置对应的中断处理函数是read_intr()还是write_intr(),然后使用hd_out()函数向硬盘控制器发送指令并返回。到此为止文件的读取过程已经结束,剩下的只是等待磁盘中断返回并且调用中断处理函数即可。do_hd_request()的代码如下:

//linux/drivers/block/hd.c
static void do_hd_request(struct request_queue *q)
{
    hd_request();
}

static void hd_request(void)
{
    unsigned int block, nsect, sec, track, head, cyl;
    struct hd_i_struct *disk;
    struct request *req;

    if (do_hd)
        return;
repeat:               //停止定时器
    del_timer(&device_timer);

    if (!hd_req) {
        hd_req = blk_fetch_request(hd_queue);    //从hd_queue取得一个request对象
        if (!hd_req) {
            do_hd = NULL;
            return;
        }
    }
    req = hd_req;

    if (reset) {
        reset_hd();
        return;
    }
    disk = req->rq_disk->private_data;
    block = blk_rq_pos(req);
    nsect = blk_rq_sectors(req);
    if (block >= get_capacity(req->rq_disk) ||
        ((block+nsect) > get_capacity(req->rq_disk))) {
        printk("%s: bad access: block=%d, count=%d\n",
            req->rq_disk->disk_name, block, nsect);
        hd_end_request_cur(-EIO);
        goto repeat;
    }

    if (disk->special_op) {
        if (do_special_op(disk, req))
            goto repeat;
        return;
    }
    sec   = block % disk->sect + 1;
    track = block / disk->sect;
    head  = track % disk->head;
    cyl   = track / disk->head;
#ifdef DEBUG
    printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
        req->rq_disk->disk_name,
        req_data_dir(req) == READ ? "read" : "writ",
        cyl, head, sec, nsect, req->buffer);
#endif
    if (req->cmd_type == REQ_TYPE_FS) {
        switch (rq_data_dir(req)) {
        case READ:
            //在hd_out函数的SET_HANDLER宏里设置do_hd全局变量,即参数&read_intr,用于设置正确的中断处理函数
            hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ,
                &read_intr);
            if (reset)
                goto repeat;
            break;
        case WRITE:
            hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_WRITE,
                &write_intr);
            if (reset)
                goto repeat;
            if (wait_DRQ()) {
                bad_rw_intr();
                goto repeat;
            }
            outsw(HD_DATA, req->buffer, 256);
            break;
        default:
            printk("unknown hd-command\n");
            hd_end_request_cur(-EIO);
            break;
        }
    }
}1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283复制代码

接下来的硬盘中断处理函数为hd_interrupt(),由于请求可能是读或者写,所以相应的处理函数也是不同的。在执行硬盘操作之前,需要先把全局变量do_hd设置为指向read_intr()或者write_intr(),这样当中断到达时就能正确地处理中断。不过这也意味着硬盘的请求是串行处理的,即一次只能处理一个请求。hd_interrupt()函数代码如下:

//linux/drivers/block/hd.c
static irqreturn_t hd_interrupt(int irq, void *dev_id)
{
    void (*handler)(void) = do_hd;     //do_hd指向NULL、read_intr或者write_intr

    spin_lock(hd_queue->queue_lock);

    do_hd = NULL;
    del_timer(&device_timer);       //停止定时器
    if (!handler)
        handler = unexpected_hd_interrupt;
    handler();     //执行相应的处理函数

    spin_unlock(hd_queue->queue_lock);

    return IRQ_HANDLED;   //返回中断处理状态
}1234567891011121314151617复制代码

由于读取请求设置的do_hd等于read_intr,来看一下read_intr()函数的代码:

//linux/drivers/block/hd.c
static void read_intr(void)
{
    struct request *req;
    int i, retries = 100000;

    do {
        i = (unsigned) inb_p(HD_STATUS);    //读取硬盘状态
        if (i & BUSY_STAT)
            continue;
        if (!OK_STATUS(i))
            break;
        if (i & DRQ_STAT)
            goto ok_to_read;                //如果没有出错直接转到此处
    } while (--retries > 0);                //反复尝试retries次
    dump_status("read_intr", i);            //输出出错信息
    bad_rw_intr();
    hd_request();                           //调用hd_request()处理下一个请求
    return;

ok_to_read:
    req = hd_req;
    insw(HD_DATA, req->buffer, 256);        //读取256个u16(即512字节)
#ifdef DEBUG
    printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
           req->rq_disk->disk_name, blk_rq_pos(req) + 1,
           blk_rq_sectors(req) - 1, req->buffer+512);
#endif
    if (hd_end_request(0, 512)) {           //hd_end_request()函数完成了许多操作,包括调用bio的回调函数mpage_end_io(在mpage_bio_submit函数中设置的)
        SET_HANDLER(&read_intr);
        return;
    }

    (void) inb_p(HD_STATUS);
#if (HD_DELAY > 0)
    last_req = read_timer();
#endif
    hd_request();
}123456789101112131415161718192021222324252627282930313233343536373839复制代码

值得注意的是hd_end_request()函数,在这个函数中执行了原始bio的回调函数mpage_end_io()。在前面讲到mpage_end_io()这个函数中说过,这个函数的作用是遍历bio结构的每个向量,看相关页面是否获得最新数据,如果是,则页面标志为最新并解锁该页面。通过解锁操作可以唤醒阻塞在这个页面上的进程。回顾一下读取页面的do_generic_file_read()函数,此时进程被唤醒后会检查页面状态是否是最新,如果是就继续向下执行并最终复制到用户缓冲区。

到此,从read系统调用到磁盘驱动再从磁盘的中断处理函数到内核代码并最终返回给用户的文件读取的分析过程已经全部完成啦!由于本文着重分析的是调用流程,所以诸如plug机制、磁盘盘块号计算、IO调度算法之类的不在介绍范围,敬请谅解。



你可能感兴趣的:(Linux内核读取文件流程源码及阻塞点超详解)