linux文件系统写过程简析

linux写入磁盘过程经历VFS ->  页缓存(page cache) -> 具体的文件系统(ext2/3/4、XFS、ReiserFS等) -> Block IO ->设备驱动 -> SCSI指令(或者其他指令),总体来说linux文件写入磁盘过程比较复杂

1、VFS(虚拟文件系统)

      Linux中采用了VFS的方式屏蔽了多个文件系统的差别, 当需要不同的设备或者其他文件系统时,采用挂载mount的方式访问其他设备或者其他文件系统(这里可以把文件系统理解为具体的设备)。正是因为使用了VFS,所以所有的文件系统设备使用统一的文件目录树视图访问,整个存储空间采用一个文件系统目录树来管理,屏蔽了底层多个文件系统之间的差别。当然,如果你需要把你自己编写的文件系统集成到Linux内核,采用VFS的方式进行访问,你需要采用模块加载的方式进行处理,相应的文件系统模块文件需要编入到系统目录/lib/modules/your-system-name/kernel/fs当中。当然VFS的作用远不止这些,通过VFS也进行访问设备,在Linux下所有的对象都是文件,简化了系统的访问。

     1.1 正常情况下,所有的文件操作通过系统调用进入到VFS中,特殊的处理,直接操作原始设备。文件系统写入的系统调用为:

  #include <unistd.h>

  ssize_t  write(int fd,  const void * buffer, size_t  count);

    1.2 当采用系统调用进入VFS时,接下来的处理交给VFS层。处理过程比较中要的是vfs_write、generic_file_aio_write

  

 1 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)

 2 {

 3     ssize_t ret;

 4 

 5     if (!(file->f_mode & FMODE_WRITE))

 6         return -EBADF;

 7     if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))

 8         return -EINVAL;

 9     if (unlikely(!access_ok(VERIFY_READ, buf, count)))

10         return -EFAULT;

11 

12     ret = rw_verify_area(WRITE, file, pos, count);

13     if (ret >= 0) {

14         count = ret;

15         if (file->f_op->write)

16             ret = file->f_op->write(file, buf, count, pos);

17         else

18             ret = do_sync_write(file, buf, count, pos);

19         if (ret > 0) {

20             fsnotify_modify(file->f_path.dentry);

21             add_wchar(current, ret);

22         }

23         inc_syscw(current);

24     }

25 

26     return ret;

27 }

 

 1 /**

 2  * generic_file_aio_write - write data to a file

 3  * @iocb:    IO state structure

 4  * @iov:    vector with data to write

 5  * @nr_segs:    number of segments in the vector

 6  * @pos:    position in file where to write

 7  *

 8  * This is a wrapper around __generic_file_aio_write() to be used by most

 9  * filesystems. It takes care of syncing the file in case of O_SYNC file

10  * and acquires i_mutex as needed.

11  */

12 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,

13         unsigned long nr_segs, loff_t pos)

14 {

15     struct file *file = iocb->ki_filp;

16     struct inode *inode = file->f_mapping->host;

17     ssize_t ret;

18 

19     BUG_ON(iocb->ki_pos != pos);

20 

21     mutex_lock(&inode->i_mutex);

22     ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);

23     mutex_unlock(&inode->i_mutex);

24 

25     if (ret > 0 || ret == -EIOCBQUEUED) {

26         ssize_t err;

27 

28         err = generic_write_sync(file, pos, ret);

29         if (err < 0 && ret > 0)

30             ret = err;

31     }

32     return ret;

33 }

  2、 对于VFS层也有采用page cache和非page cache两种,下面重要介绍采用page cache的处理。

      在VFS中, 每个打开操作的文件对应内核都有一个address_space 数据结构, 该数据结构是用来表示系统中打开的文件,并且一个打开的文件只有一个address_space数据结构。

如下:   

 1 struct address_space {

 2     struct inode        *host;        /* owner: inode, block_device */

 3     struct radix_tree_root    page_tree;    /* radix tree of all pages */

 4     spinlock_t        tree_lock;    /* and lock protecting it */

 5     unsigned int        i_mmap_writable;/* count VM_SHARED mappings */

 6     struct prio_tree_root    i_mmap;        /* tree of private and shared mappings */

 7     struct list_head    i_mmap_nonlinear;/*list VM_NONLINEAR mappings */

 8     spinlock_t        i_mmap_lock;    /* protect tree, count, list */

 9     unsigned int        truncate_count;    /* Cover race condition with truncate */

10     unsigned long        nrpages;    /* number of total pages */

11     pgoff_t            writeback_index;/* writeback starts here */

12     const struct address_space_operations *a_ops;    /* methods */

13     unsigned long        flags;        /* error bits/gfp mask */

14     struct backing_dev_info *backing_dev_info; /* device readahead, etc */

15     spinlock_t        private_lock;    /* for use by the address_space */

16     struct list_head    private_list;    /* ditto */

17     struct address_space    *assoc_mapping;    /* ditto */

18     struct mutex        unmap_mutex;    /* to protect unmapping */

19 } __attribute__((aligned(sizeof(long))));

    对于文件中的文件内容缓存采用的是基数树的方式来保存的,在成员变量page_tree中,关于基数树的介绍参考[1]和[2]。 下面是关于page cache写处理的几个重要的函数    

 1 ssize_t

 2 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,

 3         unsigned long nr_segs, loff_t pos, loff_t *ppos,

 4         size_t count, ssize_t written)

 5 {

 6     struct file *file = iocb->ki_filp;

 7     struct address_space *mapping = file->f_mapping;

 8     ssize_t status;

 9     struct iov_iter i;

10 

11     iov_iter_init(&i, iov, nr_segs, count, written);

12     status = generic_perform_write(file, &i, pos);

13 

14     if (likely(status >= 0)) {

15         written += status;

16         *ppos = pos + status;

17       }

18     

19     /*

20      * If we get here for O_DIRECT writes then we must have fallen through

21      * to buffered writes (block instantiation inside i_size).  So we sync

22      * the file data here, to try to honour O_DIRECT expectations.

23      */

24     if (unlikely(file->f_flags & O_DIRECT) && written)

25         status = filemap_write_and_wait_range(mapping,

26                     pos, pos + written - 1);

27 

28     return written ? written : status;

29 }

    调用page cache中的write_begin 和write_end 

    Note: 在进行VFS系统调用写入文件过程中,可以允许在文件中的任何位置写入,这其中就包括当写入的过程中写入的起始位置不是一个block的开始位置,这时需要特殊的处理,上述的过程都在write_begin这个函数调用过程中处理完毕。

3、ext2/3/4中文件的处理。

   当在page cache中进行到write_begin时,需要ext4中的ext4_write_begin处理, 如下:   

 1 static int ext4_write_begin(struct file *file, struct address_space *mapping,

 2                 loff_t pos, unsigned len, unsigned flags,

 3                 struct page **pagep, void **fsdata)

 4 {

 5     struct inode *inode = mapping->host;

 6     int ret, needed_blocks;

 7     handle_t *handle;

 8     int retries = 0;

 9     struct page *page;

10     pgoff_t index;

11     unsigned from, to;

12         .........

13 

14     index = pos >> PAGE_CACHE_SHIFT;

15     from = pos & (PAGE_CACHE_SIZE - 1);

16     to = from + len;

17 

18 retry:

19     handle = ext4_journal_start(inode, needed_blocks);

20     if (IS_ERR(handle)) {

21         ret = PTR_ERR(handle);

22         goto out;

23     }

24 

25     /* We cannot recurse into the filesystem as the transaction is already

26      * started */

27     flags |= AOP_FLAG_NOFS;

28 

29     page = grab_cache_page_write_begin(mapping, index, flags);

30     if (!page) {

31         ext4_journal_stop(handle);

32         ret = -ENOMEM;

33         goto out;

34     }

35     *pagep = page;

36 

37     ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,

38                 ext4_get_block);

39 

40     if (!ret && ext4_should_journal_data(inode)) {

41         ret = walk_page_buffers(handle, page_buffers(page),

42                 from, to, NULL, do_journal_get_write_access);

43     }

44 

45     if (ret) {

46         unlock_page(page);

47         page_cache_release(page);

48         /*

49          * block_write_begin may have instantiated a few blocks

50          * outside i_size.  Trim these off again. Don't need

51          * i_size_read because we hold i_mutex.

52          *

53          * Add inode to orphan list in case we crash before

54          * truncate finishes

55          */

56         if (pos + len > inode->i_size && ext4_can_truncate(inode))

57             ext4_orphan_add(handle, inode);

58 

59         ext4_journal_stop(handle);

60         if (pos + len > inode->i_size) {

61             ext4_truncate_failed_write(inode);

62             /*

63              * If truncate failed early the inode might

64              * still be on the orphan list; we need to

65              * make sure the inode is removed from the

66              * orphan list in that case.

67              */

68             if (inode->i_nlink)

69                 ext4_orphan_del(NULL, inode);

70         }

71     }

72 

73     if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))

74         goto retry;

75 out:

76     return ret;

77 }

       其中在ext4_write_begin中包含了很多的处理功能,包括文件物理块的分配(假设ext4中的delay allocation特性没有开启)、文件块的部分写过程的处理等。下面是在ext_write_begin函数调用过程中比较重要的几个函数。 

 1 /*

 2  * block_write_begin takes care of the basic task of block allocation and

 3  * bringing partial write blocks uptodate first.

 4  *

 5  * If *pagep is not NULL, then block_write_begin uses the locked page

 6  * at *pagep rather than allocating its own. In this case, the page will

 7  * not be unlocked or deallocated on failure.

 8  */

 9 int block_write_begin(struct file *file, struct address_space *mapping,

10             loff_t pos, unsigned len, unsigned flags,

11             struct page **pagep, void **fsdata,

12             get_block_t *get_block)

13 {

14     struct inode *inode = mapping->host;

15     int status = 0;

16     struct page *page;

17     pgoff_t index;

18     unsigned start, end;

19     int ownpage = 0;

20 

21     index = pos >> PAGE_CACHE_SHIFT;

22     start = pos & (PAGE_CACHE_SIZE - 1);

23     end = start + len;

24 

25     page = *pagep;

26     if (page == NULL) {

27         ownpage = 1;

28         page = grab_cache_page_write_begin(mapping, index, flags);

29         if (!page) {

30             status = -ENOMEM;

31             goto out;

32         }

33         *pagep = page;

34     } else

35         BUG_ON(!PageLocked(page));

36 

37     status = __block_prepare_write(inode, page, start, end, get_block);

38     if (unlikely(status)) {

39         ClearPageUptodate(page);

40 

41         if (ownpage) {

42             unlock_page(page);

43             page_cache_release(page);

44             *pagep = NULL;

45 

46             /*

47              * prepare_write() may have instantiated a few blocks

48              * outside i_size.  Trim these off again. Don't need

49              * i_size_read because we hold i_mutex.

50              */

51             if (pos + len > inode->i_size)

52                 vmtruncate(inode, inode->i_size);

53         }

54     }

55 

56 out:

57     return status;

58 }

       

 1 static int __block_prepare_write(struct inode *inode, struct page *page,

 2         unsigned from, unsigned to, get_block_t *get_block)

 3 {

 4     unsigned block_start, block_end;

 5     sector_t block;

 6     int err = 0;

 7     unsigned blocksize, bbits;

 8     struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

 9 

10     BUG_ON(!PageLocked(page));

11     BUG_ON(from > PAGE_CACHE_SIZE);

12     BUG_ON(to > PAGE_CACHE_SIZE);

13     BUG_ON(from > to);

14 

15     blocksize = 1 << inode->i_blkbits;

16     if (!page_has_buffers(page))

17         create_empty_buffers(page, blocksize, 0);

18     head = page_buffers(page);

19 

20     bbits = inode->i_blkbits;

21     block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

22 

23     for(bh = head, block_start = 0; bh != head || !block_start;

24         block++, block_start=block_end, bh = bh->b_this_page) {

25         block_end = block_start + blocksize;

26         if (block_end <= from || block_start >= to) {

27             if (PageUptodate(page)) {

28                 if (!buffer_uptodate(bh))

29                     set_buffer_uptodate(bh);

30             }

31             continue;

32         }

33         if (buffer_new(bh))

34             clear_buffer_new(bh);

35         if (!buffer_mapped(bh)) {

36             WARN_ON(bh->b_size != blocksize);

37             err = get_block(inode, block, bh, 1);

38             if (err)

39                 break;

40             if (buffer_new(bh)) {

41                 unmap_underlying_metadata(bh->b_bdev,

42                             bh->b_blocknr);

43                 if (PageUptodate(page)) {

44                     clear_buffer_new(bh);

45                     set_buffer_uptodate(bh);

46                     mark_buffer_dirty(bh);

47                     continue;

48                 }

49                 if (block_end > to || block_start < from)

50                     zero_user_segments(page,

51                         to, block_end,

52                         block_start, from);

53                 continue;

54             }

55         }

56         if (PageUptodate(page)) {

57             if (!buffer_uptodate(bh))

58                 set_buffer_uptodate(bh);

59             continue; 

60         }

61         if (!buffer_uptodate(bh) && !buffer_delay(bh) &&

62             !buffer_unwritten(bh) &&

63              (block_start < from || block_end > to)) {

64             ll_rw_block(READ, 1, &bh);

65             *wait_bh++=bh;

66         }

67     }

68     /*

69      * If we issued read requests - let them complete.

70      */

71     while(wait_bh > wait) {

72         wait_on_buffer(*--wait_bh);

73         if (!buffer_uptodate(*wait_bh))

74             err = -EIO;

75     }

76     if (unlikely(err))

77         page_zero_new_buffers(page, from, to);

78     return err;

79 }

     

 1 /**

 2  * ll_rw_block: low-level access to block devices (DEPRECATED)

 3  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)

 4  * @nr: number of &struct buffer_heads in the array

 5  * @bhs: array of pointers to &struct buffer_head

 6  *

 7  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and

 8  * requests an I/O operation on them, either a %READ or a %WRITE.  The third

 9  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers

10  * are sent to disk. The fourth %READA option is described in the documentation

11  * for generic_make_request() which ll_rw_block() calls.

12  *

13  * This function drops any buffer that it cannot get a lock on (with the

14  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be

15  * clean when doing a write request, and any buffer that appears to be

16  * up-to-date when doing read request.  Further it marks as clean buffers that

17  * are processed for writing (the buffer cache won't assume that they are

18  * actually clean until the buffer gets unlocked).

19  *

20  * ll_rw_block sets b_end_io to simple completion handler that marks

21  * the buffer up-to-date (if approriate), unlocks the buffer and wakes

22  * any waiters. 

23  *

24  * All of the buffers must be for the same device, and must also be a

25  * multiple of the current approved size for the device.

26  */

27 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])

28 {

29     int i;

30 

31     for (i = 0; i < nr; i++) {

32         struct buffer_head *bh = bhs[i];

33 

34         if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)

35             lock_buffer(bh);

36         else if (!trylock_buffer(bh))

37             continue;

38 

39         if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||

40             rw == SWRITE_SYNC_PLUG) {

41             if (test_clear_buffer_dirty(bh)) {

42                 bh->b_end_io = end_buffer_write_sync;

43                 get_bh(bh);

44                 if (rw == SWRITE_SYNC)

45                     submit_bh(WRITE_SYNC, bh);

46                 else

47                     submit_bh(WRITE, bh);

48                 continue;

49             }

50         } else {

51             if (!buffer_uptodate(bh)) {

52                 bh->b_end_io = end_buffer_read_sync;

53                 get_bh(bh);

54                 submit_bh(rw, bh);

55                 continue;

56             }

57         }

58         unlock_buffer(bh);

59     }

60 }

     其中在ext4中块的分配过程中,管理块分配处理的函数实现在fs/ext4/balloc.c  fs/ext4/mballoc.c

   4、当page cache中的数据需要刷新到disk上的时候,这时处理的过程由Block IO接管。

      在进行文件page cache刷新到disk上的过程中比较重要的数据结构有如下两个buffer_head 和 bio      

 1 struct buffer_head {

 2     unsigned long b_state;        /* buffer state bitmap (see above) */

 3     struct buffer_head *b_this_page;/* circular list of page's buffers */

 4     struct page *b_page;        /* the page this bh is mapped to */

 5 

 6     sector_t b_blocknr;        /* start block number */

 7     size_t b_size;            /* size of mapping */

 8     char *b_data;            /* pointer to data within the page */

 9 

10     struct block_device *b_bdev;

11     bh_end_io_t *b_end_io;        /* I/O completion */

12      void *b_private;        /* reserved for b_end_io */

13     struct list_head b_assoc_buffers; /* associated with another mapping */

14     struct address_space *b_assoc_map;    /* mapping this buffer is

15                            associated with */

16     atomic_t b_count;        /* users using this buffer_head */

17 };

   

 1 /*

 2  * main unit of I/O for the block layer and lower layers (ie drivers and

 3  * stacking drivers)

 4  */

 5 struct bio {

 6     sector_t        bi_sector;    /* device address in 512 byte

 7                            sectors */

 8     struct bio        *bi_next;    /* request queue link */

 9     struct block_device    *bi_bdev;

10     unsigned long        bi_flags;    /* status, command, etc */

11     unsigned long        bi_rw;        /* bottom bits READ/WRITE,

12                          * top bits priority

13                          */

14 

15     unsigned short        bi_vcnt;    /* how many bio_vec's */

16     unsigned short        bi_idx;        /* current index into bvl_vec */

17     ...............

18 

19     /*

20      * We can inline a number of vecs at the end of the bio, to avoid

21      * double allocations for a small number of bio_vecs. This member

22      * MUST obviously be kept at the very end of the bio.

23      */

24     struct bio_vec        bi_inline_vecs[0];

25 };

   在Block IO层进行基本的IO request的合并和处理调度, 基本的层由elevator管理, 具体的调度算法有noop、deadline和anticipate等多种调度算法,现在默认的调度算法是deadline,当然调度算法可调,根据系统可以调成系统最有的处理。 

[1] 基数树(radix tree). http://blog.csdn.net/joker0910/article/details/8250085

[2] Radix Tree. http://en.wikipedia.org/wiki/Radix_tree 

你可能感兴趣的:(linux)