EXT3日志文件系统之address_space_operation

EXT3日志文件系统之address_space_operation

.前言

Ext3文件系统具有日志的功能,有三种日志模式,journal, ordered, writeback。但是EXT3文件系统的磁盘结构与EXT2文件系统是一样的,EXT3的日志可以是一个文件(此时inode的节点号是8),也可以保存在某个分区上。

二.与具体日志模式结合的写文件函数

static const structaddress_space_operations ext3_ordered_aops = {

         .writepage                 = ext3_ordered_writepage,

         .write_end                 = ext3_ordered_write_end,

};

 

static const structaddress_space_operations ext3_writeback_aops = {

         .writepage                 = ext3_writeback_writepage,

         .write_end                 = ext3_writeback_write_end,

};

 

static const struct address_space_operationsext3_journalled_aops = {

         .writepage                 = ext3_journalled_writepage,

         .write_end                 = ext3_journalled_write_end,

         .set_page_dirty                 =ext3_journalled_set_page_dirty,

};

 

1.Ordered模式将元数据提交到日志当中,具体文件数据会在元数据提交前保存到块设备当中;

2.Writeback模式只提交元数据到日志当中;

3.Journalled模式会将元数据和具体文件数据都提交到日志中,因此速度最慢;提交时元数据和文件数据都以metadata的形式提交;

 

2.1 ext3_ordered_writepage

static int ext3_ordered_writepage(structpage *page,

                                     structwriteback_control *wbc)

{

         structinode *inode = page->mapping->host;

         structbuffer_head *page_bufs;

         handle_t*handle = NULL;

         intret = 0;

         interr;

 

         J_ASSERT(PageLocked(page));

         WARN_ON_ONCE(IS_RDONLY(inode)&&

                        !(EXT3_SB(inode->i_sb)->s_mount_state& EXT3_ERROR_FS));

 

         if(ext3_journal_current_handle())  //取当前进程中的current->journal_info

                   gotoout_fail;

 

         trace_ext3_ordered_writepage(page);

         if(!page_has_buffers(page)) {   //如果当前页没有分配buffer_head,则需要分配并且映射,会更改元数据。

                   create_empty_buffers(page,inode->i_sb->s_blocksize,

                                     (1<< BH_Dirty)|(1 << BH_Uptodate));

                   page_bufs= page_buffers(page);

         }else {//在这个分支里,数据页分配了,头缓冲也分配了,这种情况下,块设备元数据不会改变,只要提交数据就可以了,不需要写到日志文件里面去;

                   page_bufs= page_buffers(page);

                   if(!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,

                                            NULL, buffer_unmapped)) {

                           

                            returnblock_write_full_page(page, NULL, wbc);//直接提交

                   }

         }

         handle= ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));

 

         if(IS_ERR(handle)) {

                   ret= PTR_ERR(handle);

                   gotoout_fail;

         }

 

         walk_page_buffers(handle,page_bufs, 0,

                            PAGE_CACHE_SIZE,NULL, bget_one);//处理数据块之前引用加1

 

         ret= block_write_full_page(page, ext3_get_block, wbc);

 

        

         if(ret == 0) {

                   err= walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,

                                               NULL,journal_dirty_data_fn);//此处将具体文件数据提交到块设备

                   if(!ret)

                            ret= err;

         }

         walk_page_buffers(handle,page_bufs, 0,

                            PAGE_CACHE_SIZE,NULL, bput_one);//处理数据块之后,引用减1

         err= ext3_journal_stop(handle);

         if(!ret)

                   ret= err;

         returnret;

 

out_fail:

         redirty_page_for_writepage(wbc,page);

         unlock_page(page);

         returnret;

}

 

2.2 ext3_writeback_writepage

static int ext3_writeback_writepage(structpage *page,

                                     structwriteback_control *wbc)

{

         structinode *inode = page->mapping->host;

         handle_t*handle = NULL;

         intret = 0;

         interr;

 

         J_ASSERT(PageLocked(page));

         WARN_ON_ONCE(IS_RDONLY(inode)&&

                       !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));

 

         if(ext3_journal_current_handle())

                   gotoout_fail;

 

         trace_ext3_writeback_writepage(page);

         if(page_has_buffers(page)) {//page有对应的buffer_head,元数据不变;

                   if(!walk_page_buffers(NULL, page_buffers(page), 0,

                                           PAGE_CACHE_SIZE, NULL, buffer_unmapped)){

                            returnblock_write_full_page(page, NULL, wbc);

                   }

         }

 

         handle= ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));

         if(IS_ERR(handle)) {

                   ret= PTR_ERR(handle);

                   gotoout_fail;

         }

 

         ret= block_write_full_page(page, ext3_get_block, wbc);//写文件

 

         err= ext3_journal_stop(handle);

         if(!ret)

                   ret= err;

         returnret;

 

out_fail:

         redirty_page_for_writepage(wbc,page);

         unlock_page(page);

         returnret;

}

 

2.3 ext3_journalled_writepage

static int ext3_journalled_writepage(structpage *page,

                                     structwriteback_control *wbc)

{

         structinode *inode = page->mapping->host;

         handle_t*handle = NULL;

         intret = 0;

         interr;

 

         J_ASSERT(PageLocked(page));

         WARN_ON_ONCE(IS_RDONLY(inode)&&

                       !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));

 

         if(ext3_journal_current_handle())

                   gotono_write;

 

         trace_ext3_journalled_writepage(page);

         handle= ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));

         if(IS_ERR(handle)) {

                   ret= PTR_ERR(handle);

                   gotono_write;

         }

 

         if(!page_has_buffers(page) || PageChecked(page)) {//该也缺少头缓冲

                  

                   ClearPageChecked(page);

                   ret= __block_write_begin(page, 0, PAGE_CACHE_SIZE,

                                                 ext3_get_block);//分配页面,作为头缓冲

                   if(ret != 0) {

                            ext3_journal_stop(handle);

                            gotoout_unlock;

                   }

                   ret= walk_page_buffers(handle, page_buffers(page), 0,

                            PAGE_CACHE_SIZE,NULL, do_journal_get_write_access);//遍历页面,取得各个页面的写的权限

 

                   err= walk_page_buffers(handle, page_buffers(page), 0,

                                     PAGE_CACHE_SIZE,NULL, write_end_fn);//将元数据和文件具体数据都挂到metadata链表上

                   if(ret == 0)

                            ret= err;

                   ext3_set_inode_state(inode,EXT3_STATE_JDATA);

                   atomic_set(&EXT3_I(inode)->i_datasync_tid,

                               handle->h_transaction->t_tid);

                   unlock_page(page);

         }else {

        

                   ret= block_write_full_page(page, ext3_get_block, wbc);//已有页面的元数据不需要分配,则进入这个分支。再需要分配则调用ext3_get_block;

         }

         err= ext3_journal_stop(handle);

         if(!ret)

                   ret= err;

out:

         returnret;

 

no_write:

         redirty_page_for_writepage(wbc,page);

out_unlock:

         unlock_page(page);

         gotoout;

}

2.4 block_write_full_page/ block_write_full_page_endio

int block_write_full_page(struct page*page, get_block_t *get_block,

                            structwriteback_control *wbc)

{

         returnblock_write_full_page_endio(page, get_block, wbc,

                                                  end_buffer_async_write);//有的页面的回写机制会设置成buffer_async_write模式,在这种模式下,该页面会在日志之前提交。

}

 

int block_write_full_page_endio(struct page*page, get_block_t *get_block,

                            structwriteback_control *wbc, bh_end_io_t *handler)

{

         structinode * const inode = page->mapping->host;

         loff_ti_size = i_size_read(inode);

         constpgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;

         unsignedoffset;

1.      页面在i_size之内;

         if(page->index < end_index)

                   return__block_write_full_page(inode, page, get_block, wbc,

                                                      handler);

2.      超出范围,解除映射

         offset= i_size & (PAGE_CACHE_SIZE-1);

         if(page->index >= end_index+1 || !offset) {

                  

                   do_invalidatepage(page,0);

                   unlock_page(page);

                   return0; /* don't care */

         }

   3. /*

          * The page straddles i_size.  It must be zeroed out on each and every

          * writepage invocation because it may bemmapped.  "A file is mapped

          * in multiples of the page size.  For a file that is not a multiple of

          * the page size, the remaining memory is zeroed when mapped, and

          * writes to that region are not written out tothe file."

          */理解横跨i_size的页面,多出size的页面大小被置为0

         zero_user_segment(page,offset, PAGE_CACHE_SIZE);

         return__block_write_full_page(inode, page, get_block, wbc, handler);

}

2.5 __block_write_full_page

static int __block_write_full_page(structinode *inode, struct page *page,

                            get_block_t*get_block, struct writeback_control *wbc,

                            bh_end_io_t*handler)

{

         interr;

         sector_tblock;

         sector_tlast_block;

         structbuffer_head *bh, *head;

         constunsigned blocksize = 1 << inode->i_blkbits;

         intnr_underway = 0;

         intwrite_op = (wbc->sync_mode == WB_SYNC_ALL ?

                            WRITE_SYNC: WRITE);

 

         BUG_ON(!PageLocked(page));

 

         last_block= (i_size_read(inode) - 1) >> inode->i_blkbits;

 

         if(!page_has_buffers(page)) {

                   create_empty_buffers(page,blocksize,

                                               (1<< BH_Dirty)|(1 << BH_Uptodate));

         }

 

         /*

          * Be very careful.  We have no exclusion from__set_page_dirty_buffers

          * here, and the (potentially unmapped) buffersmay become dirty at

          * any time. If a buffer becomes dirty here after we've inspected it

          * then we just miss that fact, and the pagestays dirty.

          *

          * Buffers outside i_size may be dirtied by__set_page_dirty_buffers;

          * handle that here by just cleaning them.

          */

 

         block= (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

         head= page_buffers(page);

         bh= head;

 

         /*

          * Get all the dirty buffers mapped to diskaddresses and

          * handle any aliases from the underlyingblockdev's mapping.

          */

         do{

                   if(block > last_block) {

                            /*

                             * mapped buffers outside i_size will occur,because

                             * this page can be outside i_size when thereis a

                             * truncate in progress.

                             */

                            /*

                             * The buffer was zeroed byblock_write_full_page()

                             */

                            clear_buffer_dirty(bh);

                            set_buffer_uptodate(bh);

                   }else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&

                               buffer_dirty(bh)) {

                            WARN_ON(bh->b_size!= blocksize);

                            err= get_block(inode, block, bh, 1);

                            if(err)

                                     gotorecover;

                            clear_buffer_delay(bh);

                            if(buffer_new(bh)) {

                                     /*blockdev mappings never come here */

                                     clear_buffer_new(bh);

                                     unmap_underlying_metadata(bh->b_bdev,

                                                                 bh->b_blocknr);

                            }

                   }

                   bh= bh->b_this_page;

                   block++;

         }while (bh != head);

 

/*sync_mode有两种模式

*            WB_SYNC_NONE,   /*Don't wait on anything */

*       WB_SYNC_ALL,        /* Wait on every mapping */

*/

 

         do{

                   if(!buffer_mapped(bh))

                            continue;

                   /*

                    * If it's a fully non-blocking write attemptand we cannot

                    * lock the buffer then redirty the page.  Note that this can

                    * potentially cause a busy-wait loop fromwriteback threads

                    * and kswapd activity, but those code pathshave their own

                    * higher-level throttling.

                    */

                   if(wbc->sync_mode != WB_SYNC_NONE) {

                            lock_buffer(bh);   //这种情况下是WB_SYNC_ALL的模式,同步执行

                   }else if (!trylock_buffer(bh)) {//这个分支是WB_SYNC_NONE模式,非同步

                            redirty_page_for_writepage(wbc,page); //在这段代码里,脏页面在radix_tree//面被置位

                            continue;

                   }

                   if(test_clear_buffer_dirty(bh)) {//这种情况下是WB_SYNC_ALL的模式,设置//buffer_head的状态是buffer_async_write, 在下面的代码里面会提交页数据至块设备

                            mark_buffer_async_write_endio(bh,handler);

                   }else {

                            unlock_buffer(bh);//回应上面代码的lock_buffer(bh)

                   }

         }while ((bh = bh->b_this_page) != head);

         /*

          * The page and its buffers are protected byPageWriteback(), so we can

          * drop the bh refcounts early.

          */

         BUG_ON(PageWriteback(page));

         set_page_writeback(page);

 

         do{

                   structbuffer_head *next = bh->b_this_page;

                   if(buffer_async_write(bh)) {

                            submit_bh(write_op,bh);

                            nr_underway++;

                   }

                   bh= next;

         }while (bh != head);

         unlock_page(page);

 

         err= 0;

done:

         if(nr_underway == 0) {

                   /*

                    * The page was marked dirty, but the bufferswere

                    * clean. Someone wrote them back by hand with

                    * ll_rw_block/submit_bh.  A rare case.

                    */

                   end_page_writeback(page);

 

                   /*

                    * The page and buffer_heads can be released atany time from

                    * here on.

                    */

         }

         returnerr;

 

recover:

         /*

          * ENOSPC, or some other error.  We may already have added some

          * blocks to the file, so we need to writethese out to avoid

          * exposing stale data.

          * The page is currently locked and not markedfor writeback

          */

         bh= head;

         /*Recovery: lock and submit the mapped buffers */

         do{

                   if(buffer_mapped(bh) && buffer_dirty(bh) &&

                       !buffer_delay(bh)) {

                            lock_buffer(bh);

                            mark_buffer_async_write_endio(bh,handler);

                   }else {

                            /*

                             * The buffer may have been set dirty during

                             * attachment to a dirty page.

                             */

                            clear_buffer_dirty(bh);

                   }

         }while ((bh = bh->b_this_page) != head);

         SetPageError(page);

         BUG_ON(PageWriteback(page));

         mapping_set_error(page->mapping,err);

         set_page_writeback(page);

         do{

                   structbuffer_head *next = bh->b_this_page;

                   if(buffer_async_write(bh)) {

                            clear_buffer_dirty(bh);

                            submit_bh(write_op,bh);

                            nr_underway++;

                   }

                   bh= next;

         }while (bh != head);

         unlock_page(page);

         gotodone;

}

 

 

你可能感兴趣的:(linux内核之文件系统)