Ext3文件系统具有日志的功能,有三种日志模式,journal, ordered, writeback。但是EXT3文件系统的磁盘结构与EXT2文件系统是一样的,EXT3的日志可以是一个文件(此时inode的节点号是8),也可以保存在某个分区上。
static const structaddress_space_operations ext3_ordered_aops = {
.writepage = ext3_ordered_writepage,
.write_end = ext3_ordered_write_end,
};
static const structaddress_space_operations ext3_writeback_aops = {
.writepage = ext3_writeback_writepage,
.write_end = ext3_writeback_write_end,
};
static const struct address_space_operationsext3_journalled_aops = {
.writepage = ext3_journalled_writepage,
.write_end = ext3_journalled_write_end,
.set_page_dirty =ext3_journalled_set_page_dirty,
};
1.Ordered模式将元数据提交到日志当中,具体文件数据会在元数据提交前保存到块设备当中;
2.Writeback模式只提交元数据到日志当中;
3.Journalled模式会将元数据和具体文件数据都提交到日志中,因此速度最慢;提交时元数据和文件数据都以metadata的形式提交;
static int ext3_ordered_writepage(structpage *page,
structwriteback_control *wbc)
{
structinode *inode = page->mapping->host;
structbuffer_head *page_bufs;
handle_t*handle = NULL;
intret = 0;
interr;
J_ASSERT(PageLocked(page));
WARN_ON_ONCE(IS_RDONLY(inode)&&
!(EXT3_SB(inode->i_sb)->s_mount_state& EXT3_ERROR_FS));
if(ext3_journal_current_handle()) //取当前进程中的current->journal_info
gotoout_fail;
trace_ext3_ordered_writepage(page);
if(!page_has_buffers(page)) { //如果当前页没有分配buffer_head,则需要分配并且映射,会更改元数据。
create_empty_buffers(page,inode->i_sb->s_blocksize,
(1<< BH_Dirty)|(1 << BH_Uptodate));
page_bufs= page_buffers(page);
}else {//在这个分支里,数据页分配了,头缓冲也分配了,这种情况下,块设备元数据不会改变,只要提交数据就可以了,不需要写到日志文件里面去;
page_bufs= page_buffers(page);
if(!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
NULL, buffer_unmapped)) {
returnblock_write_full_page(page, NULL, wbc);//直接提交
}
}
handle= ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
if(IS_ERR(handle)) {
ret= PTR_ERR(handle);
gotoout_fail;
}
walk_page_buffers(handle,page_bufs, 0,
PAGE_CACHE_SIZE,NULL, bget_one);//处理数据块之前引用加1
ret= block_write_full_page(page, ext3_get_block, wbc);
if(ret == 0) {
err= walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
NULL,journal_dirty_data_fn);//此处将具体文件数据提交到块设备
if(!ret)
ret= err;
}
walk_page_buffers(handle,page_bufs, 0,
PAGE_CACHE_SIZE,NULL, bput_one);//处理数据块之后,引用减1;
err= ext3_journal_stop(handle);
if(!ret)
ret= err;
returnret;
out_fail:
redirty_page_for_writepage(wbc,page);
unlock_page(page);
returnret;
}
static int ext3_writeback_writepage(structpage *page,
structwriteback_control *wbc)
{
structinode *inode = page->mapping->host;
handle_t*handle = NULL;
intret = 0;
interr;
J_ASSERT(PageLocked(page));
WARN_ON_ONCE(IS_RDONLY(inode)&&
!(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
if(ext3_journal_current_handle())
gotoout_fail;
trace_ext3_writeback_writepage(page);
if(page_has_buffers(page)) {//page有对应的buffer_head,元数据不变;
if(!walk_page_buffers(NULL, page_buffers(page), 0,
PAGE_CACHE_SIZE, NULL, buffer_unmapped)){
returnblock_write_full_page(page, NULL, wbc);
}
}
handle= ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
if(IS_ERR(handle)) {
ret= PTR_ERR(handle);
gotoout_fail;
}
ret= block_write_full_page(page, ext3_get_block, wbc);//写文件
err= ext3_journal_stop(handle);
if(!ret)
ret= err;
returnret;
out_fail:
redirty_page_for_writepage(wbc,page);
unlock_page(page);
returnret;
}
static int ext3_journalled_writepage(structpage *page,
structwriteback_control *wbc)
{
structinode *inode = page->mapping->host;
handle_t*handle = NULL;
intret = 0;
interr;
J_ASSERT(PageLocked(page));
WARN_ON_ONCE(IS_RDONLY(inode)&&
!(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
if(ext3_journal_current_handle())
gotono_write;
trace_ext3_journalled_writepage(page);
handle= ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
if(IS_ERR(handle)) {
ret= PTR_ERR(handle);
gotono_write;
}
if(!page_has_buffers(page) || PageChecked(page)) {//该也缺少头缓冲
ClearPageChecked(page);
ret= __block_write_begin(page, 0, PAGE_CACHE_SIZE,
ext3_get_block);//分配页面,作为头缓冲
if(ret != 0) {
ext3_journal_stop(handle);
gotoout_unlock;
}
ret= walk_page_buffers(handle, page_buffers(page), 0,
PAGE_CACHE_SIZE,NULL, do_journal_get_write_access);//遍历页面,取得各个页面的写的权限
err= walk_page_buffers(handle, page_buffers(page), 0,
PAGE_CACHE_SIZE,NULL, write_end_fn);//将元数据和文件具体数据都挂到metadata链表上
if(ret == 0)
ret= err;
ext3_set_inode_state(inode,EXT3_STATE_JDATA);
atomic_set(&EXT3_I(inode)->i_datasync_tid,
handle->h_transaction->t_tid);
unlock_page(page);
}else {
ret= block_write_full_page(page, ext3_get_block, wbc);//已有页面的元数据不需要分配,则进入这个分支。再需要分配则调用ext3_get_block;
}
err= ext3_journal_stop(handle);
if(!ret)
ret= err;
out:
returnret;
no_write:
redirty_page_for_writepage(wbc,page);
out_unlock:
unlock_page(page);
gotoout;
}
int block_write_full_page(struct page*page, get_block_t *get_block,
structwriteback_control *wbc)
{
returnblock_write_full_page_endio(page, get_block, wbc,
end_buffer_async_write);//有的页面的回写机制会设置成buffer_async_write模式,在这种模式下,该页面会在日志之前提交。
}
int block_write_full_page_endio(struct page*page, get_block_t *get_block,
structwriteback_control *wbc, bh_end_io_t *handler)
{
structinode * const inode = page->mapping->host;
loff_ti_size = i_size_read(inode);
constpgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
unsignedoffset;
1. 页面在i_size之内;
if(page->index < end_index)
return__block_write_full_page(inode, page, get_block, wbc,
handler);
2. 超出范围,解除映射
offset= i_size & (PAGE_CACHE_SIZE-1);
if(page->index >= end_index+1 || !offset) {
do_invalidatepage(page,0);
unlock_page(page);
return0; /* don't care */
}
3. /*
* The page straddles i_size. It must be zeroed out on each and every
* writepage invocation because it may bemmapped. "A file is mapped
* in multiples of the page size. For a file that is not a multiple of
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out tothe file."
*/理解横跨i_size的页面,多出size的页面大小被置为0;
zero_user_segment(page,offset, PAGE_CACHE_SIZE);
return__block_write_full_page(inode, page, get_block, wbc, handler);
}
static int __block_write_full_page(structinode *inode, struct page *page,
get_block_t*get_block, struct writeback_control *wbc,
bh_end_io_t*handler)
{
interr;
sector_tblock;
sector_tlast_block;
structbuffer_head *bh, *head;
constunsigned blocksize = 1 << inode->i_blkbits;
intnr_underway = 0;
intwrite_op = (wbc->sync_mode == WB_SYNC_ALL ?
WRITE_SYNC: WRITE);
BUG_ON(!PageLocked(page));
last_block= (i_size_read(inode) - 1) >> inode->i_blkbits;
if(!page_has_buffers(page)) {
create_empty_buffers(page,blocksize,
(1<< BH_Dirty)|(1 << BH_Uptodate));
}
/*
* Be very careful. We have no exclusion from__set_page_dirty_buffers
* here, and the (potentially unmapped) buffersmay become dirty at
* any time. If a buffer becomes dirty here after we've inspected it
* then we just miss that fact, and the pagestays dirty.
*
* Buffers outside i_size may be dirtied by__set_page_dirty_buffers;
* handle that here by just cleaning them.
*/
block= (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
head= page_buffers(page);
bh= head;
/*
* Get all the dirty buffers mapped to diskaddresses and
* handle any aliases from the underlyingblockdev's mapping.
*/
do{
if(block > last_block) {
/*
* mapped buffers outside i_size will occur,because
* this page can be outside i_size when thereis a
* truncate in progress.
*/
/*
* The buffer was zeroed byblock_write_full_page()
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
}else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
buffer_dirty(bh)) {
WARN_ON(bh->b_size!= blocksize);
err= get_block(inode, block, bh, 1);
if(err)
gotorecover;
clear_buffer_delay(bh);
if(buffer_new(bh)) {
/*blockdev mappings never come here */
clear_buffer_new(bh);
unmap_underlying_metadata(bh->b_bdev,
bh->b_blocknr);
}
}
bh= bh->b_this_page;
block++;
}while (bh != head);
/*sync_mode有两种模式
* WB_SYNC_NONE, /*Don't wait on anything */
* WB_SYNC_ALL, /* Wait on every mapping */
*/
do{
if(!buffer_mapped(bh))
continue;
/*
* If it's a fully non-blocking write attemptand we cannot
* lock the buffer then redirty the page. Note that this can
* potentially cause a busy-wait loop fromwriteback threads
* and kswapd activity, but those code pathshave their own
* higher-level throttling.
*/
if(wbc->sync_mode != WB_SYNC_NONE) {
lock_buffer(bh); //这种情况下是WB_SYNC_ALL的模式,同步执行
}else if (!trylock_buffer(bh)) {//这个分支是WB_SYNC_NONE模式,非同步
redirty_page_for_writepage(wbc,page); //在这段代码里,脏页面在radix_tree里//面被置位
continue;
}
if(test_clear_buffer_dirty(bh)) {//这种情况下是WB_SYNC_ALL的模式,设置//buffer_head的状态是buffer_async_write, 在下面的代码里面会提交页数据至块设备
mark_buffer_async_write_endio(bh,handler);
}else {
unlock_buffer(bh);//回应上面代码的lock_buffer(bh)
}
}while ((bh = bh->b_this_page) != head);
/*
* The page and its buffers are protected byPageWriteback(), so we can
* drop the bh refcounts early.
*/
BUG_ON(PageWriteback(page));
set_page_writeback(page);
do{
structbuffer_head *next = bh->b_this_page;
if(buffer_async_write(bh)) {
submit_bh(write_op,bh);
nr_underway++;
}
bh= next;
}while (bh != head);
unlock_page(page);
err= 0;
done:
if(nr_underway == 0) {
/*
* The page was marked dirty, but the bufferswere
* clean. Someone wrote them back by hand with
* ll_rw_block/submit_bh. A rare case.
*/
end_page_writeback(page);
/*
* The page and buffer_heads can be released atany time from
* here on.
*/
}
returnerr;
recover:
/*
* ENOSPC, or some other error. We may already have added some
* blocks to the file, so we need to writethese out to avoid
* exposing stale data.
* The page is currently locked and not markedfor writeback
*/
bh= head;
/*Recovery: lock and submit the mapped buffers */
do{
if(buffer_mapped(bh) && buffer_dirty(bh) &&
!buffer_delay(bh)) {
lock_buffer(bh);
mark_buffer_async_write_endio(bh,handler);
}else {
/*
* The buffer may have been set dirty during
* attachment to a dirty page.
*/
clear_buffer_dirty(bh);
}
}while ((bh = bh->b_this_page) != head);
SetPageError(page);
BUG_ON(PageWriteback(page));
mapping_set_error(page->mapping,err);
set_page_writeback(page);
do{
structbuffer_head *next = bh->b_this_page;
if(buffer_async_write(bh)) {
clear_buffer_dirty(bh);
submit_bh(write_op,bh);
nr_underway++;
}
bh= next;
}while (bh != head);
unlock_page(page);
gotodone;
}