注:本文分析基于3.10.0-693.el7内核版本,即CentOS 7.4
在《BDI writeback脏页回写》中我们了解了BDI的一些基本结构以及初始化和一些触发路径,现在我们要更深入了解下脏页的writeback,一个是脏页什么时候会被处理,另一个是多久会触发脏页的writeback。
脏页由写操作引入,毕竟读操作不会引起数据的不一致性。不管应用层以什么方式将数据变脏,最终都会调用__mark_inode_dirty将inode标记dirty。而调用__mark_inode_dirty主要有以下几种情况,
该情况下,当前page被标记为dirty,但是该page对应的buffer_head并不一定都是dirty状态,因此又可分为自上而下和自下而上。所谓自上而下对应的是,如果整个page都是dirty状态,那么这个page对应的所有buffer_head都要值为dirty;而自下而上对应的就是,如果page里的某个buffer_head标记为dirty,那这个page也要标记为dirty。
自上而下一般通过set_page_dirty操作,
set_page_dirty-> ##标记整个page为dirty
__set_page_dirty_buffers-> ##把page对应的buffer_head都标记为dirty
__set_page_dirty-> ##标记page为dirty
__mark_inode_dirty ##标记inode为dirty
对于自下而上一般通过mark_buffer_dirty操作,
mark_buffer_dirty-> ##标记单个buffer为dirty
__set_page_dirty-> ##标记page为dirty
__mark_inode_dirty ##标记inode为dirty
这种情况是最常见的,比如我们通过write系统调用写文件时,最后写入page cache后,就形成了脏页,
generic_write_end->
mark_inode_dirty->
__mark_inode_dirty
这种情况一般就是更新文件的各个时间,比如access time、modify time和change time,这需要更新文件的metadata。
update_time->
mark_inode_dirty_sync->
__mark_inode_dirty
__mark_inode_dirty主要做了以下几件事,
void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
struct backing_dev_info *bdi = NULL;
/*
* Don't do this for I_DIRTY_PAGES - that doesn't actually
* dirty the inode itself
*/
//如果dirty inode本身有修改,需要向jbd2日志模块transaction进行提交
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
trace_writeback_dirty_inode_start(inode, flags);
if (sb->s_op->dirty_inode)
sb->s_op->dirty_inode(inode, flags);
trace_writeback_dirty_inode(inode, flags);
}
...
//是否开启IO调试信息,用于记录IO回写的具体信息
//比如进程号,inode号,文件名和磁盘设备名
if (unlikely(block_dump))
block_dump___mark_inode_dirty(inode);
spin_lock(&inode->i_lock);
if ((inode->i_state & flags) != flags) {
//确定inode确实是dirty状态
const int was_dirty = inode->i_state & I_DIRTY;
...
if (!was_dirty) {
bool wakeup_bdi = false;
bdi = inode_to_bdi(inode);
spin_unlock(&inode->i_lock);
spin_lock(&bdi->wb.list_lock);
if (bdi_cap_writeback_dirty(bdi)) {
WARN(!test_bit(BDI_registered, &bdi->state),
"bdi-%s not registered\n", bdi->name);
//判断bdi的b_dirty、b_io、b_more_io队列中是否有脏inode
//没有说明这是该BDI设备的第一个脏inode,需要唤醒writeback进程
if (!wb_has_dirty_io(&bdi->wb))
wakeup_bdi = true;
}
//记录inode dirty的时间,用于后面计算dirty inode是否该进行回写
inode->dirtied_when = jiffies;
//把dirty inode挂到b_dirty队列上
list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
spin_unlock(&bdi->wb.list_lock);
if (wakeup_bdi)
//唤醒脏页周期回写进程
bdi_wakeup_thread_delayed(bdi);
return;
}
}
out_unlock_inode:
spin_unlock(&inode->i_lock);
}
void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
{
unsigned long timeout;
//定时器周期为dirty_writeback_interval,
//即/proc/sys/vm/dirty_writeback_centisecs,默认为5s
timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
spin_lock_bh(&bdi->wb_lock);
if (test_bit(BDI_registered, &bdi->state))
//为工作队列设置定时器
queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
spin_unlock_bh(&bdi->wb_lock);
}
由上可以看出,对于单个BDI设备,每隔5s会唤醒writeback进程回写脏页。
但是我们也看到此时dirty inode只挂到BDI设备的b_dirty队列,但是实际在回写的时候操作的是b_io队列,因此inode dirty之后并不是马上回写,还需要一段时间才会启动,这个时间就是由dirty_expire_centisecs控制。
所有前提准备好后,writeback进程被唤醒,此时就走到bdi_writeback_workfn上,这是在BDI初始化时注册的回写处理函数。
基本流程为:
/*
* Handle writeback of dirty data for the device backed by this bdi. Also
* reschedules periodically and does kupdated style flushing.
*/
void bdi_writeback_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(to_delayed_work(work),
struct bdi_writeback, dwork);
struct backing_dev_info *bdi = wb->bdi;
long pages_written;
set_worker_desc("flush-%s", dev_name(bdi->dev));
current->flags |= PF_SWAPWRITE;
//如果当前不是一个救援工作队列,或者当前bdi设备已注册,这是一般路径
if (likely(!current_is_workqueue_rescuer() ||
!test_bit(BDI_registered, &bdi->state))) {
do {
//从bdi的work_list取出队列里的任务,执行脏页回写
pages_written = wb_do_writeback(wb);
trace_writeback_pages_written(pages_written);
} while (!list_empty(&bdi->work_list));
} else {
//如果当前workqueue不能获得足够的worker进行处理,
//只提交一个work并限制写入1024个pages
pages_written = writeback_inodes_wb(&bdi->wb, 1024, WB_REASON_FORKER_THREAD);
trace_writeback_pages_written(pages_written);
}
//如果上面处理完到现在这段间隔又有了work,再次立马启动回写进程
if (!list_empty(&bdi->work_list))
mod_delayed_work(bdi_wq, &wb->dwork, 0);
//如果所有bdi设备上挂的dirty inode回写完,那么就重置定制器,
//再过dirty_writeback_interval,即5s后再唤醒回写进程
else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
bdi_wakeup_thread_delayed(bdi);
current->flags &= ~PF_SWAPWRITE;
}
主要是处理已提交的回写work,同时检查并提交周期回写work和background回写work。
基本流程为:
/*
* Retrieve work items and do the writeback they describe
*/
static long wb_do_writeback(struct bdi_writeback *wb)
{
struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
long wrote = 0;
//设置bdi状态为正在回写
set_bit(BDI_writeback_running, &wb->bdi->state);
//遍历BDI里面的work,回写脏页
while ((work = get_next_work_item(bdi)) != NULL) {
trace_writeback_exec(bdi, work);
//回写脏页
wrote += wb_writeback(wb, work);
if (work->done)
//如果是同步写,调用complete函数通知等待的线程,此次写操作已完成
complete(work->done);
else
kfree(work);
}
/*
* Check for periodic writeback, kupdated() style
*/
//如果dirty_writeback_centisecs时间到了,提交一个周期回写的work
wrote += wb_check_old_data_flush(wb);
//如果dirty page超过dirty_background_ratio/dirty_background_bytes,
//提交一个background回写work
wrote += wb_check_background_flush(wb);
clear_bit(BDI_writeback_running, &wb->bdi->state);
return wrote;
}
主要是处理各个路径下提交的回写work,
基本流程为:
static long wb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
unsigned long wb_start = jiffies;
long nr_pages = work->nr_pages;
unsigned long oldest_jif;
struct inode *inode;
long progress;
oldest_jif = jiffies;
//注意这个oldest_jif,对于周期回写时间会往前拨
work->older_than_this = &oldest_jif;
spin_lock(&wb->list_lock);
for (;;) {
/*
* Stop writeback when nr_pages has been consumed
*/
if (work->nr_pages <= 0)
break;
//优先处理显式回写work,比如手动sync,内存不足触发的回写
//background和周期回写优先级较低,而且他们有可能一直处理
if ((work->for_background || work->for_kupdate) &&
!list_empty(&wb->bdi->work_list))
break;
/*
* For background writeout, stop when we are below the
* background dirty threshold
*/
//判断是否需要进行background回写
//依据dirty_background_ratio和dirty_background_bytes计算
//我们下次再讲
if (work->for_background && !over_bground_thresh(wb->bdi))
break;
if (work->for_kupdate) {
//如果是周期回写的work,把时间往回拨dirty_expire_interval,即30s
//用于在queue_io时,只处理早于该时间dirty的inode,也就是inode dirty之后,
//并不是马上回写,还需要等待dirty_expire_interval,才会开始回写
oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
} else if (work->for_background)
oldest_jif = jiffies;
trace_writeback_start(wb->bdi, work);
//如果b_io上没有dirty inode,扫描一遍b_dirty队列
if (list_empty(&wb->b_io))
//把过期的dirty node挂到b_io队列
queue_io(wb, work);
//开始具体处理脏页,调用对应文件系统的writepages函数写脏页
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
progress = __writeback_inodes_wb(wb, work);
trace_writeback_written(wb->bdi, work);
wb_update_bandwidth(wb, wb_start);
/*
* Did we write something? Try for more
*
* Dirty inodes are moved to b_io for writeback in batches.
* The completion of the current batch does not necessarily
* mean the overall work is done. So we keep looping as long
* as made some progress on cleaning pages or inodes.
*/
if (progress)
continue;
/*
* No more inodes for IO, bail
*/
//如果b_more_io都没有dirty inode需要处理,说明处理完成了
if (list_empty(&wb->b_more_io))
break;
//b_more_io链表存放的一般是回写过程中暂时无法处理的inode
//当b_io上都处理完时,再来处理b_more_io队列
if (!list_empty(&wb->b_more_io)) {
trace_writeback_wait(wb->bdi, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
/* This function drops i_lock... */
inode_sleep_on_writeback(inode);
spin_lock(&wb->list_lock);
}
}
spin_unlock(&wb->list_lock);
return nr_pages - work->nr_pages;
}
主要流程:
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
{
int moved;
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
//把b_dirty上过期的dirty inode移到b_io上,这部分需要回写了
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
trace_writeback_queue_io(wb, work, moved);
}
/*
* Move expired (dirtied before work->older_than_this) dirty inodes from
* @delaying_queue to @dispatch_queue.
*/
static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
struct wb_writeback_work *work)
{
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
struct inode *inode;
int do_sb_sort = 0;
int moved = 0;
//遍历b_dirty队列,找出过期的inode挂到tmp队列上
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
//如果当前inode变dirty的时间晚于work上的过期时间,
//也就是没过期,不处理
if (work->older_than_this &&
inode_dirtied_after(inode, *work->older_than_this))
break;
//过期inode先挂到tmp队列
list_move(&inode->i_wb_list, &tmp);
moved++;
if (sb_is_blkdev_sb(inode->i_sb))
continue;
//判断这些inode是否是属于同一个超级块,也就是是否是同一个分区
if (sb && sb != inode->i_sb)
//inode属于不同分区,需要分类
do_sb_sort = 1;
sb = inode->i_sb;
}
/* just one sb in list, splice to dispatch_queue and we're done */
//如果所有inode都属于同一块分区,就不需要sort
if (!do_sb_sort) {
//直接把tmp队列join到b_io队列就好了
list_splice(&tmp, dispatch_queue);
goto out;
}
/* Move inodes from one superblock together */
//如果inode属于不同分区,将tmp中属于同一块分区的inode挂到一起
//通过遍历,每次将一个分区的inode放一起,达到分类排序的效果
while (!list_empty(&tmp)) {
sb = wb_inode(tmp.prev)->i_sb;
list_for_each_prev_safe(pos, node, &tmp) {
inode = wb_inode(pos);
if (inode->i_sb == sb)
//将inode挂到b_io队列
list_move(&inode->i_wb_list, dispatch_queue);
}
}
out:
return moved;
}
主要流程:
static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
unsigned long expired;
long nr_pages;
//dirty_writeback_centisecs为0,则不启动周期回写进程
if (!dirty_writeback_interval)
return 0;
//计算超期时间,间隔为dirty_writeback_centisecs
expired = wb->last_old_flush +
msecs_to_jiffies(dirty_writeback_interval * 10);
//判断是否超期
if (time_before(jiffies, expired))
return 0;
//重置时间
wb->last_old_flush = jiffies;
//获取脏页数量
nr_pages = get_nr_dirty_pages();
//如果有脏页,提交周期回写work
if (nr_pages) {
struct wb_writeback_work work = {
.nr_pages = nr_pages,
.sync_mode = WB_SYNC_NONE,
.for_kupdate = 1,
.range_cyclic = 1,
.reason = WB_REASON_PERIODIC,
};
return wb_writeback(wb, &work);
}
return 0;
}
dirty_writeback_centisecs
控制周期回写进程的唤醒时间,默认值为500,单位是厘秒,实际内核中是*10使用,即5s,也就是每隔5秒唤醒脏页回写进程,降低这个值可以把尖峰的写操作削平成多次写操作。
dirty_expire_centisecs
控制dirty inode实际回写的等待时间,默认值是3000,即30s,只有当超过这个值后,内核回写进程才会将dirty数据回写到磁盘