注:本文分析基于3.10.0-693.el7内核版本,即CentOS 7.4
BDI(backing device info),备用存储设备,最常见的就是硬盘存储设备。这类设备相对于内存来说,其读写速度非常慢,差别在两个数量级,因此为了提高系统整体性能,Linux系统引入了cache作为缓冲,最近用到的数据临时保存在内存里,减少对慢速BDI设备的操作。但这就需要在一定的时机把这些数据同步回BDI设备,一来内存毕竟小,无法存放太多数据,二来内存里的数据容易丢失,比如机器异常宕机或重启。进行周期性回写工作的机制就是writeback,在3.10内核上由workqueue来进行实际的回写工作。原先由一个pdflush进程统管所有磁盘的脏页回写,在磁盘数量多时很容易出现IO瓶颈,采用workquene来回写,相当于升级为多线程,提高了IO吞吐量。
内核为了管理好BDI设备,专门为此创建了对应的结构体,
struct backing_dev_info {
struct list_head bdi_list;
...
/*
* The base dirty throttle rate, re-calculated on every 200ms.
* All the bdi tasks' dirty rate will be curbed under it.
* @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
* in small steps and is much more smooth/stable than the latter.
*/
unsigned long dirty_ratelimit;
unsigned long balanced_dirty_ratelimit;
struct fprop_local_percpu completions;
int dirty_exceeded;
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
//writeback对象, 承载着具体writeback工作进程和要处理的inode
struct bdi_writeback wb; /* default writeback info for this bdi*/
spinlock_t wb_lock; /* protects work_list & wb.dwork scheduling */
//任务链表
struct list_head work_list;
...
};
其中,work_list任务链表存放的就是所有需要writeback的work,所有需要回写的脏页都会封装成wb_writeback_work,并提交到该链表。
/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
struct wb_writeback_work {
long nr_pages;
struct super_block *sb;
unsigned long *older_than_this;
enum writeback_sync_modes sync_mode;
unsigned int tagged_writepages:1;
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
enum wb_reason reason; /* why was writeback initiated? */
// 用于挂接到backing_dev_info中work_list链表
struct list_head list; /* pending work list */
// 如果是阻塞回写,调用该函数通知等待进程回写完成
struct completion *done; /* set if the caller waits*/
};
封装writeback work时还会标记回写的缘由,是周期回写或是后台回写亦或是手动刷缓存。其中的list就是用于挂接到backing_dev_info中work_list链表。
而wb_writeback_work中实际工作队列和操作的inode则由存放在struct bdi_writeback上,
struct bdi_writeback {
struct backing_dev_info *bdi; /* our parent bdi */
unsigned int nr;
unsigned long last_old_flush; /* last old data flush */
// writeback工作进程,包含工作队列和对应的定时器
struct delayed_work dwork; /* work item used for writeback */
// 将需要刷新的inode节点挂载到该链表
struct list_head b_dirty; /* dirty inodes */
//writeback线程在处理过程中,b_dirty的inode会被移到b_io队列上进行处理
struct list_head b_io; /* parked for writeback */
//b_io队列负载过重时,inode也会被移到b_more_io队列
struct list_head b_more_io; /* parked for more writeback */
spinlock_t list_lock; /* protects the b_* lists */
};
在内核初始化时,会创建一个bdi_wq工作队列,用于管理所有BDI设备的writeback工作,同时创建一个默认的default BDI结构,
static int __init default_bdi_init(void)
{
int err;
//创建bdi_wq工作队列
bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
WQ_UNBOUND | WQ_SYSFS, 0);
if (!bdi_wq)
return -ENOMEM;
//初始化default_backing_dev_info
err = bdi_init(&default_backing_dev_info);
if (!err)
//注册bdi设备,添加磁盘时也会调用该函数将bdi设备注册到全局的bdi_list链表中
//此时就是创建/sys/kernel/debug/bdi/default目录
//其他BDI设备注册时名字一般为设备的主次设备号,如,253:0
bdi_register(&default_backing_dev_info, NULL, "default");
err = bdi_init(&noop_backing_dev_info);
return err;
}
int bdi_init(struct backing_dev_info *bdi)
{
...
bdi_wb_init(&bdi->wb, bdi);
...
}
static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
{
memset(wb, 0, sizeof(*wb));
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
spin_lock_init(&wb->list_lock);
//设置回调处理函数,也就是回写脏页的工作线程
INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
}
可见最终执行脏页回写的就是在bdi_writeback_workfn函数中。
BDI子系统使用workqueue机制进行数据回写,其通过bdi_queue_work提交具体的writeback任务,也就是回写请求(wb_writeback_work)挂到bdi_wq队列上。
static void bdi_queue_work(struct backing_dev_info *bdi,
struct wb_writeback_work *work)
{
trace_writeback_queue(bdi, work);
spin_lock_bh(&bdi->wb_lock);
if (!test_bit(BDI_registered, &bdi->state)) {
if (work->done)
complete(work->done);
goto out_unlock;
}
//将回写任务wb_writeback_work挂到任务队列work_list中
list_add_tail(&work->list, &bdi->work_list);
//启动工作队列开始处理回写任务
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
out_unlock:
spin_unlock_bh(&bdi->wb_lock);
}
该情况下不提交任务,只启动writeback工作队列,主要发生在写操作路径。
static void bdi_wakeup_thread(struct backing_dev_info *bdi)
{
spin_lock_bh(&bdi->wb_lock);
if (test_bit(BDI_registered, &bdi->state))
//直接启动writeback工作队列
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
spin_unlock_bh(&bdi->wb_lock);
}
触发writeback的地方主要有以下几处,
sync->
SYSCALL_DEFINE0(sync)->
sync_inodes_one_sb->
sync_inodes_sb->
bdi_queue_work
这个和手动执行sync类似,只不过sync是针对所有superblock,而syncfs是针对某个superblock,因此调用路径类似。
SYSCALL_DEFINE1(syncfs, int, fd)->
sync_filesystem->
__sync_filesystem->
sync_inodes_sb->
bdi_queue_work
free_more_memory->
wakeup_flusher_threads->
__bdi_start_writeback->
bdi_queue_work
__alloc_pages_nodemask->
__alloc_pages_slowpath->
__alloc_pages_direct_reclaim->
__perform_reclaim->
try_to_free_pages->
do_try_to_free_pages->
wakeup_flusher_threads->
__bdi_start_writeback->
bdi_queue_work
该路径不提交writeback任务,只启动writeback工作队列
ext4_file_write->
ext4_file_dio_write->
__generic_file_aio_write->
generic_file_buffered_write->
generic_perform_write->
balance_dirty_pages_ratelimited->
balance_dirty_pages->
bdi_start_background_writeback->
bdi_wakeup_thread