Linux内核延迟写的特点,是指在Linux通过write的场景下写入数据之后,会将数据直接标记为dirty,然后通过延迟读写的方式最后将数据回写到磁盘上。在本文的Linux-3.10的源码基础上简单学习一下相关机制。
Linux内核当前都是通过bdi_writeback函数来进行数据回写,该初始化方式如下。
static int __init default_bdi_init(void)
{
int err;
bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
WQ_UNBOUND | WQ_SYSFS, 0); // 通过workqueue机制来进行的函数执行
if (!bdi_wq)
return -ENOMEM;
err = bdi_init(&default_backing_dev_info); // 初始化bdi相关信息
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
err = bdi_init(&noop_backing_dev_info);
return err;
}
subsys_initcall(default_bdi_init);
int bdi_init(struct backing_dev_info *bdi)
{
int i, err;
bdi->dev = NULL;
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = FPROP_FRAC_BASE;
spin_lock_init(&bdi->wb_lock);
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->work_list);
bdi_wb_init(&bdi->wb, bdi); // 注册回调执行的函数类型
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init(&bdi->bdi_stat[i], 0);
if (err)
goto err;
}
bdi->dirty_exceeded = 0;
bdi->bw_time_stamp = jiffies;
bdi->written_stamp = 0;
bdi->balanced_dirty_ratelimit = INIT_BW;
bdi->dirty_ratelimit = INIT_BW;
bdi->write_bandwidth = INIT_BW;
bdi->avg_write_bandwidth = INIT_BW;
err = fprop_local_init_percpu(&bdi->completions);
if (err) {
err:
while (i--)
percpu_counter_destroy(&bdi->bdi_stat[i]);
}
return err;
}
EXPORT_SYMBOL(bdi_init);
static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
{
memset(wb, 0, sizeof(*wb));
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
spin_lock_init(&wb->list_lock);
INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); // 注册bdi_writeback_workfn为延迟写入的回调处理函数
}
其中subsys_initcall函数会在start_kernel --> rest_init中通过kernel_init最终调用。
fs_initcall、early_initcall、__init宏_LPSTC123的博客-CSDN博客_early_initcall
SYSCALL_DEFINE0(sync)
{
int nowait = 0, wait = 1;
wakeup_flusher_threads(0, WB_REASON_SYNC);
iterate_supers(sync_inodes_one_sb, NULL);
iterate_supers(sync_fs_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &wait);
iterate_bdevs(fdatawrite_one_bdev, NULL);
iterate_bdevs(fdatawait_one_bdev, NULL);
if (unlikely(laptop_mode))
laptop_sync_completion();
return 0;
}
主动唤醒了wakeup_flusher_threads线程来进行数据回写的操作。
/*
* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
* the whole world.
*/
void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
{
struct backing_dev_info *bdi;
if (!nr_pages) {
nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
}
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
if (!bdi_has_dirty_io(bdi))
continue;
__bdi_start_writeback(bdi, nr_pages, false, reason); // 开启回写的page的任务
}
rcu_read_unlock();
}
如果需要回写的数据为dirty状态,则主动调用__bdi_start_writeback来回写。
static void
__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
bool range_cyclic, enum wb_reason reason)
{
struct wb_writeback_work *work;
/*
* This is WB_SYNC_NONE writeback, so if allocation fails just
* wakeup the thread for old dirty data writeback
*/
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
trace_writeback_nowork(bdi);
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
return;
}
work->sync_mode = WB_SYNC_NONE;
work->nr_pages = nr_pages;
work->range_cyclic = range_cyclic;
work->reason = reason;
bdi_queue_work(bdi, work); // 将回写的work放入到任务队列中进行处理
}
static void bdi_queue_work(struct backing_dev_info *bdi,
struct wb_writeback_work *work)
{
trace_writeback_queue(bdi, work);
spin_lock_bh(&bdi->wb_lock);
list_add_tail(&work->list, &bdi->work_list);
spin_unlock_bh(&bdi->wb_lock);
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); // 放入队列中
}
通过mod_delayed_work函数放入到队列中进行延迟处理。
/**
* mod_delayed_work - modify delay of or queue a delayed work
* @wq: workqueue to use
* @dwork: work to queue
* @delay: number of jiffies to wait before queueing
*
* mod_delayed_work_on() on local CPU.
*/
static inline bool mod_delayed_work(struct workqueue_struct *wq,
struct delayed_work *dwork,
unsigned long delay)
{
return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}
bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
struct delayed_work *dwork, unsigned long delay)
{
unsigned long flags;
int ret;
do {
ret = try_to_grab_pending(&dwork->work, true, &flags);
} while (unlikely(ret == -EAGAIN));
if (likely(ret >= 0)) {
__queue_delayed_work(cpu, wq, dwork, delay); // 放入队列中进行处理
local_irq_restore(flags);
}
/* -ENOENT from try_to_grab_pending() becomes %true */
return ret;
}
EXPORT_SYMBOL_GPL(mod_delayed_work_on);
此时就会调用在work_queue中注册的回调函数。
/*
* Handle writeback of dirty data for the device backed by this bdi. Also
* reschedules periodically and does kupdated style flushing.
*/
void bdi_writeback_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(to_delayed_work(work),
struct bdi_writeback, dwork);
struct backing_dev_info *bdi = wb->bdi;
long pages_written;
set_worker_desc("flush-%s", dev_name(bdi->dev));
current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() ||
list_empty(&bdi->bdi_list))) {
/*
* The normal path. Keep writing back @bdi until its
* work_list is empty. Note that this path is also taken
* if @bdi is shutting down even when we're running off the
* rescuer as work_list needs to be drained.
*/
do {
pages_written = wb_do_writeback(wb, 0); // 处理回调page
trace_writeback_pages_written(pages_written);
} while (!list_empty(&bdi->work_list));
} else {
/*
* bdi_wq can't get enough workers and we're running off
* the emergency worker. Don't hog it. Hopefully, 1024 is
* enough for efficient IO.
*/
pages_written = writeback_inodes_wb(&bdi->wb, 1024,
WB_REASON_FORKER_THREAD);
trace_writeback_pages_written(pages_written);
}
if (!list_empty(&bdi->work_list) ||
(wb_has_dirty_io(wb) && dirty_writeback_interval))
queue_delayed_work(bdi_wq, &wb->dwork,
msecs_to_jiffies(dirty_writeback_interval * 10)); // 如果未处理完则放入队列继续处理
current->flags &= ~PF_SWAPWRITE;
}
long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{
struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
long wrote = 0;
set_bit(BDI_writeback_running, &wb->bdi->state);
while ((work = get_next_work_item(bdi)) != NULL) { // 获取队列
/*
* Override sync mode, in case we must wait for completion
* because this thread is exiting now.
*/
if (force_wait)
work->sync_mode = WB_SYNC_ALL;
trace_writeback_exec(bdi, work);
wrote += wb_writeback(wb, work); // 处理work
/*
* Notify the caller of completion if this is a synchronous
* work item, otherwise just free it.
*/
if (work->done)
complete(work->done);
else
kfree(work);
}
/*
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
wrote += wb_check_background_flush(wb);
clear_bit(BDI_writeback_running, &wb->bdi->state);
return wrote;
}
static long wb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
unsigned long wb_start = jiffies;
long nr_pages = work->nr_pages;
unsigned long oldest_jif;
struct inode *inode;
long progress;
oldest_jif = jiffies;
work->older_than_this = &oldest_jif;
spin_lock(&wb->list_lock);
for (;;) {
/*
* Stop writeback when nr_pages has been consumed
*/
if (work->nr_pages <= 0)
break;
/*
* Background writeout and kupdate-style writeback may
* run forever. Stop them if there is other work to do
* so that e.g. sync can proceed. They'll be restarted
* after the other works are all done.
*/
if ((work->for_background || work->for_kupdate) &&
!list_empty(&wb->bdi->work_list))
break;
/*
* For background writeout, stop when we are below the
* background dirty threshold
*/
if (work->for_background && !over_bground_thresh(wb->bdi))
break;
/*
* Kupdate and background works are special and we want to
* include all inodes that need writing. Livelock avoidance is
* handled by these works yielding to any other work so we are
* safe.
*/
if (work->for_kupdate) {
oldest_jif = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
} else if (work->for_background)
oldest_jif = jiffies;
trace_writeback_start(wb->bdi, work);
if (list_empty(&wb->b_io))
queue_io(wb, work);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work); // 写入
else
progress = __writeback_inodes_wb(wb, work);
trace_writeback_written(wb->bdi, work);
wb_update_bandwidth(wb, wb_start);
/*
* Did we write something? Try for more
*
* Dirty inodes are moved to b_io for writeback in batches.
* The completion of the current batch does not necessarily
* mean the overall work is done. So we keep looping as long
* as made some progress on cleaning pages or inodes.
*/
if (progress)
continue;
/*
* No more inodes for IO, bail
*/
if (list_empty(&wb->b_more_io))
break;
/*
* Nothing written. Wait for some inode to
* become available for writeback. Otherwise
* we'll just busyloop.
*/
if (!list_empty(&wb->b_more_io)) {
trace_writeback_wait(wb->bdi, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
/* This function drops i_lock... */
inode_sleep_on_writeback(inode);
spin_lock(&wb->list_lock);
}
}
spin_unlock(&wb->list_lock);
return nr_pages - work->nr_pages;
}
在进入wb_writeback函数之后就是一个死循环进行处理。接着就调用writeback_sb_inodes进一步写入。
/*
* Write a portion of b_io inodes which belong to @sb.
*
* Return the number of pages and/or inodes written.
*/
static long writeback_sb_inodes(struct super_block *sb,
struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
struct writeback_control wbc = {
.sync_mode = work->sync_mode,
.tagged_writepages = work->tagged_writepages,
.for_kupdate = work->for_kupdate,
.for_background = work->for_background,
.range_cyclic = work->range_cyclic,
.range_start = 0,
.range_end = LLONG_MAX,
};
unsigned long start_time = jiffies;
long write_chunk;
long wrote = 0; /* count both pages and inodes */
while (!list_empty(&wb->b_io)) { // 检查队列是否为空
struct inode *inode = wb_inode(wb->b_io.prev);
if (inode->i_sb != sb) {
if (work->sb) {
/*
* We only want to write back data for this
* superblock, move all inodes not belonging
* to it back onto the dirty list.
*/
redirty_tail(inode, wb);
continue;
}
/*
* The inode belongs to a different superblock.
* Bounce back to the caller to unpin this and
* pin the next superblock.
*/
break;
}
/*
* Don't bother with new inodes or inodes being freed, first
* kind does not need periodic writeout yet, and for the latter
* kind writeout is handled by the freer.
*/
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
spin_unlock(&inode->i_lock);
redirty_tail(inode, wb);
continue;
}
if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
/*
* If this inode is locked for writeback and we are not
* doing writeback-for-data-integrity, move it to
* b_more_io so that writeback can proceed with the
* other inodes on s_io.
*
* We'll have another go at writing back this inode
* when we completed a full scan of b_io.
*/
spin_unlock(&inode->i_lock);
requeue_io(inode, wb);
trace_writeback_sb_inodes_requeue(inode);
continue;
}
spin_unlock(&wb->list_lock);
/*
* We already requeued the inode if it had I_SYNC set and we
* are doing WB_SYNC_NONE writeback. So this catches only the
* WB_SYNC_ALL case.
*/
if (inode->i_state & I_SYNC) {
/* Wait for I_SYNC. This function drops i_lock... */
inode_sleep_on_writeback(inode);
/* Inode may be gone, start again */
spin_lock(&wb->list_lock);
continue;
}
inode->i_state |= I_SYNC; // 设置同步状态
spin_unlock(&inode->i_lock);
write_chunk = writeback_chunk_size(wb->bdi, work);
wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
/*
* We use I_SYNC to pin the inode in memory. While it is set
* evict_inode() will wait so the inode cannot be freed.
*/
__writeback_single_inode(inode, &wbc); // 调用single_inode函数进行处理
work->nr_pages -= write_chunk - wbc.nr_to_write;
wrote += write_chunk - wbc.nr_to_write;
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY))
wrote++;
requeue_inode(inode, wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
cond_resched_lock(&wb->list_lock);
/*
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
*/
if (wrote) {
if (time_is_before_jiffies(start_time + HZ / 10UL))
break;
if (work->nr_pages <= 0)
break;
}
}
return wrote;
}
static int
__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
struct address_space *mapping = inode->i_mapping;
long nr_to_write = wbc->nr_to_write;
unsigned dirty;
int ret;
WARN_ON(!(inode->i_state & I_SYNC));
trace_writeback_single_inode_start(inode, wbc, nr_to_write);
ret = do_writepages(mapping, wbc);
/*
* Make sure to wait on the data before writing out the metadata.
* This is important for filesystems that modify metadata on data
* I/O completion.
*/
if (wbc->sync_mode == WB_SYNC_ALL) {
int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
}
/*
* Some filesystems may redirty the inode during the writeback
* due to delalloc, clear dirty metadata flags right before
* write_inode()
*/
spin_lock(&inode->i_lock);
/* Clear I_DIRTY_PAGES if we've written out all dirty pages */
if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state &= ~I_DIRTY_PAGES;
dirty = inode->i_state & I_DIRTY;
inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
spin_unlock(&inode->i_lock);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
int err = write_inode(inode, wbc);
if (ret == 0)
ret = err;
}
trace_writeback_single_inode(inode, wbc, nr_to_write);
return ret;
}
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
if (wbc->nr_to_write <= 0)
return 0;
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc); // 调用文件系统注册的回调的函数
else
ret = generic_writepages(mapping, wbc);
return ret;
}
至此,就将回写的操作分配到了底层的文件系统来进行处理。如果是ext4的文件系统则调用的为ext4_writepage函数进行下一步数据的写入操作。
本文主要是通过资料跟着源码学习了一下数据回写的机制,主要是通过bdi机制,通过workqueue来进行数据的写入,并且在数据写入的时候其实就写到了page层就返回,然后再延迟写入机制将page中的数据落入到底层的存储介质中。触发的时机也引用到下文中的内容,即可以通过系统调用sync主动调用,也可以等待系统配置的定时来执行,在内存分配数据不足的时候也会触发数据回写的流程。由于本人才疏学浅,如有错误请批评指正。
page cache回写的几种触发方式_程序猿Ricky的日常干货的博客-CSDN博客_pagecache 如何回写
https://zhuanlan.zhihu.com/p/532262364
writeback bdi脏页回写原理linux内核源码解析_dongzhiyan_hjp的博客-CSDN博客_脏页回写
vm内核参数之内存脏页dirty_writeback_centisecs和dirty_expire_centisecs_Blue summer的博客-CSDN博客_dirty_writeback_centisecs
https://www.leviathan.vip/2019/06/01/Linux%E5%86%85%E6%A0%B8%E6%BA%90%E7%A0%81%E5%88%86%E6%9E%90-Page-Cache%E5%8E%9F%E7%90%86%E5%88%86%E6%9E%90/#%E5%90%8E%E5%8F%B0%E7%BA%BF%E7%A8%8B%E5%91%A8%E6%9C%9F%E5%9B%9E%E5%86%99
【linux内核源码】io操作之page cache - 简书
浅谈Linux dirty data配置 - 腾讯云开发者社区-腾讯云
Linux缓存回写——基于linux-4.15_SweeNeil的博客-CSDN博客
https://ywang-wnlo.github.io/posts/646202b9.html