本文使用的内核源码为linux 4.15.2
workqueue 机制的最小调试单元是work item 由结构体work_struct来表示
/include/linux/workqueue.h
struct work_struct {
/*
低bit表示work的标志位,
其它的bit用于存放一次运行的work_pool 的ID或者pool_workqueue的指针
*/
atomic_long_t data;
struct list_head entry;
work_func_t func;
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
};
work的一种变种delayed_work
struct delayed_work {
struct work_struct work;
struct timer_list timer;
/* target workqueue and CPU ->timer uses to queue ->work */
struct workqueue_struct *wq;
int cpu;
};
work 运行在内核线程中(终端top命令下的kworker),这个内核线程在代码中被叫做worker ,worker 类似流水线的工人,work 相当于流水线工人的工作。
worker 定义在 kernel/workqueue_internal.h
/*
* The poor guys doing the actual heavy lifting. All on-duty workers are
* either serving the manager role, on idle list or on busy hash. For
* details on the locking annotation (L, I, X...), refer to workqueue.c.
*
* Only to be used in workqueue and async.
*/
struct worker {
/* on idle list while idle, on busy hash table while busy */
union {
struct list_head entry; /* L: while idle */
struct hlist_node hentry; /* L: while busy */
};
struct work_struct *current_work; /* L: work being processed */
work_func_t current_func; /* L: current_work's fn */
struct pool_workqueue *current_pwq; /* L: current_work's pwq */
bool desc_valid; /* ->desc is valid */
struct list_head scheduled; /* L: scheduled works */
/* 64 bytes boundary on 64bit, 32 on 32bit */
struct task_struct *task; /* I: worker task */
struct worker_pool *pool; /* I: the associated pool */
/* L: for rescuers */
struct list_head node; /* A: anchored at pool->workers */
/* A: runs through worker->node */
unsigned long last_active; /* L: last active timestamp */
unsigned int flags; /* X: flags */
int id; /* I: worker id */
/*
* Opaque string set with work_set_desc(). Printed out with task
* dump for debugging - WARN, BUG, panic or sysrq.
*/
char desc[WORKER_DESC_LEN];
/* used only by rescuers to point to the target workqueue */
struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
};
工作队列 workqueue_struct 的数据结构; 外部可见的工作队列, 它通过其pool_workqueues将发出的工作项传递到适当的worker_pool。
/*
* The externally visible workqueue. It relays the issued work items to
* the appropriate worker_pool through its pool_workqueues.
*/
struct workqueue_struct {
struct list_head pwqs; /* WR: all pwqs of this wq */
struct list_head list; /* PR: list of all workqueues */
struct mutex mutex; /* protects this wq */
int work_color; /* WQ: current work color */
int flush_color; /* WQ: current flush color */
atomic_t nr_pwqs_to_flush; /* flush in progress */
struct wq_flusher *first_flusher; /* WQ: first flusher */
struct list_head flusher_queue; /* WQ: flush waiters */
struct list_head flusher_overflow; /* WQ: flush overflow list */
struct list_head maydays; /* MD: pwqs requesting rescue */
struct worker *rescuer; /* I: rescue worker */
int nr_drainers; /* WQ: drain in progress */
int saved_max_active; /* WQ: saved pwq max_active */
struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */
struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */
#ifdef CONFIG_SYSFS
struct wq_device *wq_dev; /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
/*
* Destruction of workqueue_struct is sched-RCU protected to allow
* walking the workqueues list without grabbing wq_pool_mutex.
* This is used to dump all workqueues from sysrq.
*/
struct rcu_head rcu;
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
};
工作队列是内核2.5引入的,存在以下几点问题:
大系统中,cpu一多,内核线程数量太多,会消耗系统的PID
内核线程数量一多,并发性就差,需要不断的调度,同时工作线程与CPU一一绑定,不能很好利用cpu资源
2.6从某个子版本开发的版本引入了CMWQ(concurrency-managed workqueues),提出了工作线程池的概念
workerqueue.c中
struct worker_pool {
spinlock_t lock; /* the pool lock */
//对于绑定的BOUND cpu的ID,如果是UNBOUND 类型 则为-1
int cpu; /* I: the associated cpu */
//对于UNBOUND 类型的workqueue 表示内存节点的ID编号
int node; /* I: the associated node ID */
int id; /* I: pool ID */
unsigned int flags; /* X: flags */
unsigned long watchdog_ts; /* L: watchdog timestamp */
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
/* nr_idle includes the ones off idle_list for rebinding */
int nr_idle; /* L: currently idle ones */
struct list_head idle_list; /* X: list of idle workers */
struct timer_list idle_timer; /* L: worker idle timeout */
struct timer_list mayday_timer; /* L: SOS timer for workers */
/* a workers is either on busy_hash or idle_list, or the manager */
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
/* L: hash of busy workers */
/* see manage_workers() for details on the two manager mutexes */
struct worker *manager; /* L: purely informational */
struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
struct completion *detach_completion; /* all workers detached */
struct ida worker_ida; /* worker IDs for task name */
struct workqueue_attrs *attrs; /* I: worker attributes */
struct hlist_node hash_node; /* PL: unbound_pool_hash node */
int refcnt; /* PL: refcnt for unbound pools */
/*
* The current concurrency level. As it's likely to be accessed
* from other CPUs during try_to_wake_up(), put it in a separate
* cacheline.
*/
atomic_t nr_running ____cacheline_aligned_in_smp;
/*
* Destruction of pool is sched-RCU protected to allow dereferences
* from get_work_pool().
*/
struct rcu_head rcu;
} ____cacheline_aligned_in_smp;
每个cpu都有二个worker-pool (一个普通优先级一个高优先级)。
CMWQ 定义了一个pool_workqueue的数据结构,用于连接workqueue 和worker-pool
struct pool_workqueue {
struct worker_pool *pool; /* I: the associated pool */
struct workqueue_struct *wq; /* I: the owning workqueue */
.......
} __aligned(1 << WORK_STRUCT_FLAG_BITS);
kworker在终端top的情况:
进程号 USER PR NI VIRT RES SHR %CPU %MEM TIME+ COMMAND
1 root 20 0 168172 12120 8324 S 0.0 0.1 0:07.35 systemd
2 root 20 0 0 0 0 S 0.0 0.0 0:00.10 kthreadd
6 root 0 -20 0 0 0 I 0.0 0.0 0:00.00 netns
8 root 0 -20 0 0 0 I 0.0 0.0 0:00.00 kworker/0:0H-kblockd
12 root 20 0 0 0 0 S 0.0 0.0 0:00.00 rcu_tasks_trace
kworker 创建的代码如下(kthread_create_on_node),创建好后把worker放入work_pool 中(worker_attach_to_pool(worker, pool) )
/**
* create_worker - create a new workqueue worker
* @pool: pool the new worker will belong to
*
* Create and start a new worker which is attached to @pool.
*
* CONTEXT:
* Might sleep. Does GFP_KERNEL allocations.
*
* Return:
* Pointer to the newly created worker.
*/
static struct worker *create_worker(struct worker_pool *pool)
{
struct worker *worker = NULL;
int id = -1;
char id_buf[16];
/* ID is needed to determine kthread name */
id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
if (id < 0)
goto fail;
worker = alloc_worker(pool->node);
if (!worker)
goto fail;
worker->pool = pool;
worker->id = id;
if (pool->cpu >= 0)
snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
pool->attrs->nice < 0 ? "H" : "");
else
snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
"kworker/%s", id_buf);
if (IS_ERR(worker->task))
goto fail;
set_user_nice(worker->task, pool->attrs->nice);
kthread_bind_mask(worker->task, pool->attrs->cpumask);
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
/* start the newly created worker */
spin_lock_irq(&pool->lock);
worker->pool->nr_workers++;
worker_enter_idle(worker);
wake_up_process(worker->task);
spin_unlock_irq(&pool->lock);
return worker;
fail:
if (id >= 0)
ida_simple_remove(&pool->worker_ida, id);
kfree(worker);
return NULL;
}
每个cpu 定义了2个worker-pool,NR_STD_WORKER_POOLS=2
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
前面讲了create_worker,这里讲一下create_worker以及work_pool 等的创建,从中理一下workqueue 的工作流程
/**
* workqueue_init - bring workqueue subsystem fully online
*
* This is the latter half of two-staged workqueue subsystem initialization
* and invoked as soon as kthreads can be created and scheduled.
* Workqueues have been created and work items queued on them, but there
* are no kworkers executing the work items yet. Populate the worker pools
* with the initial workers and enable future kworker creations.
*/
int __init workqueue_init(void)
{
struct workqueue_struct *wq;
struct worker_pool *pool;
int cpu, bkt;
/*
* It'd be simpler to initialize NUMA in workqueue_init_early() but
* CPU to node mapping may not be available that early on some
* archs such as power and arm64. As per-cpu pools created
* previously could be missing node hint and unbound pools NUMA
* affinity, fix them up.
*/
wq_numa_init();
mutex_lock(&wq_pool_mutex);
for_each_possible_cpu(cpu) {
for_each_cpu_worker_pool(pool, cpu) {
pool->node = cpu_to_node(cpu);
}
}
list_for_each_entry(wq, &workqueues, list)
wq_update_unbound_numa(wq, smp_processor_id(), true);
mutex_unlock(&wq_pool_mutex);
/* create the initial workers */
for_each_online_cpu(cpu) {
for_each_cpu_worker_pool(pool, cpu) {
pool->flags &= ~POOL_DISASSOCIATED;
BUG_ON(!create_worker(pool));
}
}
hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
BUG_ON(!create_worker(pool));
wq_online = true;
wq_watchdog_init();
return 0;
}
/* create the initial workers */
for_each_online_cpu(cpu) {
for_each_cpu_worker_pool(pool, cpu) {
pool->flags &= ~POOL_DISASSOCIATED;
BUG_ON(!create_worker(pool));
}
}
workqueue_init会根据cpu个数,每个cpu 对应的worker_pool数(2个),来创建kworker 内核线程。
内核线程函数如下
/**
* worker_thread - the worker thread function
* @__worker: self
*
* The worker thread function. All workers belong to a worker_pool -
* either a per-cpu one or dynamic unbound one. These workers process all
* work items regardless of their specific target workqueue. The only
* exception is work items which belong to workqueues with a rescuer which
* will be explained in rescuer_thread().
*
* Return: 0
*/
static int worker_thread(void *__worker)
{
struct worker *worker = __worker;
struct worker_pool *pool = worker->pool;
/* tell the scheduler that this is a workqueue worker */
worker->task->flags |= PF_WQ_WORKER;
woke_up:
spin_lock_irq(&pool->lock);
/* am I supposed to die? */
if (unlikely(worker->flags & WORKER_DIE)) {
spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
worker->task->flags &= ~PF_WQ_WORKER;
set_task_comm(worker->task, "kworker/dying");
ida_simple_remove(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker, pool);
kfree(worker);
return 0;
}
worker_leave_idle(worker);
recheck:
/* no more worker necessary? */
if (!need_more_worker(pool))
goto sleep;
/* do we need to manage? */
if (unlikely(!may_start_working(pool)) && manage_workers(worker))
goto recheck;
/*
* ->scheduled list can only be filled while a worker is
* preparing to process a work or actually processing it.
* Make sure nobody diddled with it while I was sleeping.
*/
WARN_ON_ONCE(!list_empty(&worker->scheduled));
/*
* Finish PREP stage. We're guaranteed to have at least one idle
* worker or that someone else has already assumed the manager
* role. This is where @worker starts participating in concurrency
* management if applicable and concurrency management is restored
* after being rebound. See rebind_workers() for details.
*/
worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
do {
struct work_struct *work =
list_first_entry(&pool->worklist,
struct work_struct, entry);
pool->watchdog_ts = jiffies;
if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
/* optimization path, not strictly necessary */
process_one_work(worker, work);
if (unlikely(!list_empty(&worker->scheduled)))
process_scheduled_works(worker);
} else {
move_linked_works(work, &worker->scheduled, NULL);
process_scheduled_works(worker);
}
} while (keep_working(pool));
worker_set_flags(worker, WORKER_PREP);
sleep:
/*
* pool->lock is held and there's no work to process and no need to
* manage, sleep. Workers are woken up only while holding
* pool->lock or from local cpu, so setting the current state
* before releasing pool->lock is enough to prevent losing any
* event.
*/
worker_enter_idle(worker);
__set_current_state(TASK_IDLE);
spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
}
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
......
lockdep_invariant_state(true);
trace_workqueue_execute_start(work);
worker->current_func(work); //工作队列处理函数
......
}
所以初化工作的流程:
kernel_init ->
kernel_init_freeable ->
workqueue_init->
create_worker->
worker_thread->
process_one_work->
worker->current_func(work);
用文字表述下来就是启动内核时,workqueue_init 初始化,初始化时会给每个cpu创建安2个work_pool (一个普通一个高优先级),然后每个都创建一个工作线程(worker_thread线程函数),线程中执行process_one_work,最后执行worker->current_func(work);
用户使用的初始化,调用alloc_workqueue ,__alloc_workqueue_key 的代码这里就不列出来了
#define alloc_workqueue(fmt, flags, max_active, args...) \
__alloc_workqueue_key((fmt), (flags), (max_active), \
NULL, NULL, ##args)
这里举一个使用的场景:memcontrol.c中
struct workqueue_struct *memcg_kmem_cache_wq;
static int __init mem_cgroup_init(void)
{
int cpu, node;
#ifndef CONFIG_SLOB
/*
* Kmem cache creation is mostly done with the slab_mutex held,
* so use a workqueue with limited concurrency to avoid stalling
* all worker threads in case lots of cgroups are created and
* destroyed simultaneously.
*/
memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1); //注意这里
BUG_ON(!memcg_kmem_cache_wq);
#endif
........
}
static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
struct memcg_kmem_cache_create_work *cw;
cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
if (!cw)
return;
css_get(&memcg->css);
cw->memcg = memcg;
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);//注意这里
queue_work(memcg_kmem_cache_wq, &cw->work); //注意这里
}
上述分析系统运行创建workqueue, (驱动程序调用 alloc_workqueue宏,然后创建workqueue),增加到workqueues队列中。
一个work 挂入workqueue 中,最终通过worker-pool 中的工作线程(worker_thread)来处理其回调函数,worker-pool是共享的(每个cpu创建的是固定的),因为workqueue 需要找一个合适的worker-pool ,然后从worker-pool中分派一个合适的工作线程。而在找worker-pool的过程中,需要pool_workqueue协助(前面有提到:工作队列 workqueue_struct 的数据结构; 外部可见的工作队列, 它通过其pool_workqueues将发出的工作项传递到适当的worker_pool)
关于详细的pool_workqueue如何工作的看源码__queue_work (workqueue.c中)
static void __queue_work(int cpu, struct workqueue_struct *wq,struct work_struct *work)
//为系统中每个cpu创建一个workqueue
struct workqueue_struct *create_workqueue(const char *name )
//为系统创建一个workqueue
struct workqueue_struct *create_singlethread_workqueue(const char *name)
上述二个函数底层都是调用alloc_workqueue -> __alloc_workqueue_key
注意:create_workqueue 是为每个cpu创建一个工作线程
bool queue_work(struct workqueue_struct *wq,struct work_struct *work)
bool queue_delayed_work(struct workqueue_struct *wq,struct delayed_work *dwork, unsigned long delay)
bool cancel_work(struct work_struct *work);
bool cancel_work_sync(struct work_struct *work)
bool cancel_delayed_work(struct delayed_work *dwork)
bool cancel_delayed_work_sync(struct delayed_work *dwork)
void flush_workqueue(struct worksqueue_struct * queue); //要等待所有workqueue完成
void destroy_workqueue(structure workqueque_struct *queue);
注意所有_sync 的函数,都可能会sleep ,只能使用中进程上下文。
在大多数情况下不一定需要自己专有的工作队列,而create_workqueue 是为每个cpu创建一个工作线程,这显然不太划算(占用太多资源),我们可以使用内核共享的工作队列。这个共享的工作队列是在内核启动时,通过workqueue_init_early()函数生成的,这个全局的工作队列就是所谓的“system_wq” ,每个cpu有一个,每个由一个名为events/n 的专用线程,n表示cpu的序列编号。可以通过以下API把work放在内核共享的工作队列中进行处理。
//在当前处理器的上进行的调度
int schedule_work(struct work_struct *work);
int schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
//on 是指定处理器
int schedule_work_on(int cpu, struct work_struct *work);
int schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
shedule_work 的源码如下,queue_work 的简单封装,运行在system_wq
static inline bool schedule_work(struct work_struct *work)
{
return queue_work(system_wq, work);
}
system_wq 是在 workqueue_init_early 中创建的。
system_wq = alloc_workqueue(“events”, 0, 0);
//针对的是共享工作队列的flush
void flush_scheduled_work(void);
要注意的点,下面这些api 可能在高版本的内核中会删除掉,属于 legacy workqueue API ,建议使用新替代,
那新的用哪个了?
create_workqueue(),
create_singlethread_workqueue()
create_freezable_workqueue())
新的api如下:
#define alloc_workqueue(fmt, flags, max_active, args...);
//创建了一个工作队列,按照排队顺序(即FIFO顺序)逐个处理每个工作项
#define alloc_ordered_workqueue(fmt, flags, args...);
新老版本的替代方法和对应的参数,从源码中能看出来,这里列了一个,其它的可以看源码
#define create_workqueue(name) \
alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1,
对比新老版本的源码,底层依赖都是一个:__alloc_workqueue_key。
alloc_workqueue 参数中有一个max_active 和flags ,mac_active 的理解举例
如果max_active为5,表示每个CPU最多可以同时执行该工作队列上的5个工作项。
flags 说明如下