本文参考了下面两位博主的文章,采用4.18版本的内核分析。
http://www.wowotech.net/sort/irq_subsystem
https://blog.csdn.net/chenying126/article/details/78786406
本文主要讲述下面两部分的内容:
1、将work挂入workqueue的处理过程
2、如何处理挂入workqueue的work
这里先给出几个常用的接口:
1.把一个work挂入未绑定cpu的workqueue中
/**
* queue_work - queue work on a workqueue
* @wq: workqueue to use
* @work: work to queue
*
* Returns %false if @work was already on a queue, %true otherwise.
*
* We queue the work to the CPU on which it was submitted, but if the CPU dies
* it can be processed by another CPU.
*/
static inline bool queue_work(struct workqueue_struct *wq,
struct work_struct *work)
{
return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}
2.把一个work在一段时间后,挂入未绑定cpu的workqueue
/**
* queue_delayed_work - queue work on a workqueue after delay
* @wq: workqueue to use
* @dwork: delayable work to queue
* @delay: number of jiffies to wait before queueing
*
* Equivalent to queue_delayed_work_on() but tries to use the local CPU.
*/
static inline bool queue_delayed_work(struct workqueue_struct *wq,
struct delayed_work *dwork,
unsigned long delay)
{
return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}
这里的一段时间,是创建一个内核定时器,让在定时器处理函数中把work挂载workqueue上,在前面文章中已经分析过了。
下面我们以queue_work_on函数作为开始:
1、queue_work_on函数
使用workqueue机制的模块可以调用queue_work_on(有其他变种的接口,这里略过,其实思路是一致的)将一个定义好的work挂入workqueue,具体代码如下:
/**
* queue_work_on - queue work on specific cpu
* @cpu: CPU number to execute work on
* @wq: workqueue to use
* @work: work to queue
*
* We queue the work to a specific CPU, the caller must ensure it
* can't go away.
*
* Return: %false if @work was already on a queue, %true otherwise.
*/
bool queue_work_on(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
bool ret = false;
unsigned long flags;
local_irq_save(flags); //把work加入工作队列是在关本地中断下运行的
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
__queue_work(cpu, wq, work); //挂入work list并通知worker thread pool来处理
ret = true;
}
local_irq_restore(flags); //开本地中断
return ret;
}
work_struct的data member中的WORK_STRUCT_PENDING_BIT这个bit标识了该work是处于pending状态还是正在处理中,pending状态的work只会挂入一次。大部分的逻辑都是在__queue_work函数中,下面的小节都是描述该函数的执行过程。
static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
struct pool_workqueue *pwq;
struct worker_pool *last_pool;
struct list_head *worklist;
unsigned int work_flags;
unsigned int req_cpu = cpu;
/*
* While a work item is PENDING && off queue, a task trying to
* steal the PENDING will busy-loop waiting for it to either get
* queued or lose PENDING. Grabbing PENDING and queueing should
* happen with IRQ disabled.
*/
lockdep_assert_irqs_disabled();
debug_work_activate(work);
/* if draining, only works from the same workqueue are allowed */
if (unlikely(wq->flags & __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
retry:
if (req_cpu == WORK_CPU_UNBOUND)
cpu = wq_select_unbound_cpu(raw_smp_processor_id());
/* pwq which will be used unless @work is executing elsewhere */
if (!(wq->flags & WQ_UNBOUND))
pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
else
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
/*
* If @work was previously on a different pool, it might still be
* running there, in which case the work needs to be queued on that
* pool to guarantee non-reentrancy.
*/
last_pool = get_work_pool(work);
if (last_pool && last_pool != pwq->pool) {
struct worker *worker;
spin_lock(&last_pool->lock);
worker = find_worker_executing_work(last_pool, work);
if (worker && worker->current_pwq->wq == wq) {
pwq = worker->current_pwq;
} else {
/* meh... not running there, queue here */
spin_unlock(&last_pool->lock);
spin_lock(&pwq->pool->lock);
}
} else {
spin_lock(&pwq->pool->lock);
}
/*
* pwq is determined and locked. For unbound pools, we could have
* raced with pwq release and it could already be dead. If its
* refcnt is zero, repeat pwq selection. Note that pwqs never die
* without another pwq replacing it in the numa_pwq_tbl or while
* work items are executing on it, so the retrying is guaranteed to
* make forward-progress.
*/
if (unlikely(!pwq->refcnt)) {
if (wq->flags & WQ_UNBOUND) {
spin_unlock(&pwq->pool->lock);
cpu_relax();
goto retry;
}
/* oops */
WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
wq->name, cpu);
}
/* pwq determined, queue */
trace_workqueue_queue_work(req_cpu, pwq, work);
if (WARN_ON(!list_empty(&work->entry))) {
spin_unlock(&pwq->pool->lock);
return;
}
pwq->nr_in_flight[pwq->work_color]++;
work_flags = work_color_to_flags(pwq->work_color);
if (likely(pwq->nr_active < pwq->max_active)) {
trace_workqueue_activate_work(work);
pwq->nr_active++;
worklist = &pwq->pool->worklist;
if (list_empty(worklist))
pwq->pool->watchdog_ts = jiffies;
} else {
work_flags |= WORK_STRUCT_DELAYED;
worklist = &pwq->delayed_works;
}
insert_work(pwq, work, worklist, work_flags);
spin_unlock(&pwq->pool->lock);
}
2、__WQ_DRAINING的解释
__queue_work函数一开始会校验__WQ_DRAINING这个flag,如下:
/* if draining, only works from the same workqueue are allowed */
if (unlikely(wq->flags & __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
__WQ_DRAINING这个flag表示该workqueue正在进行draining的操作,这多半是发送在销毁workqueue的时候,既然要销毁,那么挂入该workqueue的所有的work都要处理完毕,才允许它消亡。当想要将一个workqueue中所有的work都清空的时候,如果还有work挂入怎么办?一般而言,这时候当然是不允许新的work挂入了,毕竟现在的目标是清空workqueue中的work。但是有一种特例(通过is_chained_work判定),也就是正在清空的work(隶属于该workqueue)又触发了一个queue work的操作(也就是所谓chained work),这时候该work允许挂入。
3、选择pool workqueue
retry:
if (req_cpu == WORK_CPU_UNBOUND) //这个work是否要求使用哪个cpu
cpu = wq_select_unbound_cpu(raw_smp_processor_id());
/* pwq which will be used unless @work is executing elsewhere */
if (!(wq->flags & WQ_UNBOUND))
pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); //对于bound型的workqueue,直接使用本地CPU对应pool_workqueue
else
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); //对于unbound型,调用unbound_pwq_by_node()寻找本地node节点对应的unbound类型的pool_workqueue
WORK_CPU_UNBOUND表示并不指定cpu,这时候,选择当前代码运行的那个cpu了。一旦确定了cpu了,对于非unbound的workqueue,当然使用per cpu的pool workqueue。如果是unbound的workqueue,那么要根据numa node id来选择。cpu_to_node可以从cpu id获取node id。需要注意的是:这里选择的pool wq只是备选的,可能用也可能不用,它有可能会被替换掉,具体参考下一节描述。
4、选择worker thread pool
与其说挂入workqueue,不如说挂入worker thread pool,因为毕竟是线程池来处理具体的work。pool_workqueue有一个相关联的worker thread pool(struct pool_workqueue的pool成员),因此看起来选择了pool wq也就选定了worker pool了,但是,不一定当前选定的那个pool wq对应的worker pool就适合该work,因为有时候该work可能正在其他的worker thread上执行中,在这种情况下,为了确保work的callback function不会重入,该work最好还是挂在那个worker thread pool上,具体代码如下:
/*
* If @work was previously on a different pool, it might still be
* running there, in which case the work needs to be queued on that
* pool to guarantee non-reentrancy.
*/
last_pool = get_work_pool(work); //通过work_struct的成员data查询该work上一次是在哪个worker_pool中运行的。
if (last_pool && last_pool != pwq->pool) { //如果上次运行的worker_pool和本次不一致
struct worker *worker;
spin_lock(&last_pool->lock);
worker = find_worker_executing_work(last_pool, work); //判断一个work是否正在last_pool上运行,也即不在当前worker_pool运行。如果是,返回这个正在执行的工作线程worker
if (worker && worker->current_pwq->wq == wq) {
pwq = worker->current_pwq; //利用当前work正在执行的pool_workqueue,利用缓存热度,不进行调度
} else {
/* meh... not running there, queue here */
spin_unlock(&last_pool->lock);
spin_lock(&pwq->pool->lock);
}
} else {
spin_lock(&pwq->pool->lock);
}
last_pool记录了上一次该work是被哪一个worker pool处理的,如果last_pool就是pool wq对应的worker pool,那么皆大欢喜,否则只能使用last pool了。使用last pool的例子比较复杂一些,因为这时候需要根据last worker pool找到对应的pool workqueue。find_worker_executing_work函数可以找到具体哪一个worker线程正在处理该work,如果没有找到,那么还是使用第3节中选定的pool wq吧,否则,选择该worker线程当前的那个pool workqueue(其实也就是选定了线程池)。
5.确定这个pwq可用
/*
* pwq is determined and locked. For unbound pools, we could have
* raced with pwq release and it could already be dead. If its
* refcnt is zero, repeat pwq selection. Note that pwqs never die
* without another pwq replacing it in the numa_pwq_tbl or while
* work items are executing on it, so the retrying is guaranteed to
* make forward-progress.
*/
if (unlikely(!pwq->refcnt)) {
if (wq->flags & WQ_UNBOUND) {
spin_unlock(&pwq->pool->lock);
cpu_relax(); //对unbound类型pool_workqueue释放是异步的,当refcnt减少到0时,说明该pool_workqueue已经被释放,那么需要跳转到retry出重新选择pool_workqueue
goto retry;
}
/* oops */
/* 对于绑定cpu的,pwq每个cpu都有两个线程池,所以pwq不会使0,如果是0,只能证明程序已经跑飞 */
WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
wq->name, cpu);
}
/* initialize newly alloced @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
struct worker_pool *pool)
{
BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
memset(pwq, 0, sizeof(*pwq));
pwq->pool = pool;
pwq->wq = wq;
pwq->flush_color = -1;
pwq->refcnt = 1; //新申请的pwq,初始化时,值为1
INIT_LIST_HEAD(&pwq->delayed_works);
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
}
在pwq中加入一个work时,是要引用计数加1
/**
* insert_work - insert a work into a pool
* @pwq: pwq @work belongs to
* @work: work to insert
* @head: insertion point
* @extra_flags: extra WORK_STRUCT_* flags to set
*
* Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
* work_struct flags.
*
* CONTEXT:
* spin_lock_irq(pool->lock).
*/
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct list_head *head, unsigned int extra_flags)
{
struct worker_pool *pool = pwq->pool;
/* we own @work, set data and link */
set_work_pwq(work, pwq, extra_flags);
list_add_tail(&work->entry, head);
get_pwq(pwq);
/*
* Ensure either wq_worker_sleeping() sees the above
* list_add_tail() or we see zero nr_running to avoid workers lying
* around lazily while there are works to be processed.
*/
smp_mb();
if (__need_more_worker(pool))
wake_up_worker(pool);
}
/**
* get_pwq - get an extra reference on the specified pool_workqueue
* @pwq: pool_workqueue to get
*
* Obtain an extra reference on @pwq. The caller should guarantee that
* @pwq has positive refcnt and be holding the matching pool->lock.
*/
static void get_pwq(struct pool_workqueue *pwq)
{
lockdep_assert_held(&pwq->pool->lock);
WARN_ON_ONCE(pwq->refcnt <= 0);
pwq->refcnt++;
}
在一个work执行完之后,要在链表中删除这个work前,引用计数减1
/**
* put_pwq - put a pool_workqueue reference
* @pwq: pool_workqueue to put
*
* Drop a reference of @pwq. If its refcnt reaches zero, schedule its
* destruction. The caller should be holding the matching pool->lock.
*/
static void put_pwq(struct pool_workqueue *pwq)
{
lockdep_assert_held(&pwq->pool->lock);
if (likely(--pwq->refcnt)) // 减1操作
return;
if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
return;
/*
* @pwq can't be released under pool->lock, bounce to
* pwq_unbound_release_workfn(). This never recurses on the same
* pool->lock as this path is taken only for unbound workqueues and
* the release work item is scheduled on a per-cpu workqueue. To
* avoid lockdep warning, unbound pool->locks are given lockdep
* subclass of 1 in get_unbound_pool().
*/
schedule_work(&pwq->unbound_release_work);
}
6、选择work挂入的队列
队列有两个,一个是被推迟执行的队列(pwq->delayed_works),一个是线程池要处理的队列(pwq->pool->worklist),如果挂入线程池要处理的队列,也就意味着该work进入active状态,线程池会立刻启动处理流程,如果挂入推迟执行的队列,那么该work还是pending状态:
/* pwq determined, queue */
trace_workqueue_queue_work(req_cpu, pwq, work);
if (WARN_ON(!list_empty(&work->entry))) {
spin_unlock(&pwq->pool->lock);
return;
}
pwq->nr_in_flight[pwq->work_color]++;
work_flags = work_color_to_flags(pwq->work_color);
if (likely(pwq->nr_active < pwq->max_active)) { //判断当前pool_workqueue的work活跃数量,如果少于最高限值,就加入链表worker_pool->worklist,否则加入delayed_works链表中
trace_workqueue_activate_work(work);
pwq->nr_active++;
worklist = &pwq->pool->worklist;
if (list_empty(worklist))
pwq->pool->watchdog_ts = jiffies;
} else {
work_flags |= WORK_STRUCT_DELAYED;
worklist = &pwq->delayed_works;
}
7.将当前work加入到pool_workqueue->worklist尾部
insert_work(pwq, work, worklist, work_flags);
spin_unlock(&pwq->pool->lock);
/**
* insert_work - insert a work into a pool
* @pwq: pwq @work belongs to
* @work: work to insert
* @head: insertion point
* @extra_flags: extra WORK_STRUCT_* flags to set
*
* Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
* work_struct flags.
*
* CONTEXT:
* spin_lock_irq(pool->lock).
*/
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct list_head *head, unsigned int extra_flags)
{
struct worker_pool *pool = pwq->pool;
/* we own @work, set data and link */
set_work_pwq(work, pwq, extra_flags); //把pool_workqueue指针的值和一些flag设置到data成员中,方便下次调用
list_add_tail(&work->entry, head); //加入这个线程池的工作链表尾部
get_pwq(pwq); //每次加入,pwq的引用计数都要累加
/*
* Ensure either wq_worker_sleeping() sees the above
* list_add_tail() or we see zero nr_running to avoid workers lying
* around lazily while there are works to be processed.
*/
//保证wake_up_worker()唤醒worker时,在__schedule()->wq_worker_sleeping()时,这里的list_add_tail()已经完成。同时保证下面__need_more_worker()读取nr_running时list_add_tail()链表已经完成。
smp_mb();
if (__need_more_worker(pool)) //如果当前nr_running为0,表示当前worker可能并没有处于运行状态。那么需要wake_up_worker()强行唤醒一次。
wake_up_worker(pool);
}
workqueue 就是存放一组 work 的集合,基本可以分为两类:一类系统创建的 workqueue,一类是用户自己创建的 workqueue。
不论是系统还是用户的 workqueue,如果没有指定 WQ_UNBOUND,默认都是和 normal worker_pool 绑定。
1. 系统 workqueue
系统在初始化时创建了一批默认的 workqueue:system_wq、system_highpri_wq、system_long_wq、system_unbound_wq、system_freezable_wq、system_power_efficient_wq、system_freezable_power_efficient_wq。
/**
* workqueue_init_early - early init for workqueue subsystem
*
* This is the first half of two-staged workqueue subsystem initialization
* and invoked as soon as the bare basics - memory allocation, cpumasks and
* idr are up. It sets up all the data structures and system workqueues
* and allows early boot code to create workqueues and queue/cancel work
* items. Actual work item execution starts only after kthreads can be
* created and scheduled right before early initcalls.
*/
int __init workqueue_init_early(void)
{
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; //每个cpu有高低优先级的,2个,这里HIGHPRI_NICE_LEVEL为-20,对应的prio为100,是普通进程里面的最高优先级
int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
int i, cpu;
WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
/* initialize CPU pools 为每个cpu初始化两个线程池 */
for_each_possible_cpu(cpu) {
struct worker_pool *pool;
i = 0;
for_each_cpu_worker_pool(pool, cpu) { //每个CPU两个worker_pool,分别对应per-cpu变量cpu_worker_pool[0]和cpu_worker_pool[1]
BUG_ON(init_worker_pool(pool)); //初始化worker_pool(线程池)
pool->cpu = cpu; // 指定 cpu
cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
pool->attrs->nice = std_nice[i++]; //设置nice值
pool->node = cpu_to_node(cpu); //pre-cpu的都是设置的near mermory
/* alloc pool ID */
mutex_lock(&wq_pool_mutex);
BUG_ON(worker_pool_assign_id(pool));
mutex_unlock(&wq_pool_mutex);
}
}
/* create default unbound and ordered wq attrs */
for (i = 0; i < NR_STD_WORKER_POOLS; i++) { //默认未绑定和ordered的也是各申请两个attribute(高低优先级)
struct workqueue_attrs *attrs;
BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
attrs->nice = std_nice[i];
unbound_std_wq_attrs[i] = attrs; //设置Unbound类型workqueue的属性
/*
* An ordered wq should have only one pwq as ordering is
* guaranteed by max_active which is enforced by pwqs.
* Turn off NUMA so that dfl_pwq is used for all nodes.
*/
BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
attrs->nice = std_nice[i];
attrs->no_numa = true;
ordered_wq_attrs[i] = attrs; //设置ordered类型workqueue的属性,ordered类型workqueue同一时刻只能有一个work item在运行。
}
system_wq = alloc_workqueue("events", 0, 0); //普通优先级bound类型工作队列system_wq
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); //高优先级bound类型工作队列system_highpri_wq
system_long_wq = alloc_workqueue("events_long", 0, 0); //比较耗时的任务
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, //普通优先级unbound类型工作队列
WQ_UNBOUND_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable", //freezable(系统挂起后,这类内核线程可以被冻结)类型工作队列system_freezable_wq
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient", //省电类型的工作队列
WQ_POWER_EFFICIENT, 0);
system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", //freezable并且省电类型的工作队列
WQ_FREEZABLE | WQ_POWER_EFFICIENT,
0);
BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
!system_unbound_wq || !system_freezable_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq);
return 0;
}
这里我们看到,pre-cpu的线程池是直接初始化的。前面的章节如果有印象,应该知道,pre-cpu的现场池其实是静态定义的,所以它不用动态申请可以直接初始化。
/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
下面的几个函数都是系统默认的system_wq
/**
* schedule_work_on - put work task on a specific cpu
* @cpu: cpu to put the work task on
* @work: job to be done
*
* This puts a job on a specific cpu
*/
static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
return queue_work_on(cpu, system_wq, work);
}
/**
* schedule_work - put work task in global workqueue
* @work: job to be done
*
* Returns %false if @work was already on the kernel-global workqueue and
* %true otherwise.
*
* This puts a job in the kernel-global workqueue if it was not already
* queued and leaves it in the same position on the kernel-global
* workqueue otherwise.
*/
static inline bool schedule_work(struct work_struct *work)
{
return queue_work(system_wq, work);
}
/**
* schedule_delayed_work - put work task in global workqueue after delay
* @dwork: job to be done
* @delay: number of jiffies to wait or 0 for immediate execution
*
* After waiting for a given time this puts a job in the kernel-global
* workqueue.
*/
static inline bool schedule_delayed_work(struct delayed_work *dwork,
unsigned long delay)
{
return queue_delayed_work(system_wq, dwork, delay);
}
/**
* init_worker_pool - initialize a newly zalloc'd worker_pool
* @pool: worker_pool to initialize
*
* Initialize a newly zalloc'd @pool. It also allocates @pool->attrs.
*
* Return: 0 on success, -errno on failure. Even on failure, all fields
* inside @pool proper are initialized and put_unbound_pool() can be called
* on @pool safely to release it.
*/
static int init_worker_pool(struct worker_pool *pool)
{
spin_lock_init(&pool->lock);
pool->id = -1;
pool->cpu = -1; //初始值-1表示当前worker_pool是unbound型的
pool->node = NUMA_NO_NODE;
pool->flags |= POOL_DISASSOCIATED;
pool->watchdog_ts = jiffies;
INIT_LIST_HEAD(&pool->worklist); //worker_pool 的 work list,各个 workqueue 把 work 挂载到这个链表上, 让 worker_pool 对应的多个 worker 来执行
INIT_LIST_HEAD(&pool->idle_list); //worker_pool 的 idle worker list
hash_init(pool->busy_hash); //worker_pool 的 busy worker list
//销毁多余worker,每IDLE_WORKER_TIMEOUT(300秒)执行一次
timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
//设置mayday_timer,周期为MAYDAY_INTERVAL,即100ms
timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
INIT_LIST_HEAD(&pool->workers); //worker_pool的 worker list
ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
pool->refcnt = 1; //线程池,初始化时引用计数为1,创建一个worker + 1,销毁一个worker - 1
/* shouldn't fail above this point */
pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
if (!pool->attrs)
return -ENOMEM;
return 0;
}
1、per cpu worker pool什么时候创建worker线程?
2.unbound cpu worker pool 什么时候创建worker线程?
/**
* workqueue_init - bring workqueue subsystem fully online
*
* This is the latter half of two-staged workqueue subsystem initialization
* and invoked as soon as kthreads can be created and scheduled.
* Workqueues have been created and work items queued on them, but there
* are no kworkers executing the work items yet. Populate the worker pools
* with the initial workers and enable future kworker creations.
*/
int __init workqueue_init(void)
{
struct workqueue_struct *wq;
struct worker_pool *pool;
int cpu, bkt;
/*
* It'd be simpler to initialize NUMA in workqueue_init_early() but
* CPU to node mapping may not be available that early on some
* archs such as power and arm64. As per-cpu pools created
* previously could be missing node hint and unbound pools NUMA
* affinity, fix them up.
*
* Also, while iterating workqueues, create rescuers if requested.
*/
wq_numa_init();
mutex_lock(&wq_pool_mutex);
for_each_possible_cpu(cpu) { //初始化线程池的内存节点编号
for_each_cpu_worker_pool(pool, cpu) {
pool->node = cpu_to_node(cpu);
}
}
//workqueues上挂了系统中所有的wq,这里是初始化这些wq
list_for_each_entry(wq, &workqueues, list) {
wq_update_unbound_numa(wq, smp_processor_id(), true);
WARN(init_rescuer(wq), //创建救援线程
"workqueue: failed to create early rescuer for %s",
wq->name);
}
mutex_unlock(&wq_pool_mutex);
/* create the initial workers 给每个在线的cpu的 worker_pool 创建 worker*/
for_each_online_cpu(cpu) {
for_each_cpu_worker_pool(pool, cpu) {
pool->flags &= ~POOL_DISASSOCIATED;
BUG_ON(!create_worker(pool)); //创建一个线程池
}
}
//对未绑定cpu的线程池,也创建线程一个默认的
hash_for_each(unbound_pool_hash, bkt, pool, hash_node) //创建unbound线程池
BUG_ON(!create_worker(pool));
wq_online = true;
wq_watchdog_init(); //初始化wq_watchdog,用于监视某个worker是不是卡主了
return 0;
}
因此,在系统初始化的时候,per cpu workqueue共享的那些线程池(2 x cpu nr)和unbound cpu workqueue的线程池就会通过create_worker创建一个initial worker。
一旦initial worker启动,该线程会执行worker_thread函数来处理work,在处理过程中,如果有需要, worker会创建新的线程。
2、unbound thread pool什么时候创建worker线程?
我们先看看unbound thread pool的建立,和per-CPU不同的是unbound thread pool是全局共享的,因此,每当创建不同属性的unbound workqueue的时候,都需要创建pool_workqueue及其对应的worker pool,这时候就会调用get_unbound_pool函数在当前系统中现存的线程池中找是否有匹配的worker pool,如果没有就需要创建新的线程池。在创建新的线程池之后,会立刻调用create_worker创建一个initial worker。和per cpu worker pool一样,一旦initial worker启动,随着work不断的挂入以及worker处理work的具体情况,线程池会动态创建worker。当然unbound thread pool在系统刚开始初始化的时候是创建了一个的。
/**
* get_unbound_pool - get a worker_pool with the specified attributes
* @attrs: the attributes of the worker_pool to get
*
* Obtain a worker_pool which has the same attributes as @attrs, bump the
* reference count and return it. If there already is a matching
* worker_pool, it will be used; otherwise, this function attempts to
* create a new one.
*
* Should be called with wq_pool_mutex held.
*
* Return: On success, a worker_pool with the same attributes as @attrs.
* On failure, %NULL.
*/
static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
u32 hash = wqattrs_hash(attrs);
struct worker_pool *pool;
int node;
int target_node = NUMA_NO_NODE;
lockdep_assert_held(&wq_pool_mutex);
/* do we already have a matching pool? ,查找是否已经有这个属性的未绑定的cpu的线程池 */
hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
if (wqattrs_equal(pool->attrs, attrs)) { //比较属性
pool->refcnt++;
return pool; //有的话,直接返回
}
}
/* if cpumask is contained inside a NUMA node, we belong to that node */
if (wq_numa_enabled) {
for_each_node(node) {
if (cpumask_subset(attrs->cpumask,
wq_numa_possible_cpumask[node])) {
target_node = node;
break;
}
}
}
/* nope, create a new one ,没有测创建一个新的 */
pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
if (!pool || init_worker_pool(pool) < 0)
goto fail;
lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
copy_workqueue_attrs(pool->attrs, attrs);
pool->node = target_node;
/*
* no_numa isn't a worker_pool attribute, always clear it. See
* 'struct workqueue_attrs' comments for detail.
*/
pool->attrs->no_numa = false;
if (worker_pool_assign_id(pool) < 0)
goto fail;
/* create and start the initial worker */
if (wq_online && !create_worker(pool))
goto fail;
/* install */
hash_add(unbound_pool_hash, &pool->hash_node, hash); //新的线程池,加入哈希表
return pool;
fail:
if (pool)
put_unbound_pool(pool);
return NULL;
}
3、如何创建worker。代码如下:
/**
* create_worker - create a new workqueue worker
* @pool: pool the new worker will belong to
*
* Create and start a new worker which is attached to @pool.
*
* CONTEXT:
* Might sleep. Does GFP_KERNEL allocations.
*
* Return:
* Pointer to the newly created worker.
*/
static struct worker *create_worker(struct worker_pool *pool)
{
struct worker *worker = NULL;
int id = -1;
char id_buf[16];
/* ID is needed to determine kthread name */
id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); //当前worker_pool->worker_ida获取一个空闲id
if (id < 0)
goto fail;
worker = alloc_worker(pool->node); //分配一个woker结构体
if (!worker)
goto fail;
worker->id = id; //递增的id
//worker的名字
if (pool->cpu >= 0)
snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
pool->attrs->nice < 0 ? "H" : "");
else
snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
//创建内核线程,线程处理函数是worker_thread
worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
"kworker/%s", id_buf);
if (IS_ERR(worker->task))
goto fail;
set_user_nice(worker->task, pool->attrs->nice); //设置内核工作线程的优先级相关
kthread_bind_mask(worker->task, pool->attrs->cpumask);
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool); //建立worker和线程池的关系
/* start the newly created worker */
spin_lock_irq(&pool->lock);
worker->pool->nr_workers++; //统计当前worker对应worker_pool中工作线程数目
worker_enter_idle(worker); //让该工作线程进入idle状态
wake_up_process(worker->task); //唤醒刚创建的工作线程
spin_unlock_irq(&pool->lock);
return worker;
fail:
if (id >= 0)
ida_simple_remove(&pool->worker_ida, id);
kfree(worker);
return NULL;
}
代码不复杂,通过线程池(struct worker_pool)绑定的cpu信息(struct worker_pool的cpu成员)可以知道该pool是per-CPU还是unbound,对于per-CPU线程池,pool->cpu是大于等于0的。对于对于per-CPU线程池,其worker线程的名字是kworker/cpu:worker id,如果是high priority的,后面还跟着一个H字符。对于unbound线程池,其worker线程的名字是kworker/u pool id:worker id。
if (pool->cpu >= 0)
snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
pool->attrs->nice < 0 ? "H" : "");
else
snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
可以看到,下面这几个就是pre-cpu的worker
root 4 0.0 0.0 0 0 ? S 3月15 0:00 [kworker/0:0]
root 5 0.0 0.0 0 0 ? S< 3月15 0:00 [kworker/0:0H]
root 25 0.0 0.0 0 0 ? S< 3月15 0:00 [kworker/3:0H]
root 66 0.0 0.0 0 0 ? S 3月15 0:07 [kworker/1:1]
root 67 0.0 0.0 0 0 ? S 3月15 0:07 [kworker/0:1]
下面几个是unbound的worker
root 32 0.0 0.0 0 0 ? S< 3月15 0:00 [kworker/u17:0]
root 4600 0.0 0.0 0 0 ? S 13:38 0:00 [kworker/u16:2]
root 4605 0.0 0.0 0 0 ? S 13:43 0:00 [kworker/u16:0]
root 4612 1.0 0.0 0 0 ? S 14:09 0:00 [kworker/u16:1]
先给出函数,下面分析
/**
* worker_thread - the worker thread function
* @__worker: self
*
* The worker thread function. All workers belong to a worker_pool -
* either a per-cpu one or dynamic unbound one. These workers process all
* work items regardless of their specific target workqueue. The only
* exception is work items which belong to workqueues with a rescuer which
* will be explained in rescuer_thread().
*
* Return: 0
*/
static int worker_thread(void *__worker)
{
struct worker *worker = __worker;
struct worker_pool *pool = worker->pool;
/* tell the scheduler that this is a workqueue worker */
set_pf_worker(true); //分析1
woke_up:
spin_lock_irq(&pool->lock);
/* am I supposed to die? 分析2 */
if (unlikely(worker->flags & WORKER_DIE)) {
spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
set_pf_worker(false);
set_task_comm(worker->task, "kworker/dying");
ida_simple_remove(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker);
kfree(worker);
return 0;
}
worker_leave_idle(worker);
recheck:
/* no more worker necessary? 分析3 */
if (!need_more_worker(pool))
goto sleep;
/* do we need to manage? 分析4 */
if (unlikely(!may_start_working(pool)) && manage_workers(worker))
goto recheck;
/*
* ->scheduled list can only be filled while a worker is
* preparing to process a work or actually processing it.
* Make sure nobody diddled with it while I was sleeping.
*/
//scheduled链表表示工作线程准备处理一个work或者正在执行一个work时才会有work添加到该链表中
WARN_ON_ONCE(!list_empty(&worker->scheduled));
/* 分析 5
* Finish PREP stage. We're guaranteed to have at least one idle
* worker or that someone else has already assumed the manager
* role. This is where @worker starts participating in concurrency
* management if applicable and concurrency management is restored
* after being rebound. See rebind_workers() for details.
*/
worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
do {
struct work_struct *work =
list_first_entry(&pool->worklist,
struct work_struct, entry);
pool->watchdog_ts = jiffies;
if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
/* optimization path, not strictly necessary */
process_one_work(worker, work);
if (unlikely(!list_empty(&worker->scheduled)))
process_scheduled_works(worker);
} else {
move_linked_works(work, &worker->scheduled, NULL);
process_scheduled_works(worker);
}
} while (keep_working(pool));
worker_set_flags(worker, WORKER_PREP);
sleep:
/*
* pool->lock is held and there's no work to process and no need to
* manage, sleep. Workers are woken up only while holding
* pool->lock or from local cpu, so setting the current state
* before releasing pool->lock is enough to prevent losing any
* event.
*/
worker_enter_idle(worker);
__set_current_state(TASK_IDLE);
spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
}
分析1:
/* tell the scheduler that this is a workqueue worker */
set_pf_worker(true);
static void set_pf_worker(bool val)
{
mutex_lock(&wq_pool_attach_mutex);
if (val)
current->flags |= PF_WQ_WORKER;
else
current->flags &= ~PF_WQ_WORKER;
mutex_unlock(&wq_pool_attach_mutex);
}
worker线程函数一开始就会通过PF_WQ_WORKER来标注自己
有了这样一个flag,调度器在调度当前进程sleep的时候可以检查这个准备sleep的进程是否是一个worker线程,如果是的话,那么调度器不能鲁莽的调度到其他的进程,这时候,还需要找到该worker对应的线程池,唤醒一个idle的worker线程。通过workqueue模块和调度器模块的交互,当work A被阻塞后(处理该work的worker线程进入sleep),调度器会唤醒其他的worker线程来处理其他的work B,work C……
分析2:
woke_up:
spin_lock_irq(&pool->lock);
/* am I supposed to die? */
if (unlikely(worker->flags & WORKER_DIE)) { //-WORKER_DIE表示此工作线程将要被销毁,此时即不能使用
spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
set_pf_worker(false);
set_task_comm(worker->task, "kworker/dying");
ida_simple_remove(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker);
kfree(worker);
return 0;
}
worker_leave_idle(worker); //清除worker(线程)的空闲状态
......
/*
* pool->lock is held and there's no work to process and no need to
* manage, sleep. Workers are woken up only while holding
* pool->lock or from local cpu, so setting the current state
* before releasing pool->lock is enough to prevent losing any
* event.
*/
worker_enter_idle(worker);
__set_current_state(TASK_IDLE);
spin_unlock_irq(&pool->lock);
schedule(); //没任务了,主动放弃资源,下次被唤醒时,从调度函数出来
goto woke_up; //执行这里,说是被唤醒,可能是删除该线程被唤醒,也可能是有任务了被唤醒
因为线程的销毁是异步的。所以标记了要销毁的线程,会不再挂接work。待已经挂在worker上的work处理完毕,就会销毁。所以上面要是标记了销毁标志且worker上面的work确实没有了,就会销毁这个worker。
当然一般没设置销毁标志的,则会进来之后先清除空闲标志,表示这个线程没处于挂起状态。
分析3:
recheck:
/* no more worker necessary? */
if (!need_more_worker(pool))
goto sleep;
......
sleep:
/*
* pool->lock is held and there's no work to process and no need to
* manage, sleep. Workers are woken up only while holding
* pool->lock or from local cpu, so setting the current state
* before releasing pool->lock is enough to prevent losing any
* event.
*/
worker_enter_idle(worker); //线程进入睡眠
__set_current_state(TASK_IDLE); //设置线程的睡眠标志
spin_unlock_irq(&pool->lock);
schedule(); //调度出去,让出cpu资源
goto woke_up;
}
如何判断是否需要创建更多的worker线程呢?原则如下:
(1)有事情做:挂在worker pool中的work list不能是空的,如果是空的,那么当然sleep就好了
/*
* Need to wake up a worker? Called from anything but currently
* running workers.
*
* Note that, because unbound workers never contribute to nr_running, this
* function will always return %true for unbound pools as long as the
* worklist isn't empty.
*/
static bool need_more_worker(struct worker_pool *pool)
{
return !list_empty(&pool->worklist) && __need_more_worker(pool);
}
(2)比较忙:worker pool的nr_running成员表示线程池中当前正在干活(running状态)的worker线程有多少个,当nr_running等于0表示所有的worker线程在处理work的时候阻塞了,这时候,必须要启动新的worker线程来处理worker pool上处于active状态的work链表上的work们。
/*
* Policy functions. These define the policies on how the global worker
* pools are managed. Unless noted otherwise, these functions assume that
* they're being called with pool->lock held.
*/
static bool __need_more_worker(struct worker_pool *pool)
{
return !atomic_read(&pool->nr_running);
}
分析4
recheck:
/* no more worker necessary? */
if (!need_more_worker(pool))
goto sleep;
/* do we need to manage? */
if (unlikely(!may_start_working(pool)) && manage_workers(worker))
goto recheck;
may_start_working()判断pool中是否有idle状态工作线程。如果没有,那么manage_workers()创建一些工作线程
/* Can I start working? Called from busy but !running workers. */
static bool may_start_working(struct worker_pool *pool)
{
return pool->nr_idle;
}
manage_workers()函数动态管理创建工作线程的函数
/**
* manage_workers - manage worker pool
* @worker: self
*
* Assume the manager role and manage the worker pool @worker belongs
* to. At any given time, there can be only zero or one manager per
* pool. The exclusion is handled automatically by this function.
*
* The caller can safely start processing works on false return. On
* true return, it's guaranteed that need_to_create_worker() is false
* and may_start_working() is true.
*
* CONTEXT:
* spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations.
*
* Return:
* %false if the pool doesn't need management and the caller can safely
* start processing works, %true if management function was performed and
* the conditions that the caller verified before calling the function may
* no longer be true.
*/
static bool manage_workers(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
if (pool->flags & POOL_MANAGER_ACTIVE)
return false;
pool->flags |= POOL_MANAGER_ACTIVE; //标记线程池在管理创建线程
pool->manager = worker;
maybe_create_worker(pool); //创建备用线程
pool->manager = NULL;
pool->flags &= ~POOL_MANAGER_ACTIVE;
wake_up(&wq_manager_wait); //唤醒等待队列中的事件
return true;
}
maybo_create_worker()函数中while首先调用create_worker()来创建新的工作线程。
/**
* maybe_create_worker - create a new worker if necessary
* @pool: pool to create a new worker for
*
* Create a new worker for @pool if necessary. @pool is guaranteed to
* have at least one idle worker on return from this function. If
* creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
* sent to all rescuers with works scheduled on @pool to resolve
* possible allocation deadlock.
*
* On return, need_to_create_worker() is guaranteed to be %false and
* may_start_working() %true.
*
* LOCKING:
* spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations. Called only from
* manager.
*/
static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
restart:
spin_unlock_irq(&pool->lock);
/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
// 启动mayday_timer,如果创建worker时间太长就唤醒紧急worker(rescuer)处理work_struct
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
while (true) {
//create_worker()创建成功则退出while循环;或者通过need_to_create_worker()判断是否需要继续创建新线程
if (create_worker(pool) || !need_to_create_worker(pool))
break;
schedule_timeout_interruptible(CREATE_COOLDOWN);
//再次判断是否需要继续创建新线程,不需要的退出,需要的话继续创建线程
if (!need_to_create_worker(pool))
break;
}
del_timer_sync(&pool->mayday_timer); //一定时间内创建成功了线程,则删除掉,本函数前面加入的定时器
spin_lock_irq(&pool->lock);
/*
* This is necessary even after a new worker was just successfully
* created as @pool->lock was dropped and the new worker might have
* already become busy.
* 再次判断是否需要继续创建新线程
*/
if (need_to_create_worker(pool))
goto restart;
}
分析 5 worker线程开始处理work
/*
* Finish PREP stage. We're guaranteed to have at least one idle
* worker or that someone else has already assumed the manager
* role. This is where @worker starts participating in concurrency
* management if applicable and concurrency management is restored
* after being rebound. See rebind_workers() for details.
*/
worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
do {
struct work_struct *work = //获取该线程池上链表上的第一个work
list_first_entry(&pool->worklist,
struct work_struct, entry);
pool->watchdog_ts = jiffies;
if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
/* optimization path, not strictly necessary */
process_one_work(worker, work); //单独处理一个work
if (unlikely(!list_empty(&worker->scheduled)))
process_scheduled_works(worker); //处理worker_pool->scheduled链表上的work_struct
} else {
//如果当前work_struct置位WORK_STRUCT_LINKED表示work后面还串上其它work,把这些work迁移到woeker_pool->scheduled中,然后一并再用process_one_work()函数处理
move_linked_works(work, &worker->scheduled, NULL);
process_scheduled_works(worker);
}
} while (keep_working(pool));
worker_set_flags(worker, WORKER_PREP);
按理说worker线程处理work应该比较简单,从线程池的worklist中取一个work,然后调用process_one_work处理之就OK了,不过现实稍微复杂一些,work和work之间并不是独立的,也就是说,work A和work B可能是linked work,这些linked work应该被一个worker来处理。WORK_STRUCT_LINKED标记了work是属于linked work,如果是linked work,worker并不直接处理,而是将其挂入scheduled work list,然后调用process_scheduled_works来处理。毫无疑问,process_scheduled_works也是调用process_one_work来处理一个一个scheduled work list上的work。
scheduled work list并非仅仅应用在linked work,在worker处理work的时候,有一个原则要保证:同一个work不能被同一个cpu上的多个worker同时执行。这时候,如果worker发现自己要处理的work正在被另外一个worker线程处理,那么本worker线程将不处理该work,只需要挂入正在执行该work的worker线程的scheduled work list即可。
把这些work迁移到woeker_pool->scheduled中,然后一并再用process_one_work()函数处理
/**
* move_linked_works - move linked works to a list
* @work: start of series of works to be scheduled
* @head: target list to append @work to
* @nextp: out parameter for nested worklist walking
*
* Schedule linked works starting from @work to @head. Work series to
* be scheduled starts at @work and includes any consecutive work with
* WORK_STRUCT_LINKED set in its predecessor.
*
* If @nextp is not NULL, it's updated to point to the next work of
* the last scheduled work. This allows move_linked_works() to be
* nested inside outer list_for_each_entry_safe().
*
* CONTEXT:
* spin_lock_irq(pool->lock).
*/
static void move_linked_works(struct work_struct *work, struct list_head *head,
struct work_struct **nextp)
{
struct work_struct *n;
/*
* Linked worklist will always end before the end of the list,
* use NULL for list head.
*/
list_for_each_entry_safe_from(work, n, NULL, entry) {
list_move_tail(&work->entry, head);
if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
break;
}
/*
* If we're already inside safe list traversal and have moved
* multiple works to the scheduled queue, the next position
* needs to be updated.
*/
if (nextp)
*nextp = n;
}
/**
* process_scheduled_works - process scheduled works
* @worker: self
*
* Process all scheduled works. Please note that the scheduled list
* may change while processing a work, so this function repeatedly
* fetches a work from the top and executes it.
*
* CONTEXT:
* spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times.
*/
static void process_scheduled_works(struct worker *worker)
{
while (!list_empty(&worker->scheduled)) {
struct work_struct *work = list_first_entry(&worker->scheduled,
struct work_struct, entry);
process_one_work(worker, work);
}
}
处理一个work
/**
* process_one_work - process single work
* @worker: self
* @work: work to process
*
* Process @work. This function contains all the logics necessary to
* process a single work including synchronization against and
* interaction with other workers on the same cpu, queueing and
* flushing. As long as context requirement is met, any worker can
* call this function to process a work.
*
* CONTEXT:
* spin_lock_irq(pool->lock) which is released and regrabbed.
*/
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool;
bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; //判断当前的workqueue是否是CPU_INTENSIVE,会对其所在工作线程进行特殊设置
int work_color;
struct worker *collision;
#ifdef CONFIG_LOCKDEP
/*
* It is permissible to free the struct work_struct from
* inside the function that is called from it, this we need to
* take into account for lockdep too. To avoid bogus "held
* lock freed" warnings as well as problems when looking into
* work->lockdep_map, make a copy and use that here.
*/
struct lockdep_map lockdep_map;
lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
/* ensure we're on the correct CPU */
WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
raw_smp_processor_id() != pool->cpu);
/*
* A single work shouldn't be executed concurrently by
* multiple workers on a single cpu. Check whether anyone is
* already processing the work. If so, defer the work to the
* currently executing one.
*/
//-查询当前work是否在worker_pool->busy_hash表中正在运行,如果在就移到当前work正在执行的worker->scheduled并退出当前处理
collision = find_worker_executing_work(pool, work);
if (unlikely(collision)) {
move_linked_works(work, &collision->scheduled, NULL);
return;
}
/* claim and dequeue, */
debug_work_deactivate(work);
hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); //将 worker 加入 busy 队列 pool->busy_hash
worker->current_work = work; //把这个work,放到这个worker的当前运行上
worker->current_func = work->func;
worker->current_pwq = pwq;
work_color = get_work_color(work);
/*
* Record wq name for cmdline and debug reporting, may get
* overridden through set_worker_desc().
*/
strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);
list_del_init(&work->entry); //每次执行work,都会从所在链表上删除自己
/*
* CPU intensive works don't participate in concurrency management.
* They're the scheduler's responsibility. This takes @worker out
* of concurrency management and the next code block will chain
* execution of the pending work items.
*/
if (unlikely(cpu_intensive)) //设置cpu密集型标志
worker_set_flags(worker, WORKER_CPU_INTENSIVE);
/*
* Wake up another worker if necessary. The condition is always
* false for normal per-cpu workers since nr_running would always
* be >= 1 at this point. This is used to chain execution of the
* pending work items for WORKER_NOT_RUNNING workers such as the
* UNBOUND and CPU_INTENSIVE ones.
*/
//判断是否需要唤醒更多工作线程,wake_up_worker()去唤醒worker_pool中第一个idle线程。对于bound型worker_pool此时一般nr_running>=1,所以条件不成立
if (need_more_worker(pool))
wake_up_worker(pool);
/*
* Record the last pool and clear PENDING which should be the last
* update to @work. Also, do this inside @pool->lock so that
* PENDING and queued state changes happen together while IRQ is
* disabled.
*/
//清除struct worker中data成员pending标志位,里面使用了smp_wmb保证了pending之前的写操作完成之后才清除pending
set_work_pool_and_clear_pending(work, pool->id);
spin_unlock_irq(&pool->lock);
lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
/*
* Strictly speaking we should mark the invariant state without holding
* any locks, that is, before these two lock_map_acquire()'s.
*
* However, that would result in:
*
* A(W1)
* WFC(C)
* A(W1)
* C(C)
*
* Which would create W1->C->W1 dependencies, even though there is no
* actual deadlock possible. There are two solutions, using a
* read-recursive acquire on the work(queue) 'locks', but this will then
* hit the lockdep limitation on recursive locks, or simply discard
* these locks.
*
* AFAICT there is no possible deadlock scenario between the
* flush_work() and complete() primitives (except for single-threaded
* workqueues), so hiding them isn't a problem.
*/
lockdep_invariant_state(true);
trace_workqueue_execute_start(work);
worker->current_func(work); //真正执行work的回调函数
/*
* While we must be careful to not use "work" after this, the trace
* point will only record its address.
*/
trace_workqueue_execute_end(work);
lock_map_release(&lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
" last function: %pf\n",
current->comm, preempt_count(), task_pid_nr(current),
worker->current_func);
debug_show_held_locks(current);
dump_stack();
}
/*
* The following prevents a kworker from hogging CPU on !PREEMPT
* kernels, where a requeueing work item waiting for something to
* happen could deadlock with stop_machine as such work item could
* indefinitely requeue itself while all other CPUs are trapped in
* stop_machine. At the same time, report a quiescent RCU state so
* the same condition doesn't freeze RCU.
*/
cond_resched();
spin_lock_irq(&pool->lock);
/* clear cpu intensive status */
if (unlikely(cpu_intensive))
worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
/* we're done with it, release */
hash_del(&worker->hentry); //-work回调函数执行完成后的清理工作
worker->current_work = NULL;
worker->current_func = NULL;
worker->current_pwq = NULL;
pwq_dec_nr_in_flight(pwq, work_color);
}