工作队列基本原理是将work交给一个内核线程来执行。工作队列在进程上下文中执行。因此工作运行重新调度和睡眠,是异步执行的进程上下文(不明白为什么是异步执行的进程上下文)
当驱动程序或者内核子系统在进程上下文中有异步执行的工作任务是,用word item(工作)描述工作任务。将work item添加到一个队列(工作队列)中,然后一个内核线程会去执行这些工作的回调函数。这个内核线程被称为worker。
早期工作队列是有多线程和单线程组成。把需要执行的任务放到一个队列中,然后内核线程从队列中取出任务,执行这些任务的回调函数。早期版本有以下缺点:
1、并发差:工作线程和cpu绑定。cpu0有3个待执行的任务A、B、C。当执行A的时候工作线程休眠了。B、C只能等待。无法迁移到其他cpu上执行。
后面提出了新的解决方案,concurrency-managed worqueues(CMWQ)提出了工作线程池(woker-pool),worker-pool有两种,一种是BOUND类型,可以理解为Per-CPU类型,每个CPU都有worker-pool,另一个是UNBOUND类型,即不和具体的CPU绑定。这两种(bound和unbound)worker-pool都会定义两个线程池,一个给普通优先级的work使用,另一个则是给高优先级的work使用。(worker-pool有两种类型,普通优先级和高优先级的)。一个CPU上同一优先级的所有worker线程共同构成了一个worker-pool。因此一个CPU上存在了两个worker pool.目前的设计是一个CPU对应两个工作队列pwq,相应地也有两个worker pool,分别服务于这个2个队列。
上图的system_wq就是对应了工作队列。工作队列可以找cpu上同优先级的worker-pool为其执行任务,即work(因为在创建工作队列的时候,每个工作队列就和每个cpu上的pwq用链表连接起来了)。当我们准备把一个work加入到工作队列时,会先通过一个工作队列(如system_wq),找到pwq,然后通过pwq找到worker-pool。最后将work挂到worker-pool的链表中。最后当工作线程被唤醒的时候,他会去找到自己所属的worker-pool。从pool的链表中,挨个执行work(process_on_work)。
workqueue机制中最小的调度单元work item
struct work_struct {
atomic_long_t data;
struct list_head entry;//将work挂到队列上
work_func_t func;//回调函数
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
};
work item运行在内核线程中,该线程被称为worker
struct worker {
/* on idle list while idle, on busy hash table while busy */
union {
struct list_head entry; /* L: while idle */
struct hlist_node hentry; /* L: while busy */
};
/* 当前正在处理的work */
struct work_struct *current_work; /* L: work being processed */
/* 当前正在执行work的回调函数 */
work_func_t current_func; /* L: current_work's fn */
/* 当前work所属的pool_workpool */
struct pool_workqueue *current_pwq; /* L: current_work's pwq */
bool desc_valid; /* ->desc is valid */
/* 所以被调度并准备执行的work都挂入该链表中 */
struct list_head scheduled; /* L: scheduled works */
/* 64 bytes boundary on 64bit, 32 on 32bit */
/* 该工作线程所属的task_struct */
struct task_struct *task; /* I: worker task */
/* 该工作线程所属的工作线程池 */
struct worker_pool *pool; /* I: the associated pool */
/* L: for rescuers */
unsigned long last_active; /* L: last active timestamp */
unsigned int flags; /* X: flags */
/* 工作线程的id */
int id; /* I: worker id */
/*
* Opaque string set with work_set_desc(). Printed out with task
* dump for debugging - WARN, BUG, panic or sysrq.
*/
char desc[WORKER_DESC_LEN];
/* used only by rescuers to point to the target workqueue */
struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
};
struct worker_pool {
/* 包含worker-pool的自旋锁 */
spinlock_t lock; /* the pool lock */
/* 对应bound类型的workqueue,表示绑定的Cpu id.对于unbound其值为-1 */
int cpu; /* I: the associated cpu */
/* 对于unboundl类型的workqueue,node表示该work-pool所属内存节点的id编号 */
int node; /* I: the associated node ID */
/* 该work-pool id */
int id; /* I: pool ID */
unsigned int flags; /* X: flags */
/* pengding状态的work会挂入该链表 */
struct list_head worklist; /* L: list of pending works */
/* 工作线程数量 */
int nr_workers; /* L: total number of workers */
/* nr_idle includes the ones off idle_list for rebinding */
/* 处于idle状态的工作线程数量 */
int nr_idle; /* L: currently idle ones */
struct list_head idle_list; /* X: list of idle workers */
struct timer_list idle_timer; /* L: worker idle timeout */
struct timer_list mayday_timer; /* L: SOS timer for workers */
/* a workers is either on busy_hash or idle_list, or the manager */
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
/* L: hash of busy workers */
/* see manage_workers() for details on the two manager mutexes */
struct mutex manager_arb; /* manager arbitration */
struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
struct completion *detach_completion; /* all workers detached */
struct ida worker_ida; /* worker IDs for task name */
struct workqueue_attrs *attrs; /* I: worker attributes */
struct hlist_node hash_node; /* PL: unbound_pool_hash node */
int refcnt; /* PL: refcnt for unbound pools */
/*
* The current concurrency level. As it's likely to be accessed
* from other CPUs during try_to_wake_up(), put it in a separate
* cacheline.
*/
atomic_t nr_running ____cacheline_aligned_in_smp;
/*
* Destruction of pool is sched-RCU protected to allow dereferences
* from get_work_pool().
*/
struct rcu_head rcu;
} ____cacheline_aligned_in_smp;
work-pool是Per-CPU,每个cpu都有两个worke-pool,分别对应普通和高优先级的工作线程
CMWQ还定义了一个pool_workqueue。用于连接工作队列workqueue_struct和worker-pool的枢纽
__aligned(1 << WORK_STRUCT_FLAG_BITS);WORK_STRUCT_FLAG_BITS=8.因此该结构体是按照1<<8 = 256字节对齐的。
worker-pool里面的worklist里面挂了待处理的work item。struct list_head workers应该是线程池里面的各个内核线程。
struct pool_workqueue {
/* 指向的worker-pool指针 */
struct worker_pool *pool; /* I: the associated pool */
/* 所属的工作队列 */
struct workqueue_struct *wq; /* I: the owning workqueue */
int work_color; /* L: current color */
int flush_color; /* L: flushing color */
int refcnt; /* L: reference count */
int nr_in_flight[WORK_NR_COLORS];
/* L: nr of in_flight works */
int nr_active; /* L: nr of active works */
int max_active; /* L: max active works */
struct list_head delayed_works; /* L: delayed works */
struct list_head pwqs_node; /* WR: node on wq->pwqs */
struct list_head mayday_node; /* MD: node on wq->maydays */
/*
* Release of unbound pwq is punted to system_wq. See put_pwq()
* and pwq_unbound_release_workfn() for details. pool_workqueue
* itself is also sched-RCU protected so that the first pwq can be
* determined without grabbing wq->mutex.
*/
struct work_struct unbound_release_work;
struct rcu_head rcu;
} __aligned(1 << WORK_STRUCT_FLAG_BITS);
struct pool_workqueue用于连接工作队列和worker-pool
workqueue_struct是工作队列。
struct workqueue_struct {
/* 所有的pool-workequeue都挂入链表中 */
struct list_head pwqs; /* WR: all pwqs of this wq */
/* 系统定义了一个全局的链表workqueues,所有的workqueue挂入该链表 */
struct list_head list; /* PL: list of all workqueues */
struct mutex mutex; /* protects this wq */
int work_color; /* WQ: current work color */
int flush_color; /* WQ: current flush color */
atomic_t nr_pwqs_to_flush; /* flush in progress */
struct wq_flusher *first_flusher; /* WQ: first flusher */
struct list_head flusher_queue; /* WQ: flush waiters */
struct list_head flusher_overflow; /* WQ: flush overflow list */
/* 所有rescue状态下的pool-workqueue都挂入该链表 */
struct list_head maydays; /* MD: pwqs requesting rescue */
/*
rescue内核线程.内存紧张时创建新的工作线程可能会失败,如果workqueu时设置了
WQ_MEM_RECLAIM,那么rescuer内核线程会接管这种情况
*/
struct worker *rescuer; /* I: rescue worker */
int nr_drainers; /* WQ: drain in progress */
int saved_max_active; /* WQ: saved pwq max_active */
/* unbound类型属性 */
struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */
/* 指向unbound类型的pool_workqueue */
struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */
#ifdef CONFIG_SYSFS
struct wq_device *wq_dev; /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
/* 指向Per-CPU类型的pool_workqueue */
struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
};
系统中所有的工作队列,包括系统默认的工作队列,eg systen_wq或者是system_highpri_wq等,以及驱动开新创建的工作队列,它们共享一组worker-pool。对应Bound类型的工作队列,每个cpu只有两个工作线程池,每个工作线程池可以和多个workqueue对应,每个workqueue也只能对应着几个工作线程池。
一个work挂入workqueue中,最终还要通过worker-pool中的工作线程来处理器回调函数,worker-pool是系统共享的。
来自于《奔跑把linux内核》。它里面的工作队列对应workqueue_struct。
1、系统中所有的工作队列共享一组worker-pool。结合上图的意思。我感觉是system_wq可以选择cpu0上的worker-pool,也可以选择cpu1的worker-pool中的worker去执行任务。即工作队列可以被同优先级的任一worker-pool里面的工作线程处理(后面看代码是不是这样的,同一优先级感觉是对的,具体原因看函数alloc_and_link_pwqs)
一个任务work会被挂入到workqueue中。workqueue需要找一个合适的worker-pool,然后从worker-pool中分配一个合适的工作线程worker。pool-workqueue起到连接工作线程池worker-pool和工作队列workqueue的作用。
这几个数据结构的关系感觉是:每个CPU都有两个工作线程池,每个线程池有对应了一个pwq。工作队列里面的work只有通过pwd去找worker-pool。worker-pool里面又包含了工作线程worker和任务work item。
每个CPU有两个worker_pool, 因为NR_STD_WORKER_POOLS = 2;
/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
cpu_worker_pools);
#define for_each_cpu_worker_pool(pool, cpu) \
for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
(pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
(pool)++)
static int __init init_workqueues(void)
{
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
int i, cpu;
WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
/*
创建一个pool_workqueue数据结构的slab缓存对象
干啥用啊?
*/
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
wq_numa_init();
/* initialize CPU pools */
/* 为系统中所有可用的CPU创建worker_pool */
for_each_possible_cpu(cpu) {
struct worker_pool *pool;
i = 0;
/* 每个CPU有两个优先级的worker_pool */
for_each_cpu_worker_pool(pool, cpu) {
BUG_ON(init_worker_pool(pool));
pool->cpu = cpu;
cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
pool->attrs->nice = std_nice[i++];//0 and HIGHPRI_NICE_LEVEL
pool->node = cpu_to_node(cpu);
/* alloc pool ID */
mutex_lock(&wq_pool_mutex);
BUG_ON(worker_pool_assign_id(pool));
mutex_unlock(&wq_pool_mutex);
}
}
/* create the initial worker */
/*
为每个在线的CPU的worker_pool创建一个工作线程
从代码看的话,是一个worker_pool只有一个工作线程??
*/
for_each_online_cpu(cpu) {
struct worker_pool *pool;
for_each_cpu_worker_pool(pool, cpu) {
pool->flags &= ~POOL_DISASSOCIATED;
BUG_ON(create_and_start_worker(pool) < 0);
}
}
/* create default unbound and ordered wq attrs */
/*
创建unbound类型和ordered类型的工作队列属性
ordered类型的工作队列表示同一时刻,只能有一个worke item在运行
*/
for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
struct workqueue_attrs *attrs;
BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
attrs->nice = std_nice[i];
unbound_std_wq_attrs[i] = attrs;
/*
* An ordered wq should have only one pwq as ordering is
* guaranteed by max_active which is enforced by pwqs.
* Turn off NUMA so that dfl_pwq is used for all nodes.
*/
BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
attrs->nice = std_nice[i];
attrs->no_numa = true;
ordered_wq_attrs[i] = attrs;
}
/*
创建系统默认的工作队列workqueue
分别是普通优先级、高优先级,unbound,freezable...
*/
system_wq = alloc_workqueue("events", 0, 0);
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
WQ_UNBOUND_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
WQ_POWER_EFFICIENT, 0);
system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
WQ_FREEZABLE | WQ_POWER_EFFICIENT,
0);
BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
!system_unbound_wq || !system_freezable_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq);
return 0;
}
从初始化代码里面,没有感觉worker_pool有所谓的bound类型和unbound呢?或者说只看到创建bound类型的worker_pool(因为bound类型是和cpu绑定的)。反而是工作队列有unbound(system_unbound_wq)和bound(system_wq)。但是如果bound/unbound是对于工作队列而言,那上图里面不是说unbound类型的工作队列是会分配到根据内存节点分配的worker_pool吗?我也没有看到有初始化这种工作线程池呢?
工作线程创建
init_workqueues->create_and_start_worker
下面是为每个cpu的每个worker-pool创建一个工作线程。
for_each_online_cpu(cpu) {
struct worker_pool *pool;
for_each_cpu_worker_pool(pool, cpu) {
pool->flags &= ~POOL_DISASSOCIATED;
BUG_ON(create_and_start_worker(pool) < 0);
}
}
static int create_and_start_worker(struct worker_pool *pool)
{
struct worker *worker;
worker = create_worker(pool);
if (worker) {
spin_lock_irq(&pool->lock);
start_worker(worker);
spin_unlock_irq(&pool->lock);
}
return worker ? 0 : -ENOMEM;
}
创建内核线程,其回调函数设置为了worker_thread
static struct worker *create_worker(struct worker_pool *pool)
{
struct worker *worker = NULL;
int id = -1;
char id_buf[16];
/* ID is needed to determine kthread name */
id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
if (id < 0)
goto fail;
/* 先从slab中分配一个worker对象 */
worker = alloc_worker();
if (!worker)
goto fail;
worker->pool = pool;
worker->id = id;
/*
表示bound类型工作线程
如果是高优先级的还需要加H
名字一般为kworker/+cpuid+workerid
*/
if (pool->cpu >= 0)
snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
pool->attrs->nice < 0 ? "H" : "");
else/* unbound:kworker/u+cpuid+workerid */
snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
/* 在本地内存节点创建一个内核线程,主要是分配task_struct */
worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
"kworker/%s", id_buf);
if (IS_ERR(worker->task))
goto fail;
set_user_nice(worker->task, pool->attrs->nice);
/* prevent userland from meddling with cpumask of workqueue workers */
worker->task->flags |= PF_NO_SETAFFINITY;
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
return worker;
fail:
if (id >= 0)
ida_simple_remove(&pool->worker_ida, id);
kfree(worker);
return NULL;
}
回到create_and_start_worker,当worker创建成功后。将该worker设置为idle状态,然后唤醒该线程。
static void start_worker(struct worker *worker)
{
worker->pool->nr_workers++;
worker_enter_idle(worker);
wake_up_process(worker->task);
}
设置工作线程为idle状态。每个worker-pool都定时器,用于动态删除过多空闲的内核线程。
static void worker_enter_idle(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
WARN_ON_ONCE(!list_empty(&worker->entry) &&
(worker->hentry.next || worker->hentry.pprev)))
return;
/* can't use worker_set_flags(), also called from start_worker() */
worker->flags |= WORKER_IDLE;
pool->nr_idle++;
worker->last_active = jiffies;
/* idle_list is LIFO */
list_add(&worker->entry, &pool->idle_list);
if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
/*
* Sanity check nr_running. Because wq_unbind_fn() releases
* pool->lock between setting %WORKER_UNBOUND and zapping
* nr_running, the warning may trigger spuriously. Check iff
* unbind is not in progress.
*/
WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
pool->nr_workers == pool->nr_idle &&
atomic_read(&pool->nr_running));
}
pool->idle_timer.function = idle_worker_timeout;
static void idle_worker_timeout(unsigned long __pool)
{
struct worker_pool *pool = (void *)__pool;
spin_lock_irq(&pool->lock);
while (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
/* idle_list is kept in LIFO order, check the last one */
worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
if (time_before(jiffies, expires)) {
mod_timer(&pool->idle_timer, expires);
break;
}
destroy_worker(worker);
}
spin_unlock_irq(&pool->lock);
}
创建工作队列
#define alloc_ordered_workqueue(fmt, flags, args...) \
alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)
#define create_workqueue(name) \
alloc_workqueue("%s", WQ_MEM_RECLAIM, 1, (name))
#define create_freezable_workqueue(name) \
alloc_workqueue("%s", WQ_FREEZABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, \
1, (name))
#define create_singlethread_workqueue(name) \
alloc_workqueue("%s", WQ_UNBOUND | WQ_MEM_RECLAIM, 1, (name))
#define alloc_workqueue(fmt, flags, max_active, args...) \
__alloc_workqueue_key((fmt), (flags), (max_active), \
NULL, NULL, ##args)
WQ_UNBOUND:任务work会加入unbound工作队列,该类型的工作队列没有绑定到具体的CPU上。该类型的work不需要额外的同步管理,unbound工作线程池会尝试尽快执行它的work(从这句话看,worker-pool也分bound和unbound??)
WQ_FREEZABLE:该类型的工作队列会参与到系统的suspend过程中。这会让工作线程处理完当前所以的work才完成进程冻结,并且这个过程中不会在重新开始一个work的执行,直到进程解冻
WQ_MEM_RECLAIM:当内存紧张是,创建新的工作线程可能会失败,系统中还有一个rescuer内核线程会接管这种情况
max_active:说明每个CPU最多可以有多少个work挂入到一个工作队列中
#define alloc_workqueue(fmt, flags, max_active, args...) \
__alloc_workqueue_key((fmt), (flags), (max_active), \
NULL, NULL, ##args)
__alloc_workqueue_key:创建一个 工作队列的集合workqueue_struct,然后进行初始化
struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
unsigned int flags,
int max_active,
struct lock_class_key *key,
const char *lock_name, ...)
{
size_t tbl_size = 0;
va_list args;
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
/* see the comment above the definition of WQ_POWER_EFFICIENT */
if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
flags |= WQ_UNBOUND;
/* allocate wq and format name */
if (flags & WQ_UNBOUND)
tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
if (!wq)
return NULL;
if (flags & WQ_UNBOUND) {
wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
if (!wq->unbound_attrs)
goto err_free_wq;
}
va_start(args, lock_name);
vsnprintf(wq->name, sizeof(wq->name), fmt, args);
va_end(args);
max_active = max_active ?: WQ_DFL_ACTIVE;
max_active = wq_clamp_max_active(max_active, flags, wq->name);
/* init wq */
wq->flags = flags;
wq->saved_max_active = max_active;
mutex_init(&wq->mutex);
atomic_set(&wq->nr_pwqs_to_flush, 0);
INIT_LIST_HEAD(&wq->pwqs);
INIT_LIST_HEAD(&wq->flusher_queue);
INIT_LIST_HEAD(&wq->flusher_overflow);
INIT_LIST_HEAD(&wq->maydays);
lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
INIT_LIST_HEAD(&wq->list);
if (alloc_and_link_pwqs(wq) < 0)
goto err_free_wq;
/*
* Workqueues which may be used during memory reclaim should
* have a rescuer to guarantee forward progress.
*/
if (flags & WQ_MEM_RECLAIM) {
struct worker *rescuer;
rescuer = alloc_worker();
if (!rescuer)
goto err_destroy;
rescuer->rescue_wq = wq;
rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
wq->name);
if (IS_ERR(rescuer->task)) {
kfree(rescuer);
goto err_destroy;
}
wq->rescuer = rescuer;
rescuer->task->flags |= PF_NO_SETAFFINITY;
wake_up_process(rescuer->task);
}
if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
goto err_destroy;
/*
* wq_pool_mutex protects global freeze state and workqueues list.
* Grab it, adjust max_active and add the new @wq to workqueues
* list.
*/
mutex_lock(&wq_pool_mutex);
mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq)
pwq_adjust_max_active(pwq);
mutex_unlock(&wq->mutex);
list_add(&wq->list, &workqueues);
mutex_unlock(&wq_pool_mutex);
return wq;
err_free_wq:
free_workqueue_attrs(wq->unbound_attrs);
kfree(wq);
return NULL;
err_destroy:
destroy_workqueue(wq);
return NULL;
}
对于创建工作队列时的bound,书上是这样解释的:
对于bound类型的工作队列,它是per-cpu类型的,也就是它会不从这个cpu迁移到另外一个,页不希望进程调度器打扰它们。设置为unbound的工作队列,究竟在哪个cpu上唤醒交由进程调度器决定。per-cpu类型的工作队列会让idle状态的cpu从idle状态唤醒,从而增加功耗。
1、对于bound类型的工作队列,它是per-cpu类型的。这个是per-cpu的??我只看到wq->cpu_pwqs这个是。
2、设置为unbound的工作队列,究竟在哪个cpu上唤醒交由进程调度器决定。工作队列不是进行或者线程,还有唤醒这个说法??还是说唤醒的是cpu??
alloc_and_link_pwqs:感觉就是将工作队列pwq和工作队列集合workqueue_struct关联起来
对于bound类型的工作队列集合workqueue_struct , alloc_percpu(struct pool_workqueue)为每个CPU都分配了一个工作队列pwq(书上不是说每个cpu都会存在两个worker-pool和两个pwq嘛,这里怎么只分配了一个呢??对了,alloc_workqueue会调用两次,分别是0和high,那这样每个CPU也就动态分配了两次pwq,正好对应上);
system_wq = alloc_workqueue("events", 0, 0);
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
cpu_worker_pools是一个静态定义的per-CPU的worker-pool,看样子是个数组,这个worker-pool应该是两个。这样就和书上的能对应上了。init_pwq和link_pwq就是将pwq和wq关联上。
此外可以看到在__alloc_workqueue_key申请了一个wq,但是在alloc_and_link_pwqs会和每个cpu相同优先级的pwq进行了关联。。
static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
bool highpri = wq->flags & WQ_HIGHPRI;
int cpu, ret;
if (!(wq->flags & WQ_UNBOUND)) {
/* 为每个CPU都分配一个pwq */
wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
if (!wq->cpu_pwqs)
return -ENOMEM;
for_each_possible_cpu(cpu) {
/* 获取每个cpu的工作队列pwq */
struct pool_workqueue *pwq =
per_cpu_ptr(wq->cpu_pwqs, cpu);
/* cpu_worker_pools是一个静态定义的per-CPU的worker-pool */
struct worker_pool *cpu_pools =
per_cpu(cpu_worker_pools, cpu);
init_pwq(pwq, wq, &cpu_pools[highpri]);
mutex_lock(&wq->mutex);
link_pwq(pwq);
mutex_unlock(&wq->mutex);
}
return 0;
} else if (wq->flags & __WQ_ORDERED) {
.....................
} else {
........................
}
}
struct workqueue_struct {
..................................
struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
..................................
};
cpu_pwqs也是一个Per-CPU类型的指针,alloc_percpu()为每个CPU都分配了一个pool_workqueue的数据结构
1、每个cpu都有两个worker-pool,每个worker-pool都有一个pwq枢纽。
2、每个workqueue_struct都有一个成员cpu_pwqs,是一个Per-CPU类型的指针。
前提是bound类型的工作队列。那么每创建一个工作队列。那么wq->cpu_pwqs都会去和每个cpu的cpu_pools[highpri]进行关联。那么普通优先级的工作队列,eg system_wq会和所有cpu的cpu_pools[0]进行关联。这样感觉就能在所有cpu上执行了??
struct pool_workqueue *pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
struct worker_pool *cpu_pools = per_cpu(cpu_worker_pools, cpu);
将wq->cpu_pwqs执行worker-pool
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
struct worker_pool *pool)
{
BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
memset(pwq, 0, sizeof(*pwq));
pwq->pool = pool;
pwq->wq = wq;
pwq->flush_color = -1;
pwq->refcnt = 1;
INIT_LIST_HEAD(&pwq->delayed_works);
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
}
对应unbound/order类型的工作队列:则使用 apply_workqueue_attrs
static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
bool highpri = wq->flags & WQ_HIGHPRI;
int cpu, ret;
if (!(wq->flags & WQ_UNBOUND)) {
............................
} else if (wq->flags & __WQ_ORDERED) {
ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
/* there should only be single pwq for ordering guarantee */
WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
"ordering guarantee broken for workqueue %s\n", wq->name);
return ret;
} else {
return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
}
}
感觉非bound类型的工作队列是基于node分配的??和图里面的一样内存节点0,内存节点1。
int apply_workqueue_attrs(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
struct workqueue_attrs *new_attrs, *tmp_attrs;
struct pool_workqueue **pwq_tbl, *dfl_pwq;
int node, ret;
/* only unbound workqueues can change attributes */
if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
return -EINVAL;
/* creating multiple pwqs breaks ordering guarantee */
if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
return -EINVAL;
pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
if (!pwq_tbl || !new_attrs || !tmp_attrs)
goto enomem;
/* make a copy of @attrs and sanitize it */
copy_workqueue_attrs(new_attrs, attrs);
cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
/*
* We may create multiple pwqs with differing cpumasks. Make a
* copy of @new_attrs which will be modified and used to obtain
* pools.
*/
copy_workqueue_attrs(tmp_attrs, new_attrs);
/*
* CPUs should stay stable across pwq creations and installations.
* Pin CPUs, determine the target cpumask for each node and create
* pwqs accordingly.
*/
get_online_cpus();
mutex_lock(&wq_pool_mutex);
/*
* If something goes wrong during CPU up/down, we'll fall back to
* the default pwq covering whole @attrs->cpumask. Always create
* it even if we don't use it immediately.
*/
/* 创建或者查找一个pool_workqueue */
dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
if (!dfl_pwq)
goto enomem_pwq;
for_each_node(node) {
if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
if (!pwq_tbl[node])
goto enomem_pwq;
} else {
dfl_pwq->refcnt++;
pwq_tbl[node] = dfl_pwq;
}
}
mutex_unlock(&wq_pool_mutex);
/* all pwqs have been created successfully, let's install'em */
mutex_lock(&wq->mutex);
copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
/* save the previous pwq and install the new one */
for_each_node(node)
pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
/* @dfl_pwq might not have been used, ensure it's linked */
link_pwq(dfl_pwq);
swap(wq->dfl_pwq, dfl_pwq);
mutex_unlock(&wq->mutex);
/* put the old pwqs */
for_each_node(node)
put_pwq_unlocked(pwq_tbl[node]);
put_pwq_unlocked(dfl_pwq);
put_online_cpus();
ret = 0;
/* fall through */
out_free:
free_workqueue_attrs(tmp_attrs);
free_workqueue_attrs(new_attrs);
kfree(pwq_tbl);
return ret;
enomem_pwq:
free_unbound_pwq(dfl_pwq);
for_each_node(node)
if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
free_unbound_pwq(pwq_tbl[node]);
mutex_unlock(&wq_pool_mutex);
put_online_cpus();
enomem:
ret = -ENOMEM;
goto out_free;
}
另外不像bound类型的工作队列。在初始化的时候并未创建对应的worker-pool。因此这里如果没有找到pool,则会去创建一个pool.
static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
struct worker_pool *pool;
struct pool_workqueue *pwq;
lockdep_assert_held(&wq_pool_mutex);
/* 查找是否存在相同属性的worker-pool,没有就创建一个 */
pool = get_unbound_pool(attrs);
if (!pool)
return NULL;
pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
if (!pwq) {
put_unbound_pool(pool);
return NULL;
}
init_pwq(pwq, wq, pool);
return pwq;
}
调度一个work
1、初始化一个work
#define INIT_WORK(_work, _func) \
do { \
__INIT_WORK((_work), (_func), 0); \
} while (0)
#define __INIT_WORK(_work, _func, _onstack) \
do { \
__init_work((_work), _onstack); \
(_work)->data = (atomic_long_t) WORK_DATA_INIT(); \
INIT_LIST_HEAD(&(_work)->entry); \
PREPARE_WORK((_work), (_func)); \
} while (0)
2、将work挂到workqueue_struct里面。把work挂入了,初始化时的system_wq(bound类型普通优先级)里面。schedule_work默认是把work挂入了系统默认的工作队列system_wq中。
static inline bool schedule_work(struct work_struct *work)
{
return queue_work(system_wq, work);
}
WORK_CPU_UNBOUDN表示不绑定到任何CPU上,建议使用本地cpu
static inline bool queue_work(struct workqueue_struct *wq,
struct work_struct *work)
{
return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}
bool queue_work_on(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
bool ret = false;
unsigned long flags;
/* 在关中断的情况将work加入工作队列 */
local_irq_save(flags);
/* 如果该work已经设置了WORK_STRUCT_PENDING_BIT,说明该work已经在工作队列中 */
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
__queue_work(cpu, wq, work);
ret = true;
}
local_irq_restore(flags);
return ret;
}
1 可以看到req_cpu == WORK_CPU_UNBOUND时,使用的就是本地CPU
2 根据workqueue, wq的类型,寻找到对应的pool_workqueue(pool_workqueue是桥梁枢纽,想把work加入workqueue,需要找合适的pool_workqueue)
3 根据work里面记录上次运行的pool,调整选择的pwq
4 最后调用insert_work将work加入到worker_pool的worklist或者pwq的delayed_works链表中。同样在insert_work中会判断是否还有空闲的worker,如果没有则会唤醒新的worker。因此work是被加入到了worker-pool的链表中
从整个流程中可以看到想要加入一个work,其实是先找pwq(根据传入的wq或者worker里面记录的上一次的pwq)。然后有了pwq就能找到worker_pool,最后把work挂载到worker_pool待处理的链表中。
此外,这个函数可以看到调用schedule_work其实只是把work加入到了workqueue中,但是并没有执行work,只是将其加入了待处理的链表中。
static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
struct pool_workqueue *pwq;
struct worker_pool *last_pool;
struct list_head *worklist;
unsigned int work_flags;
unsigned int req_cpu = cpu;
/*
* While a work item is PENDING && off queue, a task trying to
* steal the PENDING will busy-loop waiting for it to either get
* queued or lose PENDING. Grabbing PENDING and queueing should
* happen with IRQ disabled.
*/
WARN_ON_ONCE(!irqs_disabled());
debug_work_activate(work);
/* if draining, only works from the same workqueue are allowed */
if (unlikely(wq->flags & __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
retry:
if (req_cpu == WORK_CPU_UNBOUND)
cpu = raw_smp_processor_id();
/* pwq which will be used unless @work is executing elsewhere */
/* 对应bound类型的workqueue,直接使用本地CPU对应的工作队列pwq(pool_workqueue) */
if (!(wq->flags & WQ_UNBOUND))/* system_wq就是bound和cpu绑定的workqueue */
pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
else/* 对应unbound类型的workqueue,则寻找本地节点对应的unbound类型的pool_workqueue */
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
/*
* If @work was previously on a different pool, it might still be
* running there, in which case the work needs to be queued on that
* pool to guarantee non-reentrancy.
*/
/*
work里面记录上一次运行的worker_pool
如果此处的pool和上一次的不一致,需要检查这个work是否正在运行在其他的worker_pool里面
如果是则需要调整pwq
*/
last_pool = get_work_pool(work);
if (last_pool && last_pool != pwq->pool) {
struct worker *worker;
spin_lock(&last_pool->lock);
worker = find_worker_executing_work(last_pool, work);
if (worker && worker->current_pwq->wq == wq) {
pwq = worker->current_pwq;
} else {
/* meh... not running there, queue here */
spin_unlock(&last_pool->lock);
spin_lock(&pwq->pool->lock);
}
} else {
spin_lock(&pwq->pool->lock);
}
/*
* pwq is determined and locked. For unbound pools, we could have
* raced with pwq release and it could already be dead. If its
* refcnt is zero, repeat pwq selection. Note that pwqs never die
* without another pwq replacing it in the numa_pwq_tbl or while
* work items are executing on it, so the retrying is guaranteed to
* make forward-progress.
*/
if (unlikely(!pwq->refcnt)) {
/* 对应unbound类型pwq释放是异步的 */
if (wq->flags & WQ_UNBOUND) {
spin_unlock(&pwq->pool->lock);
cpu_relax();
goto retry;
}
/* oops */
WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
wq->name, cpu);
}
/* pwq determined, queue */
trace_workqueue_queue_work(req_cpu, pwq, work);
if (WARN_ON(!list_empty(&work->entry))) {
spin_unlock(&pwq->pool->lock);
return;
}
pwq->nr_in_flight[pwq->work_color]++;
work_flags = work_color_to_flags(pwq->work_color);
if (likely(pwq->nr_active < pwq->max_active)) {
trace_workqueue_activate_work(work);
pwq->nr_active++;
worklist = &pwq->pool->worklist;
} else {
work_flags |= WORK_STRUCT_DELAYED;
worklist = &pwq->delayed_works;
}
insert_work(pwq, work, worklist, work_flags);
spin_unlock(&pwq->pool->lock);
}
工作线程处理work
可以看到在创建worker的时候,注册的处理函数是worker_thread
static struct worker *create_worker(struct worker_pool *pool)
{
struct worker *worker = NULL;
........................
worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
"kworker/%s", id_buf);
..............................
return worker;
fail:
if (id >= 0)
ida_simple_remove(&pool->worker_ida, id);
kfree(worker);
return NULL;
}
worker_thread:
1 将worke从空闲链表里面清除。从上面知道worker最终是被放到了worker-pool的链表里面。因此工作线程也是这个链表里面去获取work。
2 判断是否需要创建工作线程
3 调用process_one_work处理work,调用workek的回调函数
static int worker_thread(void *__worker)
{
struct worker *worker = __worker;
struct worker_pool *pool = worker->pool;
/* tell the scheduler that this is a workqueue worker */
worker->task->flags |= PF_WQ_WORKER;
woke_up:
spin_lock_irq(&pool->lock);
/* am I supposed to die? */
if (unlikely(worker->flags & WORKER_DIE)) {
spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
worker->task->flags &= ~PF_WQ_WORKER;
set_task_comm(worker->task, "kworker/dying");
ida_simple_remove(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker, pool);
kfree(worker);
return 0;
}
/* 清除work_idle标记,并且将worker从idle状态的链表移除 */
worker_leave_idle(worker);
recheck:
/* no more worker necessary? */
/* 内核线程会被不停的调度,如果该工作线程没有任务,则睡眠 */
if (!need_more_worker(pool))
goto sleep;
/* do we need to manage? */
/* 线程池里面没有空闲的线程,则需要manage_workers去创建线程 */
if (unlikely(!may_start_working(pool)) && manage_workers(worker))
goto recheck;
/*
* ->scheduled list can only be filled while a worker is
* preparing to process a work or actually processing it.
* Make sure nobody diddled with it while I was sleeping.
*/
WARN_ON_ONCE(!list_empty(&worker->scheduled));
/*
* Finish PREP stage. We're guaranteed to have at least one idle
* worker or that someone else has already assumed the manager
* role. This is where @worker starts participating in concurrency
* management if applicable and concurrency management is restored
* after being rebound. See rebind_workers() for details.
*/
worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
do {
struct work_struct *work =
list_first_entry(&pool->worklist,
struct work_struct, entry);
/*
WORK_STRUCT_LINKED置为表示work后面还有链接了需要处理的work
因此需要用move_linked_works,将剩下的work,也一并加入scheduled链表
然后最终循环调用process_one_work进行处理
*/
if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
/* optimization path, not strictly necessary */
process_one_work(worker, work);
if (unlikely(!list_empty(&worker->scheduled)))
process_scheduled_works(worker);
} else {
move_linked_works(work, &worker->scheduled, NULL);
process_scheduled_works(worker);
}
} while (keep_working(pool));/* worker_pool里面还有任务,且活动线程数量小于1,继续处理 */
worker_set_flags(worker, WORKER_PREP, false);
sleep:
/*
* pool->lock is held and there's no work to process and no need to
* manage, sleep. Workers are woken up only while holding
* pool->lock or from local cpu, so setting the current state
* before releasing pool->lock is enough to prevent losing any
* event.
*/
worker_enter_idle(worker);
__set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
}
process_one_work中调用工作队列注册的回调函数,worker->current_func(work);
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool;
bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
int work_color;
struct worker *collision;
#ifdef CONFIG_LOCKDEP
#endif
/*
* Ensure we're on the correct CPU. DISASSOCIATED test is
* necessary to avoid spurious warnings from rescuers servicing the
* unbound or a disassociated pool.
*/
WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
!(pool->flags & POOL_DISASSOCIATED) &&
raw_smp_processor_id() != pool->cpu);
/*
* A single work shouldn't be executed concurrently by
* multiple workers on a single cpu. Check whether anyone is
* already processing the work. If so, defer the work to the
* currently executing one.
*/
/*
一个work可能在同一个CPU上的不同工作线程中运行,
那么该work只能退出当前处理
从busy_hash表中查找,并返回正在处理该work的worker
*/
collision = find_worker_executing_work(pool, work);
if (unlikely(collision)) {
/* 将该work迁移到运行它的worker的scheduled链表上 */
move_linked_works(work, &collision->scheduled, NULL);
return;
}
/* claim and dequeue */
debug_work_deactivate(work);
hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
worker->current_work = work;
worker->current_func = work->func;
worker->current_pwq = pwq;
work_color = get_work_color(work);
list_del_init(&work->entry);
/*
* CPU intensive works don't participate in concurrency
* management. They're the scheduler's responsibility.
*/
if (unlikely(cpu_intensive))
worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
/*
* Unbound pool isn't concurrency managed and work items should be
* executed ASAP. Wake up another worker if necessary.
*/
if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
wake_up_worker(pool);
/*
* Record the last pool and clear PENDING which should be the last
* update to @work. Also, do this inside @pool->lock so that
* PENDING and queued state changes happen together while IRQ is
* disabled.
*/
set_work_pool_and_clear_pending(work, pool->id);
spin_unlock_irq(&pool->lock);
lock_map_acquire_read(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
trace_workqueue_execute_start(work);
worker->current_func(work);
/*
* While we must be careful to not use "work" after this, the trace
* point will only record its address.
*/
trace_workqueue_execute_end(work);
lock_map_release(&lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
" last function: %pf\n",
current->comm, preempt_count(), task_pid_nr(current),
worker->current_func);
debug_show_held_locks(current);
dump_stack();
}
/*
* The following prevents a kworker from hogging CPU on !PREEMPT
* kernels, where a requeueing work item waiting for something to
* happen could deadlock with stop_machine as such work item could
* indefinitely requeue itself while all other CPUs are trapped in
* stop_machine.
*/
cond_resched();
spin_lock_irq(&pool->lock);
/* clear cpu intensive status */
if (unlikely(cpu_intensive))
worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
/* we're done with it, release */
hash_del(&worker->hentry);
worker->current_work = NULL;
worker->current_func = NULL;
worker->current_pwq = NULL;
worker->desc_valid = false;
pwq_dec_nr_in_flight(pwq, work_color);
}