新work queue工作机制
工作队列(workqueue)是Linux kernel中将工作推后执行的一种机制。这种机制和BH或Tasklets不同之处在于工作队列是把推后的工作交由一个内核线程去执行,因此工作队列的优势就在于它允许重新调度甚至睡眠。
Linux的work queue在2.6.0 到2.6.19以及到2.6.36工作队列发生了一些变化。本文主要对新版本做一些分析。
虽然自从2.6.0之后,Linux对work queue进行了优化,但是kernel用到create_workqueue的模块越来越多,而调用create_workqueue会在每个cpu上都创建一个work_thread, 每个cpu都分配一个cpu_workqueue_struct以及workqueue_struct,而如果没被queue_work的话根本没机会工作,这样仍然相当浪费内存资源,而且加重了cpu loading。另外,同一个work queue上的每个work都是按照串行执行的,假如其中一个work的调度程序睡眠了,那么后面的work也将无法工作。
自从2.6.36以后,work queue的机制发生了很大变化,所有的work queue都被合并成
一个work queue,work thread也不是和work queue一一关联,work何时工作紧紧按照工作的重要性以及时间紧迫性来划分。也就是说新机制是按照cpu数量来创建work thread,而不是work queue。
下面我们还是通过代码分析吧:
初始化workqueus及创建work threads:
系统启动时调用init_workqueus()@kernel/kernel/workqueue.c
- static int __initinit_workqueues(void)
- {
- unsigned int cpu;
- int i;
-
- cpu_notifier(workqueue_cpu_callback,CPU_PRI_WORKQUEUE);
-
- /* initialize gcwqs */
- /* 前面有说过,新机制将workqueues都排到gcwq上管理了,
- 每个cpu各一个gcwq。*/
- for_each_gcwq_cpu(cpu) {
- struct global_cwq *gcwq =get_gcwq(cpu);
- spin_lock_init(&gcwq->lock);
- INIT_LIST_HEAD(&gcwq->worklist);
- gcwq->cpu = cpu;
- gcwq->flags |=GCWQ_DISASSOCIATED;
- INIT_LIST_HEAD(&gcwq->idle_list);
- for (i = 0; i <BUSY_WORKER_HASH_SIZE; i++)
- INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
- init_timer_deferrable(&gcwq->idle_timer);
- gcwq->idle_timer.function =idle_worker_timeout;
- gcwq->idle_timer.data =(unsigned long)gcwq;
- setup_timer(&gcwq->mayday_timer,gcwq_mayday_timeout,
- (unsigned long)gcwq);
- ida_init(&gcwq->worker_ida);
- gcwq->trustee_state =TRUSTEE_DONE;
- init_waitqueue_head(&gcwq->trustee_wait);
- }
-
- /* create the initial worker */
- for_each_online_gcwq_cpu(cpu) {
- struct global_cwq *gcwq =get_gcwq(cpu);
- struct worker *worker;
- if (cpu != WORK_CPU_UNBOUND)
- gcwq->flags &=~GCWQ_DISASSOCIATED;
- /* 开机启动初始化后创建workthread主要是这里实现*/
- worker = create_worker(gcwq,true);
- BUG_ON(!worker);
- spin_lock_irq(&gcwq->lock);
- start_worker(worker);
- spin_unlock_irq(&gcwq->lock);
- }
- /* 创建系统开机后默认的workqueue,平常我们调用的
- schedule_work()其实就是用的system_wq这个work queue,可
- 参考schedule_work()实现。*/
- system_wq =alloc_workqueue("events", 0, 0);
- system_long_wq =alloc_workqueue("events_long", 0, 0);
- system_nrt_wq =alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
- system_unbound_wq =alloc_workqueue("events_unbound", WQ_UNBOUND,
- WQ_UNBOUND_MAX_ACTIVE);
- system_freezable_wq =alloc_workqueue("events_freezable",
- WQ_FREEZABLE, 0);
- BUG_ON(!system_wq || !system_long_wq ||!system_nrt_wq ||
- !system_unbound_wq || !system_freezable_wq);
- return 0;
- }
- early_initcall(init_workqueues);
再看create_worker是如何创建work thread的:
- static structworker *create_worker(struct global_cwq *gcwq, bool bind)
- {
- bool on_unbound_cpu = gcwq->cpu ==WORK_CPU_UNBOUND;
- struct worker *worker = NULL;
- int id = -1;
-
- spin_lock_irq(&gcwq->lock);
- while (ida_get_new(&gcwq->worker_ida,&id)) {
- spin_unlock_irq(&gcwq->lock);
- if(!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
- goto fail;
- spin_lock_irq(&gcwq->lock);
- }
- spin_unlock_irq(&gcwq->lock);
- /* 为work分配空间,初始化worker*/
- worker = alloc_worker();
- if (!worker)
- goto fail;
- worker->gcwq = gcwq;
- worker->id = id;
- /* kthread_create_on_node和 kthread_create都可以创建worker_thread,区别在于是否和cpu绑定,前者别是不依赖于CPU而工作,可以在任何CPU上工作,而后者表示分别在各个CPU上创建一个workthread来工作。从ps命令里就可以看到像kworker/0:0,kworker/1:0, kworker/u:0这样的进程就是这里创建的workthread了!*/
- if (!on_unbound_cpu)
- worker->task =kthread_create_on_node(worker_thread, worker,
- cpu_to_node(gcwq->cpu),"kworker/%u:%d", gcwq->cpu, id);
- else
- worker->task =kthread_create(worker_thread, worker, "kworker/u:%d", id);
- if (IS_ERR(worker->task))
- goto fail;
-
- /*
- *A rogue worker will become a regular one if CPU comes
- *online later on. Make sure every workerhas
- *PF_THREAD_BOUND set.
- */
- if (bind && !on_unbound_cpu)
- kthread_bind(worker->task,gcwq->cpu);
- else {
- worker->task->flags |=PF_THREAD_BOUND;
- if (on_unbound_cpu)
- worker->flags |=WORKER_UNBOUND;
- }
-
- return worker;
- fail:
- if (id >= 0) {
- spin_lock_irq(&gcwq->lock);
- ida_remove(&gcwq->worker_ida,id);
- spin_unlock_irq(&gcwq->lock);
- }
- kfree(worker);
- return NULL;
- }
处理works:
由kthread_create_on_node()或 kthread_create()创建了work thread之后,它就开始运行起来了:
- static intworker_thread(void *__worker)
- {
- struct worker *worker = __worker;
- struct global_cwq *gcwq =worker->gcwq;
-
- /* tell the scheduler that this is aworkqueue worker */
- worker->task->flags |=PF_WQ_WORKER;
- /* 最后的代码用gotowoke_up来表示work thread是一个无限循环。*/
- woke_up:
- spin_lock_irq(&gcwq->lock);
- /* DIE can be set only while we're idle,checking here is enough */
- if (worker->flags & WORKER_DIE) {
- spin_unlock_irq(&gcwq->lock);
- worker->task->flags &=~PF_WQ_WORKER;
- return 0;
- }
- worker_leave_idle(worker);
- recheck:
- /* no more worker necessary? */
- /* 如果有高优先级的work需要处理,而且当前已经没有空闲的workthread可以来处理掉这个高优先级work,那下一步就要创建新的workthread来处理掉,读者可自行分析need_more_woker()的实现。这里就体现了新机制对于高优先级先处理的方法。*/
- if (!need_more_worker(gcwq))
- goto sleep;
- /*新建一个workthread,可以看出,新的机制已经不想老的那样不管如何情况只要creatework queue就创建work thread,浪费内存资源。在manage_workers() -> maybe_create_worker ()-> create_worker (), create_worker()前面分析过了,它会creatework thread!*/
- /* do we need to manage? */
- if (unlikely(!may_start_working(gcwq))&& manage_workers(worker))
- goto recheck;
- /*
- *->scheduled list can only be filled while a worker is
- *preparing to process a work or actually processing it.
- *Make sure nobody diddled with it while I was sleeping.
- */
- BUG_ON(!list_empty(&worker->scheduled));
-
- /*
- *When control reaches this point, we're guaranteed to have
- *at least one idle worker or that someone else has already
- *assumed the manager role.
- */
- worker_clr_flags(worker, WORKER_PREP);
-
- do {
- struct work_struct *work =
- list_first_entry(&gcwq->worklist,
- struct work_struct, entry);
- /* 在创建里新的work thead去处理高优先级的work之后,终于轮到处理自己的work了。核心在process_one_work().*/
- if (likely(!(*work_data_bits(work)& WORK_STRUCT_LINKED))) {
- /* optimization path, notstrictly necessary */
- process_one_work(worker,work);
- if(unlikely(!list_empty(&worker->scheduled)))
- process_scheduled_works(worker);
- } else {
- move_linked_works(work,&worker->scheduled, NULL);
- process_scheduled_works(worker);
- }
- } while (keep_working(gcwq));
- worker_set_flags(worker, WORKER_PREP,false);
- sleep:
- /*在休眠之前,再一次判断当前有没有新的work需要处理。所以即使本work睡眠了,其他work也可以继续工作,这样就不会存在老的机制那样一个worksleep会阻塞其他work执行。*/
- if(unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
- goto recheck;
-
- /*
- *gcwq->lock is held and there's no work to process and no
- *need to manage, sleep. Workers are wokenup only while
- *holding gcwq->lock or from local cpu, so setting the
- *current state before releasing gcwq->lock is enough to
- *prevent losing any event.
- */
- worker_enter_idle(worker);
- __set_current_state(TASK_INTERRUPTIBLE);
- spin_unlock_irq(&gcwq->lock);
- schedule();
- goto woke_up;
- }
再来看看系统如何将work给处理掉:
- static voidprocess_one_work(struct worker *worker, struct work_struct *work)
- __releases(&gcwq->lock)
- __acquires(&gcwq->lock)
- {
- struct cpu_workqueue_struct *cwq =get_work_cwq(work);
- struct global_cwq *gcwq = cwq->gcwq;
- struct hlist_head *bwh =busy_worker_head(gcwq, work);
- bool cpu_intensive = cwq->wq->flags& WQ_CPU_INTENSIVE;
- /* 取出用户driver设置的函数*/
- work_func_t f = work->func;
- int work_color;
- struct worker *collision;
- #ifdefCONFIG_LOCKDEP
- /*
- *It is permissible to free the struct work_struct from
- *inside the function that is called from it, this we need to
- *take into account for lockdep too. Toavoid bogus "held
- *lock freed" warnings as well as problems when looking into
- *work->lockdep_map, make a copy and use that here.
- */
- struct lockdep_map lockdep_map =work->lockdep_map;
- #endif
- /*
- *A single work shouldn't be executed concurrently by
- *multiple workers on a single cpu. Checkwhether anyone is
- *already processing the work. If so,defer the work to the
- *currently executing one.
- */
- collision =__find_worker_executing_work(gcwq, bwh, work);
- if (unlikely(collision)) {
- move_linked_works(work,&collision->scheduled, NULL);
- return;
- }
-
- /* claim and process */
- debug_work_deactivate(work);
- hlist_add_head(&worker->hentry,bwh);
- worker->current_work = work;
- worker->current_cwq = cwq;
- work_color = get_work_color(work);
-
- /* record the current cpu number in thework data and dequeue */
- set_work_cpu(work, gcwq->cpu);
- list_del_init(&work->entry);
-
- /*
- *If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
- *wake up another worker; otherwise, clear HIGHPRI_PENDING.
- */
- /* 如果全局的gcwq有高优先级的work需要处理,唤醒它执行!*/
- if (unlikely(gcwq->flags &GCWQ_HIGHPRI_PENDING)) {
- struct work_struct *nwork = list_first_entry(&gcwq->worklist,
- structwork_struct, entry);
- if(!list_empty(&gcwq->worklist) &&
- get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
- /*唤醒高优先级的work所对应的workthread来工作。*/
- wake_up_worker(gcwq);
- else
- gcwq->flags &=~GCWQ_HIGHPRI_PENDING;
- }
-
- /*
- *CPU intensive works don't participate in concurrency
- *management. They're the scheduler'sresponsibility.
- */
- /*如果当前有对时间敏感的work,那么如果有空闲的workthread的话,也要唤醒相应work thread来工作。*/
- if (unlikely(cpu_intensive))
- worker_set_flags(worker, WORKER_CPU_INTENSIVE,true);
-
- spin_unlock_irq(&gcwq->lock);
-
- work_clear_pending(work);
- lock_map_acquire_read(&cwq->wq->lockdep_map);
- lock_map_acquire(&lockdep_map);
- trace_workqueue_execute_start(work);
- /* 历经千辛万苦,终于跑到要调用的work functionpointer了!!!*/
- f(work);
- /*
- *While we must be careful to not use "work" after this, the trace
- *point will only record its address.
- */
- /* 后面就是一些删除work,资源清楚释放,标志重设的工作了。*/
- trace_workqueue_execute_end(work);
- lock_map_release(&lockdep_map);
- lock_map_release(&cwq->wq->lockdep_map);
-
- if (unlikely(in_atomic() ||lockdep_depth(current) > 0)) {
- printk(KERN_ERR "BUG:workqueue leaked lock or atomic: "
- "%s/0x%08x/%d\n",
- current->comm, preempt_count(),task_pid_nr(current));
- printk(KERN_ERR " last function: ");
- print_symbol("%s\n",(unsigned long)f);
- debug_show_held_locks(current);
- dump_stack();
- }
- spin_lock_irq(&gcwq->lock);
- /* clear cpu intensive status */
- if (unlikely(cpu_intensive))
- worker_clr_flags(worker,WORKER_CPU_INTENSIVE);
- /* we're done with it, release */
- hlist_del_init(&worker->hentry);
- worker->current_work = NULL;
- worker->current_cwq = NULL;
- cwq_dec_nr_in_flight(cwq, work_color,false);
- }
-
创建work queue:
Work thread如何处理掉work已经分析完了,然而对于前面init_workqueues()提到的system_wq是如何得到的还不清楚,另外一个问题:为什么说work thread不依赖于work queue了,下面我们来分析alloc_workqueue():
- #define alloc_workqueue(name, flags, max_active) \
- __alloc_workqueue_key((name), (flags),(max_active), NULL, NULL)
- structworkqueue_struct *__alloc_workqueue_key(const char *name,
- unsigned int flags,
- int max_active,
- struct lock_class_key *key,
- const char *lock_name)
- {
- struct workqueue_struct *wq;
- unsigned int cpu;
-
- /*
- *Workqueues which may be used during memory reclaim should
- *have a rescuer to guarantee forward progress.
- */
- /*WQ_MEM_RECLAIM表示当前内存资源是否紧张,都要执行我这个work.*/
- if (flags & WQ_MEM_RECLAIM)
- flags |= WQ_RESCUER;
-
- /*
- *Unbound workqueues aren't concurrency managed and should be
- *dispatched to workers immediately.
- */
- /* WQ_UNBOUND 表示work不依赖于如何CPU,可以在任意CPU上运行。*/
- if (flags & WQ_UNBOUND)
- flags |= WQ_HIGHPRI;
- /* max_active 限制任意一个CPU上能同时执行的最大work数量。*/
- max_active = max_active ?: WQ_DFL_ACTIVE;
- max_active =wq_clamp_max_active(max_active, flags, name);
- /* 分配 workqueue_struct,将当前workqueue相对应的信息如name, flags等保存起来,其实我们已经知道,在workthread中,这些信息会被用到。*/
- wq = kzalloc(sizeof(*wq), GFP_KERNEL);
- if (!wq)
- goto err;
- wq->flags = flags;
- wq->saved_max_active = max_active;
- mutex_init(&wq->flush_mutex);
- atomic_set(&wq->nr_cwqs_to_flush,0);
- INIT_LIST_HEAD(&wq->flusher_queue);
- INIT_LIST_HEAD(&wq->flusher_overflow);
- wq->name = name;
- lockdep_init_map(&wq->lockdep_map,lock_name, key, 0);
- INIT_LIST_HEAD(&wq->list);
- if (alloc_cwqs(wq) < 0)
- goto err;
- /* 初始化per cpu上的cpu_workqueue_struct信息。*/
- for_each_cwq_cpu(cpu, wq) {
- struct cpu_workqueue_struct *cwq =get_cwq(cpu, wq);
- struct global_cwq *gcwq =get_gcwq(cpu);
- BUG_ON((unsigned long)cwq &WORK_STRUCT_FLAG_MASK);
- cwq->gcwq = gcwq;
- cwq->wq = wq;
- cwq->flush_color = -1;
- cwq->max_active = max_active;
- INIT_LIST_HEAD(&cwq->delayed_works);
- }
-
- if (flags & WQ_RESCUER) {
- struct worker *rescuer;
- if(!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
- goto err;
- wq->rescuer = rescuer =alloc_worker();
- if (!rescuer)
- goto err;
- rescuer->task =kthread_create(rescuer_thread, wq, "%s", name);
- if (IS_ERR(rescuer->task))
- goto err;
- rescuer->task->flags |=PF_THREAD_BOUND;
- wake_up_process(rescuer->task);
- }
-
- /*
- *workqueue_lock protects global freeze state and workqueues
- *list. Grab it, set max_activeaccordingly and add the new
- *workqueue to workqueues list.
- */
- spin_lock(&workqueue_lock);
- if (workqueue_freezing &&wq->flags & WQ_FREEZABLE)
- for_each_cwq_cpu(cpu, wq)
- get_cwq(cpu,wq)->max_active = 0;
- /* 将当前wq添加到workqueues里去。*/
- list_add(&wq->list,&workqueues);
- spin_unlock(&workqueue_lock);
- return wq;
- err:
- if (wq) {
- free_cwqs(wq);
- free_mayday_mask(wq->mayday_mask);
- kfree(wq->rescuer);
- kfree(wq);
- }
- return NULL;
- }
- EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
新的机制虽然仍然保留了create_workqueue()和 create_singlethread_workqueue()这两个接口,但他们的实现最终其实调用的都是alloc_workqueue(),只是传的flags不一样。如前面所说,新机制的work queue里只有flags才会影响调度的顺序,work queue已经不重要了。
- #definecreate_workqueue(name) \
- alloc_workqueue((name),WQ_MEM_RECLAIM, 1)
- #definecreate_freezable_workqueue(name) \
- alloc_workqueue((name),WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 1)
- #definecreate_singlethread_workqueue(name) \
- alloc_workqueue((name),WQ_UNBOUND | WQ_MEM_RECLAIM, 1)
挂起work:
再看work queue如何触发work:
queue_work –> queue_work_on-> __queue_work
- static void__queue_work(unsigned int cpu, struct workqueue_struct *wq,
- struct work_struct *work)
- {
- struct global_cwq *gcwq;
- struct cpu_workqueue_struct *cwq;
- struct list_head *worklist;
- unsigned int work_flags;
- unsigned long flags;
-
- debug_work_activate(work);
-
- /* if dying, only works from the sameworkqueue are allowed */
- if (unlikely(wq->flags & WQ_DYING)&&
- WARN_ON_ONCE(!is_chained_work(wq)))
- return;
-
- /* determine gcwq to use */
- /* 根据flags获取相应gcwq*/
- if (!(wq->flags & WQ_UNBOUND)) {
- struct global_cwq *last_gcwq;
-
- if (unlikely(cpu ==WORK_CPU_UNBOUND))
- cpu =raw_smp_processor_id();
-
- /*
- * It's multi cpu. If @wq is non-reentrant and @work
- * was previously on a different cpu, it mightstill
- * be running there, in which case the workneeds to
- * be queued on that cpu to guaranteenon-reentrance.
- */
- gcwq = get_gcwq(cpu);
- if (wq->flags &WQ_NON_REENTRANT &&
- (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
- struct worker *worker;
-
- spin_lock_irqsave(&last_gcwq->lock,flags);
-
- worker =find_worker_executing_work(last_gcwq, work);
-
- if (worker &&worker->current_cwq->wq == wq)
- gcwq = last_gcwq;
- else {
- /* meh... notrunning there, queue here */
- spin_unlock_irqrestore(&last_gcwq->lock,flags);
- spin_lock_irqsave(&gcwq->lock,flags);
- }
- } else
- spin_lock_irqsave(&gcwq->lock,flags);
- } else {
- gcwq = get_gcwq(WORK_CPU_UNBOUND);
- spin_lock_irqsave(&gcwq->lock,flags);
- }
-
- /* gcwq determined, get cwq and queue */
- cwq = get_cwq(gcwq->cpu, wq);
- trace_workqueue_queue_work(cpu, cwq,work);
-
- BUG_ON(!list_empty(&work->entry));
-
- cwq->nr_in_flight[cwq->work_color]++;
- work_flags =work_color_to_flags(cwq->work_color);
-
- if (likely(cwq->nr_active <cwq->max_active)) {
- trace_workqueue_activate_work(work);
- cwq->nr_active++;
- worklist =gcwq_determine_ins_pos(gcwq, cwq);
- } else {
- work_flags |= WORK_STRUCT_DELAYED;
- worklist =&cwq->delayed_works;
- }
- /* 将当前work放到队列上等待执行。*/
- insert_work(cwq, work, worklist,work_flags);
-
- spin_unlock_irqrestore(&gcwq->lock,flags);
- }
-
- static voidinsert_work(struct cpu_workqueue_struct *cwq,
- struct work_struct *work,struct list_head *head,
- unsigned int extra_flags)
- {
- struct global_cwq *gcwq = cwq->gcwq;
-
- /* we own @work, set data and link */
- set_work_cwq(work, cwq, extra_flags);
-
- /*
- *Ensure that we get the right work->data if we see the
- *result of list_add() below, see try_to_grab_pending().
- */
- smp_wmb();
-
- list_add_tail(&work->entry, head);
-
- /*
- *Ensure either worker_sched_deactivated() sees the above
- *list_add_tail() or we see zero nr_running to avoid workers
- *lying around lazily while there are works to be processed.
- */
- smp_mb();
- /* 如果当前有高优先级的work或者已经没有空闲的workthread了,well,那就再创建一个workthread来处理。*/
- if (__need_more_worker(gcwq))
- wake_up_worker(gcwq);
- }
至此,对work queue的工作机制都分析完了。可以看出,新的机制相对来说更灵活,而且基本上不会浪费内存资源,导致系统过量负载。
或许,不久的将来,create_workqueue()接口都将不复存在….
Reference:
http://lwn.net/Articles/403891/
http://gqf2008.iteye.com/blog/447060
kernel/documentation/Workqueue.txt
2012/08/10