work queue in Linux 2.3.36

                            新work queue工作机制

 

工作队列(workqueue)是Linux kernel中将工作推后执行的一种机制。这种机制和BH或Tasklets不同之处在于工作队列是把推后的工作交由一个内核线程去执行,因此工作队列的优势就在于它允许重新调度甚至睡眠。

Linux的work queue在2.6.0 到2.6.19以及到2.6.36工作队列发生了一些变化。本文主要对新版本做一些分析。

虽然自从2.6.0之后,Linux对work queue进行了优化,但是kernel用到create_workqueue的模块越来越多,而调用create_workqueue会在每个cpu上都创建一个work_thread, 每个cpu都分配一个cpu_workqueue_struct以及workqueue_struct,而如果没被queue_work的话根本没机会工作,这样仍然相当浪费内存资源,而且加重了cpu loading。另外,同一个work queue上的每个work都是按照串行执行的,假如其中一个work的调度程序睡眠了,那么后面的work也将无法工作。

自从2.6.36以后,work queue的机制发生了很大变化,所有的work queue都被合并成

一个work queue,work thread也不是和work queue一一关联,work何时工作紧紧按照工作的重要性以及时间紧迫性来划分。也就是说新机制是按照cpu数量来创建work thread,而不是work queue。

下面我们还是通过代码分析吧:

初始化workqueus及创建work threads:

系统启动时调用init_workqueus()@kernel/kernel/workqueue.c

static int __initinit_workqueues(void)
{
       unsigned int cpu;
       int i;
 
       cpu_notifier(workqueue_cpu_callback,CPU_PRI_WORKQUEUE);
 
       /* initialize gcwqs */
/* 前面有说过,新机制将workqueues都排到gcwq上管理了,
每个cpu各一个gcwq。*/
       for_each_gcwq_cpu(cpu) {
              struct global_cwq *gcwq =get_gcwq(cpu);
              spin_lock_init(&gcwq->lock);
              INIT_LIST_HEAD(&gcwq->worklist);
              gcwq->cpu = cpu;
              gcwq->flags |=GCWQ_DISASSOCIATED;
              INIT_LIST_HEAD(&gcwq->idle_list);
              for (i = 0; i <BUSY_WORKER_HASH_SIZE; i++)
                     INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
              init_timer_deferrable(&gcwq->idle_timer);
              gcwq->idle_timer.function =idle_worker_timeout;
              gcwq->idle_timer.data =(unsigned long)gcwq;
              setup_timer(&gcwq->mayday_timer,gcwq_mayday_timeout,
                         (unsigned long)gcwq);
              ida_init(&gcwq->worker_ida);
              gcwq->trustee_state =TRUSTEE_DONE;
              init_waitqueue_head(&gcwq->trustee_wait);
       }
 
       /* create the initial worker */
       for_each_online_gcwq_cpu(cpu) {
              struct global_cwq *gcwq =get_gcwq(cpu);
              struct worker *worker;
              if (cpu != WORK_CPU_UNBOUND)
                     gcwq->flags &=~GCWQ_DISASSOCIATED;
/* 开机启动初始化后创建workthread主要是这里实现*/
              worker = create_worker(gcwq,true);
              BUG_ON(!worker);
              spin_lock_irq(&gcwq->lock);
              start_worker(worker);
              spin_unlock_irq(&gcwq->lock);
       }
/* 创建系统开机后默认的workqueue,平常我们调用的
schedule_work()其实就是用的system_wq这个work queue,可
参考schedule_work()实现。*/
       system_wq =alloc_workqueue("events", 0, 0);
       system_long_wq =alloc_workqueue("events_long", 0, 0);
       system_nrt_wq =alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
       system_unbound_wq =alloc_workqueue("events_unbound", WQ_UNBOUND,
                                       WQ_UNBOUND_MAX_ACTIVE);
       system_freezable_wq =alloc_workqueue("events_freezable",
                                         WQ_FREEZABLE, 0);
       BUG_ON(!system_wq || !system_long_wq ||!system_nrt_wq ||
             !system_unbound_wq || !system_freezable_wq);
       return 0;
}
early_initcall(init_workqueues);


再看create_worker是如何创建work thread的:

static structworker *create_worker(struct global_cwq *gcwq, bool bind)
{
       bool on_unbound_cpu = gcwq->cpu ==WORK_CPU_UNBOUND;
       struct worker *worker = NULL;
       int id = -1;
 
       spin_lock_irq(&gcwq->lock);
       while (ida_get_new(&gcwq->worker_ida,&id)) {
              spin_unlock_irq(&gcwq->lock);
              if(!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
                     goto fail;
              spin_lock_irq(&gcwq->lock);
       }
       spin_unlock_irq(&gcwq->lock);
/* 为work分配空间,初始化worker*/
       worker = alloc_worker();
       if (!worker)
              goto fail;
       worker->gcwq = gcwq;
       worker->id = id;
       /* kthread_create_on_node和 kthread_create都可以创建worker_thread,区别在于是否和cpu绑定,前者别是不依赖于CPU而工作,可以在任何CPU上工作,而后者表示分别在各个CPU上创建一个workthread来工作。从ps命令里就可以看到像kworker/0:0,kworker/1:0, kworker/u:0这样的进程就是这里创建的workthread了!*/
       if (!on_unbound_cpu)
              worker->task =kthread_create_on_node(worker_thread, worker,
cpu_to_node(gcwq->cpu),"kworker/%u:%d", gcwq->cpu, id);
       else
              worker->task =kthread_create(worker_thread, worker, "kworker/u:%d", id);
       if (IS_ERR(worker->task))
              goto fail;
 
       /*
        *A rogue worker will become a regular one if CPU comes
        *online later on.  Make sure every workerhas
        *PF_THREAD_BOUND set.
        */
       if (bind && !on_unbound_cpu)
              kthread_bind(worker->task,gcwq->cpu);
       else {
              worker->task->flags |=PF_THREAD_BOUND;
              if (on_unbound_cpu)
                     worker->flags |=WORKER_UNBOUND;
       }
 
       return worker;
fail:
       if (id >= 0) {
              spin_lock_irq(&gcwq->lock);
              ida_remove(&gcwq->worker_ida,id);
              spin_unlock_irq(&gcwq->lock);
       }
       kfree(worker);
       return NULL;
}


处理works:

由kthread_create_on_node()或 kthread_create()创建了work thread之后,它就开始运行起来了:

static intworker_thread(void *__worker)
{
       struct worker *worker = __worker;
       struct global_cwq *gcwq =worker->gcwq;
 
       /* tell the scheduler that this is aworkqueue worker */
       worker->task->flags |=PF_WQ_WORKER;
/* 最后的代码用gotowoke_up来表示work thread是一个无限循环。*/
woke_up:
       spin_lock_irq(&gcwq->lock);
       /* DIE can be set only while we're idle,checking here is enough */
       if (worker->flags & WORKER_DIE) {
              spin_unlock_irq(&gcwq->lock);
              worker->task->flags &=~PF_WQ_WORKER;
              return 0;
       }
       worker_leave_idle(worker);
recheck:
       /* no more worker necessary? */
/* 如果有高优先级的work需要处理,而且当前已经没有空闲的workthread可以来处理掉这个高优先级work,那下一步就要创建新的workthread来处理掉,读者可自行分析need_more_woker()的实现。这里就体现了新机制对于高优先级先处理的方法。*/
       if (!need_more_worker(gcwq))
              goto sleep;
/*新建一个workthread,可以看出,新的机制已经不想老的那样不管如何情况只要creatework queue就创建work thread,浪费内存资源。在manage_workers() -> maybe_create_worker ()-> create_worker (), create_worker()前面分析过了,它会creatework thread!*/
       /* do we need to manage? */
       if (unlikely(!may_start_working(gcwq))&& manage_workers(worker))
              goto recheck;
       /*
        *->scheduled list can only be filled while a worker is
        *preparing to process a work or actually processing it.
        *Make sure nobody diddled with it while I was sleeping.
        */
       BUG_ON(!list_empty(&worker->scheduled));
 
       /*
        *When control reaches this point, we're guaranteed to have
        *at least one idle worker or that someone else has already
        *assumed the manager role.
        */
       worker_clr_flags(worker, WORKER_PREP);
 
       do {
              struct work_struct *work =
                     list_first_entry(&gcwq->worklist,
                                    struct work_struct, entry);
/* 在创建里新的work thead去处理高优先级的work之后,终于轮到处理自己的work了。核心在process_one_work().*/
              if (likely(!(*work_data_bits(work)& WORK_STRUCT_LINKED))) {
                     /* optimization path, notstrictly necessary */
                     process_one_work(worker,work);
                     if(unlikely(!list_empty(&worker->scheduled)))
                            process_scheduled_works(worker);
              } else {
                     move_linked_works(work,&worker->scheduled, NULL);
                     process_scheduled_works(worker);
              }
       } while (keep_working(gcwq));
       worker_set_flags(worker, WORKER_PREP,false);
sleep:
       /*在休眠之前,再一次判断当前有没有新的work需要处理。所以即使本work睡眠了,其他work也可以继续工作,这样就不会存在老的机制那样一个worksleep会阻塞其他work执行。*/
       if(unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
              goto recheck;
 
       /*
        *gcwq->lock is held and there's no work to process and no
        *need to manage, sleep.  Workers are wokenup only while
        *holding gcwq->lock or from local cpu, so setting the
        *current state before releasing gcwq->lock is enough to
        *prevent losing any event.
        */
       worker_enter_idle(worker);
       __set_current_state(TASK_INTERRUPTIBLE);
       spin_unlock_irq(&gcwq->lock);
       schedule();
       goto woke_up;
}


再来看看系统如何将work给处理掉:

static voidprocess_one_work(struct worker *worker, struct work_struct *work)
__releases(&gcwq->lock)
__acquires(&gcwq->lock)
{
       struct cpu_workqueue_struct *cwq =get_work_cwq(work);
       struct global_cwq *gcwq = cwq->gcwq;
       struct hlist_head *bwh =busy_worker_head(gcwq, work);
       bool cpu_intensive = cwq->wq->flags& WQ_CPU_INTENSIVE;
/* 取出用户driver设置的函数*/
       work_func_t f = work->func;
       int work_color;
       struct worker *collision;
#ifdefCONFIG_LOCKDEP
       /*
        *It is permissible to free the struct work_struct from
        *inside the function that is called from it, this we need to
        *take into account for lockdep too.  Toavoid bogus "held
        *lock freed" warnings as well as problems when looking into
        *work->lockdep_map, make a copy and use that here.
        */
       struct lockdep_map lockdep_map =work->lockdep_map;
#endif
       /*
        *A single work shouldn't be executed concurrently by
        *multiple workers on a single cpu.  Checkwhether anyone is
        *already processing the work.  If so,defer the work to the
        *currently executing one.
        */
       collision =__find_worker_executing_work(gcwq, bwh, work);
       if (unlikely(collision)) {
              move_linked_works(work,&collision->scheduled, NULL);
              return;
       }
 
       /* claim and process */
       debug_work_deactivate(work);
       hlist_add_head(&worker->hentry,bwh);
       worker->current_work = work;
       worker->current_cwq = cwq;
       work_color = get_work_color(work);
 
       /* record the current cpu number in thework data and dequeue */
       set_work_cpu(work, gcwq->cpu);
       list_del_init(&work->entry);
 
       /*
        *If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
        *wake up another worker; otherwise, clear HIGHPRI_PENDING.
        */
/* 如果全局的gcwq有高优先级的work需要处理,唤醒它执行!*/
       if (unlikely(gcwq->flags &GCWQ_HIGHPRI_PENDING)) {
              struct work_struct *nwork = list_first_entry(&gcwq->worklist,
                                          structwork_struct, entry);
              if(!list_empty(&gcwq->worklist) &&
                 get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
                     /*唤醒高优先级的work所对应的workthread来工作。*/
                     wake_up_worker(gcwq);
              else
                     gcwq->flags &=~GCWQ_HIGHPRI_PENDING;
       }
 
       /*
        *CPU intensive works don't participate in concurrency
        *management.  They're the scheduler'sresponsibility.
        */
/*如果当前有对时间敏感的work,那么如果有空闲的workthread的话,也要唤醒相应work thread来工作。*/
       if (unlikely(cpu_intensive))
              worker_set_flags(worker, WORKER_CPU_INTENSIVE,true);
 
       spin_unlock_irq(&gcwq->lock);
 
       work_clear_pending(work);
       lock_map_acquire_read(&cwq->wq->lockdep_map);
       lock_map_acquire(&lockdep_map);
       trace_workqueue_execute_start(work);
       /* 历经千辛万苦,终于跑到要调用的work functionpointer了!!!*/
f(work);
       /*
        *While we must be careful to not use "work" after this, the trace
        *point will only record its address.
        */
       /* 后面就是一些删除work,资源清楚释放,标志重设的工作了。*/
       trace_workqueue_execute_end(work);
       lock_map_release(&lockdep_map);
       lock_map_release(&cwq->wq->lockdep_map);
 
       if (unlikely(in_atomic() ||lockdep_depth(current) > 0)) {
              printk(KERN_ERR "BUG:workqueue leaked lock or atomic: "
                     "%s/0x%08x/%d\n",
                     current->comm, preempt_count(),task_pid_nr(current));
              printk(KERN_ERR "    last function: ");
              print_symbol("%s\n",(unsigned long)f);
              debug_show_held_locks(current);
              dump_stack();
       }
       spin_lock_irq(&gcwq->lock);
       /* clear cpu intensive status */
       if (unlikely(cpu_intensive))
              worker_clr_flags(worker,WORKER_CPU_INTENSIVE);
       /* we're done with it, release */
       hlist_del_init(&worker->hentry);
       worker->current_work = NULL;
       worker->current_cwq = NULL;
       cwq_dec_nr_in_flight(cwq, work_color,false);
}
 


创建work queue:

Work thread如何处理掉work已经分析完了,然而对于前面init_workqueues()提到的system_wq是如何得到的还不清楚,另外一个问题:为什么说work thread不依赖于work queue了,下面我们来分析alloc_workqueue():

#define alloc_workqueue(name, flags, max_active)            \
       __alloc_workqueue_key((name), (flags),(max_active), NULL, NULL)
structworkqueue_struct *__alloc_workqueue_key(const char *name,
                                         unsigned int flags,
                                          int max_active,
                                          struct lock_class_key *key,
                                          const char *lock_name)
{
       struct workqueue_struct *wq;
       unsigned int cpu;
 
       /*
        *Workqueues which may be used during memory reclaim should
        *have a rescuer to guarantee forward progress.
        */
/*WQ_MEM_RECLAIM表示当前内存资源是否紧张,都要执行我这个work.*/
       if (flags & WQ_MEM_RECLAIM)
              flags |= WQ_RESCUER;
 
       /*
        *Unbound workqueues aren't concurrency managed and should be
        *dispatched to workers immediately.
        */
       /* WQ_UNBOUND 表示work不依赖于如何CPU,可以在任意CPU上运行。*/
       if (flags & WQ_UNBOUND)
              flags |= WQ_HIGHPRI;
       /* max_active 限制任意一个CPU上能同时执行的最大work数量。*/
       max_active = max_active ?: WQ_DFL_ACTIVE;
       max_active =wq_clamp_max_active(max_active, flags, name);
       /* 分配 workqueue_struct,将当前workqueue相对应的信息如name, flags等保存起来,其实我们已经知道,在workthread中,这些信息会被用到。*/
       wq = kzalloc(sizeof(*wq), GFP_KERNEL);
       if (!wq)
              goto err;
       wq->flags = flags;
       wq->saved_max_active = max_active;
       mutex_init(&wq->flush_mutex);
       atomic_set(&wq->nr_cwqs_to_flush,0);
       INIT_LIST_HEAD(&wq->flusher_queue);
       INIT_LIST_HEAD(&wq->flusher_overflow);
       wq->name = name;
       lockdep_init_map(&wq->lockdep_map,lock_name, key, 0);
       INIT_LIST_HEAD(&wq->list);
       if (alloc_cwqs(wq) < 0)
              goto err;
       /* 初始化per cpu上的cpu_workqueue_struct信息。*/
       for_each_cwq_cpu(cpu, wq) {
              struct cpu_workqueue_struct *cwq =get_cwq(cpu, wq);
              struct global_cwq *gcwq =get_gcwq(cpu);
              BUG_ON((unsigned long)cwq &WORK_STRUCT_FLAG_MASK);
              cwq->gcwq = gcwq;
              cwq->wq = wq;
              cwq->flush_color = -1;
              cwq->max_active = max_active;
              INIT_LIST_HEAD(&cwq->delayed_works);
       }
 
       if (flags & WQ_RESCUER) {
              struct worker *rescuer;
              if(!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
                     goto err;
              wq->rescuer = rescuer =alloc_worker();
              if (!rescuer)
                     goto err;
              rescuer->task =kthread_create(rescuer_thread, wq, "%s", name);
              if (IS_ERR(rescuer->task))
                     goto err;
              rescuer->task->flags |=PF_THREAD_BOUND;
              wake_up_process(rescuer->task);
       }
 
       /*
        *workqueue_lock protects global freeze state and workqueues
        *list.  Grab it, set max_activeaccordingly and add the new
        *workqueue to workqueues list.
        */
       spin_lock(&workqueue_lock);
       if (workqueue_freezing &&wq->flags & WQ_FREEZABLE)
              for_each_cwq_cpu(cpu, wq)
                     get_cwq(cpu,wq)->max_active = 0;
       /* 将当前wq添加到workqueues里去。*/
       list_add(&wq->list,&workqueues);
       spin_unlock(&workqueue_lock);
       return wq;
err:
       if (wq) {
              free_cwqs(wq);
              free_mayday_mask(wq->mayday_mask);
              kfree(wq->rescuer);
              kfree(wq);
       }
       return NULL;
}
EXPORT_SYMBOL_GPL(__alloc_workqueue_key);

 

新的机制虽然仍然保留了create_workqueue()和 create_singlethread_workqueue()这两个接口,但他们的实现最终其实调用的都是alloc_workqueue(),只是传的flags不一样。如前面所说,新机制的work queue里只有flags才会影响调度的顺序,work queue已经不重要了。

#definecreate_workqueue(name)                              \
alloc_workqueue((name),WQ_MEM_RECLAIM, 1)
#definecreate_freezable_workqueue(name)                \
alloc_workqueue((name),WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 1)
#definecreate_singlethread_workqueue(name)                  \
alloc_workqueue((name),WQ_UNBOUND | WQ_MEM_RECLAIM, 1)


挂起work:

再看work queue如何触发work:

queue_work –> queue_work_on-> __queue_work

static void__queue_work(unsigned int cpu, struct workqueue_struct *wq,
                      struct work_struct *work)
{
       struct global_cwq *gcwq;
       struct cpu_workqueue_struct *cwq;
       struct list_head *worklist;
       unsigned int work_flags;
       unsigned long flags;
 
       debug_work_activate(work);
 
       /* if dying, only works from the sameworkqueue are allowed */
       if (unlikely(wq->flags & WQ_DYING)&&
          WARN_ON_ONCE(!is_chained_work(wq)))
              return;
 
       /* determine gcwq to use */
       /* 根据flags获取相应gcwq*/
       if (!(wq->flags & WQ_UNBOUND)) {
              struct global_cwq *last_gcwq;
 
              if (unlikely(cpu ==WORK_CPU_UNBOUND))
                     cpu =raw_smp_processor_id();
 
              /*
               * It's multi cpu.  If @wq is non-reentrant and @work
               * was previously on a different cpu, it mightstill
               * be running there, in which case the workneeds to
               * be queued on that cpu to guaranteenon-reentrance.
               */
              gcwq = get_gcwq(cpu);
              if (wq->flags &WQ_NON_REENTRANT &&
                 (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
                     struct worker *worker;
 
                     spin_lock_irqsave(&last_gcwq->lock,flags);
 
                     worker =find_worker_executing_work(last_gcwq, work);
 
                     if (worker &&worker->current_cwq->wq == wq)
                            gcwq = last_gcwq;
                     else {
                            /* meh... notrunning there, queue here */
                            spin_unlock_irqrestore(&last_gcwq->lock,flags);
                            spin_lock_irqsave(&gcwq->lock,flags);
                     }
              } else
                     spin_lock_irqsave(&gcwq->lock,flags);
       } else {
              gcwq = get_gcwq(WORK_CPU_UNBOUND);
              spin_lock_irqsave(&gcwq->lock,flags);
       }
 
       /* gcwq determined, get cwq and queue */
       cwq = get_cwq(gcwq->cpu, wq);
       trace_workqueue_queue_work(cpu, cwq,work);
 
       BUG_ON(!list_empty(&work->entry));
 
       cwq->nr_in_flight[cwq->work_color]++;
       work_flags =work_color_to_flags(cwq->work_color);
 
       if (likely(cwq->nr_active <cwq->max_active)) {
              trace_workqueue_activate_work(work);
              cwq->nr_active++;
              worklist =gcwq_determine_ins_pos(gcwq, cwq);
       } else {
              work_flags |= WORK_STRUCT_DELAYED;
              worklist =&cwq->delayed_works;
       }
       /* 将当前work放到队列上等待执行。*/
       insert_work(cwq, work, worklist,work_flags);
 
       spin_unlock_irqrestore(&gcwq->lock,flags);
}
 
static voidinsert_work(struct cpu_workqueue_struct *cwq,
                     struct work_struct *work,struct list_head *head,
                     unsigned int extra_flags)
{
       struct global_cwq *gcwq = cwq->gcwq;
 
       /* we own @work, set data and link */
       set_work_cwq(work, cwq, extra_flags);
 
       /*
        *Ensure that we get the right work->data if we see the
        *result of list_add() below, see try_to_grab_pending().
        */
       smp_wmb();
 
       list_add_tail(&work->entry, head);
 
       /*
        *Ensure either worker_sched_deactivated() sees the above
        *list_add_tail() or we see zero nr_running to avoid workers
        *lying around lazily while there are works to be processed.
        */
       smp_mb();
       /* 如果当前有高优先级的work或者已经没有空闲的workthread了,well,那就再创建一个workthread来处理。*/
       if (__need_more_worker(gcwq))
              wake_up_worker(gcwq);
}

             

至此,对work queue的工作机制都分析完了。可以看出,新的机制相对来说更灵活,而且基本上不会浪费内存资源,导致系统过量负载。

或许,不久的将来,create_workqueue()接口都将不复存在….

                    

Reference:

http://lwn.net/Articles/403891/

http://gqf2008.iteye.com/blog/447060

kernel/documentation/Workqueue.txt

 

2012/08/10

 

你可能感兴趣的:(thread,linux,工作,struct,list,System)