工作队列:
内核中所有的工作队列都是由helper_wq工作队列创建的,那么helper_wq是谁创建的呢?答案是直接执行的。看一下代码:
static __init int helper_init(void)
{
helper_wq = create_singlethread_workqueue("kthread");
BUG_ON(!helper_wq);
return 0;
}
core_initcall(helper_init);
看看create_workqueue函数是怎么实现的:
struct workqueue_struct *__create_workqueue(const char *name, int singlethread)
{
int cpu, destroy = 0;
struct workqueue_struct *wq;
struct task_struct *p;
wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return NULL;
wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
if (!wq->cpu_wq) {
kfree(wq);
return NULL;
}
wq->name = name;
lock_cpu_hotplug();
if (singlethread) {
INIT_LIST_HEAD(&wq->list);
p = create_workqueue_thread(wq, singlethread_cpu);
if (!p)
destroy = 1;
else
wake_up_process(p);
} else {
spin_lock(&workqueue_lock);
list_add(&wq->list, &workqueues);
spin_unlock(&workqueue_lock);
for_each_online_cpu(cpu) {
p = create_workqueue_thread(wq, cpu);
if (p) {
kthread_bind(p, cpu);
wake_up_process(p);
} else
destroy = 1;
}
}
unlock_cpu_hotplug();
if (destroy) {
destroy_workqueue(wq);
wq = NULL;
}
return wq;
}
其中wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct)这一句创建了
struct cpu_workqueue_struct {
spinlock_t lock;
long remove_sequence;
long insert_sequence;
struct list_head worklist;
wait_queue_head_t more_work;
wait_queue_head_t work_done;
struct workqueue_struct *wq;
task_t *thread;
int run_depth;
} ____cacheline_aligned;
以上的这个结构是每个cpu结构,它是内嵌在
struct workqueue_struct {
struct cpu_workqueue_struct *cpu_wq;
const char *name;
struct list_head list; /* Empty if single thread */
};
的cpu_wq数组字段里面的。作为消遣,看看__alloc_percpu函数:
void *__alloc_percpu(size_t size)
{
int i;
struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
if (!pdata)
return NULL;
for_each_possible_cpu(i) {
int node = cpu_to_node(i);
if (node_online(node))
pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
else
pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
if (!pdata->ptrs[i])
goto unwind_oom;
memset(pdata->ptrs[i], 0, size);
}
return (void *)(~(unsigned long)pdata);
unwind_oom:
while (--i >= 0) {
if (!cpu_possible(i))
continue;
kfree(pdata->ptrs[i]);
}
kfree(pdata);
return NULL;
}
下面开始真正的重要过程:
static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
int cpu)
{
struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
struct task_struct *p;
spin_lock_init(&cwq->lock);
cwq->wq = wq;
cwq->thread = NULL;
cwq->insert_sequence = 0;
cwq->remove_sequence = 0;
INIT_LIST_HEAD(&cwq->worklist);
init_waitqueue_head(&cwq->more_work);
init_waitqueue_head(&cwq->work_done);
if (is_single_threaded(wq))
p = kthread_create(worker_thread, cwq, "%s", wq->name);
else
p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
if (IS_ERR(p))
return NULL;
cwq->thread = p;
return p;
}
struct task_struct *kthread_create(int (*threadfn)(void *data),
void *data,
const char namefmt[],
...)
{
struct kthread_create_info create;
DECLARE_WORK(work, keventd_create_kthread, &create);
create.threadfn = threadfn;
create.data = data;
init_completion(&create.started);
init_completion(&create.done);
if (!helper_wq)//注意这一句,如果没有helper_wq,那么就直接执行回调函数,这里就是keventd_create_kthread
work.func(work.data);
else {
queue_work(helper_wq, &work);
wait_for_completion(&create.done);
}
if (!IS_ERR(create.result)) {
va_list args;
va_start(args, namefmt);
vsnprintf(create.result->comm, sizeof(create.result->comm),
namefmt, args);
va_end(args);
}
return create.result;
}
上面的过程确实有点绕,实际上work.func和queue_work调用执行的是同一个过程,就是执行keventd_create_kthread函数,而前面传入的threadfn也就是worker_thread则是封装进了kthread_create_info结构里面了,要想知道往下该做什么了,就要看keventd_create_kthread函数:
static void keventd_create_kthread(void *_create)
{
struct kthread_create_info *create = _create;
int pid;
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
create->result = ERR_PTR(pid);
} else {
wait_for_completion(&create->started);
read_lock(&tasklist_lock);
create->result = find_task_by_pid(pid);
read_unlock(&tasklist_lock);
}
complete(&create->done);
}
到这里才真正启动一个内核线程kthread,然后这个内核线程函数以create为参数,到这儿,几乎可以猜出接下来要做什么了
static int kthread(void *_create)
{
struct kthread_create_info *create = _create;
int (*threadfn)(void *data);
void *data;
sigset_t blocked;
int ret = -EINTR;
kthread_exit_files();
threadfn = create->threadfn;
data = create->data;
sigfillset(&blocked);
sigprocmask(SIG_BLOCK, &blocked, NULL);
flush_signals(current);
set_cpus_allowed(current, CPU_MASK_ALL);
__set_current_state(TASK_INTERRUPTIBLE);
complete(&create->started);
schedule();
if (!kthread_should_stop())
ret = threadfn(data);
if (kthread_should_stop()) {
kthread_stop_info.err = ret;
complete(&kthread_stop_info.done);
}
return 0;
}
唉,最终threadfn(data)搞定了一切,这个threadfn(data)实际上就是
static int worker_thread(void *__cwq)
{
struct cpu_workqueue_struct *cwq = __cwq;
DECLARE_WAITQUEUE(wait, current);
struct k_sigaction sa;
sigset_t blocked;
current->flags |= PF_NOFREEZE;
set_user_nice(current, -5);
sigfillset(&blocked);
sigprocmask(SIG_BLOCK, &blocked, NULL);
flush_signals(current);
sa.sa.sa_handler = SIG_IGN;
sa.sa.sa_flags = 0;
siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
add_wait_queue(&cwq->more_work, &wait);
if (list_empty(&cwq->worklist))
schedule();
else
__set_current_state(TASK_RUNNING);
remove_wait_queue(&cwq->more_work, &wait);
if (!list_empty(&cwq->worklist))
run_workqueue(cwq);
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return 0;
}
再来一层调用就完事了:run_workqueue(cwq)
static void run_workqueue(struct cpu_workqueue_struct *cwq)
{
unsigned long flags;
spin_lock_irqsave(&cwq->lock, flags);
cwq->run_depth++;
if (cwq->run_depth > 3) {
printk("%s: recursion depth exceeded: %d/n",
__FUNCTION__, cwq->run_depth);
dump_stack();
}
while (!list_empty(&cwq->worklist)) {
struct work_struct *work = list_entry(cwq->worklist.next,
struct work_struct, entry);
void (*f) (void *) = work->func;
void *data = work->data;
list_del_init(cwq->worklist.next);
spin_unlock_irqrestore(&cwq->lock, flags);
BUG_ON(work->wq_data != cwq);
clear_bit(0, &work->pending);
f(data);
spin_lock_irqsave(&cwq->lock, flags);
cwq->remove_sequence++;
wake_up(&cwq->work_done);
}
cwq->run_depth--;
spin_unlock_irqrestore(&cwq->lock, flags);
}
以上就是工作队列的全过程,这是一个完美的层次嵌套结构,很完美!如果任何时间想用工作队列解决一些问题那么你只需要做两件事情,一是建立一个工作队列,二是往里面排入任务。
看完了上述代码,下面看一个应用,就是linux的异步io。
static int __init aio_setup(void)
{
kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
aio_wq = create_workqueue("aio");
pr_debug("aio_setup: sizeof(struct page) = %d/n", (int)sizeof(struct page));
return 0;
}
在新内核里,睡眠唤醒和2.4的不同了:为了更加灵活,用回调函数来实现。
#define wake_up(x) __wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, NULL)
void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr_exclusive, 0, key);
spin_unlock_irqrestore(&q->lock, flags);
}
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int sync, void *key)
{
struct list_head *tmp, *next;
list_for_each_safe(tmp, next, &q->task_list) {
wait_queue_t *curr;
unsigned flags;
curr = list_entry(tmp, wait_queue_t, task_list);
flags = curr->flags;
if (curr->func(curr, mode, sync, key) &&
(flags & WQ_FLAG_EXCLUSIVE) &&!--nr_exclusive)
break;
}
}
看看curr->func(curr, mode, sync, key) 这一句,wait_queue_t结构体里面加入了一个函数指针。对于一般的唤醒,就是default_wake_function函数,而对于异步io,就是aio_wake_function:
static int aio_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait);
list_del_init(&wait->task_list);
kick_iocb(iocb);
return 1;
}
kiocb结构是与异步io相关的结构,暂且现在不用深究。接着看kick_iocb
void fastcall kick_iocb(struct kiocb *iocb)
{
if (is_sync_kiocb(iocb)) {
kiocbSetKicked(iocb);
wake_up_process(iocb->ki_obj.tsk);
return;
}
try_queue_kicked_iocb(iocb);
}
重点就是try_queue_kicked_iocb:
static void try_queue_kicked_iocb(struct kiocb *iocb)
{
struct kioctx *ctx = iocb->ki_ctx;
unsigned long flags;
int run = 0;
BUG_ON((!list_empty(&iocb->ki_wait.task_list)));
spin_lock_irqsave(&ctx->ctx_lock, flags);
if (!kiocbTryKick(iocb))
run = __queue_kicked_iocb(iocb);
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
if (run)
aio_queue_work(ctx);
}
__queue_kicked_iocb只是和队列相关的操作,而重点是aio_queue_work函数:
static void aio_queue_work(struct kioctx * ctx)
{
unsigned long timeout;
smp_mb();
if (waitqueue_active(&ctx->wait))
timeout = 1;
else
timeout = HZ/10;
queue_delayed_work(aio_wq, &ctx->wq, timeout);
}
queue_delayed_work应该很熟悉了,就是往前面系统初始化的时候建立的工作队列aio_wq里排入具体的任务ctx->wq那么,真正调度到该工作队列的时候该怎么办呢?答案是调用aio_kick_handler,这需要在某个地方初始化这个工作队列元素:
INIT_WORK(&ctx->wq, aio_kick_handler, ctx);
下面看一下这个函数:
static void aio_kick_handler(void *data)
{
struct kioctx *ctx = data;
mm_segment_t oldfs = get_fs();
int requeue;
set_fs(USER_DS);
use_mm(ctx->mm); //切换到提交aio进程的地址空间,因为O_DIRECT的io不需要内核缓存
spin_lock_irq(&ctx->ctx_lock);
requeue =__aio_run_iocbs(ctx);
unuse_mm(ctx->mm);
spin_unlock_irq(&ctx->ctx_lock);
set_fs(oldfs);
if (requeue)
queue_work(aio_wq, &ctx->wq);
}
__aio_run_iocbs(ctx)是真正完成任务的操作
static int __aio_run_iocbs(struct kioctx *ctx)
{
struct kiocb *iocb;
LIST_HEAD(run_list);
assert_spin_locked(&ctx->ctx_lock);
list_splice_init(&ctx->run_list, &run_list);
while (!list_empty(&run_list)) {
iocb = list_entry(run_list.next, struct kiocb,
ki_run_list);
list_del(&iocb->ki_run_list);
iocb->ki_users++; /* grab extra reference */
aio_run_iocb(iocb);
if (__aio_put_req(ctx, iocb)) /* drop extra ref */
put_ioctx(ctx);
}
if (!list_empty(&ctx->run_list))
return 1;
return 0;
}
进一步 aio_run_iocb:
static ssize_t aio_run_iocb(struct kiocb *iocb)
{
struct kioctx *ctx = iocb->ki_ctx;
ssize_t (*retry)(struct kiocb *);
ssize_t ret;
if (iocb->ki_retried++ > 1024*1024) {
…
}
if (!(iocb->ki_retried & 0xff))
…
if (!(retry = iocb->ki_retry))
…
kiocbClearKicked(iocb);
iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
spin_unlock_irq(&ctx->ctx_lock);
if (kiocbIsCancelled(iocb)) {
ret = -EINTR;
aio_complete(iocb, ret, 0);
goto out;
}
BUG_ON(current->io_wait != NULL);
current->io_wait = &iocb->ki_wait;
ret = retry(iocb);
current->io_wait = NULL;
if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
BUG_ON(!list_empty(&iocb->ki_wait.task_list));
aio_complete(iocb, ret, 0);
}
out:
spin_lock_irq(&ctx->ctx_lock);
if (-EIOCBRETRY == ret) {
INIT_LIST_HEAD(&iocb->ki_run_list);
if (kiocbIsKicked(iocb)) {
__queue_kicked_iocb(iocb);
aio_queue_work(ctx);
}
}
return ret;
}
这个函数本质上就是执行retry回调函数,完成没有完成的工作。异步io和普通io的区别就是在于发生组赛的时候并不睡眠或发生调度,而是直接返回一个错误代码,(在AIO下,进程并不会sleep,而是简单地将一个含有特殊func的wait_queue_t挂入等待队列就返回了)。这里应用工作队列的方式令人赞叹。 那么还是看看普通的唤醒函数比较心安:
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
void *key)
{
task_t *p = curr->private;
return try_to_wake_up(p, mode, sync);
}
就是这么简单,简简单单唤醒进程。在信号的传递结束以后,调用
int fastcall wake_up_process(task_t *p)
{
return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
}
唤醒接受信号的进程。下面看一下关于等待队列的事情:
static inline void init_waitqueue_head(wait_queue_head_t *q)
{
spin_lock_init(&q->lock);
INIT_LIST_HEAD(&q->task_list);
}
static inline void INIT_LIST_HEAD(struct list_head *list)
{
list->next = list;
list->prev = list;
}
********************
void fastcall __sched sleep_on(wait_queue_head_t *q)
{
SLEEP_ON_VAR //初始化一个wait_queue_t
current->state = TASK_UNINTERRUPTIBLE;
SLEEP_ON_HEAD //将这个wait_queue_t加入睡眠队列
schedule();
SLEEP_ON_TAIL //睡眠完毕,从睡眠队列清除
}
就是这么简单。