linux workqueue工作队列的理解和使用示例

linux workqueue工作队列的理解和使用示例

目录

  • linux workqueue工作队列的理解和使用示例
    • 前言
    • 基本概概念与数据结构
      • work、worker
      • workqueue、worker_pool、pool_workqueue
      • top 终端 kwork 工作线程
    • 代码工作流程分析
      • 初始化工作
      • 使用
      • 全部流程
    • 相关的API
    • 示例
    • 总结

前言

本文使用的内核源码为linux 4.15.2

基本概概念与数据结构

work、worker

workqueue 机制的最小调试单元是work item 由结构体work_struct来表示

/include/linux/workqueue.h

struct work_struct {
	/*
	低bit表示work的标志位,
	其它的bit用于存放一次运行的work_pool 的ID或者pool_workqueue的指针
	*/
	atomic_long_t data; 
	struct list_head entry;
	work_func_t func;
#ifdef CONFIG_LOCKDEP
	struct lockdep_map lockdep_map;
#endif
};

work的一种变种delayed_work

struct delayed_work {
	struct work_struct work;
	struct timer_list timer;

	/* target workqueue and CPU ->timer uses to queue ->work */
	struct workqueue_struct *wq;
	int cpu;
};

work 运行在内核线程中(终端top命令下的kworker),这个内核线程在代码中被叫做worker ,worker 类似流水线的工人,work 相当于流水线工人的工作。

worker 定义在 kernel/workqueue_internal.h

/*
 * The poor guys doing the actual heavy lifting.  All on-duty workers are
 * either serving the manager role, on idle list or on busy hash.  For
 * details on the locking annotation (L, I, X...), refer to workqueue.c.
 *
 * Only to be used in workqueue and async.
 */
struct worker {
	/* on idle list while idle, on busy hash table while busy */
	union {
		struct list_head	entry;	/* L: while idle */
		struct hlist_node	hentry;	/* L: while busy */
	};

	struct work_struct	*current_work;	/* L: work being processed */
	work_func_t		current_func;	/* L: current_work's fn */
	struct pool_workqueue	*current_pwq; /* L: current_work's pwq */
	bool			desc_valid;	/* ->desc is valid */
	struct list_head	scheduled;	/* L: scheduled works */

	/* 64 bytes boundary on 64bit, 32 on 32bit */

	struct task_struct	*task;		/* I: worker task */
	struct worker_pool	*pool;		/* I: the associated pool */
						/* L: for rescuers */
	struct list_head	node;		/* A: anchored at pool->workers */
						/* A: runs through worker->node */

	unsigned long		last_active;	/* L: last active timestamp */
	unsigned int		flags;		/* X: flags */
	int			id;		/* I: worker id */

	/*
	 * Opaque string set with work_set_desc().  Printed out with task
	 * dump for debugging - WARN, BUG, panic or sysrq.
	 */
	char			desc[WORKER_DESC_LEN];

	/* used only by rescuers to point to the target workqueue */
	struct workqueue_struct	*rescue_wq;	/* I: the workqueue to rescue */
};

workqueue、worker_pool、pool_workqueue

工作队列 workqueue_struct 的数据结构; 外部可见的工作队列, 它通过其pool_workqueues将发出的工作项传递到适当的worker_pool

/*
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
 */
struct workqueue_struct {
	struct list_head	pwqs;		/* WR: all pwqs of this wq */
	struct list_head	list;		/* PR: list of all workqueues */

	struct mutex		mutex;		/* protects this wq */
	int			work_color;	/* WQ: current work color */
	int			flush_color;	/* WQ: current flush color */
	atomic_t		nr_pwqs_to_flush; /* flush in progress */
	struct wq_flusher	*first_flusher;	/* WQ: first flusher */
	struct list_head	flusher_queue;	/* WQ: flush waiters */
	struct list_head	flusher_overflow; /* WQ: flush overflow list */

	struct list_head	maydays;	/* MD: pwqs requesting rescue */
	struct worker		*rescuer;	/* I: rescue worker */

	int			nr_drainers;	/* WQ: drain in progress */
	int			saved_max_active; /* WQ: saved pwq max_active */

	struct workqueue_attrs	*unbound_attrs;	/* PW: only for unbound wqs */
	struct pool_workqueue	*dfl_pwq;	/* PW: only for unbound wqs */

#ifdef CONFIG_SYSFS
	struct wq_device	*wq_dev;	/* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
	struct lockdep_map	lockdep_map;
#endif
	char			name[WQ_NAME_LEN]; /* I: workqueue name */

	/*
	 * Destruction of workqueue_struct is sched-RCU protected to allow
	 * walking the workqueues list without grabbing wq_pool_mutex.
	 * This is used to dump all workqueues from sysrq.
	 */
	struct rcu_head		rcu;

	/* hot fields used during command issue, aligned to cacheline */
	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
	struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
};

工作队列是内核2.5引入的,存在以下几点问题:

大系统中,cpu一多,内核线程数量太多,会消耗系统的PID
内核线程数量一多,并发性就差,需要不断的调度,同时工作线程与CPU一一绑定,不能很好利用cpu资源

2.6从某个子版本开发的版本引入了CMWQ(concurrency-managed workqueues),提出了工作线程池的概念
workerqueue.c中

struct worker_pool {
	spinlock_t		lock;		/* the pool lock */
	//对于绑定的BOUND cpu的ID,如果是UNBOUND 类型 则为-1
	int			cpu;		/* I: the associated cpu */
	//对于UNBOUND 类型的workqueue 表示内存节点的ID编号
	int			node;		/* I: the associated node ID */
	int			id;		/* I: pool ID */
	unsigned int		flags;		/* X: flags */

	unsigned long		watchdog_ts;	/* L: watchdog timestamp */

	struct list_head	worklist;	/* L: list of pending works */
	int			nr_workers;	/* L: total number of workers */

	/* nr_idle includes the ones off idle_list for rebinding */
	int			nr_idle;	/* L: currently idle ones */

	struct list_head	idle_list;	/* X: list of idle workers */
	struct timer_list	idle_timer;	/* L: worker idle timeout */
	struct timer_list	mayday_timer;	/* L: SOS timer for workers */

	/* a workers is either on busy_hash or idle_list, or the manager */
	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
						/* L: hash of busy workers */

	/* see manage_workers() for details on the two manager mutexes */
	struct worker		*manager;	/* L: purely informational */
	struct mutex		attach_mutex;	/* attach/detach exclusion */
	struct list_head	workers;	/* A: attached workers */
	struct completion	*detach_completion; /* all workers detached */

	struct ida		worker_ida;	/* worker IDs for task name */

	struct workqueue_attrs	*attrs;		/* I: worker attributes */
	struct hlist_node	hash_node;	/* PL: unbound_pool_hash node */
	int			refcnt;		/* PL: refcnt for unbound pools */

	/*
	 * The current concurrency level.  As it's likely to be accessed
	 * from other CPUs during try_to_wake_up(), put it in a separate
	 * cacheline.
	 */
	atomic_t		nr_running ____cacheline_aligned_in_smp;

	/*
	 * Destruction of pool is sched-RCU protected to allow dereferences
	 * from get_work_pool().
	 */
	struct rcu_head		rcu;
} ____cacheline_aligned_in_smp;


每个cpu都有二个worker-pool (一个普通优先级一个高优先级)。
CMWQ 定义了一个pool_workqueue的数据结构,用于连接workqueue 和worker-pool

struct pool_workqueue {
	struct worker_pool	*pool;		/* I: the associated pool */
	struct workqueue_struct *wq;		/* I: the owning workqueue */
	.......
} __aligned(1 << WORK_STRUCT_FLAG_BITS);

top 终端 kwork 工作线程

kworker在终端top的情况:

 进程号 USER      PR  NI    VIRT    RES    SHR    %CPU  %MEM     TIME+ COMMAND                                                              
                                                    
      1 root      20   0  168172  12120   8324 S   0.0   0.1   0:07.35 systemd                                                              
      2 root      20   0       0      0      0 S   0.0   0.0   0:00.10 kthreadd                                                                                                          
      6 root       0 -20       0      0      0 I   0.0   0.0   0:00.00 netns                                                                
      8 root       0 -20       0      0      0 I   0.0   0.0   0:00.00 kworker/0:0H-kblockd                                                                                                 
     12 root      20   0       0      0      0 S   0.0   0.0   0:00.00 rcu_tasks_trace     

kworker 创建的代码如下(kthread_create_on_node),创建好后把worker放入work_pool 中(worker_attach_to_pool(worker, pool) )

/**
 * create_worker - create a new workqueue worker
 * @pool: pool the new worker will belong to
 *
 * Create and start a new worker which is attached to @pool.
 *
 * CONTEXT:
 * Might sleep.  Does GFP_KERNEL allocations.
 *
 * Return:
 * Pointer to the newly created worker.
 */
static struct worker *create_worker(struct worker_pool *pool)
{
	struct worker *worker = NULL;
	int id = -1;
	char id_buf[16];

	/* ID is needed to determine kthread name */
	id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
	if (id < 0)
		goto fail;

	worker = alloc_worker(pool->node);
	if (!worker)
		goto fail;

	worker->pool = pool;
	worker->id = id;

	if (pool->cpu >= 0)
		snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
			 pool->attrs->nice < 0  ? "H" : "");
	else
		snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);

	worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
					      "kworker/%s", id_buf);
	if (IS_ERR(worker->task))
		goto fail;

	set_user_nice(worker->task, pool->attrs->nice);
	kthread_bind_mask(worker->task, pool->attrs->cpumask);

	/* successful, attach the worker to the pool */
	worker_attach_to_pool(worker, pool);

	/* start the newly created worker */
	spin_lock_irq(&pool->lock);
	worker->pool->nr_workers++;
	worker_enter_idle(worker);
	wake_up_process(worker->task);
	spin_unlock_irq(&pool->lock);

	return worker;

fail:
	if (id >= 0)
		ida_simple_remove(&pool->worker_ida, id);
	kfree(worker);
	return NULL;
}

代码工作流程分析

初始化工作

每个cpu 定义了2个worker-pool,NR_STD_WORKER_POOLS=2

static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);

前面讲了create_worker,这里讲一下create_worker以及work_pool 等的创建,从中理一下workqueue 的工作流程

/**
 * workqueue_init - bring workqueue subsystem fully online
 *
 * This is the latter half of two-staged workqueue subsystem initialization
 * and invoked as soon as kthreads can be created and scheduled.
 * Workqueues have been created and work items queued on them, but there
 * are no kworkers executing the work items yet.  Populate the worker pools
 * with the initial workers and enable future kworker creations.
 */
int __init workqueue_init(void)
{
	struct workqueue_struct *wq;
	struct worker_pool *pool;
	int cpu, bkt;

	/*
	 * It'd be simpler to initialize NUMA in workqueue_init_early() but
	 * CPU to node mapping may not be available that early on some
	 * archs such as power and arm64.  As per-cpu pools created
	 * previously could be missing node hint and unbound pools NUMA
	 * affinity, fix them up.
	 */
	wq_numa_init();

	mutex_lock(&wq_pool_mutex);

	for_each_possible_cpu(cpu) {
		for_each_cpu_worker_pool(pool, cpu) {
			pool->node = cpu_to_node(cpu);
		}
	}

	list_for_each_entry(wq, &workqueues, list)
		wq_update_unbound_numa(wq, smp_processor_id(), true);

	mutex_unlock(&wq_pool_mutex);

	/* create the initial workers */
	for_each_online_cpu(cpu) {
		for_each_cpu_worker_pool(pool, cpu) {
			pool->flags &= ~POOL_DISASSOCIATED;
			BUG_ON(!create_worker(pool));
		}
	}

	hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
		BUG_ON(!create_worker(pool));

	wq_online = true;
	wq_watchdog_init();

	return 0;
}
/* create the initial workers */
for_each_online_cpu(cpu) {
	for_each_cpu_worker_pool(pool, cpu) {
		pool->flags &= ~POOL_DISASSOCIATED;
		BUG_ON(!create_worker(pool));
	}
}

workqueue_init会根据cpu个数,每个cpu 对应的worker_pool数(2个),来创建kworker 内核线程。
内核线程函数如下

/**
 * worker_thread - the worker thread function
 * @__worker: self
 *
 * The worker thread function.  All workers belong to a worker_pool -
 * either a per-cpu one or dynamic unbound one.  These workers process all
 * work items regardless of their specific target workqueue.  The only
 * exception is work items which belong to workqueues with a rescuer which
 * will be explained in rescuer_thread().
 *
 * Return: 0
 */
static int worker_thread(void *__worker)
{
	struct worker *worker = __worker;
	struct worker_pool *pool = worker->pool;

	/* tell the scheduler that this is a workqueue worker */
	worker->task->flags |= PF_WQ_WORKER;
woke_up:
	spin_lock_irq(&pool->lock);

	/* am I supposed to die? */
	if (unlikely(worker->flags & WORKER_DIE)) {
		spin_unlock_irq(&pool->lock);
		WARN_ON_ONCE(!list_empty(&worker->entry));
		worker->task->flags &= ~PF_WQ_WORKER;

		set_task_comm(worker->task, "kworker/dying");
		ida_simple_remove(&pool->worker_ida, worker->id);
		worker_detach_from_pool(worker, pool);
		kfree(worker);
		return 0;
	}

	worker_leave_idle(worker);
recheck:
	/* no more worker necessary? */
	if (!need_more_worker(pool))
		goto sleep;

	/* do we need to manage? */
	if (unlikely(!may_start_working(pool)) && manage_workers(worker))
		goto recheck;

	/*
	 * ->scheduled list can only be filled while a worker is
	 * preparing to process a work or actually processing it.
	 * Make sure nobody diddled with it while I was sleeping.
	 */
	WARN_ON_ONCE(!list_empty(&worker->scheduled));

	/*
	 * Finish PREP stage.  We're guaranteed to have at least one idle
	 * worker or that someone else has already assumed the manager
	 * role.  This is where @worker starts participating in concurrency
	 * management if applicable and concurrency management is restored
	 * after being rebound.  See rebind_workers() for details.
	 */
	worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

	do {
		struct work_struct *work =
			list_first_entry(&pool->worklist,
					 struct work_struct, entry);

		pool->watchdog_ts = jiffies;

		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
			/* optimization path, not strictly necessary */
			process_one_work(worker, work);
			if (unlikely(!list_empty(&worker->scheduled)))
				process_scheduled_works(worker);
		} else {
			move_linked_works(work, &worker->scheduled, NULL);
			process_scheduled_works(worker);
		}
	} while (keep_working(pool));

	worker_set_flags(worker, WORKER_PREP);
sleep:
	/*
	 * pool->lock is held and there's no work to process and no need to
	 * manage, sleep.  Workers are woken up only while holding
	 * pool->lock or from local cpu, so setting the current state
	 * before releasing pool->lock is enough to prevent losing any
	 * event.
	 */
	worker_enter_idle(worker);
	__set_current_state(TASK_IDLE);
	spin_unlock_irq(&pool->lock);
	schedule();
	goto woke_up;
}

static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
......
	lockdep_invariant_state(true);
	trace_workqueue_execute_start(work);
	worker->current_func(work); //工作队列处理函数
......
}

所以初化工作的流程:

kernel_init ->
	kernel_init_freeable ->
		workqueue_init->
			create_worker->
				worker_thread->
					process_one_work->
						worker->current_func(work);

用文字表述下来就是启动内核时,workqueue_init 初始化,初始化时会给每个cpu创建安2个work_pool (一个普通一个高优先级),然后每个都创建一个工作线程(worker_thread线程函数),线程中执行process_one_work,最后执行worker->current_func(work);

用户使用的初始化,调用alloc_workqueue ,__alloc_workqueue_key 的代码这里就不列出来了

#define alloc_workqueue(fmt, flags, max_active, args...)		\
	__alloc_workqueue_key((fmt), (flags), (max_active),		\
			      NULL, NULL, ##args)

使用

这里举一个使用的场景:memcontrol.c中

struct workqueue_struct *memcg_kmem_cache_wq;
static int __init mem_cgroup_init(void)
{
	int cpu, node;

#ifndef CONFIG_SLOB
	/*
	 * Kmem cache creation is mostly done with the slab_mutex held,
	 * so use a workqueue with limited concurrency to avoid stalling
	 * all worker threads in case lots of cgroups are created and
	 * destroyed simultaneously.
	 */
	memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1); //注意这里
	BUG_ON(!memcg_kmem_cache_wq);
#endif
........
}

static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
					       struct kmem_cache *cachep)
{
	struct memcg_kmem_cache_create_work *cw;

	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
	if (!cw)
		return;

	css_get(&memcg->css);

	cw->memcg = memcg;
	cw->cachep = cachep;
	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);//注意这里

	queue_work(memcg_kmem_cache_wq, &cw->work); //注意这里
}

全部流程

上述分析系统运行创建workqueue, (驱动程序调用 alloc_workqueue宏,然后创建workqueue),增加到workqueues队列中。
一个work 挂入workqueue 中,最终通过worker-pool 中的工作线程(worker_thread)来处理其回调函数,worker-pool是共享的(每个cpu创建的是固定的),因为workqueue 需要找一个合适的worker-pool ,然后从worker-pool中分派一个合适的工作线程。而在找worker-pool的过程中,需要pool_workqueue协助(前面有提到:工作队列 workqueue_struct 的数据结构; 外部可见的工作队列, 它通过其pool_workqueues将发出的工作项传递到适当的worker_pool

关于详细的pool_workqueue如何工作的看源码__queue_work (workqueue.c中)

static void __queue_work(int cpu, struct workqueue_struct *wq,struct work_struct *work)

相关的API

//为系统中每个cpu创建一个workqueue
struct workqueue_struct *create_workqueue(const char *name )
//为系统创建一个workqueue
struct workqueue_struct *create_singlethread_workqueue(const char *name)

上述二个函数底层都是调用alloc_workqueue -> __alloc_workqueue_key

注意:create_workqueue 是为每个cpu创建一个工作线程

bool queue_work(struct workqueue_struct *wq,struct work_struct *work)
bool queue_delayed_work(struct workqueue_struct *wq,struct delayed_work *dwork, unsigned long delay)

bool cancel_work(struct work_struct *work);
bool cancel_work_sync(struct work_struct *work)
bool cancel_delayed_work(struct delayed_work *dwork)
bool cancel_delayed_work_sync(struct delayed_work *dwork)

void flush_workqueue(struct worksqueue_struct * queue); //要等待所有workqueue完成
void destroy_workqueue(structure workqueque_struct *queue);

注意所有_sync 的函数,都可能会sleep ,只能使用中进程上下文。

在大多数情况下不一定需要自己专有的工作队列,而create_workqueue 是为每个cpu创建一个工作线程,这显然不太划算(占用太多资源),我们可以使用内核共享的工作队列。这个共享的工作队列是在内核启动时,通过workqueue_init_early()函数生成的,这个全局的工作队列就是所谓的“system_wq” ,每个cpu有一个,每个由一个名为events/n 的专用线程,n表示cpu的序列编号。可以通过以下API把work放在内核共享的工作队列中进行处理。

//在当前处理器的上进行的调度
int schedule_work(struct work_struct *work);
int schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
//on 是指定处理器
int schedule_work_on(int cpu, struct work_struct *work);
int schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);

shedule_work 的源码如下,queue_work 的简单封装,运行在system_wq

static inline bool schedule_work(struct work_struct *work)
{
	return queue_work(system_wq, work);
}

system_wq 是在 workqueue_init_early 中创建的。

system_wq = alloc_workqueue(“events”, 0, 0);

//针对的是共享工作队列的flush 
void flush_scheduled_work(void);

要注意的点,下面这些api 可能在高版本的内核中会删除掉,属于 legacy workqueue API ,建议使用新替代,
那新的用哪个了?

create_workqueue(),
create_singlethread_workqueue()
create_freezable_workqueue())

新的api如下:

#define alloc_workqueue(fmt, flags, max_active, args...);
//创建了一个工作队列,按照排队顺序(即FIFO顺序)逐个处理每个工作项
#define alloc_ordered_workqueue(fmt, flags, args...);

新老版本的替代方法和对应的参数,从源码中能看出来,这里列了一个,其它的可以看源码

#define create_workqueue(name)						\
	alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, 

对比新老版本的源码,底层依赖都是一个:__alloc_workqueue_key。
alloc_workqueue 参数中有一个max_active 和flags ,mac_active 的理解举例
如果max_active为5,表示每个CPU最多可以同时执行该工作队列上的5个工作项。
flags 说明如下

示例

总结

你可能感兴趣的:(linux驱动,linux,驱动开发,系统架构)