阻塞与非阻塞是设备访问的两种不同的模式。什么是阻塞操作呢?其是指在执行设备操作的时候,如果不能获得资源,则挂起进程直到满足可操作的条件后再进行操作.而非阻塞操作则是在进程不能进行设备操作时,并不挂起到等待队列,而是放弃或者不断的查询,直到能够进行操作。
应用程序以阻塞的方式进行read操作的时候,会调用一个system call,将系统的控制权交给kernel后就进入等待状态,等kernel将这个system执行完成后向应用程序返回响应,应用程序的得到响应后,就推出阻塞状态,并进行后面的工作。
应用程序以非阻塞的方式进行write操作的时候,通过设置文件描述符的属性O_NONBLOCK使其进入非阻塞的访问状态,这时进程也会调用相应的system call,但是system call会立即从kernel中返回。
从表面上看,阻塞状态貌似没有非阻塞的访问方式效率高,事实上却不是这样,非阻塞的访问方式虽然不用等待,会立即返回,可是他不一定就完成了相应的工作,比如上面的例子里面,虽然立即返回,但是数据可能还没有真正的写入文件中,所以说效率的高低并不是表面看上去的那样。
在linux驱动中,可以使用等待队列来实现阻塞进程的唤醒。wait queue以队列为基础数据结构,与进程调度机制紧密结合,能够用于实现内核中的异步事件通知机制等。下面就先看下一些关于等待队列的基本的操作。
定义一个等待队列头并初始化:
wait_queue_head_t my_queue; init_waitqueue_head(&my_queue);或者
DECLARE_WAIT_QUEUE_HEAD(my_queue);下面来看下wait_queue_head_t这个结构体,其中t的意思就是typedef的意思,是linux中的一种命名规则。
struct __wait_queue_head { spinlock_t lock; struct list_head task_list; }; typedef struct __wait_queue_head wait_queue_head_t;首先是定义了一个lock的自旋锁,后面定义了一个链表。其中看下init_waitqueue_head函数,通过wait_queue_head_t结构体成员,就不难想象里面大概的函数实现:
extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *); #define init_waitqueue_head(q) \ do { \ static struct lock_class_key __key; \ \ __init_waitqueue_head((q), &__key); \ } while (0)而__init_waitqueue_head()函数是:
void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) { spin_lock_init(&q->lock); lockdep_set_class(&q->lock, key); INIT_LIST_HEAD(&q->task_list); }其大概也就是初始化自旋锁以及链表等单元。而DECLARE_WAIT_QUEUE_HEAD的函数原型是:
#define DECLARE_WAIT_QUEUE_HEAD(name) \ wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .task_list = { &(name).task_list, &(name).task_list } }
定义等待队列用DECLARE_WAITQUEUE函数来实现
DECLARE_WAITQUEUE(name,tsk);其定义了一个名为name的等待队列
#define DECLARE_WAITQUEUE(name, tsk) \ wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)其中
typedef struct __wait_queue wait_queue_t; struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 void *private; wait_queue_func_t func; struct list_head task_list; };
#define __WAITQUEUE_INITIALIZER(name, tsk) { \ .private = tsk, \ .func = default_wake_function, \ .task_list = { NULL, NULL } }
flag:它的值有WQ_FLAG_EXCLUSIVE或者0,他说明等待的进程是否是互斥的。当为WQ_FLAG_EXCLUSIVE表示互斥;
private:一般用来指向等待进程的task_struct实例;
func:其唤醒等待进程;
task_list:用于链接等待队列中的进程
下面看下添加和移除等待队列的API函数:
void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; wait->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); __add_wait_queue(q, wait); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(add_wait_queue);其意思就是将wait等待队列加入到q的等待队列头中。再看其中的__add_wait_queue函数:
static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new) { list_add(&new->task_list, &head->task_list); }这样很容易看出,wait是如何挂到q上面去的。同样的:
void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; spin_lock_irqsave(&q->lock, flags); __remove_wait_queue(q, wait); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(remove_wait_queue);而__remove_wait_queue函数
static inline void __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) { list_del(&old->task_list); }这样看就很简单了。
下面介绍的是等待事件函数,其就是依据condition条件是否满足来选择是否返回或者阻塞等待。
wait_event(wq, condition) wait_event_timeout(wq, condition, timeout) wait_event_interruptible(wq, condition) wait_event_interruptible_timeout(wq, condition, timeout)下面以此来看上面函数的实现过程:
/** * wait_event - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event(wq, condition) \ do { \ if (condition) \ break; \ __wait_event(wq, condition); \ } while (0)其不难看出,当condition为真时,函数立即返回,否则等待条件为真。
#define __wait_event(wq, condition) \ do { \ DEFINE_WAIT(__wait); \ \ for (;;) { \ prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ if (condition) \ break; \ schedule(); \ } \ finish_wait(&wq, &__wait); \ } while (0)
这里首先是定义了一个等待队列项__wait:
#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
#define DEFINE_WAIT_FUNC(name, function) \ wait_queue_t name = { \ .private = current, \ .func = function, \ .task_list = LIST_HEAD_INIT((name).task_list), \ }其中,.private = current表示等待队列项指向当前的进程;.func = function 其就是的唤醒函数。
下面就进入循环,开始是prepare_to_wait函数,这个函数的作用是将等待队列项__wait插入到等待队列透wq中,然后设置为TASK_UNINTERRUPTIBLE,即改阻塞状态不能被信号打断,而TASK_INTERRUPTIBLE状态可以被信号打断唤醒。然后再检查一次condition,当condition刚好为真时函数立即返回,否则调用schedule()函数使得进程睡眠,执行schedule()进行了进程的切换以后,直到进程被唤醒才会调度该进程。for循环是等进程被唤醒后再一次检查condition条件是否满足,防止同时唤醒的进程已经抢先占据了资源。最后finish_wait将进程状态属性改为TASK_RUNNING,并且将进程从等待队列中删除。下面看下实现过程:
prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) { unsigned long flags; wait->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) __add_wait_queue(q, wait); set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait);
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; __set_current_state(TASK_RUNNING); /* * We can check for list emptiness outside the lock * IFF: * - we use the "careful" check that verifies both * the next and prev pointers, so that there cannot * be any half-pending updates in progress on other * CPU's that we haven't seen yet (and that might * still change the stack area. * and * - all other users take the lock (ie we can only * have _one_ other CPU that looks at or modifies * the list). */ if (!list_empty_careful(&wait->task_list)) { spin_lock_irqsave(&q->lock, flags); list_del_init(&wait->task_list); spin_unlock_irqrestore(&q->lock, flags); } } EXPORT_SYMBOL(finish_wait);下面看一下wait_event_timeout()函数的实现,timeout就是阻塞等待的超时时间,单位是jiffy,当timeout达到以后,不论condition是否满足函数都会返回。
#define wait_event_timeout(wq, condition, timeout) \ ({ \ long __ret = timeout; \ if (!(condition)) \ __wait_event_timeout(wq, condition, __ret); \ __ret; \ })
#define __wait_event_timeout(wq, condition, ret) \ do { \ DEFINE_WAIT(__wait); \ \ for (;;) { \ prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ if (condition) \ break; \ ret = schedule_timeout(ret); \ if (!ret) \ break; \ } \ finish_wait(&wq, &__wait); \ } while (0)其和前面的区别就在于多了一个timeout条件,schedule_timeout()函数设置了一个ret的时钟值,他首先调用schedule()函数,进程睡眠,但是每次时钟中断的时候它都会检测时钟值是否到期,当时钟到期后则返回,正常的返回值是0。
剩余的两个wait()函数过程都一样,这里列出实现过程:
#define wait_event_interruptible(wq, condition) \ ({ \ int __ret = 0; \ if (!(condition)) \ __wait_event_interruptible(wq, condition, __ret); \ __ret; \ })
#define __wait_event_interruptible(wq, condition, ret) \ do { \ DEFINE_WAIT(__wait); \ \ for (;;) { \ prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ if (condition) \ break; \ if (!signal_pending(current)) { \ schedule(); \ continue; \ } \ ret = -ERESTARTSYS; \ break; \ } \ finish_wait(&wq, &__wait); \ } while (0)其中wait_event_interruptible()函数是将进程属性设置为TASK_INTERRUPTIBLE,可以被信号唤醒,signal_pending(current)函数是判断是否是信号唤醒的。是的话直接返回-ERESTARTSYS。
#define wait_event_interruptible_timeout(wq, condition, timeout) \ ({ \ long __ret = timeout; \ if (!(condition)) \ __wait_event_interruptible_timeout(wq, condition, __ret); \ __ret; \ })
#define __wait_event_interruptible_timeout(wq, condition, ret) \ do { \ wait_queue_t __wait; \ init_waitqueue_entry(&__wait, current); \ add_wait_queue(&wq, &__wait); \ for (;;) { \ set_current_state(TASK_INTERRUPTIBLE); \ if (condition) \ break; \ if (!signal_pending(current)) { \ ret = schedule_timeout(ret); \ if (!ret) \ break; \ continue; \ } \ ret = -ERESTARTSYS; \ break; \ } \ current->state = TASK_RUNNING; \ remove_wait_queue(&wq, &__wait); \ } while (0)
static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) { q->flags = 0; q->private = p; q->func = default_wake_function; }default_wake_function是内核中的一个默认的唤醒函数。
下面来看一下唤醒函数,常用的有:
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)唤醒函数主要是唤醒属于指定等待队列头的所有等待队列中等待进程。可以看出,其实质都是调用__wake_up()函数,只是传递的参数不同而已。
void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; spin_lock_irqsave(&q->lock, flags); __wake_up_common(q, mode, nr_exclusive, 0, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(__wake_up);其中
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } }list_for_each_entry_safe将遍历整个等待队列链表,在if语句中,func是默认的唤醒函数,是将curr进程通过mode方式唤醒,然后再比较是否是互斥形式,如果是的话在判断需要唤醒的互斥进程的数量(nr_exclusive是需唤醒的互斥进程的数量),通过if语句可以看出,在遍历的过程中首先先会唤醒非互斥的,然后才会唤醒互斥进程(可以通过if语句中&&的顺序判断)。
通过上面的分析,对于等待队列的阻塞以及唤醒已经很清楚了,下面还有一套sleep()函数,其目的是使进程在等待队列上睡眠,如:
sleep_on(wait_queue_head_t *q) interruptible_sleep_on(wait_queue_head_t *q)sleep_on函数是将进程的状态设置为TASK_UMINTERRUPTIBLE,并且将它附属到等待队列头q上,知道获得资源,q引导的等待队列被唤醒。
而interruptible_sleep_on函数是将进程设置为TASK_INTERRUPTIBLE。
sleep_on与wake_up、interruptible_sleep_on与wake_up_interruptible都是成对出现使用的。
void __sched sleep_on(wait_queue_head_t *q) { sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } EXPORT_SYMBOL(sleep_on);
void __sched interruptible_sleep_on(wait_queue_head_t *q) { sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } EXPORT_SYMBOL(interruptible_sleep_on);其核心都是sleep_on_common()函数,只是传递的参数不同。
static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { unsigned long flags; wait_queue_t wait; init_waitqueue_entry(&wait, current); __set_current_state(state); spin_lock_irqsave(&q->lock, flags); __add_wait_queue(q, &wait); spin_unlock(&q->lock); timeout = schedule_timeout(timeout); spin_lock_irq(&q->lock); __remove_wait_queue(q, &wait); spin_unlock_irqrestore(&q->lock, flags); return timeout; }实现思想与前面所说的都差不多,代码也比较简单,这里就不详细分析了。
在许多的设备驱动中,并不调用sleep_on()或interruptible_sleep_on(),而是亲自进行进程的状态改变和切换,这样代码的效率比较高,下面我们根据前面的globlemem虚拟字符设备驱动的例子来进行改进,增加队列等待机制,可以对照之前的代码来看加入阻塞访问前后的区别。
首先定义设备结构体,添加了r_wait和w_wait两个读写的等待队列头:
struct globalmem_dev{ struct cdev cdev; unsigned int current_len; unsigned char mem[GLOBALMEM_SIZE]; struct semaphore sem; wait_queue_head_t r_wait; wait_queue_head_t w_wait; };自然还需要在初始化模块里面进行初始化:
int globalmem_init(void) { int result; dev_t devno=MKDEV(globalmem_major,0); if(globalmem_major) result=register_chrdev_region(devno,1,"globalmem"); else{ result=alloc_chrdev_region(&devno,0,1,"globalmem"); globalmem_major=MAJOR(devno); } if(result<0) return result; globalmem_devp = kmalloc(sizeof(struct globalmem_dev),GFP_KERNEL); if(!globalmem_devp){ result = -ENOMEM; goto fail_malloc; } memset(&globalmem_devp,0,sizeof(struct globalmem_dev)); globalmem_setup_cdev(&globalmem_devp,0); init_MUTEX(&globalmem_devp->sem); init_waitqueue_head(&globalfifo_devp->r_wait); //初始化读等待队列 init_waitqueue_head(&globalfifo_devp->w_wait); //初始化写等待队列 return 0; fail_malloc: unregister_chrdev_region(devno,1); return result; }下面在继续修改读写模块:
static ssize_t globalmem_read(struct file *filp,char __user *buf,size_t count,loff_t *ppos) { unsigned long p = *ppos; int ret = 0; struct globalmem_dev *dev = filp->private_data; DECLARE_WAITQUEUE(wait,cuerrent); down(&dev->sem); add_wait_queue(&dev->r_wait,&wait); while(dev->current_len==0){ if(filp->f_flags & O_NONBLOCK){ ret = -EAGAIN; goto out; } __set_current_state(TASK_INTERRUPTIBLE); up(&dev->sem); schedule(); if(signal_pending(current)){ ret = -ERESTARTSYS; goto out2; } down(&dev->sem); } if(count > dev->current_len) count = dev->current_len; if(copy_to_user(buf,dev->mem,count)){ ret = -EFAULT; goto out; }else{ memcpy(dev->mem,dev->mem+count,dev->current_len-count); dev->current_len -= count; printk(KERN_INFO "read %d bytes(s),current_len:%d\n",count,dev->current_len); wake_up_interruptible(&dev->w_wait); ret = count; } out:up(&dev->sem); out2:remove_wait_queue(&dev->r_wait,&wait); set_current_state(TASK_RUNNING); return ret; }
static ssize_t globalmem_write(struct file *filp,const char __user *buf,size_t count,loff_t *ppos) { unsigned long p = *ppos; int ret = 0; struct globalmem_dev *dev = filp->private_data; DECLARE_WAITQUEUE(wait,cuerrent); down(&dev->sem); add_wait_queue(&dev->w_wait,&wait); while(dev->current_len==GLOBALFIFO_SIZE){ if(filp->f_flags & O_NONBLOCK){ ret = -EAGAIN; goto out; } __set_current_state(TASK_INTERRUPTIBLE); up(&dev->sem); schedule(); if(signal_pending(current)){ ret = -ERESTARTSYS; goto out2; } down(&dev->sem); } if(count > GLOBALFIFO_SIZE-dev->current_len) count = GLOBALFIFO-dev->current_len; if(copy_from_user(dev->mem+dev->current_len,buf,count)){ ret = -EFAULT; goto out; }else{ dev->current_len += count; printk(KERN_INFO "written %d bytes(s),current_len:%d\n",count,dev->current_len); wake_up_interruptible(&dev->r_wait); ret = count; } out:up(&dev->sem); out2:remove_wait_queue(&dev->w_wait,&wait); set_current_state(TASK_RUNNING); return ret; }其并没有调用seelp_on()等函数,选择自己设置状态以及进程的切换等动作。将上述的过程用wait_event_interruptible()函数替换的话,可能会出现死锁的状态,可以自己思考一下这个过程。上面读缓冲区的数据需要在写函数中唤醒r_wait,才可以进行读的操作,而进行写的过程需要在读函数中唤醒w_wait才可以写入。