Linux2.4-net源码学习笔记 IO多路复用之select()

【前言】

int select(int maxfdp,fd_set *readfds,fd_set *writefds,fd_set *errorfds,struct timeval *timeout);

函数描述:

select用于多路IO,所谓多路就是同时监听fd_set *readfds,fd_set *writefds,fd_set *errorfds这三个集合中的fd。函数的返回值int是监听的具备条件的fd个数。还有,select返回后的参数fd_set *readfds,fd_set *writefds,fd_set *errorfds是select的结果集合。此时select已将在之前监听的fd集合中具备条件的fd,不具备条件的fd已被清除了。所以可以得到这样一个结论 select的返回值 ret = count_set(readfds) + count_set(writefds) + count_set(errorfds),其中count_set表示集合中被置为1的fd的个数。另外,值得注意的是同一个fd可能同时出现在三个结果(fd_set *readfds,fd_set *writefds,fd_set *errorfds)集合中。

特殊说明:

A. timeout是select的超时时间,它可以使select处于三种状态,

第一,若将NULL以形参传入,就是将select置于阻塞状态,一定等到监视文件描述符集合中某个文件描述符发生变化为止;

第二,若将时间值设为0秒0毫秒,就变成一个纯粹的非阻塞函数,不管文件描述符是否有变化,都立刻返回继续执行,文件无变化返回0,有变化返回一个正值;

第三,timeout的值大于0,这就是等待的超时时间,即 select在timeout时间内阻塞,超时时间之内有事件到来就返回了,否则在超时后不管怎

样一定返回,返回值同上述。注意:select()返回后,timeout中的值为select过程中未使用的剩余时间

B.  每个fdset的最大容量为FD_SETSIZE(1024),也就是说,select只能异步处理0~1023以内的fd。超过这个范围以外将无法hold住。


一个实例:

#include<stdio.h>
#include<sys/time.h>
#include<sys/types.h>
#include<unistd.h>
#include <stdlib.h>

int main()
{
    fd_set rfds;
    struct timeval tv; 
    int retval;
    int bsize = 255;
    char buff[255];
    int n = 0;
    int nr = 0;
    tv.tv_sec = 5;
    tv.tv_usec = 5;

    while(1){
        FD_ZERO(&rfds);
        FD_SET(0,&rfds);
        retval = select(1,&rfds,NULL,NULL,&tv);
        if(retval == 0)
        {   
            printf(" NO Data is available now.\n");
        }   
        else if(retval == 1)
        {   
            printf("Data is available now.\n");
            n++;
            if(n>6){
                nr = read(0,buff,bsize);
                buff[nr] = 0;
                printf("Data : %s \n",buff);
            }   
            sleep(1);
        }   
        else {
            printf("Error\n");
        }
        printf(" remained time : %d %d \n", tv.tv_sec,tv.tv_usec );
        tv.tv_sec = 5;
        tv.tv_usec = 5;
    }
    return 0;
}

【源码分析】

1 sys_select

(fs/select.c)  
1.struct timeval转换成了时钟周期数
2.调用core_sys_select()
3.然后检查剩余时间,处理时间  

asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,  
						   fd_set __user *exp, struct timeval __user *tvp)  
{  
	s64 timeout = -1;  
	struct timeval tv;  
	int ret;  
	
	if (tvp) {/*如果有超时值*/ 
		if (copy_from_user(&tv, tvp, sizeof(tv)))  
			return -EFAULT;  
		
		if (tv.tv_sec < 0 || tv.tv_usec < 0)/*时间无效*/ 
			return -EINVAL;  
		
		/* Cast to u64 to make GCC stop complaining */ 
		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)  
			timeout = -1; /* 无限等待*/ 
		else {  
			timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);  
			timeout += tv.tv_sec * HZ;/*计算出超时的相对时间,单位为时钟周期数*/ 
		}  
	}  
	
	/*主要工作都在core_sys_select中做了*/ 
	/////////////////////////////////////
	/////////////////////////////////////(2)
	//@最大fd,@read fdset,@write fdset,@error fdset,@超时
	ret = core_sys_select(n, inp, outp, exp, &timeout);  
	
	if (tvp) {/*如果有超时值,设置超时信息*/ 
		struct timeval rtv;  
		
		if (current->personality & STICKY_TIMEOUTS)/*模拟bug的一个机制,不详细描述*/ 
			goto sticky;  
		/*rtv中是剩余的时间*/ 
		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));  
		rtv.tv_sec = timeout;  
		if (timeval_compare(&rtv, &tv) >= 0)/*如果core_sys_select超时返回,更新时间*/ 
			rtv = tv;  
		/*拷贝更新后的时间到用户空间*/ 
		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {  
sticky:  
		if (ret == -ERESTARTNOHAND)/*ERESTARTNOHAND表明,被中断的系统调用*/ 
			ret = -EINTR;  
		}  
	}  
	return ret;  
}  

2. core_sys_select

1. 将三个查询位图(fd_set)和三个结果位图整合成一张位图 fd_set_bits :[fdset in][fdset out][fdset ex ][fdset res_in][fdset res_out][fdset res_ex]
2. 调用do_select,
3. 将返回的结果集返回到用户空间 

static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,  
						   fd_set __user *exp, s64 *timeout)  
{  
	fd_set_bits fds;  
	void *bits;  
	int ret, max_fds;  
	unsigned int size;  
	struct fdtable *fdt;  
	/* Allocate small arguments on the stack to save memory and be faster */ 
	
	/*SELECT_STACK_ALLOC 定义为256*/ 
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];  
	
	ret = -EINVAL;  
	if (n < 0)  
		goto out_nofds;  
	
	/* max_fds can increase, so grab it once to avoid race */ 
	rcu_read_lock();  
	fdt = files_fdtable(current->files);/*获取当前进程的文件描述符表*/ 
	max_fds = fdt->max_fds;  
	rcu_read_unlock();  
	if (n > max_fds)/*修正用户传入的第一个参数:fd_set中文件描述符的最大值*/ 
		n = max_fds;  
	
		/*  
		* We need 6 bitmaps (in/out/ex for both incoming and outgoing),  
		* since we used fdset we need to allocate memory in units of  
		* long-words.   
	*/ 
	
	/*  
	如果stack_fds数组的大小不能容纳下所有的fd_set,就用kmalloc重新分配一个大数组。  
	然后将位图平均分成份,并初始化fds结构  
	*/ 
	size = FDS_BYTES(n);  
	bits = stack_fds;  
	if (size > sizeof(stack_fds) / 6) {  
		/* Not enough space in on-stack array; must use kmalloc */ 
		ret = -ENOMEM;  
		bits = kmalloc(6 * size, GFP_KERNEL);  
		if (!bits)  
			goto out_nofds;  
	} 
	//将3个位图和结构位图整合到一张位图中fd_set_bits fds
	//结构图[fdset in][fdset out][fdset ex ][fdset res_in][fdset res_out][fdset res_ex]
	fds.in      = bits;  
	fds.out     = bits +   size;  
	fds.ex      = bits + 2*size;  
	fds.res_in  = bits + 3*size;  
	fds.res_out = bits + 4*size;  
	fds.res_ex  = bits + 5*size;  
	
	/*get_fd_set仅仅调用copy_from_user从用户空间拷贝了fd_set*/ 
	if ((ret = get_fd_set(n, inp, fds.in)) ||  
		(ret = get_fd_set(n, outp, fds.out)) ||  
		(ret = get_fd_set(n, exp, fds.ex)))  
		goto out;  
	
	zero_fd_set(n, fds.res_in);  
	zero_fd_set(n, fds.res_out);  
	zero_fd_set(n, fds.res_ex);  
	
	
	/*  
	接力棒传给了do_select  
	*/ 
	////////////////////////
	////////////////////////(3)
	//@最大fd值,@6个fdset整合的位图,@超时
	ret = do_select(n, &fds, timeout);  
	
	if (ret < 0)  
		goto out;  
	
	/*do_select返回,是一种异常状态*/ 
	if (!ret) {  
		/*记得上面的sys_select不?将ERESTARTNOHAND转换成了EINTR并返回。EINTR表明系统调用被中断*/ 
		ret = -ERESTARTNOHAND;  
		if (signal_pending(current))/*当当前进程有信号要处理时,signal_pending返回真,这符合了EINTR的语义*/ 
			goto out;  
		ret = 0;  
	}  
	
	/*把结果集,拷贝回用户空间*/ 
	if (set_fd_set(n, inp, fds.res_in) ||  
		set_fd_set(n, outp, fds.res_out) ||  
		set_fd_set(n, exp, fds.res_ex))  
		ret = -EFAULT;  
	
out:  
	if (bits != stack_fds)  
		kfree(bits);/*对应上面的kmalloc*/ 
out_nofds:  
	return ret;  
}  

3. do_select

1. 重新检查fd集合,并更新最大fd到n
2. for(;;)见图

Linux2.4-net源码学习笔记 IO多路复用之select()_第1张图片


int do_select(int n, fd_set_bits *fds, s64 *timeout)  
{  
	struct poll_wqueues table;  
	poll_table *wait;  
	int retval, i;  
	
	rcu_read_lock();  
	/*根据已经打开fd的位图检查用户打开的fd, 要求对应fd必须打开, 并且返回最大的fd*/ 
	retval = max_select_fd(n, fds);  
	rcu_read_unlock();  
	
	if (retval < 0)  
		return retval;  
	n = retval;	//重置maxfd  
	
	
	/*将当前进程放入自已的等待队列table, 并将该等待队列加入到该测试表wait*/ 
	poll_initwait(&table);  
	wait = &table.pt;  
	
	//timeout == 0 ,查询过程不进行阻塞,立即返回,所以不用等待队列wait
	if (!*timeout)  
		wait = NULL;  
	retval = 0;  
	
	for (;;) //退出循环:1.询问到相关fd的,具备事件条件(r,w e); 2.超时; 3. 其他:出错
	{
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;  
		long __timeout;  
		
		/*注意:可中断的睡眠状态*/ 
		set_current_state(TASK_INTERRUPTIBLE);
		
		//查询位图
		inp = fds->in; outp = fds->out; exp = fds->ex;	
		//结果位图
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;  
		
		for (i = 0; i < n; ++rinp, ++routp, ++rexp) /*遍历所有fd:i*/ 
		{
			unsigned long in, out, ex, all_bits, bit = 1, mask, j;  
			unsigned long res_in = 0, res_out = 0, res_ex = 0;  
			const struct file_operations *f_op = NULL;  
			struct file *file = NULL;  
			
			//参考core_sys_select()
			//long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
			//long *rinp, *routp, *rexp, *inp, *outp, *exp; long类型!
			in = *inp++; out = *outp++; ex = *exp++; //位图next(以一个long作为一个单元,在单元内进行fd的搜索)
			
			all_bits = in | out | ex;  //对于一个字长(32bit)范围内的fd,三态一体:all_bits
			if (all_bits == 0) {	   //位图内的这个字长是否有被置位?	  
				/*  
				__NFDBITS定义为(8 * sizeof(unsigned long)),即long的位数。  
				因为一个long代表了__NFDBITS位,所以跳到下一个位图i要增加__NFDBITS  
				*/ 
				i += __NFDBITS;			//无,跳到位图的下一个字长  
				continue;  
			}  
			//bit每次初始为1,bit <<= 1 遍历inp、outp、exp指向的long空间(换算在all_bits中)的每个bit
			/*i在前一个for中没有++哦,在这里补上的,标定了当前所在fdset的位*/
			for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) // j: 0 ~ 32 ,字长中的每个位进行搜索判定
			{  
				int fput_needed;  
				if (i >= n)  
					break;  
				
				/*测试该字长中的每一位*/ 
				if (!(bit & all_bits))  
					continue;  
				
				/*得到file结构指针,并增加引用计数字段f_count*/ 位图中的i就是fd的值哦 
				file = fget_light(i, &fput_needed);  
				if (file) {  
					f_op = file->f_op;  
					mask = DEFAULT_POLLMASK;  
					
					/*对于socket描述符,f_op->poll对应的函数是sock_poll  
					注意第三个参数是等待队列,在poll成功后会将本进程唤醒执行*/ 
					/////////////////////////////////////
					/////////////////////////////////////(4)
					if (f_op && f_op->poll)  
						mask = (*f_op->poll)(file, retval ? NULL : wait);
					//retval ? NULL : wait# retval记录poll返回具备条件的fd的总和,初始为0。首次wait,以后NULL
					//返回值mask记录了底层驱动程序在poll过程中的查询结果 
					
					/*释放file结构指针,实际就是减小他的一个引用计数字段f_count*/ 
					fput_light(file, fput_needed);  
					
					/*根据poll的结果设置状态,要返回select出来的fd数目,所以retval++。  
					注意:retval是in out ex三个集合的总和*/ 
					if ((mask & POLLIN_SET) && (in & bit)) { //in bingo 
						res_in |= bit;  
						retval++;  
					}  
					if ((mask & POLLOUT_SET) && (out & bit)) { //out bingo 
						res_out |= bit;  
						retval++;  
					}  
					if ((mask & POLLEX_SET) && (ex & bit)) {  //ex bingo
						res_ex |= bit;  
						retval++;  
					}  
				}  
				
				/*  
				注意前面的set_current_state(TASK_INTERRUPTIBLE);  
				因为已经进入TASK_INTERRUPTIBLE状态,所以cond_resched回调度其他进程来运行,  
				这里的目的纯粹是为了增加一个抢占点。被抢占后,由等待队列机制唤醒。  
				
				  在支持抢占式调度的内核中(定义了CONFIG_PREEMPT),cond_resched是空操作  
				*/   
				cond_resched();  
			}//end of for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) 
			
			/*根据poll的结果写回到输出位图里*/ 
			if (res_in)  
				*rinp = res_in;  
			if (res_out)  
				*routp = res_out;  
			if (res_ex)  
				*rexp = res_ex;  
		}//end of for (i = 0; i < n; ++rinp, ++routp, ++rexp) 
		
		wait = NULL;
		//1。查询结果>0; 2. 不阻塞 timeout == 0; 3. ?
		if (retval || !*timeout || signal_pending(current))/*signal_pending前面说过了*/ 
			break;  
		if(table.error) {  
			retval = table.error;  
			break;  
		}  
		
		/*
		则执行schedule_timeout睡眠。睡眠时间长短由__timeout决定,一直等到该进程被唤醒。
		*/
		if (*timeout < 0) {  
			/*无限等待*/ 
			__timeout = MAX_SCHEDULE_TIMEOUT;  
		} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {  
			/* 时间超过MAX_SCHEDULE_TIMEOUT,即schedule_timeout允许的最大值,用一个循环来不断减少超时值*/ 
			__timeout = MAX_SCHEDULE_TIMEOUT - 1;  
			*timeout -= __timeout;  
		} else {  
			/*等待一段时间*/ 
			__timeout = *timeout;  
			*timeout = 0;  
		}  
		
		/*TASK_INTERRUPTIBLE状态下,调用schedule_timeout的进程会在收到信号后重新得到调度的机会,  
		即schedule_timeout返回,并返回剩余的时钟周期数  
		*/ 
		///////////////////////////////////////
		///////////////////////////////////////(5)
		__timeout = schedule_timeout(__timeout);  
		if (*timeout >= 0)  
			*timeout += __timeout;  
	}//end of for(;;)  
 
	/*设置为运行状态*/ 
	 __set_current_state(TASK_RUNNING);  
	 /*清理等待队列*/ 
	poll_freewait(&table);  
 
	return retval;  
 }  


4. 驱动层代码

不同文件系统的fd对应的底层poll操作自然是不同的。以sockfs为例,fd是一个socket的文件号,那么do_select()中的mask = (*f_op->poll)(file, retval ? NULL : wait);将对应到sock_poll()函数,源码如下:

static unsigned int sock_poll(struct file *file, poll_table *wait)  
{  
	struct socket *sock;  
	
	/*约定socket的file->private_data字段放着对应的socket结构指针*/ 
	sock = file->private_data;  
	
	/*
	对应了三个协议的函数tcp_poll,udp_poll,datagram_poll,其中udp_poll几乎直接调用了datagram_poll  
	*/ 
	return sock->ops->poll(file, sock, wait);  
}  

不同的运输层协议,将sock->ops->poll(file, sock, wait);对应到不同的poll函数,以datagram_poll()为例

unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *wait)
{
	struct sock *sk = sock->sk;
	unsigned int mask;
	//将当前进程挂入sk->sleep指向的等待队列中
	poll_wait(file, sk->sleep, wait);
	mask = 0;

	/* exceptional events? */
	if (sk->err || !skb_queue_empty(&sk->error_queue))
		mask |= POLLERR;
	if (sk->shutdown == SHUTDOWN_MASK)
		mask |= POLLHUP;

	/* readable? */已接受的数据报队列是否为空,非空,说明可读取报文
	if (!skb_queue_empty(&sk->receive_queue) || (sk->shutdown&RCV_SHUTDOWN))
		mask |= POLLIN | POLLRDNORM;

	/* Connection-based need to check for termination and startup */
	if (connection_based(sk)) {
		if (sk->state==TCP_CLOSE)
			mask |= POLLHUP;
		/* connection hasn't started yet? */
		if (sk->state == TCP_SYN_SENT)
			return mask;
	}

	/* writable? */判断sock的写缓存(sk->sndbuff)是否有空闲空间
	if (sock_writeable(sk))
		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
	else
		set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);

	return mask;
}


sock_writeable()

/*
 *	Default write policy as shown to user space via poll/select/SIGIO
 *	Kernel internally doesn't use the MIN_WRITE_SPACE threshold.
 */
static inline int sock_writeable(struct sock *sk) 
{
	return sock_wspace(sk) >= SOCK_MIN_WRITE_SPACE;
}
static inline unsigned long sock_wspace(struct sock *sk)
{
	int amt = 0;

	if (!(sk->shutdown & SEND_SHUTDOWN)) {
		amt = sk->sndbuf - atomic_read(&sk->wmem_alloc);
		if (amt < 0) 
			amt = 0;
	}
	return amt;
}


extern inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	if (p && wait_address)
		__pollwait(filp, wait_address, p);
}

void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	struct poll_table_page *table = p->table;

	if (!table || POLL_TABLE_FULL(table)) {
		struct poll_table_page *new_table;

		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
		if (!new_table) {
			p->error = -ENOMEM;
			__set_current_state(TASK_RUNNING);
			return;
		}
		new_table->entry = new_table->entries;
		new_table->next = table;
		p->table = new_table;
		table = new_table;
	}

	/* Add a new entry */
	{
		struct poll_table_entry * entry = table->entry;
		table->entry = entry+1;
	 	get_file(filp);
	 	entry->filp = filp;
		entry->wait_address = wait_address;
		init_waitqueue_entry(&entry->wait, current);
		add_wait_queue(wait_address,&entry->wait);
	}
}


【参考】

 www.ibm.com/developerworks

Linux TCP IP 协议栈分析.pdf(已加密) 、Linux2.6协议栈源码分析 

你可能感兴趣的:(linux,select,多路io,sock_poll)