select系统调用

您查询的关键词是: core_sys_select 。如果打开速度慢,可以尝试 快速版;如果想保存快照,可以 添加到搜藏;如果想更新或删除快照,可以 投诉快照。
(百度和网页 http://blog.163.com/yuanbor/blog/static/566746202011438630998/的作者无关,不对其内容负责。百度快照谨为网络故障时之索引,不代表被搜索网站的即时页面。)
博客  
发现
小组
风格
 
 
群博客召集令
创建博客 登录  
 关注

SmallOfficeHomeOffic

Operating System and Architecture

  • 首页
  • 日志
  • 相册
  • 音乐
  • 收藏
  • 博友
  • 关于我
 
 

日志

 
 

Linux-2.6.25 select系统调用源码分析  

2011-05-03 20:06:30|  分类: Linux内核精华|字号 订阅

Linux 2.6.25中的select系统调用主要有4个函数: 
sys_select:处理时间参数,调用core_sys_select。 
core_sys_select:处理三个fd_set参数,调用do_select。 
do_select:做select/poll的工作。在合适的时机把自己挂起等待,调用sock_poll。 
sock_poll:用函数指针分派到具体的协议层函数tcp_poll、udp_poll、datagram_poll。 
层层分工明确,我也要多学习这种方式啊。 
C代码

  1. /* 
  2. sys_select(fs/select.c) 
  3. 处理了超时值(如果有),将struct timeval转换成了时钟周期数,调用core_sys_select,然后检查剩余时间,处理时间 
  4. */ 
  5. asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,  
  6.                            fd_set __user *exp, struct timeval __user *tvp)  
  7. {  
  8.     s64 timeout = -1;  
  9.     struct timeval tv;  
  10.     int ret;  
  11.     if (tvp) {/*如果有超时值*/ 
  12.         if (copy_from_user(&tv, tvp, sizeof(tv)))  
  13.             return -EFAULT;  
  14.         if (tv.tv_sec < 0 || tv.tv_usec < 0)/*时间无效*/ 
  15.             return -EINVAL;  
  16.         /* Cast to u64 to make GCC stop complaining */ 
  17.         if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)  
  18.             timeout = -1;   /* 无限等待*/ 
  19.         else {  
  20.             timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);  
  21.             timeout += tv.tv_sec * HZ;/*计算出超时的相对时间,单位为时钟周期数*/ 
  22.         }  
  23.     }  
  24.     /*主要工作都在core_sys_select中做了*/ 
  25.     ret = core_sys_select(n, inp, outp, exp, &timeout);  
  26.     if (tvp) {/*如果有超时值*/ 
  27.         struct timeval rtv;  
  28.         if (current->personality & STICKY_TIMEOUTS)/*模拟bug的一个机制,不详细描述*/ 
  29.             goto sticky;  
  30.         /*rtv中是剩余的时间*/ 
  31.         rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));  
  32.         rtv.tv_sec = timeout;  
  33.         if (timeval_compare(&rtv, &tv) >= 0)/*如果core_sys_select超时返回,更新时间*/ 
  34.             rtv = tv;  
  35.         /*拷贝更新后的时间到用户空间*/ 
  36.         if (copy_to_user(tvp, &rtv, sizeof(rtv))) {  
  37. sticky:  
  38.             /* 
  39.             * If an application puts its timeval in read-only 
  40.             * memory, we don't want the Linux-specific update to 
  41.             * the timeval to cause a fault after the select has 
  42.             * completed successfully. However, because we're not 
  43.             * updating the timeval, we can't restart the system 
  44.             * call. 
  45.             */ 
  46.             if (ret == -ERESTARTNOHAND)/*ERESTARTNOHAND表明,被中断的系统调用*/ 
  47.                 ret = -EINTR;  
  48.         }  
  49.     }  
  50.     return ret;  
  51. }  
  52. /*core_sys_select 
  53. 为do_select准备好了位图,然后调用do_select,将返回的结果集,返回到用户空间 
  54. */ 
  55. static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,  
  56.                            fd_set __user *exp, s64 *timeout)  
  57. {  
  58.     fd_set_bits fds;  
  59.     void *bits;  
  60.     int ret, max_fds;  
  61.     unsigned int size;  
  62.     struct fdtable *fdt;  
  63.     /* Allocate small arguments on the stack to save memory and be faster */ 
  64.     /*SELECT_STACK_ALLOC 定义为256*/ 
  65.     long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];  
  66.     ret = -EINVAL;  
  67.     if (n < 0)  
  68.         goto out_nofds;  
  69.     /* max_fds can increase, so grab it once to avoid race */ 
  70.     rcu_read_lock();  
  71.     fdt = files_fdtable(current->files);/*获取当前进程的文件描述符表*/ 
  72.     max_fds = fdt->max_fds;  
  73.     rcu_read_unlock();  
  74.     if (n > max_fds)/*修正用户传入的第一个参数:fd_set中文件描述符的最大值*/ 
  75.         n = max_fds;  
  76.     /* 
  77.     * We need 6 bitmaps (in/out/ex for both incoming and outgoing), 
  78.     * since we used fdset we need to allocate memory in units of 
  79.     * long-words.  
  80.     */ 
  81.     /* 
  82.     如果stack_fds数组的大小不能容纳下所有的fd_set,就用kmalloc重新分配一个大数组。 
  83.     然后将位图平均分成份,并初始化fds结构 
  84.     */ 
  85.     size = FDS_BYTES(n);  
  86.     bits = stack_fds;  
  87.     if (size > sizeof(stack_fds) / 6) {  
  88.         /* Not enough space in on-stack array; must use kmalloc */ 
  89.         ret = -ENOMEM;  
  90.         bits = kmalloc(6 * size, GFP_KERNEL);  
  91.         if (!bits)  
  92.             goto out_nofds;  
  93.     }  
  94.     fds.in      = bits;  
  95.     fds.out     = bits +   size;  
  96.     fds.ex      = bits + 2*size;  
  97.     fds.res_in  = bits + 3*size;  
  98.     fds.res_out = bits + 4*size;  
  99.     fds.res_ex  = bits + 5*size;  
  100.     /*get_fd_set仅仅调用copy_from_user从用户空间拷贝了fd_set*/ 
  101.     if ((ret = get_fd_set(n, inp, fds.in)) ||  
  102.         (ret = get_fd_set(n, outp, fds.out)) ||  
  103.         (ret = get_fd_set(n, exp, fds.ex)))  
  104.         goto out;  
  105.     zero_fd_set(n, fds.res_in);  
  106.     zero_fd_set(n, fds.res_out);  
  107.     zero_fd_set(n, fds.res_ex);  
  108.     /* 
  109.     接力棒传给了do_select 
  110.     */ 
  111.     ret = do_select(n, &fds, timeout);  
  112.     if (ret < 0)  
  113.         goto out;  
  114.     /*do_select返回,是一种异常状态*/ 
  115.     if (!ret) {  
  116.         /*记得上面的sys_select不?将ERESTARTNOHAND转换成了EINTR并返回。EINTR表明系统调用被中断*/ 
  117.         ret = -ERESTARTNOHAND;  
  118.         if (signal_pending(current))/*当当前进程有信号要处理时,signal_pending返回真,这符合了EINTR的语义*/ 
  119.             goto out;  
  120.         ret = 0;  
  121.     }  
  122.     /*把结果集,拷贝回用户空间*/ 
  123.     if (set_fd_set(n, inp, fds.res_in) ||  
  124.         set_fd_set(n, outp, fds.res_out) ||  
  125.         set_fd_set(n, exp, fds.res_ex))  
  126.         ret = -EFAULT;  
  127. out:  
  128.     if (bits != stack_fds)  
  129.         kfree(bits);/*对应上面的kmalloc*/ 
  130. out_nofds:  
  131.     return ret;  
  132. }  
  133. /*do_select 
  134. 真正的select在此,遍历了所有的fd,调用对应的xxx_poll函数 
  135. */ 
  136. int do_select(int n, fd_set_bits *fds, s64 *timeout)  
  137. {  
  138.     struct poll_wqueues table;  
  139.     poll_table *wait;  
  140.     int retval, i;  
  141.     rcu_read_lock();  
  142.     /*根据已经打开fd的位图检查用户打开的fd, 要求对应fd必须打开, 并且返回最大的fd*/ 
  143.     retval = max_select_fd(n, fds);  
  144.     rcu_read_unlock();  
  145.     if (retval < 0)  
  146.         return retval;  
  147.     n = retval;  
  148.     /*将当前进程放入自已的等待队列table, 并将该等待队列加入到该测试表wait*/ 
  149.     poll_initwait(&table);  
  150.     wait = &table.pt;  
  151.     if (!*timeout)  
  152.         wait = NULL;  
  153.     retval = 0;  
  154.     for (;;) {/*死循环*/ 
  155.         unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;  
  156.         long __timeout;  
  157.         /*注意:可中断的睡眠状态*/ 
  158.         set_current_state(TASK_INTERRUPTIBLE);  
  159.         inp = fds->in; outp = fds->out; exp = fds->ex;  
  160.         rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;  
  161.         for (i = 0; i < n; ++rinp, ++routp, ++rexp) {/*遍历所有fd*/ 
  162.             unsigned long in, out, ex, all_bits, bit = 1, mask, j;  
  163.             unsigned long res_in = 0, res_out = 0, res_ex = 0;  
  164.             const struct file_operations *f_op = NULL;  
  165.             struct file *file = NULL;  
  166.             in = *inp++; out = *outp++; ex = *exp++;  
  167.             all_bits = in | out | ex;  
  168.             if (all_bits == 0) {  
  169.                 /* 
  170.                 __NFDBITS定义为(8 * sizeof(unsigned long)),即long的位数。 
  171.                 因为一个long代表了__NFDBITS位,所以跳到下一个位图i要增加__NFDBITS 
  172.                 */ 
  173.                 i += __NFDBITS;  
  174.                 continue;  
  175.             }  
  176.             for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {  
  177.                 int fput_needed;  
  178.                 if (i >= n)  
  179.                     break;  
  180.                 /*测试每一位*/ 
  181.                 if (!(bit & all_bits))  
  182.                     continue;  
  183.                 /*得到file结构指针,并增加引用计数字段f_count*/ 
  184.                 file = fget_light(i, &fput_needed);  
  185.                 if (file) {  
  186.                     f_op = file->f_op;  
  187.                     mask = DEFAULT_POLLMASK;  
  188.                     /*对于socket描述符,f_op->poll对应的函数是sock_poll 
  189.                     注意第三个参数是等待队列,在poll成功后会将本进程唤醒执行*/ 
  190.                     if (f_op && f_op->poll)  
  191.                         mask = (*f_op->poll)(file, retval ? NULL : wait);  
  192.                     /*释放file结构指针,实际就是减小他的一个引用计数字段f_count*/ 
  193.                     fput_light(file, fput_needed);  
  194.                     /*根据poll的结果设置状态,要返回select出来的fd数目,所以retval++。 
  195.                     注意:retval是in out ex三个集合的总和*/ 
  196.                     if ((mask & POLLIN_SET) && (in & bit)) {  
  197.                         res_in |= bit;  
  198.                         retval++;  
  199.                     }  
  200.                     if ((mask & POLLOUT_SET) && (out & bit)) {  
  201.                         res_out |= bit;  
  202.                         retval++;  
  203.                     }  
  204.                     if ((mask & POLLEX_SET) && (ex & bit)) {  
  205.                         res_ex |= bit;  
  206.                         retval++;  
  207.                     }  
  208.                 }  
  209.                 /* 
  210.                 注意前面的set_current_state(TASK_INTERRUPTIBLE); 
  211.                 因为已经进入TASK_INTERRUPTIBLE状态,所以cond_resched回调度其他进程来运行, 
  212.                 这里的目的纯粹是为了增加一个抢占点。被抢占后,由等待队列机制唤醒。 
  213.                 在支持抢占式调度的内核中(定义了CONFIG_PREEMPT),cond_resched是空操作 
  214.                 */   
  215.                 cond_resched();  
  216.             }  
  217.             /*根据poll的结果写回到输出位图里*/ 
  218.             if (res_in)  
  219.                 *rinp = res_in;  
  220.             if (res_out)  
  221.                 *routp = res_out;  
  222.             if (res_ex)  
  223.                 *rexp = res_ex;  
  224.         }  
  225.         wait = NULL;  
  226.         if (retval || !*timeout || signal_pending(current))/*signal_pending前面说过了*/ 
  227.             break;  
  228.         if(table.error) {  
  229.             retval = table.error;  
  230.             break;  
  231.         }  
  232.         if (*timeout < 0) {  
  233.             /*无限等待*/ 
  234.             __timeout = MAX_SCHEDULE_TIMEOUT;  
  235.         } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {  
  236.             /* 时间超过MAX_SCHEDULE_TIMEOUT,即schedule_timeout允许的最大值,用一个循环来不断减少超时值*/ 
  237.             __timeout = MAX_SCHEDULE_TIMEOUT - 1;  
  238.             *timeout -= __timeout;  
  239.         } else {  
  240.             /*等待一段时间*/ 
  241.             __timeout = *timeout;  
  242.             *timeout = 0;  
  243.         }  
  244.         /*TASK_INTERRUPTIBLE状态下,调用schedule_timeout的进程会在收到信号后重新得到调度的机会, 
  245.         即schedule_timeout返回,并返回剩余的时钟周期数 
  246.         */ 
  247.         __timeout = schedule_timeout(__timeout);  
  248.         if (*timeout >= 0)  
  249.             *timeout += __timeout;  
  250.     }  
  251.     /*设置为运行状态*/ 
  252.     __set_current_state(TASK_RUNNING);  
  253.     /*清理等待队列*/ 
  254.     poll_freewait(&table);  
  255.     return retval;  
  256. }  
  257. static unsigned int sock_poll(struct file *file, poll_table *wait)  
  258. {  
  259.     struct socket *sock;  
  260.     /*约定socket的file->private_data字段放着对应的socket结构指针*/ 
  261.     sock = file->private_data;  
  262.     /*对应了三个协议的函数tcp_poll,udp_poll,datagram_poll,其中udp_poll几乎直接调用了datagram_poll 
  263.     累了,先休息一下,这三个函数以后分析*/ 
  264.     return sock->ops->poll(file, sock, wait);  
  265. /*
  266. sys_select(fs/select.c)
  267. 处理了超时值(如果有),将struct timeval转换成了时钟周期数,调用core_sys_select,然后检查剩余时间,处理时间
  268. */
  269. asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
  270.                                                    fd_set __user *exp, struct timeval __user *tvp)
  271. {
  272.         s64 timeout = -1;
  273.         struct timeval tv;
  274.         int ret;
  275.         if (tvp) {/*如果有超时值*/
  276.                 if (copy_from_user(&tv, tvp, sizeof(tv)))
  277.                         return -EFAULT;
  278.                 if (tv.tv_sec < 0 || tv.tv_usec < 0)/*时间无效*/
  279.                         return -EINVAL;
  280.                 /* Cast to u64 to make GCC stop complaining */
  281.                 if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
  282.                         timeout = -1;        /* 无限等待*/
  283.                 else {
  284.                         timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
  285.                         timeout += tv.tv_sec * HZ;/*计算出超时的相对时间,单位为时钟周期数*/
  286.                 }
  287.         }
  288.         /*主要工作都在core_sys_select中做了*/
  289.         ret = core_sys_select(n, inp, outp, exp, &timeout);
  290.         if (tvp) {/*如果有超时值*/
  291.                 struct timeval rtv;
  292.                 if (current->personality & STICKY_TIMEOUTS)/*模拟bug的一个机制,不详细描述*/
  293.                         goto sticky;
  294.                 /*rtv中是剩余的时间*/
  295.                 rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
  296.                 rtv.tv_sec = timeout;
  297.                 if (timeval_compare(&rtv, &tv) >= 0)/*如果core_sys_select超时返回,更新时间*/
  298.                         rtv = tv;
  299.                 /*拷贝更新后的时间到用户空间*/
  300.                 if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
  301. sticky:
  302.                         /*
  303.                         * If an application puts its timeval in read-only
  304.                         * memory, we don't want the Linux-specific update to
  305.                         * the timeval to cause a fault after the select has
  306.                         * completed successfully. However, because we're not
  307.                         * updating the timeval, we can't restart the system
  308.                         * call.
  309.                         */
  310.                         if (ret == -ERESTARTNOHAND)/*ERESTARTNOHAND表明,被中断的系统调用*/
  311.                                 ret = -EINTR;
  312.                 }
  313.         }
  314.         return ret;
  315. }
  316. /*core_sys_select
  317. 为do_select准备好了位图,然后调用do_select,将返回的结果集,返回到用户空间
  318. */
  319. static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
  320.                                                    fd_set __user *exp, s64 *timeout)
  321. {
  322.         fd_set_bits fds;
  323.         void *bits;
  324.         int ret, max_fds;
  325.         unsigned int size;
  326.         struct fdtable *fdt;
  327.         /* Allocate small arguments on the stack to save memory and be faster */
  328.         /*SELECT_STACK_ALLOC 定义为256*/
  329.         long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
  330.         ret = -EINVAL;
  331.         if (n < 0)
  332.                 goto out_nofds;
  333.         /* max_fds can increase, so grab it once to avoid race */
  334.         rcu_read_lock();
  335.         fdt = files_fdtable(current->files);/*获取当前进程的文件描述符表*/
  336.         max_fds = fdt->max_fds;
  337.         rcu_read_unlock();
  338.         if (n > max_fds)/*修正用户传入的第一个参数:fd_set中文件描述符的最大值*/
  339.                 n = max_fds;
  340.         /*
  341.         * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
  342.         * since we used fdset we need to allocate memory in units of
  343.         * long-words.
  344.         */
  345.         /*
  346.         如果stack_fds数组的大小不能容纳下所有的fd_set,就用kmalloc重新分配一个大数组。
  347.         然后将位图平均分成份,并初始化fds结构
  348.         */
  349.         size = FDS_BYTES(n);
  350.         bits = stack_fds;
  351.         if (size > sizeof(stack_fds) / 6) {
  352.                 /* Not enough space in on-stack array; must use kmalloc */
  353.                 ret = -ENOMEM;
  354.                 bits = kmalloc(6 * size, GFP_KERNEL);
  355.                 if (!bits)
  356.                         goto out_nofds;
  357.         }
  358.         fds.in      = bits;
  359.         fds.out     = bits +   size;
  360.         fds.ex      = bits + 2*size;
  361.         fds.res_in  = bits + 3*size;
  362.         fds.res_out = bits + 4*size;
  363.         fds.res_ex  = bits + 5*size;
  364.         /*get_fd_set仅仅调用copy_from_user从用户空间拷贝了fd_set*/
  365.         if ((ret = get_fd_set(n, inp, fds.in)) ||
  366.                 (ret = get_fd_set(n, outp, fds.out)) ||
  367.                 (ret = get_fd_set(n, exp, fds.ex)))
  368.                 goto out;
  369.         zero_fd_set(n, fds.res_in);
  370.         zero_fd_set(n, fds.res_out);
  371.         zero_fd_set(n, fds.res_ex);
  372.         /*
  373.         接力棒传给了do_select
  374.         */
  375.         ret = do_select(n, &fds, timeout);
  376.         if (ret < 0)
  377.                 goto out;
  378.         /*do_select返回,是一种异常状态*/
  379.         if (!ret) {
  380.                 /*记得上面的sys_select不?将ERESTARTNOHAND转换成了EINTR并返回。EINTR表明系统调用被中断*/
  381.                 ret = -ERESTARTNOHAND;
  382.                 if (signal_pending(current))/*当当前进程有信号要处理时,signal_pending返回真,这符合了EINTR的语义*/
  383.                         goto out;
  384.                 ret = 0;
  385.         }
  386.         /*把结果集,拷贝回用户空间*/
  387.         if (set_fd_set(n, inp, fds.res_in) ||
  388.                 set_fd_set(n, outp, fds.res_out) ||
  389.                 set_fd_set(n, exp, fds.res_ex))
  390.                 ret = -EFAULT;
  391. out:
  392.         if (bits != stack_fds)
  393.                 kfree(bits);/*对应上面的kmalloc*/
  394. out_nofds:
  395.         return ret;
  396. }
  397. /*do_select
  398. 真正的select在此,遍历了所有的fd,调用对应的xxx_poll函数
  399. */
  400. int do_select(int n, fd_set_bits *fds, s64 *timeout)
  401. {
  402.         struct poll_wqueues table;
  403.         poll_table *wait;
  404.         int retval, i;
  405.         rcu_read_lock();
  406.         /*根据已经打开fd的位图检查用户打开的fd, 要求对应fd必须打开, 并且返回最大的fd*/
  407.         retval = max_select_fd(n, fds);
  408.         rcu_read_unlock();
  409.         if (retval < 0)
  410.                 return retval;
  411.         n = retval;
  412.         /*将当前进程放入自已的等待队列table, 并将该等待队列加入到该测试表wait*/
  413.         poll_initwait(&table);
  414.         wait = &table.pt;
  415.         if (!*timeout)
  416.                 wait = NULL;
  417.         retval = 0;
  418.         for (;;) {/*死循环*/
  419.                 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
  420.                 long __timeout;
  421.                 /*注意:可中断的睡眠状态*/
  422.                 set_current_state(TASK_INTERRUPTIBLE);
  423.                 inp = fds->in; outp = fds->out; exp = fds->ex;
  424.                 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
  425.                 for (i = 0; i < n; ++rinp, ++routp, ++rexp) {/*遍历所有fd*/
  426.                         unsigned long in, out, ex, all_bits, bit = 1, mask, j;
  427.                         unsigned long res_in = 0, res_out = 0, res_ex = 0;
  428.                         const struct file_operations *f_op = NULL;
  429.                         struct file *file = NULL;
  430.                         in = *inp++; out = *outp++; ex = *exp++;
  431.                         all_bits = in | out | ex;
  432.                         if (all_bits == 0) {
  433.                                 /*
  434.                                 __NFDBITS定义为(8 * sizeof(unsigned long)),即long的位数。
  435.                                 因为一个long代表了__NFDBITS位,所以跳到下一个位图i要增加__NFDBITS
  436.                                 */
  437.                                 i += __NFDBITS;
  438.                                 continue;
  439.                         }
  440.                         for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
  441.                                 int fput_needed;
  442.                                 if (i >= n)
  443.                                         break;
  444.                                 /*测试每一位*/
  445.                                 if (!(bit & all_bits))
  446.                                         continue;
  447.                                 /*得到file结构指针,并增加引用计数字段f_count*/
  448.                                 file = fget_light(i, &fput_needed);
  449.                                 if (file) {
  450.                                         f_op = file->f_op;
  451.                                         mask = DEFAULT_POLLMASK;
  452.                                         /*对于socket描述符,f_op->poll对应的函数是sock_poll
  453.                                         注意第三个参数是等待队列,在poll成功后会将本进程唤醒执行*/
  454.                                         if (f_op && f_op->poll)
  455.                                                 mask = (*f_op->poll)(file, retval ? NULL : wait);
  456.                                         /*释放file结构指针,实际就是减小他的一个引用计数字段f_count*/
  457.                                         fput_light(file, fput_needed);
  458.                                         /*根据poll的结果设置状态,要返回select出来的fd数目,所以retval++。
  459.                                         注意:retval是in out ex三个集合的总和*/
  460.                                         if ((mask & POLLIN_SET) && (in & bit)) {
  461.                                                 res_in |= bit;
  462.                                                 retval++;
  463.                                         }
  464.                                         if ((mask & POLLOUT_SET) && (out & bit)) {
  465.                                                 res_out |= bit;
  466.                                                 retval++;
  467.                                         }
  468.                                         if ((mask & POLLEX_SET) && (ex & bit)) {
  469.                                                 res_ex |= bit;
  470.                                                 retval++;
  471.                                         }
  472.                                 }
  473.                                 /*
  474.                                 注意前面的set_current_state(TASK_INTERRUPTIBLE);
  475.                                 因为已经进入TASK_INTERRUPTIBLE状态,所以cond_resched回调度其他进程来运行,
  476.                                 这里的目的纯粹是为了增加一个抢占点。被抢占后,由等待队列机制唤醒。
  477.                                 在支持抢占式调度的内核中(定义了CONFIG_PREEMPT),cond_resched是空操作
  478.                                 */
  479.                                 cond_resched();
  480.                         }
  481.                         /*根据poll的结果写回到输出位图里*/
  482.                         if (res_in)
  483.                                 *rinp = res_in;
  484.                         if (res_out)
  485.                                 *routp = res_out;
  486.                         if (res_ex)
  487.                                 *rexp = res_ex;
  488.                 }
  489.                 wait = NULL;
  490.                 if (retval || !*timeout || signal_pending(current))/*signal_pending前面说过了*/
  491.                         break;
  492.                 if(table.error) {
  493.                         retval = table.error;
  494.                         break;
  495.                 }
  496.                 if (*timeout < 0) {
  497.                         /*无限等待*/
  498.                         __timeout = MAX_SCHEDULE_TIMEOUT;
  499.                 } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
  500.                         /* 时间超过MAX_SCHEDULE_TIMEOUT,即schedule_timeout允许的最大值,用一个循环来不断减少超时值*/
  501.                         __timeout = MAX_SCHEDULE_TIMEOUT - 1;
  502.                         *timeout -= __timeout;
  503.                 } else {
  504.                         /*等待一段时间*/
  505.                         __timeout = *timeout;
  506.                         *timeout = 0;
  507.                 }
  508.                 /*TASK_INTERRUPTIBLE状态下,调用schedule_timeout的进程会在收到信号后重新得到调度的机会,
  509.                 即schedule_timeout返回,并返回剩余的时钟周期数
  510.                 */
  511.                 __timeout = schedule_timeout(__timeout);
  512.                 if (*timeout >= 0)
  513.                         *timeout += __timeout;
  514.         }
  515.         /*设置为运行状态*/
  516.         __set_current_state(TASK_RUNNING);
  517.         /*清理等待队列*/
  518.         poll_freewait(&table);
  519.         return retval;
  520. }
  521. static unsigned int sock_poll(struct file *file, poll_table *wait)
  522. {
  523.         struct socket *sock;
  524.         /*约定socket的file->private_data字段放着对应的socket结构指针*/
  525.         sock = file->private_data;
  526.         /*对应了三个协议的函数tcp_poll,udp_poll,datagram_poll,其中udp_poll几乎直接调用了datagram_poll
  527.         累了,先休息一下,这三个函数以后分析*/
  528.         return sock->ops->poll(file, sock, wait);
  529. }

复制代码

其他重要函数一览 
static int max_select_fd(unsigned long n, fd_set_bits *fds) 
返回在fd_set中已经打开的,并且小于用户指定最大值,的fd 
static inline int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) 
从用户空间拷贝fd_set到内核 
static inline void zero_fd_set(unsigned long nr, unsigned long *fdset) 
把fd_set清零 
static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) 
把fd_set拷贝回用户空间 
static inline int signal_pending(struct task_struct *p) 
目前进程有信号需要处理 
struct file *fget_light(unsigned int fd, int *fput_needed) 
由fd得到其对应的file结构指针,并增加其引用计数 
static inline void fput_light(struct file *file, int fput_needed) 
释放由fget_light得到的file结构指针,减少其引用计数 
set_current_state 
设置当前进程的状态 
static inline int cond_resched(void) 
判断是否有进程需要抢占当前进程,如果是将立即发生调度。就是额外增加一个抢占点。 
signed long __sched schedule_timeout(signed long timeout) 
当前进程睡眠timeout个jiffies 
rcu_read_lock 
rcu_read_unlock 
Linux 2.6新加入的rcu锁。读锁的加锁、解锁函数 
参考http://www.ibm.com/developerworks/cn/linux/l-rcu 
poll_freewait 
poll_initwait 
poll_wait 
... 
和文件IO,poll机制有关的几个函数,参考《Linux设备驱动(第三版)》6.3 
tcp_poll 
udp_poll 
datagram_poll 
协议层的poll函数

分享到:          
阅读(224) |  评论(0) |  引用  (0)  | 举报

最近读者

评论

 
 
公司简介  -  联系方法  -  招聘信息  -  客户服务  -  隐私政策  -  博客风格  -  手机博客  -  订阅此博客

网易公司版权所有 ©1997-2011

您查询的关键词是: core_sys_select 。如果打开速度慢,可以尝试 快速版;如果想保存快照,可以 添加到搜藏;如果想更新或删除快照,可以 投诉快照。
(百度和网页 http://blog.163.com/yuanbor/blog/static/566746202011438630998/的作者无关,不对其内容负责。百度快照谨为网络故障时之索引,不代表被搜索网站的即时页面。)
博客  
发现
小组
风格
 
 
群博客召集令
创建博客 登录  
 关注

SmallOfficeHomeOffic

Operating System and Architecture

  • 首页
  • 日志
  • 相册
  • 音乐
  • 收藏
  • 博友
  • 关于我
 
 

日志

 
 

Linux-2.6.25 select系统调用源码分析  

2011-05-03 20:06:30|  分类: Linux内核精华|字号 订阅

Linux 2.6.25中的select系统调用主要有4个函数: 
sys_select:处理时间参数,调用core_sys_select。 
core_sys_select:处理三个fd_set参数,调用do_select。 
do_select:做select/poll的工作。在合适的时机把自己挂起等待,调用sock_poll。 
sock_poll:用函数指针分派到具体的协议层函数tcp_poll、udp_poll、datagram_poll。 
层层分工明确,我也要多学习这种方式啊。 
C代码

  1. /* 
  2. sys_select(fs/select.c) 
  3. 处理了超时值(如果有),将struct timeval转换成了时钟周期数,调用core_sys_select,然后检查剩余时间,处理时间 
  4. */ 
  5. asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,  
  6.                            fd_set __user *exp, struct timeval __user *tvp)  
  7. {  
  8.     s64 timeout = -1;  
  9.     struct timeval tv;  
  10.     int ret;  
  11.     if (tvp) {/*如果有超时值*/ 
  12.         if (copy_from_user(&tv, tvp, sizeof(tv)))  
  13.             return -EFAULT;  
  14.         if (tv.tv_sec < 0 || tv.tv_usec < 0)/*时间无效*/ 
  15.             return -EINVAL;  
  16.         /* Cast to u64 to make GCC stop complaining */ 
  17.         if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)  
  18.             timeout = -1;   /* 无限等待*/ 
  19.         else {  
  20.             timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);  
  21.             timeout += tv.tv_sec * HZ;/*计算出超时的相对时间,单位为时钟周期数*/ 
  22.         }  
  23.     }  
  24.     /*主要工作都在core_sys_select中做了*/ 
  25.     ret = core_sys_select(n, inp, outp, exp, &timeout);  
  26.     if (tvp) {/*如果有超时值*/ 
  27.         struct timeval rtv;  
  28.         if (current->personality & STICKY_TIMEOUTS)/*模拟bug的一个机制,不详细描述*/ 
  29.             goto sticky;  
  30.         /*rtv中是剩余的时间*/ 
  31.         rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));  
  32.         rtv.tv_sec = timeout;  
  33.         if (timeval_compare(&rtv, &tv) >= 0)/*如果core_sys_select超时返回,更新时间*/ 
  34.             rtv = tv;  
  35.         /*拷贝更新后的时间到用户空间*/ 
  36.         if (copy_to_user(tvp, &rtv, sizeof(rtv))) {  
  37. sticky:  
  38.             /* 
  39.             * If an application puts its timeval in read-only 
  40.             * memory, we don't want the Linux-specific update to 
  41.             * the timeval to cause a fault after the select has 
  42.             * completed successfully. However, because we're not 
  43.             * updating the timeval, we can't restart the system 
  44.             * call. 
  45.             */ 
  46.             if (ret == -ERESTARTNOHAND)/*ERESTARTNOHAND表明,被中断的系统调用*/ 
  47.                 ret = -EINTR;  
  48.         }  
  49.     }  
  50.     return ret;  
  51. }  
  52. /*core_sys_select 
  53. 为do_select准备好了位图,然后调用do_select,将返回的结果集,返回到用户空间 
  54. */ 
  55. static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,  
  56.                            fd_set __user *exp, s64 *timeout)  
  57. {  
  58.     fd_set_bits fds;  
  59.     void *bits;  
  60.     int ret, max_fds;  
  61.     unsigned int size;  
  62.     struct fdtable *fdt;  
  63.     /* Allocate small arguments on the stack to save memory and be faster */ 
  64.     /*SELECT_STACK_ALLOC 定义为256*/ 
  65.     long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];  
  66.     ret = -EINVAL;  
  67.     if (n < 0)  
  68.         goto out_nofds;  
  69.     /* max_fds can increase, so grab it once to avoid race */ 
  70.     rcu_read_lock();  
  71.     fdt = files_fdtable(current->files);/*获取当前进程的文件描述符表*/ 
  72.     max_fds = fdt->max_fds;  
  73.     rcu_read_unlock();  
  74.     if (n > max_fds)/*修正用户传入的第一个参数:fd_set中文件描述符的最大值*/ 
  75.         n = max_fds;  
  76.     /* 
  77.     * We need 6 bitmaps (in/out/ex for both incoming and outgoing), 
  78.     * since we used fdset we need to allocate memory in units of 
  79.     * long-words.  
  80.     */ 
  81.     /* 
  82.     如果stack_fds数组的大小不能容纳下所有的fd_set,就用kmalloc重新分配一个大数组。 
  83.     然后将位图平均分成份,并初始化fds结构 
  84.     */ 
  85.     size = FDS_BYTES(n);  
  86.     bits = stack_fds;  
  87.     if (size > sizeof(stack_fds) / 6) {  
  88.         /* Not enough space in on-stack array; must use kmalloc */ 
  89.         ret = -ENOMEM;  
  90.         bits = kmalloc(6 * size, GFP_KERNEL);  
  91.         if (!bits)  
  92.             goto out_nofds;  
  93.     }  
  94.     fds.in      = bits;  
  95.     fds.out     = bits +   size;  
  96.     fds.ex      = bits + 2*size;  
  97.     fds.res_in  = bits + 3*size;  
  98.     fds.res_out = bits + 4*size;  
  99.     fds.res_ex  = bits + 5*size;  
  100.     /*get_fd_set仅仅调用copy_from_user从用户空间拷贝了fd_set*/ 
  101.     if ((ret = get_fd_set(n, inp, fds.in)) ||  
  102.         (ret = get_fd_set(n, outp, fds.out)) ||  
  103.         (ret = get_fd_set(n, exp, fds.ex)))  
  104.         goto out;  
  105.     zero_fd_set(n, fds.res_in);  
  106.     zero_fd_set(n, fds.res_out);  
  107.     zero_fd_set(n, fds.res_ex);  
  108.     /* 
  109.     接力棒传给了do_select 
  110.     */ 
  111.     ret = do_select(n, &fds, timeout);  
  112.     if (ret < 0)  
  113.         goto out;  
  114.     /*do_select返回,是一种异常状态*/ 
  115.     if (!ret) {  
  116.         /*记得上面的sys_select不?将ERESTARTNOHAND转换成了EINTR并返回。EINTR表明系统调用被中断*/ 
  117.         ret = -ERESTARTNOHAND;  
  118.         if (signal_pending(current))/*当当前进程有信号要处理时,signal_pending返回真,这符合了EINTR的语义*/ 
  119.             goto out;  
  120.         ret = 0;  
  121.     }  
  122.     /*把结果集,拷贝回用户空间*/ 
  123.     if (set_fd_set(n, inp, fds.res_in) ||  
  124.         set_fd_set(n, outp, fds.res_out) ||  
  125.         set_fd_set(n, exp, fds.res_ex))  
  126.         ret = -EFAULT;  
  127. out:  
  128.     if (bits != stack_fds)  
  129.         kfree(bits);/*对应上面的kmalloc*/ 
  130. out_nofds:  
  131.     return ret;  
  132. }  
  133. /*do_select 
  134. 真正的select在此,遍历了所有的fd,调用对应的xxx_poll函数 
  135. */ 
  136. int do_select(int n, fd_set_bits *fds, s64 *timeout)  
  137. {  
  138.     struct poll_wqueues table;  
  139.     poll_table *wait;  
  140.     int retval, i;  
  141.     rcu_read_lock();  
  142.     /*根据已经打开fd的位图检查用户打开的fd, 要求对应fd必须打开, 并且返回最大的fd*/ 
  143.     retval = max_select_fd(n, fds);  
  144.     rcu_read_unlock();  
  145.     if (retval < 0)  
  146.         return retval;  
  147.     n = retval;  
  148.     /*将当前进程放入自已的等待队列table, 并将该等待队列加入到该测试表wait*/ 
  149.     poll_initwait(&table);  
  150.     wait = &table.pt;  
  151.     if (!*timeout)  
  152.         wait = NULL;  
  153.     retval = 0;  
  154.     for (;;) {/*死循环*/ 
  155.         unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;  
  156.         long __timeout;  
  157.         /*注意:可中断的睡眠状态*/ 
  158.         set_current_state(TASK_INTERRUPTIBLE);  
  159.         inp = fds->in; outp = fds->out; exp = fds->ex;  
  160.         rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;  
  161.         for (i = 0; i < n; ++rinp, ++routp, ++rexp) {/*遍历所有fd*/ 
  162.             unsigned long in, out, ex, all_bits, bit = 1, mask, j;  
  163.             unsigned long res_in = 0, res_out = 0, res_ex = 0;  
  164.             const struct file_operations *f_op = NULL;  
  165.             struct file *file = NULL;  
  166.             in = *inp++; out = *outp++; ex = *exp++;  
  167.             all_bits = in | out | ex;  
  168.             if (all_bits == 0) {  
  169.                 /* 
  170.                 __NFDBITS定义为(8 * sizeof(unsigned long)),即long的位数。 
  171.                 因为一个long代表了__NFDBITS位,所以跳到下一个位图i要增加__NFDBITS 
  172.                 */ 
  173.                 i += __NFDBITS;  
  174.                 continue;  
  175.             }  
  176.             for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {  
  177.                 int fput_needed;  
  178.                 if (i >= n)  
  179.                     break;  
  180.                 /*测试每一位*/ 
  181.                 if (!(bit & all_bits))  
  182.                     continue;  
  183.                 /*得到file结构指针,并增加引用计数字段f_count*/ 
  184.                 file = fget_light(i, &fput_needed);  
  185.                 if (file) {  
  186.                     f_op = file->f_op;  
  187.                     mask = DEFAULT_POLLMASK;  
  188.                     /*对于socket描述符,f_op->poll对应的函数是sock_poll 
  189.                     注意第三个参数是等待队列,在poll成功后会将本进程唤醒执行*/ 
  190.                     if (f_op && f_op->poll)  
  191.                         mask = (*f_op->poll)(file, retval ? NULL : wait);  
  192.                     /*释放file结构指针,实际就是减小他的一个引用计数字段f_count*/ 
  193.                     fput_light(file, fput_needed);  
  194.                     /*根据poll的结果设置状态,要返回select出来的fd数目,所以retval++。 
  195.                     注意:retval是in out ex三个集合的总和*/ 
  196.                     if ((mask & POLLIN_SET) && (in & bit)) {  
  197.                         res_in |= bit;  
  198.                         retval++;  
  199.                     }  
  200.                     if ((mask & POLLOUT_SET) && (out & bit)) {  
  201.                         res_out |= bit;  
  202.                         retval++;  
  203.                     }  
  204.                     if ((mask & POLLEX_SET) && (ex & bit)) {  
  205.                         res_ex |= bit;  
  206.                         retval++;  
  207.                     }  
  208.                 }  
  209.                 /* 
  210.                 注意前面的set_current_state(TASK_INTERRUPTIBLE); 
  211.                 因为已经进入TASK_INTERRUPTIBLE状态,所以cond_resched回调度其他进程来运行, 
  212.                 这里的目的纯粹是为了增加一个抢占点。被抢占后,由等待队列机制唤醒。 
  213.                 在支持抢占式调度的内核中(定义了CONFIG_PREEMPT),cond_resched是空操作 
  214.                 */   
  215.                 cond_resched();  
  216.             }  
  217.             /*根据poll的结果写回到输出位图里*/ 
  218.             if (res_in)  
  219.                 *rinp = res_in;  
  220.             if (res_out)  
  221.                 *routp = res_out;  
  222.             if (res_ex)  
  223.                 *rexp = res_ex;  
  224.         }  
  225.         wait = NULL;  
  226.         if (retval || !*timeout || signal_pending(current))/*signal_pending前面说过了*/ 
  227.             break;  
  228.         if(table.error) {  
  229.             retval = table.error;  
  230.             break;  
  231.         }  
  232.         if (*timeout < 0) {  
  233.             /*无限等待*/ 
  234.             __timeout = MAX_SCHEDULE_TIMEOUT;  
  235.         } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {  
  236.             /* 时间超过MAX_SCHEDULE_TIMEOUT,即schedule_timeout允许的最大值,用一个循环来不断减少超时值*/ 
  237.             __timeout = MAX_SCHEDULE_TIMEOUT - 1;  
  238.             *timeout -= __timeout;  
  239.         } else {  
  240.             /*等待一段时间*/ 
  241.             __timeout = *timeout;  
  242.             *timeout = 0;  
  243.         }  
  244.         /*TASK_INTERRUPTIBLE状态下,调用schedule_timeout的进程会在收到信号后重新得到调度的机会, 
  245.         即schedule_timeout返回,并返回剩余的时钟周期数 
  246.         */ 
  247.         __timeout = schedule_timeout(__timeout);  
  248.         if (*timeout >= 0)  
  249.             *timeout += __timeout;  
  250.     }  
  251.     /*设置为运行状态*/ 
  252.     __set_current_state(TASK_RUNNING);  
  253.     /*清理等待队列*/ 
  254.     poll_freewait(&table);  
  255.     return retval;  
  256. }  
  257. static unsigned int sock_poll(struct file *file, poll_table *wait)  
  258. {  
  259.     struct socket *sock;  
  260.     /*约定socket的file->private_data字段放着对应的socket结构指针*/ 
  261.     sock = file->private_data;  
  262.     /*对应了三个协议的函数tcp_poll,udp_poll,datagram_poll,其中udp_poll几乎直接调用了datagram_poll 
  263.     累了,先休息一下,这三个函数以后分析*/ 
  264.     return sock->ops->poll(file, sock, wait);  
  265. /*
  266. sys_select(fs/select.c)
  267. 处理了超时值(如果有),将struct timeval转换成了时钟周期数,调用core_sys_select,然后检查剩余时间,处理时间
  268. */
  269. asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
  270.                                                    fd_set __user *exp, struct timeval __user *tvp)
  271. {
  272.         s64 timeout = -1;
  273.         struct timeval tv;
  274.         int ret;
  275.         if (tvp) {/*如果有超时值*/
  276.                 if (copy_from_user(&tv, tvp, sizeof(tv)))
  277.                         return -EFAULT;
  278.                 if (tv.tv_sec < 0 || tv.tv_usec < 0)/*时间无效*/
  279.                         return -EINVAL;
  280.                 /* Cast to u64 to make GCC stop complaining */
  281.                 if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
  282.                         timeout = -1;        /* 无限等待*/
  283.                 else {
  284.                         timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
  285.                         timeout += tv.tv_sec * HZ;/*计算出超时的相对时间,单位为时钟周期数*/
  286.                 }
  287.         }
  288.         /*主要工作都在core_sys_select中做了*/
  289.         ret = core_sys_select(n, inp, outp, exp, &timeout);
  290.         if (tvp) {/*如果有超时值*/
  291.                 struct timeval rtv;
  292.                 if (current->personality & STICKY_TIMEOUTS)/*模拟bug的一个机制,不详细描述*/
  293.                         goto sticky;
  294.                 /*rtv中是剩余的时间*/
  295.                 rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
  296.                 rtv.tv_sec = timeout;
  297.                 if (timeval_compare(&rtv, &tv) >= 0)/*如果core_sys_select超时返回,更新时间*/
  298.                         rtv = tv;
  299.                 /*拷贝更新后的时间到用户空间*/
  300.                 if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
  301. sticky:
  302.                         /*
  303.                         * If an application puts its timeval in read-only
  304.                         * memory, we don't want the Linux-specific update to
  305.                         * the timeval to cause a fault after the select has
  306.                         * completed successfully. However, because we're not
  307.                         * updating the timeval, we can't restart the system
  308.                         * call.
  309.                         */
  310.                         if (ret == -ERESTARTNOHAND)/*ERESTARTNOHAND表明,被中断的系统调用*/
  311.                                 ret = -EINTR;
  312.                 }
  313.         }
  314.         return ret;
  315. }
  316. /*core_sys_select
  317. 为do_select准备好了位图,然后调用do_select,将返回的结果集,返回到用户空间
  318. */
  319. static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
  320.                                                    fd_set __user *exp, s64 *timeout)
  321. {
  322.         fd_set_bits fds;
  323.         void *bits;
  324.         int ret, max_fds;
  325.         unsigned int size;
  326.         struct fdtable *fdt;
  327.         /* Allocate small arguments on the stack to save memory and be faster */
  328.         /*SELECT_STACK_ALLOC 定义为256*/
  329.         long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
  330.         ret = -EINVAL;
  331.         if (n < 0)
  332.                 goto out_nofds;
  333.         /* max_fds can increase, so grab it once to avoid race */
  334.         rcu_read_lock();
  335.         fdt = files_fdtable(current->files);/*获取当前进程的文件描述符表*/
  336.         max_fds = fdt->max_fds;
  337.         rcu_read_unlock();
  338.         if (n > max_fds)/*修正用户传入的第一个参数:fd_set中文件描述符的最大值*/
  339.                 n = max_fds;
  340.         /*
  341.         * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
  342.         * since we used fdset we need to allocate memory in units of
  343.         * long-words.
  344.         */
  345.         /*
  346.         如果stack_fds数组的大小不能容纳下所有的fd_set,就用kmalloc重新分配一个大数组。
  347.         然后将位图平均分成份,并初始化fds结构
  348.         */
  349.         size = FDS_BYTES(n);
  350.         bits = stack_fds;
  351.         if (size > sizeof(stack_fds) / 6) {
  352.                 /* Not enough space in on-stack array; must use kmalloc */
  353.                 ret = -ENOMEM;
  354.                 bits = kmalloc(6 * size, GFP_KERNEL);
  355.                 if (!bits)
  356.                         goto out_nofds;
  357.         }
  358.         fds.in      = bits;
  359.         fds.out     = bits +   size;
  360.         fds.ex      = bits + 2*size;
  361.         fds.res_in  = bits + 3*size;
  362.         fds.res_out = bits + 4*size;
  363.         fds.res_ex  = bits + 5*size;
  364.         /*get_fd_set仅仅调用copy_from_user从用户空间拷贝了fd_set*/
  365.         if ((ret = get_fd_set(n, inp, fds.in)) ||
  366.                 (ret = get_fd_set(n, outp, fds.out)) ||
  367.                 (ret = get_fd_set(n, exp, fds.ex)))
  368.                 goto out;
  369.         zero_fd_set(n, fds.res_in);
  370.         zero_fd_set(n, fds.res_out);
  371.         zero_fd_set(n, fds.res_ex);
  372.         /*
  373.         接力棒传给了do_select
  374.         */
  375.         ret = do_select(n, &fds, timeout);
  376.         if (ret < 0)
  377.                 goto out;
  378.         /*do_select返回,是一种异常状态*/
  379.         if (!ret) {
  380.                 /*记得上面的sys_select不?将ERESTARTNOHAND转换成了EINTR并返回。EINTR表明系统调用被中断*/
  381.                 ret = -ERESTARTNOHAND;
  382.                 if (signal_pending(current))/*当当前进程有信号要处理时,signal_pending返回真,这符合了EINTR的语义*/
  383.                         goto out;
  384.                 ret = 0;
  385.         }
  386.         /*把结果集,拷贝回用户空间*/
  387.         if (set_fd_set(n, inp, fds.res_in) ||
  388.                 set_fd_set(n, outp, fds.res_out) ||
  389.                 set_fd_set(n, exp, fds.res_ex))
  390.                 ret = -EFAULT;
  391. out:
  392.         if (bits != stack_fds)
  393.                 kfree(bits);/*对应上面的kmalloc*/
  394. out_nofds:
  395.         return ret;
  396. }
  397. /*do_select
  398. 真正的select在此,遍历了所有的fd,调用对应的xxx_poll函数
  399. */
  400. int do_select(int n, fd_set_bits *fds, s64 *timeout)
  401. {
  402.         struct poll_wqueues table;
  403.         poll_table *wait;
  404.         int retval, i;
  405.         rcu_read_lock();
  406.         /*根据已经打开fd的位图检查用户打开的fd, 要求对应fd必须打开, 并且返回最大的fd*/
  407.         retval = max_select_fd(n, fds);
  408.         rcu_read_unlock();
  409.         if (retval < 0)
  410.                 return retval;
  411.         n = retval;
  412.         /*将当前进程放入自已的等待队列table, 并将该等待队列加入到该测试表wait*/
  413.         poll_initwait(&table);
  414.         wait = &table.pt;
  415.         if (!*timeout)
  416.                 wait = NULL;
  417.         retval = 0;
  418.         for (;;) {/*死循环*/
  419.                 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
  420.                 long __timeout;
  421.                 /*注意:可中断的睡眠状态*/
  422.                 set_current_state(TASK_INTERRUPTIBLE);
  423.                 inp = fds->in; outp = fds->out; exp = fds->ex;
  424.                 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
  425.                 for (i = 0; i < n; ++rinp, ++routp, ++rexp) {/*遍历所有fd*/
  426.                         unsigned long in, out, ex, all_bits, bit = 1, mask, j;
  427.                         unsigned long res_in = 0, res_out = 0, res_ex = 0;
  428.                         const struct file_operations *f_op = NULL;
  429.                         struct file *file = NULL;
  430.                         in = *inp++; out = *outp++; ex = *exp++;
  431.                         all_bits = in | out | ex;
  432.                         if (all_bits == 0) {
  433.                                 /*
  434.                                 __NFDBITS定义为(8 * sizeof(unsigned long)),即long的位数。
  435.                                 因为一个long代表了__NFDBITS位,所以跳到下一个位图i要增加__NFDBITS
  436.                                 */
  437.                                 i += __NFDBITS;
  438.                                 continue;
  439.                         }
  440.                         for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
  441.                                 int fput_needed;
  442.                                 if (i >= n)
  443.                                         break;
  444.                                 /*测试每一位*/
  445.                                 if (!(bit & all_bits))
  446.                                         continue;
  447.                                 /*得到file结构指针,并增加引用计数字段f_count*/
  448.                                 file = fget_light(i, &fput_needed);
  449.                                 if (file) {
  450.                                         f_op = file->f_op;
  451.                                         mask = DEFAULT_POLLMASK;
  452.                                         /*对于socket描述符,f_op->poll对应的函数是sock_poll
  453.                                         注意第三个参数是等待队列,在poll成功后会将本进程唤醒执行*/
  454.                                         if (f_op && f_op->poll)
  455.                                                 mask = (*f_op->poll)(file, retval ? NULL : wait);
  456.                                         /*释放file结构指针,实际就是减小他的一个引用计数字段f_count*/
  457.                                         fput_light(file, fput_needed);
  458.                                         /*根据poll的结果设置状态,要返回select出来的fd数目,所以retval++。
  459.                                         注意:retval是in out ex三个集合的总和*/
  460.                                         if ((mask & POLLIN_SET) && (in & bit)) {
  461.                                                 res_in |= bit;
  462.                                                 retval++;
  463.                                         }
  464.                                         if ((mask & POLLOUT_SET) && (out & bit)) {
  465.                                                 res_out |= bit;
  466.                                                 retval++;
  467.                                         }
  468.                                         if ((mask & POLLEX_SET) && (ex & bit)) {
  469.                                                 res_ex |= bit;
  470.                                                 retval++;
  471.                                         }
  472.                                 }
  473.                                 /*
  474.                                 注意前面的set_current_state(TASK_INTERRUPTIBLE);
  475.                                 因为已经进入TASK_INTERRUPTIBLE状态,所以cond_resched回调度其他进程来运行,
  476.                                 这里的目的纯粹是为了增加一个抢占点。被抢占后,由等待队列机制唤醒。
  477.                                 在支持抢占式调度的内核中(定义了CONFIG_PREEMPT),cond_resched是空操作
  478.                                 */
  479.                                 cond_resched();
  480.                         }
  481.                         /*根据poll的结果写回到输出位图里*/
  482.                         if (res_in)
  483.                                 *rinp = res_in;
  484.                         if (res_out)
  485.                                 *routp = res_out;
  486.                         if (res_ex)
  487.                                 *rexp = res_ex;
  488.                 }
  489.                 wait = NULL;
  490.                 if (retval || !*timeout || signal_pending(current))/*signal_pending前面说过了*/
  491.                         break;
  492.                 if(table.error) {
  493.                         retval = table.error;
  494.                         break;
  495.                 }
  496.                 if (*timeout < 0) {
  497.                         /*无限等待*/
  498.                         __timeout = MAX_SCHEDULE_TIMEOUT;
  499.                 } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
  500.                         /* 时间超过MAX_SCHEDULE_TIMEOUT,即schedule_timeout允许的最大值,用一个循环来不断减少超时值*/
  501.                         __timeout = MAX_SCHEDULE_TIMEOUT - 1;
  502.                         *timeout -= __timeout;
  503.                 } else {
  504.                         /*等待一段时间*/
  505.                         __timeout = *timeout;
  506.                         *timeout = 0;
  507.                 }
  508.                 /*TASK_INTERRUPTIBLE状态下,调用schedule_timeout的进程会在收到信号后重新得到调度的机会,
  509.                 即schedule_timeout返回,并返回剩余的时钟周期数
  510.                 */
  511.                 __timeout = schedule_timeout(__timeout);
  512.                 if (*timeout >= 0)
  513.                         *timeout += __timeout;
  514.         }
  515.         /*设置为运行状态*/
  516.         __set_current_state(TASK_RUNNING);
  517.         /*清理等待队列*/
  518.         poll_freewait(&table);
  519.         return retval;
  520. }
  521. static unsigned int sock_poll(struct file *file, poll_table *wait)
  522. {
  523.         struct socket *sock;
  524.         /*约定socket的file->private_data字段放着对应的socket结构指针*/
  525.         sock = file->private_data;
  526.         /*对应了三个协议的函数tcp_poll,udp_poll,datagram_poll,其中udp_poll几乎直接调用了datagram_poll
  527.         累了,先休息一下,这三个函数以后分析*/
  528.         return sock->ops->poll(file, sock, wait);
  529. }

复制代码

其他重要函数一览 
static int max_select_fd(unsigned long n, fd_set_bits *fds) 
返回在fd_set中已经打开的,并且小于用户指定最大值,的fd 
static inline int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) 
从用户空间拷贝fd_set到内核 
static inline void zero_fd_set(unsigned long nr, unsigned long *fdset) 
把fd_set清零 
static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) 
把fd_set拷贝回用户空间 
static inline int signal_pending(struct task_struct *p) 
目前进程有信号需要处理 
struct file *fget_light(unsigned int fd, int *fput_needed) 
由fd得到其对应的file结构指针,并增加其引用计数 
static inline void fput_light(struct file *file, int fput_needed) 
释放由fget_light得到的file结构指针,减少其引用计数 
set_current_state 
设置当前进程的状态 
static inline int cond_resched(void) 
判断是否有进程需要抢占当前进程,如果是将立即发生调度。就是额外增加一个抢占点。 
signed long __sched schedule_timeout(signed long timeout) 
当前进程睡眠timeout个jiffies 
rcu_read_lock 
rcu_read_unlock 
Linux 2.6新加入的rcu锁。读锁的加锁、解锁函数 
参考http://www.ibm.com/developerworks/cn/linux/l-rcu 
poll_freewait 
poll_initwait 
poll_wait 
... 
和文件IO,poll机制有关的几个函数,参考《Linux设备驱动(第三版)》6.3 
tcp_poll 
udp_poll 
datagram_poll 
协议层的poll函数

分享到:          
阅读(224) |  评论(0) |  引用  (0)  | 举报

最近读者

评论

 
 
公司简介  -  联系方法  -  招聘信息  -  客户服务  -  隐私政策  -  博客风格  -  手机博客  -  订阅此博客

网易公司版权所有 ©1997-2011

你可能感兴趣的:(select系统调用)