5.5 收包系统调用

  当数据到达TCP接收缓存后,TCP会唤醒应用进程。应用进程在被内核唤醒后,就可以使用read、readv、recv、recvfrom、recvmsg系统调读取TCP数据。现以read和recv系统调用为例分析应用进程从TCP收取数据的方法。

  read系统调用原型:

ssize_t read(int fd, void *buf, size_t count);
  fd:scoket文件描述符;buf:用于存放数据的缓存的首地址;count:buf的长度(字节)

  成功时返回从TCP中读取到的字节数,如果返回0则意味着TCP收到了FIN,应用进程可以选择关闭连接。

  read系统调用对于的内核函数为:

 472 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 473 {
 474     struct fd f = fdget(fd);
 475     ssize_t ret = -EBADF;
 476 
 477     if (f.file) {
 478         loff_t pos = file_pos_read(f.file);
 479         ret = vfs_read(f.file, buf, count, &pos);
 480         file_pos_write(f.file, pos);   
 481         fdput(f);
 482     }
 483     return ret;
 484 }
  vfs_read:
353 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 354 {
 355     ssize_t ret;
 356     
 357     if (!(file->f_mode & FMODE_READ))
 358         return -EBADF;
 359     if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
 360         return -EINVAL;
 361     if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 362         return -EFAULT;
 363     
 364     ret = rw_verify_area(READ, file, pos, count);
 365     if (ret >= 0) {
 366         count = ret;
 367         if (file->f_op->read) //对于socket类型的文件系统这个判断为假
 368             ret = file->f_op->read(file, buf, count, pos);
 369         else
 370             ret = do_sync_read(file, buf, count, pos);
 371         if (ret > 0) {
 372             fsnotify_access(file);
 373             add_rchar(current, ret);
 374         }
 375         inc_syscr(current);
 376     }
 377 
 378     return ret;
 379 }
  socket类型的文件系统会使用do_sync_read函数:
 333 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 334 {
 335     struct iovec iov = { .iov_base = buf, .iov_len = len };
 336     struct kiocb kiocb;
 337     ssize_t ret;
 338 
 339     init_sync_kiocb(&kiocb, filp);
 340     kiocb.ki_pos = *ppos;
 341     kiocb.ki_left = len;
 342     kiocb.ki_nbytes = len;
 343     
 344     ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); //指向sock_aio_read
 345     if (-EIOCBQUEUED == ret)
 346         ret = wait_on_sync_kiocb(&kiocb);
 347     *ppos = kiocb.ki_pos;
 348     return ret;
 349 }   
  sock_aio_read函数:

 898 static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
 899         struct file *file, const struct iovec *iov,
 900         unsigned long nr_segs)
 901 {
 902     struct socket *sock = file->private_data;
 903     size_t size = 0;
 904     int i;
 905 
 906     for (i = 0; i < nr_segs; i++)  
 907         size += iov[i].iov_len;        
 908 
 909     msg->msg_name = NULL;
 910     msg->msg_namelen = 0;
 911     msg->msg_control = NULL;
 912     msg->msg_controllen = 0;
 913     msg->msg_iov = (struct iovec *)iov;
 914     msg->msg_iovlen = nr_segs;
 915     msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 916 
 917     return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
 918 }
 919 
 920 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 921                 unsigned long nr_segs, loff_t pos)
 922 {
 923     struct sock_iocb siocb, *x;    
 924 
 925     if (pos != 0)
 926         return -ESPIPE;
 927 
 928     if (iocb->ki_left == 0) /* Match SYS5 behaviour */
 929         return 0;
 930 
 931 
 932     x = alloc_sock_iocb(iocb, &siocb);
 933     if (!x)
 934         return -ENOMEM;
 935     return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
936 }
  可见read系统调用在内核都会调用__sock_recvmsg函数。再来看recv系统调用:
 ssize_t recv(int sockfd, void *buf, size_t len, int flags);
   sockfd:scoket文件描述符;buf:用于存放数据的缓存的首地址;count:buf的长度(字节);flags:用于设置功能。

  成功时返回从TCP中读取到的字节数,如果返回0则意味着TCP收到了FIN,应用进程可以选择关闭连接。

  recv函数对应的内核函数为sys_recv:

1812 SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
1813         unsigned int, flags, struct sockaddr __user *, addr,
1814         int __user *, addr_len)
1815 {
1816     struct socket *sock;
1817     struct iovec iov;
1818     struct msghdr msg;
1819     struct sockaddr_storage address;
1820     int err, err2;
1821     int fput_needed;
1822 
1823     if (size > INT_MAX)
1824         size = INT_MAX;
1825     sock = sockfd_lookup_light(fd, &err, &fput_needed);
1826     if (!sock)
1827         goto out;
1828 
1829     msg.msg_control = NULL;
1830     msg.msg_controllen = 0;
1831     msg.msg_iovlen = 1;
1832     msg.msg_iov = &iov;
1833     iov.iov_len = size;
1834     iov.iov_base = ubuf;
1835     msg.msg_name = (struct sockaddr *)&address;
1836     msg.msg_namelen = sizeof(address);
1837     if (sock->file->f_flags & O_NONBLOCK)
1838         flags |= MSG_DONTWAIT;
1839     err = sock_recvmsg(sock, &msg, size, flags);
1840 
1841     if (err >= 0 && addr != NULL) {
1842         err2 = move_addr_to_user(&address,  //将数据发送端的地址信息返回给应用进程
1843                      msg.msg_namelen, addr, addr_len);
1844         if (err2 < 0)
1845             err = err2;
1846     }
1847 
1848     fput_light(sock->file, fput_needed);
1849 out:
1850     return err;
1851 }
1852 
1853 /*
1854  *  Receive a datagram from a socket.
1855  */
1856 
1857 asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size,
1858              unsigned int flags)
1859 {
1860     return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
1861 }
  sock_recvmsg是收包的核心函数:
 787 int sock_recvmsg(struct socket *sock, struct msghdr *msg,
 788          size_t size, int flags)
 789 {
 790     struct kiocb iocb;
 791     struct sock_iocb siocb;
 792     int ret;
 793 
 794     init_sync_kiocb(&iocb, NULL);
 795     iocb.private = &siocb;
 796     ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
 797     if (-EIOCBQUEUED == ret)
 798         ret = wait_on_sync_kiocb(&iocb);
 799     return ret;
 800 }
  可见recv系统调用也会调用__sock_recvmsg函数来完成收数据的功能。

  __sock_recvmsg函数:

765 static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
 766                        struct msghdr *msg, size_t size, int flags)
 767 {
 768     struct sock_iocb *si = kiocb_to_siocb(iocb);
 769
 770     si->sock = sock;     
 771     si->scm = NULL;      
 772     si->msg = msg;
 773     si->size = size;     
 774     si->flags = flags;   
 775
 776     return sock->ops->recvmsg(iocb, sock, msg, size, flags);//指向inet_recvmsg函数
 777 }
 778
 779 static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 780                  struct msghdr *msg, size_t size, int flags)
 781 {
 782     int err = security_socket_recvmsg(sock, msg, size, flags);
 783
 784     return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
 785 }
  inet_recvmsg 函数:
 794 int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 795          size_t size, int flags)
 796 {   
 797     struct sock *sk = sock->sk;
 798     int addr_len = 0;
 799     int err;       
 800     
 801     sock_rps_record_flow(sk);
 802     
 803     err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
 804                    flags & ~MSG_DONTWAIT, &addr_len);//指向tcp_recvmsg函数
 805     if (err >= 0)  
 806         msg->msg_namelen = addr_len;    
 807     return err;    
 808 }   
  tcp_recvmsg 函数:
1545 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1546         size_t len, int nonblock, int flags, int *addr_len)
1547 {   
1548     struct tcp_sock *tp = tcp_sk(sk);
1549     int copied = 0;
1550     u32 peek_seq;   
1551     u32 *seq;       
1552     unsigned long used;
1553     int err;        
1554     int target;     /* Read at least this many bytes */
1555     long timeo;     
1556     struct task_struct *user_recv = NULL;
1557     bool copied_early = false;
1558     struct sk_buff *skb;
1559     u32 urg_hole = 0;
1560     
1561     lock_sock(sk);      
1562     
1563     err = -ENOTCONN;    
1564     if (sk->sk_state == TCP_LISTEN)//listening socket不允许读
1565         goto out;   
1566     
1567     timeo = sock_rcvtimeo(sk, nonblock); //获取超时参数
1568
1569     /* Urgent data needs to be handled specially. */
1570     if (flags & MSG_OOB)//以带外方式读取紧急数据
1571         goto recv_urg;
...
1588     seq = &tp->copied_seq;
1589     if (flags & MSG_PEEK) { //用MSG_PEEK收取数据的话数据不会从TCP缓存中删除,下次还能读取到
1590         peek_seq = tp->copied_seq;
1591         seq = &peek_seq; //改变临时变量,而非tp->copied_seq,使得下次读数据时tp->copied_seq不变
1592     }
1593
1594     target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); //得到本次要copy到应用缓存的字节数
...
1618     do {
1619         u32 offset;
1620
1621         /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1622         if (tp->urg_data && tp->urg_seq == *seq) {//有紧急指针且下一个要copy的字节就是紧急数据
1623             if (copied)//已经copy了至少1字节数据
1624                 break; //已经读了一些数据,马上就要读到紧急数据了,立马停住,以免将紧急数据混在普通数据中
1625             if (signal_pending(current)) { //有信号等待处理,可能有是在紧急数据到达慢速处理路径时TCP发送给进程的SIGURG信号
1626                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; //设置返回值
1627                 break; //跳出循环,系统调用返回去处理SIGURG信号
1628             }
1629         }
1630
1631         /* Next get a buffer. */
1632
1633         skb_queue_walk(&sk->sk_receive_queue, skb) {//遍历接收队列
1634             /* Now that we have two receive queues this
1635              * shouldn't happen.
1636              */
1637             if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), //出现数据空洞,BUG!
1638                  "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1639                  *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1640                  flags))
1641                 break;
1642
1643             offset = *seq - TCP_SKB_CB(skb)->seq; //offset是skb中已经被读过的长度
1644             if (tcp_hdr(skb)->syn)
1645                 offset--;
1646             if (offset < skb->len) //包中有未读的新数据
1647                 goto found_ok_skb; //处理skb中的数据
1648             if (tcp_hdr(skb)->fin)
1649                 goto found_fin_ok; //处理FIN标记
1650             WARN(!(flags & MSG_PEEK), //skb中的数据全部被读过,但还没有被删除,一定是开启了PEEK模式
1651                  "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1652                  *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1653         }
1654
1655         /* Well, if we have backlog, try to process it now yet. */
1656
1657         if (copied >= target && !sk->sk_backlog.tail) //完成读取任务且没有skb在backlog队列中
1658             break; //功成身退
1659
1660         if (copied) {
1661             if (sk->sk_err ||
1662                 sk->sk_state == TCP_CLOSE ||
1663                 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1664                 !timeo ||
1665                 signal_pending(current))
1666                 break;
1667         } else { //没有copy任何数据
1668             if (sock_flag(sk, SOCK_DONE))
1669                 break;
1670
1671             if (sk->sk_err) {
1672                 copied = sock_error(sk);
1673                 break;
1674             }
1675
1676             if (sk->sk_shutdown & RCV_SHUTDOWN)
1677                 break;
1678
1679             if (sk->sk_state == TCP_CLOSE) {
1680                 if (!sock_flag(sk, SOCK_DONE)) {
1681                     /* This occurs when user tries to read
1682                      * from never connected socket.
1683                      */
1684                     copied = -ENOTCONN;
1685                     break;
1686                 }
1687                 break;
1688             }
1689
1690             if (!timeo) {
1691                 copied = -EAGAIN;
1692                 break;
1693             }
1694
1695             if (signal_pending(current)) {
1696                 copied = sock_intr_errno(timeo);
1697                 break;
1698             }
1699         }
1700
1701         tcp_cleanup_rbuf(sk, copied); //被读取到应用进程的数据无需保留,需要从接收缓存中清除,并相机发送ACK
...
1758         if (copied >= target) {//copy数据的字节数达到目标
1759             /* Do not sleep, just process backlog. */
1760             release_sock(sk);
1761             lock_sock(sk);
1762         } else//copy数据的字节数未达到目标
1763             sk_wait_data(sk, &timeo);//睡眠,等待新数据到来
...
1793         if ((flags & MSG_PEEK) && //PEEK模式
1794             (peek_seq - copied - urg_hole != tp->copied_seq)) {
1795             net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1796                         current->comm,
1797                         task_pid_nr(current));
1798             peek_seq = tp->copied_seq; //出现BUG,校正peek_seq
1799         }
1800         continue;
1801
1802     found_ok_skb:
1803         /* Ok so how much can we use? */
1804         used = skb->len - offset;//计算skb中新数据的长度
1805         if (len < used)//用户缓存空间不够
1806             used = len;
1807
1808         /* Do we have urgent data here? */
1809         if (tp->urg_data) {
1810             u32 urg_offset = tp->urg_seq - *seq;
1811             if (urg_offset < used) {//紧急数据在能copy的数据中
1812                 if (!urg_offset) {//下一个要copy的字节就是紧急数据
1813                     if (!sock_flag(sk, SOCK_URGINLINE)) {//没有设置以inline方式读取紧急数据
1814                         ++*seq;//跳过紧急数据
1815                         urg_hole++;
1816                         offset++;
1817                         used--;
1818                         if (!used)
1819                             goto skip_copy;
1820                     }
1821                 } else
1822                     used = urg_offset;//copy到紧急数据为止
1823             }
1824         }
1825
1826         if (!(flags & MSG_TRUNC)) { //没有设置MSG_TRUNC标记
...
1855             {
1856                 err = skb_copy_datagram_iovec(skb, offset,
1857                         msg->msg_iov, used);//copy数据到用户缓存
1858                 if (err) {
1859                     /* Exception. Bailout! */
1860                     if (!copied)
1861                         copied = -EFAULT;
1862                     break;
1863                 }
1864             }
1865         }
1866
1867         *seq += used;
1868         copied += used;
1869         len -= used;
1870
1871         tcp_rcv_space_adjust(sk);//调整接收缓存空间
1872
1873 skip_copy:
1874         if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {//紧急指针已经处理完毕
1875             tp->urg_data = 0;
1876             tcp_fast_path_check(sk); //重启快速处理路径
1877         }
1878         if (used + offset < skb->len)//skb中还有数据没有copy完毕
1879             continue;
1880
1881         if (tcp_hdr(skb)->fin)
1882             goto found_fin_ok;
1883         if (!(flags & MSG_PEEK)) {//不以PEEK方式读包则需要释放数据已经全部copy完毕的skb
1884             sk_eat_skb(sk, skb, copied_early); //释放skb
1885             copied_early = false;
1886         }
1887         continue;//copy下一个skb中的数据
1888
1889     found_fin_ok:
1890         /* Process the FIN. */
1891         ++*seq;
1892         if (!(flags & MSG_PEEK)) {//PEEK方式读包不释放skb
1893             sk_eat_skb(sk, skb, copied_early);
1894             copied_early = false;
1895         }
1896         break;
1897     } while (len > 0);
...
1932     /* Clean up data we have read: This will do ACK frames. */
1933     tcp_cleanup_rbuf(sk, copied); //被读取到应用进程的数据无需保留,需要从接收缓存中清除,并相机发送ACK
1934
1935     release_sock(sk); //释放socket,允许其它进程或内核软中断访问socket
1936     return copied; //返回已copy的字节数
1937
1938 out:
1939     release_sock(sk);
1940     return err;
1941
1942 recv_urg:
1943     err = tcp_recv_urg(sk, msg, len, flags);//以带外方式读取紧急数据
1944     goto out;
...
1949 }

  1594:如果flag设置了MSG_WAITALL则target = len,否则为sk->sk_rcvlowat和len中最小的一个,但至少为1;而sk->sk_rcvlowat可以通过SO_RCVLOWAT socket选项来设置

  1657:tcp_recvmsg函数在访问tcp_sock之前会调用lock_sock将其锁定,在进程调用release_sock释放锁之前内核收包软中断无法访问sock,只能将收到的包放入backlog队列中,在release_sock函数中会处理这些积压的包

  1826:MSG_TRUNC标记位表明数据的结尾被截短,因为接收缓冲区太小不足以接收全部的数据。但从代码上来看,如果设置了这个标记位则应用进程将无法从TCP收取数据

  在应用进程通过收包系统调用读取完毕数据后,TCP对数据的可靠交付才算真正完成。以上讨论的是普通的收包方式,下面几节探讨一下特殊的收包方式(与这些方式有关的代码在tcp_recvmsg中已经隐去了)。

你可能感兴趣的:(tcp,linux内核)