当数据到达TCP接收缓存后,TCP会唤醒应用进程。应用进程在被内核唤醒后,就可以使用read、readv、recv、recvfrom、recvmsg系统调读取TCP数据。现以read和recv系统调用为例分析应用进程从TCP收取数据的方法。
read系统调用原型:
ssize_t read(int fd, void *buf, size_t count);fd:scoket文件描述符;buf:用于存放数据的缓存的首地址;count:buf的长度(字节)
成功时返回从TCP中读取到的字节数,如果返回0则意味着TCP收到了FIN,应用进程可以选择关闭连接。
read系统调用对于的内核函数为:
472 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) 473 { 474 struct fd f = fdget(fd); 475 ssize_t ret = -EBADF; 476 477 if (f.file) { 478 loff_t pos = file_pos_read(f.file); 479 ret = vfs_read(f.file, buf, count, &pos); 480 file_pos_write(f.file, pos); 481 fdput(f); 482 } 483 return ret; 484 }vfs_read:
353 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) 354 { 355 ssize_t ret; 356 357 if (!(file->f_mode & FMODE_READ)) 358 return -EBADF; 359 if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) 360 return -EINVAL; 361 if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) 362 return -EFAULT; 363 364 ret = rw_verify_area(READ, file, pos, count); 365 if (ret >= 0) { 366 count = ret; 367 if (file->f_op->read) //对于socket类型的文件系统这个判断为假 368 ret = file->f_op->read(file, buf, count, pos); 369 else 370 ret = do_sync_read(file, buf, count, pos); 371 if (ret > 0) { 372 fsnotify_access(file); 373 add_rchar(current, ret); 374 } 375 inc_syscr(current); 376 } 377 378 return ret; 379 }socket类型的文件系统会使用do_sync_read函数:
333 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) 334 { 335 struct iovec iov = { .iov_base = buf, .iov_len = len }; 336 struct kiocb kiocb; 337 ssize_t ret; 338 339 init_sync_kiocb(&kiocb, filp); 340 kiocb.ki_pos = *ppos; 341 kiocb.ki_left = len; 342 kiocb.ki_nbytes = len; 343 344 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); //指向sock_aio_read 345 if (-EIOCBQUEUED == ret) 346 ret = wait_on_sync_kiocb(&kiocb); 347 *ppos = kiocb.ki_pos; 348 return ret; 349 }sock_aio_read函数:
898 static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, 899 struct file *file, const struct iovec *iov, 900 unsigned long nr_segs) 901 { 902 struct socket *sock = file->private_data; 903 size_t size = 0; 904 int i; 905 906 for (i = 0; i < nr_segs; i++) 907 size += iov[i].iov_len; 908 909 msg->msg_name = NULL; 910 msg->msg_namelen = 0; 911 msg->msg_control = NULL; 912 msg->msg_controllen = 0; 913 msg->msg_iov = (struct iovec *)iov; 914 msg->msg_iovlen = nr_segs; 915 msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; 916 917 return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags); 918 } 919 920 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, 921 unsigned long nr_segs, loff_t pos) 922 { 923 struct sock_iocb siocb, *x; 924 925 if (pos != 0) 926 return -ESPIPE; 927 928 if (iocb->ki_left == 0) /* Match SYS5 behaviour */ 929 return 0; 930 931 932 x = alloc_sock_iocb(iocb, &siocb); 933 if (!x) 934 return -ENOMEM; 935 return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs); 936 }可见read系统调用在内核都会调用__sock_recvmsg函数。再来看recv系统调用:
ssize_t recv(int sockfd, void *buf, size_t len, int flags);sockfd:scoket文件描述符;buf:用于存放数据的缓存的首地址;count:buf的长度(字节);flags:用于设置功能。
成功时返回从TCP中读取到的字节数,如果返回0则意味着TCP收到了FIN,应用进程可以选择关闭连接。
recv函数对应的内核函数为sys_recv:
1812 SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, 1813 unsigned int, flags, struct sockaddr __user *, addr, 1814 int __user *, addr_len) 1815 { 1816 struct socket *sock; 1817 struct iovec iov; 1818 struct msghdr msg; 1819 struct sockaddr_storage address; 1820 int err, err2; 1821 int fput_needed; 1822 1823 if (size > INT_MAX) 1824 size = INT_MAX; 1825 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1826 if (!sock) 1827 goto out; 1828 1829 msg.msg_control = NULL; 1830 msg.msg_controllen = 0; 1831 msg.msg_iovlen = 1; 1832 msg.msg_iov = &iov; 1833 iov.iov_len = size; 1834 iov.iov_base = ubuf; 1835 msg.msg_name = (struct sockaddr *)&address; 1836 msg.msg_namelen = sizeof(address); 1837 if (sock->file->f_flags & O_NONBLOCK) 1838 flags |= MSG_DONTWAIT; 1839 err = sock_recvmsg(sock, &msg, size, flags); 1840 1841 if (err >= 0 && addr != NULL) { 1842 err2 = move_addr_to_user(&address, //将数据发送端的地址信息返回给应用进程 1843 msg.msg_namelen, addr, addr_len); 1844 if (err2 < 0) 1845 err = err2; 1846 } 1847 1848 fput_light(sock->file, fput_needed); 1849 out: 1850 return err; 1851 } 1852 1853 /* 1854 * Receive a datagram from a socket. 1855 */ 1856 1857 asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size, 1858 unsigned int flags) 1859 { 1860 return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); 1861 }sock_recvmsg是收包的核心函数:
787 int sock_recvmsg(struct socket *sock, struct msghdr *msg, 788 size_t size, int flags) 789 { 790 struct kiocb iocb; 791 struct sock_iocb siocb; 792 int ret; 793 794 init_sync_kiocb(&iocb, NULL); 795 iocb.private = &siocb; 796 ret = __sock_recvmsg(&iocb, sock, msg, size, flags); 797 if (-EIOCBQUEUED == ret) 798 ret = wait_on_sync_kiocb(&iocb); 799 return ret; 800 }可见recv系统调用也会调用__sock_recvmsg函数来完成收数据的功能。
__sock_recvmsg函数:
765 static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock, 766 struct msghdr *msg, size_t size, int flags) 767 { 768 struct sock_iocb *si = kiocb_to_siocb(iocb); 769 770 si->sock = sock; 771 si->scm = NULL; 772 si->msg = msg; 773 si->size = size; 774 si->flags = flags; 775 776 return sock->ops->recvmsg(iocb, sock, msg, size, flags);//指向inet_recvmsg函数 777 } 778 779 static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, 780 struct msghdr *msg, size_t size, int flags) 781 { 782 int err = security_socket_recvmsg(sock, msg, size, flags); 783 784 return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags); 785 }inet_recvmsg 函数:
794 int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, 795 size_t size, int flags) 796 { 797 struct sock *sk = sock->sk; 798 int addr_len = 0; 799 int err; 800 801 sock_rps_record_flow(sk); 802 803 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, 804 flags & ~MSG_DONTWAIT, &addr_len);//指向tcp_recvmsg函数 805 if (err >= 0) 806 msg->msg_namelen = addr_len; 807 return err; 808 }tcp_recvmsg 函数:
1545 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 1546 size_t len, int nonblock, int flags, int *addr_len) 1547 { 1548 struct tcp_sock *tp = tcp_sk(sk); 1549 int copied = 0; 1550 u32 peek_seq; 1551 u32 *seq; 1552 unsigned long used; 1553 int err; 1554 int target; /* Read at least this many bytes */ 1555 long timeo; 1556 struct task_struct *user_recv = NULL; 1557 bool copied_early = false; 1558 struct sk_buff *skb; 1559 u32 urg_hole = 0; 1560 1561 lock_sock(sk); 1562 1563 err = -ENOTCONN; 1564 if (sk->sk_state == TCP_LISTEN)//listening socket不允许读 1565 goto out; 1566 1567 timeo = sock_rcvtimeo(sk, nonblock); //获取超时参数 1568 1569 /* Urgent data needs to be handled specially. */ 1570 if (flags & MSG_OOB)//以带外方式读取紧急数据 1571 goto recv_urg; ... 1588 seq = &tp->copied_seq; 1589 if (flags & MSG_PEEK) { //用MSG_PEEK收取数据的话数据不会从TCP缓存中删除,下次还能读取到 1590 peek_seq = tp->copied_seq; 1591 seq = &peek_seq; //改变临时变量,而非tp->copied_seq,使得下次读数据时tp->copied_seq不变 1592 } 1593 1594 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); //得到本次要copy到应用缓存的字节数 ... 1618 do { 1619 u32 offset; 1620 1621 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ 1622 if (tp->urg_data && tp->urg_seq == *seq) {//有紧急指针且下一个要copy的字节就是紧急数据 1623 if (copied)//已经copy了至少1字节数据 1624 break; //已经读了一些数据,马上就要读到紧急数据了,立马停住,以免将紧急数据混在普通数据中 1625 if (signal_pending(current)) { //有信号等待处理,可能有是在紧急数据到达慢速处理路径时TCP发送给进程的SIGURG信号 1626 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; //设置返回值 1627 break; //跳出循环,系统调用返回去处理SIGURG信号 1628 } 1629 } 1630 1631 /* Next get a buffer. */ 1632 1633 skb_queue_walk(&sk->sk_receive_queue, skb) {//遍历接收队列 1634 /* Now that we have two receive queues this 1635 * shouldn't happen. 1636 */ 1637 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), //出现数据空洞,BUG! 1638 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n", 1639 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, 1640 flags)) 1641 break; 1642 1643 offset = *seq - TCP_SKB_CB(skb)->seq; //offset是skb中已经被读过的长度 1644 if (tcp_hdr(skb)->syn) 1645 offset--; 1646 if (offset < skb->len) //包中有未读的新数据 1647 goto found_ok_skb; //处理skb中的数据 1648 if (tcp_hdr(skb)->fin) 1649 goto found_fin_ok; //处理FIN标记 1650 WARN(!(flags & MSG_PEEK), //skb中的数据全部被读过,但还没有被删除,一定是开启了PEEK模式 1651 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", 1652 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); 1653 } 1654 1655 /* Well, if we have backlog, try to process it now yet. */ 1656 1657 if (copied >= target && !sk->sk_backlog.tail) //完成读取任务且没有skb在backlog队列中 1658 break; //功成身退 1659 1660 if (copied) { 1661 if (sk->sk_err || 1662 sk->sk_state == TCP_CLOSE || 1663 (sk->sk_shutdown & RCV_SHUTDOWN) || 1664 !timeo || 1665 signal_pending(current)) 1666 break; 1667 } else { //没有copy任何数据 1668 if (sock_flag(sk, SOCK_DONE)) 1669 break; 1670 1671 if (sk->sk_err) { 1672 copied = sock_error(sk); 1673 break; 1674 } 1675 1676 if (sk->sk_shutdown & RCV_SHUTDOWN) 1677 break; 1678 1679 if (sk->sk_state == TCP_CLOSE) { 1680 if (!sock_flag(sk, SOCK_DONE)) { 1681 /* This occurs when user tries to read 1682 * from never connected socket. 1683 */ 1684 copied = -ENOTCONN; 1685 break; 1686 } 1687 break; 1688 } 1689 1690 if (!timeo) { 1691 copied = -EAGAIN; 1692 break; 1693 } 1694 1695 if (signal_pending(current)) { 1696 copied = sock_intr_errno(timeo); 1697 break; 1698 } 1699 } 1700 1701 tcp_cleanup_rbuf(sk, copied); //被读取到应用进程的数据无需保留,需要从接收缓存中清除,并相机发送ACK ... 1758 if (copied >= target) {//copy数据的字节数达到目标 1759 /* Do not sleep, just process backlog. */ 1760 release_sock(sk); 1761 lock_sock(sk); 1762 } else//copy数据的字节数未达到目标 1763 sk_wait_data(sk, &timeo);//睡眠,等待新数据到来 ... 1793 if ((flags & MSG_PEEK) && //PEEK模式 1794 (peek_seq - copied - urg_hole != tp->copied_seq)) { 1795 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", 1796 current->comm, 1797 task_pid_nr(current)); 1798 peek_seq = tp->copied_seq; //出现BUG,校正peek_seq 1799 } 1800 continue; 1801 1802 found_ok_skb: 1803 /* Ok so how much can we use? */ 1804 used = skb->len - offset;//计算skb中新数据的长度 1805 if (len < used)//用户缓存空间不够 1806 used = len; 1807 1808 /* Do we have urgent data here? */ 1809 if (tp->urg_data) { 1810 u32 urg_offset = tp->urg_seq - *seq; 1811 if (urg_offset < used) {//紧急数据在能copy的数据中 1812 if (!urg_offset) {//下一个要copy的字节就是紧急数据 1813 if (!sock_flag(sk, SOCK_URGINLINE)) {//没有设置以inline方式读取紧急数据 1814 ++*seq;//跳过紧急数据 1815 urg_hole++; 1816 offset++; 1817 used--; 1818 if (!used) 1819 goto skip_copy; 1820 } 1821 } else 1822 used = urg_offset;//copy到紧急数据为止 1823 } 1824 } 1825 1826 if (!(flags & MSG_TRUNC)) { //没有设置MSG_TRUNC标记 ... 1855 { 1856 err = skb_copy_datagram_iovec(skb, offset, 1857 msg->msg_iov, used);//copy数据到用户缓存 1858 if (err) { 1859 /* Exception. Bailout! */ 1860 if (!copied) 1861 copied = -EFAULT; 1862 break; 1863 } 1864 } 1865 } 1866 1867 *seq += used; 1868 copied += used; 1869 len -= used; 1870 1871 tcp_rcv_space_adjust(sk);//调整接收缓存空间 1872 1873 skip_copy: 1874 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {//紧急指针已经处理完毕 1875 tp->urg_data = 0; 1876 tcp_fast_path_check(sk); //重启快速处理路径 1877 } 1878 if (used + offset < skb->len)//skb中还有数据没有copy完毕 1879 continue; 1880 1881 if (tcp_hdr(skb)->fin) 1882 goto found_fin_ok; 1883 if (!(flags & MSG_PEEK)) {//不以PEEK方式读包则需要释放数据已经全部copy完毕的skb 1884 sk_eat_skb(sk, skb, copied_early); //释放skb 1885 copied_early = false; 1886 } 1887 continue;//copy下一个skb中的数据 1888 1889 found_fin_ok: 1890 /* Process the FIN. */ 1891 ++*seq; 1892 if (!(flags & MSG_PEEK)) {//PEEK方式读包不释放skb 1893 sk_eat_skb(sk, skb, copied_early); 1894 copied_early = false; 1895 } 1896 break; 1897 } while (len > 0); ... 1932 /* Clean up data we have read: This will do ACK frames. */ 1933 tcp_cleanup_rbuf(sk, copied); //被读取到应用进程的数据无需保留,需要从接收缓存中清除,并相机发送ACK 1934 1935 release_sock(sk); //释放socket,允许其它进程或内核软中断访问socket 1936 return copied; //返回已copy的字节数 1937 1938 out: 1939 release_sock(sk); 1940 return err; 1941 1942 recv_urg: 1943 err = tcp_recv_urg(sk, msg, len, flags);//以带外方式读取紧急数据 1944 goto out; ... 1949 }
1594:如果flag设置了MSG_WAITALL则target = len,否则为sk->sk_rcvlowat和len中最小的一个,但至少为1;而sk->sk_rcvlowat可以通过SO_RCVLOWAT socket选项来设置
1657:tcp_recvmsg函数在访问tcp_sock之前会调用lock_sock将其锁定,在进程调用release_sock释放锁之前内核收包软中断无法访问sock,只能将收到的包放入backlog队列中,在release_sock函数中会处理这些积压的包
1826:MSG_TRUNC标记位表明数据的结尾被截短,因为接收缓冲区太小不足以接收全部的数据。但从代码上来看,如果设置了这个标记位则应用进程将无法从TCP收取数据
在应用进程通过收包系统调用读取完毕数据后,TCP对数据的可靠交付才算真正完成。以上讨论的是普通的收包方式,下面几节探讨一下特殊的收包方式(与这些方式有关的代码在tcp_recvmsg中已经隐去了)。