4.1 发包系统调用

  连接建立完成后,应用进程可以使用send、sendto、sendmsg、write、writev系统调用来发送TCP数据,其中sendmsg和writev可以发送位于多个不连续内存中的数据。使用这些函数发送数据时需指定socket文件描述符、要发送的数据所在的缓冲区首地址及数据长度等信息。上述发包系统调用对应的内核函数都会调用__sock_sendmsg_nosec函数。现在以send系统调用和write系统调用为例来证明这个结论。

  send系统调用原型:

ssize_t send(int sockfd, const void *buf, size_t len, int flags);
  参数:sockfd是要发送数据的socket的文件描述符;buf是数据所在缓存的首地址;len是要发送的数据的长度;flags用于设置功能。成功时此函数的返回值是已经成功发送的字节数。

  send函数对应的内核代码为:

1800 SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
1801         unsigned int, flags)
1802 {
1803     return sys_sendto(fd, buff, len, flags, NULL, 0);
1804 }
   send_to系统调用对应的内核函数也是sys_sendto函数:

1753 SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
1754         unsigned int, flags, struct sockaddr __user *, addr,
1755         int, addr_len)
1756 {
1757     struct socket *sock;
1758     struct sockaddr_storage address;
1759     int err;
1760     struct msghdr msg;
1761     struct iovec iov;
1762     int fput_needed;
1763 
1764     if (len > INT_MAX)
1765         len = INT_MAX;
1766     sock = sockfd_lookup_light(fd, &err, &fput_needed);
1767     if (!sock)
1768         goto out;
1769 
1770     iov.iov_base = buff;
1771     iov.iov_len = len;
1772     msg.msg_name = NULL;
1773     msg.msg_iov = &iov;
1774     msg.msg_iovlen = 1;
1775     msg.msg_control = NULL;
1776     msg.msg_controllen = 0;
1777     msg.msg_namelen = 0;
1778     if (addr) {
1779         err = move_addr_to_kernel(addr, addr_len, &address);
1780         if (err < 0)
1781             goto out_put;
1782         msg.msg_name = (struct sockaddr *)&address;
1783         msg.msg_namelen = addr_len;
1784     }
1785     if (sock->file->f_flags & O_NONBLOCK)
1786         flags |= MSG_DONTWAIT;
1787     msg.msg_flags = flags;
1788     err = sock_sendmsg(sock, &msg, len);  //核心处理函数
1789 
1790 out_put:
1791     fput_light(sock->file, fput_needed);
1792 out:
1793     return err;
1794 }
  1770-1777:struct msghdr可以支持一次输入多个不连续的缓存,但对于send和sendto系统调用而言一次只输入一段缓存,故这段代码就是用入参初始化msghdr结构。

  sock_sendmsg函数的定义:

 636 int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 637 {
 638     struct kiocb iocb;
 639     struct sock_iocb siocb;
 640     int ret;
 641 
 642     init_sync_kiocb(&iocb, NULL);
 643     iocb.private = &siocb;
 644     ret = __sock_sendmsg(&iocb, sock, msg, size);
 645     if (-EIOCBQUEUED == ret)
 646         ret = wait_on_sync_kiocb(&iocb);
 647     return ret;
 648 }
   sock_sendmsg函数是通过调用__sock_sendmsg函数来完成发送数据的功能的。先暂时停下,我们来看看write系统调用的原型:

ssize_t write(int fd, const void *buf, size_t count);
   参数:fd是要发送数据的socket的文件描述符;buf是数据所在缓存的首地址;count是要发送的数据的长度。成功时此函数的返回值是已经成功发送的字节数。
  write系统调用对应的内核函数为:

 486 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 487         size_t, count)
 488 {
 489     struct fd f = fdget(fd);
 490     ssize_t ret = -EBADF;
 491 
 492     if (f.file) {
 493         loff_t pos = file_pos_read(f.file);
 494         ret = vfs_write(f.file, buf, count, &pos);
 495         file_pos_write(f.file, pos);
 496         fdput(f);
 497     }
 498 
 499     return ret;
 500 }
  vfs_write函数:
 430 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 431 {       
 432     ssize_t ret;
 433     
 434     if (!(file->f_mode & FMODE_WRITE))
 435         return -EBADF;
 436     if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
 437         return -EINVAL;
 438     if (unlikely(!access_ok(VERIFY_READ, buf, count))) 
 439         return -EFAULT;
 440 
 441     ret = rw_verify_area(WRITE, file, pos, count);
 442     if (ret >= 0) {
 443         count = ret;
 444         file_start_write(file);
 445         if (file->f_op->write)
 446             ret = file->f_op->write(file, buf, count, pos);
 447         else
 448             ret = do_sync_write(file, buf, count, pos);
 449         if (ret > 0) {
 450             fsnotify_modify(file);
 451             add_wchar(current, ret);
 452         }
 453         inc_syscw(current);
 454         file_end_write(file);
 455     }
 456 
 457     return ret;
 458 }
  445-448:对应socket类型的文件系统,file->f_op->wiret一定是NULL,即会调用do_sync_write函数:

 383 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 384 {   
 385     struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 386     struct kiocb kiocb;
 387     ssize_t ret;
 388         
 389     init_sync_kiocb(&kiocb, filp);
 390     kiocb.ki_pos = *ppos;
 391     kiocb.ki_left = len;
 392     kiocb.ki_nbytes = len;
 393 
 394     ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 395     if (-EIOCBQUEUED == ret)
 396         ret = wait_on_sync_kiocb(&kiocb);
 397     *ppos = kiocb.ki_pos;
 398     return ret;
 399 }
  394:参见2.1章,socket文件系统中filp->f_op->aio_wirte指向的函数是 sock_aio_write


 938 static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
 939             struct file *file, const struct iovec *iov,
 940             unsigned long nr_segs)
 941 {
 942     struct socket *sock = file->private_data;
 943     size_t size = 0;
 944     int i;
 945 
 946     for (i = 0; i < nr_segs; i++)
 947         size += iov[i].iov_len;
 948 
 949     msg->msg_name = NULL;
 950     msg->msg_namelen = 0;
 951     msg->msg_control = NULL;
 952     msg->msg_controllen = 0;
 953     msg->msg_iov = (struct iovec *)iov;
 954     msg->msg_iovlen = nr_segs;
 955     msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 956     if (sock->type == SOCK_SEQPACKET)
 957         msg->msg_flags |= MSG_EOR;
 958 
 959     return __sock_sendmsg(iocb, sock, msg, size);
 960 }
 961         
 962 static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
 963               unsigned long nr_segs, loff_t pos)
 964 {   
 965     struct sock_iocb siocb, *x;
 966     
 967     if (pos != 0)
 968         return -ESPIPE;
 969     
 970     x = alloc_sock_iocb(iocb, &siocb);
 971     if (!x)
 972         return -ENOMEM;
 973 
 974     return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
 975 }
  可见,与send、sendto系统调用一样,write系统调用在内核中最终调用的也是__sock_sendmsg。下面来分析__sock_sendmsg函数的代码:

615 static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock,
 616                        struct msghdr *msg, size_t size)
 617 {
 618     struct sock_iocb *si = kiocb_to_siocb(iocb);
 619 
 620     si->sock = sock;
 621     si->scm = NULL;
 622     si->msg = msg;
 623     si->size = size;
 624 
 625     return sock->ops->sendmsg(iocb, sock, msg, size);
 626 }
 627 
 628 static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 629                  struct msghdr *msg, size_t size)
 630 {
 631     int err = security_socket_sendmsg(sock, msg, size);
 632 
 633     return err ?: __sock_sendmsg_nosec(iocb, sock, msg, size);
 634 }
  625:这个函数指针指向的是 tcp_sendmsg函数:
1016 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1017         size_t size)     
1018 {
1019     struct iovec *iov;   
1020     struct tcp_sock *tp = tcp_sk(sk);
1021     struct sk_buff *skb;
1022     int iovlen, flags, err, copied = 0;
1023     int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1024     bool sg;
1025     long timeo;
1026
1027     lock_sock(sk);
1028
1029     flags = msg->msg_flags;
1030     if (flags & MSG_FASTOPEN) {     //使用TFO功能  
1031         err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);//发送TFO数据
1032         if (err == -EINPROGRESS && copied_syn > 0)
1033             goto out;    
1034         else if (err)    
1035             goto out_err;
1036         offset = copied_syn;
1037     }
1038
1039     timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1040
1041     /* Wait for a connection to finish. One exception is TCP Fast Open
1042      * (passive side) where data is allowed to be sent before a connection
1043      * is fully established.
1044      */
1045     if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1046         !tcp_passive_fastopen(sk)) {   
1047         if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1048             goto do_error;
1049     }
1050
1051     if (unlikely(tp->repair)) {        
1052         if (tp->repair_queue == TCP_RECV_QUEUE) {
1053             copied = tcp_send_rcvq(sk, msg, size);
1054             goto out;    
1055         }
1056
1057         err = -EINVAL;   
1058         if (tp->repair_queue == TCP_NO_QUEUE)
1059             goto out_err;
1060
1061         /* 'common' sending to sendq */
1062     }
1063
1064     /* This should be in poll */
1065     clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1066
1067     mss_now = tcp_send_mss(sk, &size_goal, flags);//获取当前最大报文段的值
1068
1069     /* Ok commence sending. */
1070     iovlen = msg->msg_iovlen;//数据块总个数
1071     iov = msg->msg_iov;//起始数据块
1072     copied = 0;
1073
1074     err = -EPIPE;
1075     if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))//如果socket出错或不允许再发送数据
1076         goto out_err;
1077
1078     sg = !!(sk->sk_route_caps & NETIF_F_SG);//如果网卡支持分散-聚集IO(Scatter/Gather IO),则sg为1
1079
1080     while (--iovlen >= 0) {//如果有数据块没有发送
1081         size_t seglen = iov->iov_len;
1082         unsigned char __user *from = iov->iov_base;
1083
1084         iov++;
1085         if (unlikely(offset > 0)) {  /* Skip bytes copied in SYN */
1086             if (offset >= seglen) {//当前块的内容都已经在SYN中发送过了
1087                 offset -= seglen;
1088                 continue;
1089             }
1090             seglen -= offset;
1091             from += offset;
1092             offset = 0;
1093         }
1094
1095         while (seglen > 0) {//块中有数据没有发送
1096             int copy = 0;
1097             int max = size_goal;
1098
1099             skb = tcp_write_queue_tail(sk);
1100             if (tcp_send_head(sk)) {//如果还有包没有发送
1101                 if (skb->ip_summed == CHECKSUM_NONE)
1102                     max = mss_now;
1103                 copy = max - skb->len;//计算包中的剩余空间大小
1104             }
1105
1106             if (copy <= 0) {//没有剩余空间
1107 new_segment:
1108                 /* Allocate new segment. If the interface is SG,
1109                  * allocate skb fitting to single page.
1110                  */
1111                 if (!sk_stream_memory_free(sk))//检查发送队列中SKB占用的内存是否超出了socket的限制
1112                     goto wait_for_sndbuf;
1113
1114                 skb = sk_stream_alloc_skb(sk,
1115                               select_size(sk, sg),
1116                               sk->sk_allocation);//申请新的SKB,其数据部分空间大小是一个最大报文段的值
1117                 if (!skb)
1118                     goto wait_for_memory;
1119
1120                 /*
1121                  * All packets are restored as if they have
1122                  * already been sent.
1123                  */
1124                 if (tp->repair)
1125                     TCP_SKB_CB(skb)->when = tcp_time_stamp;
1126
1127                 /*
1128                  * Check whether we can use HW checksum.
1129                  */
1130                 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1131                     skb->ip_summed = CHECKSUM_PARTIAL;
1132
1133                 skb_entail(sk, skb);//将SKB放入发送队列末尾
1134                 copy = size_goal;
1135                 max = size_goal;
1136             }
1137
1138             /* Try to append data to the end of skb. */
1139             if (copy > seglen)
1140                 copy = seglen;
1141
1142             /* Where to copy to? */
1143             if (skb_availroom(skb) > 0) {//线性区还有空间
1144                 /* We have some space in skb head. Superb! */
1145                 copy = min_t(int, copy, skb_availroom(skb));
1146                 err = skb_add_data_nocache(sk, skb, from, copy);//将用户态内存中的数据copy到SKB中
1147                 if (err)
1148                     goto do_fault;
1149             } else {
1150                 bool merge = true;
1151                 int i = skb_shinfo(skb)->nr_frags;//已经分配非连续页的数量
1152                 struct page_frag *pfrag = sk_page_frag(sk);
1153
1154                 if (!sk_page_frag_refill(sk, pfrag))//判断当前页是否有空间可写;如果没有则申请新页,申请不到则需要等待有内存可用
1155                     goto wait_for_memory;
1156
1157                 if (!skb_can_coalesce(skb, i, pfrag->page,
1158                               pfrag->offset)) {//判断当前页是否需要加入到skb_shinfo(skb)->frags数组中
1159                     if (i == MAX_SKB_FRAGS || !sg) {
1160                         tcp_mark_push(tp, skb);
1161                         goto new_segment;
1162                     }
1163                     merge = false;//不需要加入,因为当前页是skb_shinfo(skb)->frags数组最后的成员且有空间可写
1164                 }
1165
1166                 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1167
1168                 if (!sk_wmem_schedule(sk, copy))//查看socket的内存限制
1169                     goto wait_for_memory;
1170
1171                 err = skb_copy_to_page_nocache(sk, from, skb,
1172                                    pfrag->page,
1173                                    pfrag->offset,
1174                                    copy);//将用户态空间中的数据copy到页中
1175                 if (err)
1176                     goto do_error;
1177
1178                 /* Update the skb. */
1179                 if (merge) {//没有申请新页
1180                     skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);//更新页大小
1181                 } else {
1182                     skb_fill_page_desc(skb, i, pfrag->page,
1183                                pfrag->offset, copy);//将新页加入到skb_shinfo(skb)->frags数组中
1184                     get_page(pfrag->page);
1185                 }
1186                 pfrag->offset += copy;
1187             }
1188
1189             if (!copied)
1190                 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1191
1192             tp->write_seq += copy;
1193             TCP_SKB_CB(skb)->end_seq += copy;
1194             skb_shinfo(skb)->gso_segs = 0;
1195
1196             from += copy;
1197             copied += copy;
1198             if ((seglen -= copy) == 0 && iovlen == 0)//数据全部copy到内核空间
1199                 goto out;
1200
1201             if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1202                 continue;//如果要发送的数据是紧急数据,则立刻跳出当前循环,尽快copy全部数据并发送
1203
1204             if (forced_push(tp)) {//如果积累了一定长度的数据没有push,在数据全部copy进内核前也可以先发送一次
1205                 tcp_mark_push(tp, skb);//在SKB中增加PSH标记,更新pushed_seq标记
1206                 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);//发送数据
1207             } else if (skb == tcp_send_head(sk)) //如果当前包是发送队列中第一个包,先发送之
1208                 tcp_push_one(sk, mss_now);//只发送一个包
1209             continue;
1210
1211 wait_for_sndbuf:
1212             set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1213 wait_for_memory:
1214             if (copied)//如果已经有copy进内核的数据,在睡眠等待内存前先发送一次
1215                 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1216
1217             if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)//睡眠直到有空余内存或超时
1218                 goto do_error;
1219
1220             mss_now = tcp_send_mss(sk, &size_goal, flags);
1221         }
1222     }
1223
1224 out:
1225     if (copied)//如果有数据copy到内核
1226         tcp_push(sk, flags, mss_now, tp->nonagle);//发送数据
1227     release_sock(sk);//解除软中断不能访问socekt的标记,并处理backlog中积累的数据包
1228     return copied + copied_syn;
1229
1230 do_fault:
1231     if (!skb->len) {
1232         tcp_unlink_write_queue(skb, sk);
1233         /* It is the one place in all of TCP, except connection
1234          * reset, where we can be unlinking the send_head.
1235          */
1236         tcp_check_send_head(sk, skb);
1237         sk_wmem_free_skb(sk, skb);
1238     }
1239
1240 do_error:
1241     if (copied + copied_syn)
1242         goto out;
1243 out_err:
1244     err = sk_stream_error(sk, flags, err);
1245     release_sock(sk);
1246     return err;
1247 }
  1027:标识socket正在被系统调用访问,软中断收到这个socket的包后只能挂到backlog队列中然后返回,待系统调用结束时调用release_sock(sk)处理

  1150-1163:线性区没有空间,但还允许向这个skb中写入数据,原因是mss变大了或者网卡支持分散-聚集IO,只能将数据保存在非连续空间中;如果网卡不支持分散-聚集IO,则系统会在将数据发送到驱动前将非线性区中的数据线性化

  1201:skb->len < max为真意味着skb还可以申请一些空间保存数据,故先不发送,要尽量填满skb以提高发送效率

  tcp_sendmsg函数的主要功能是:

1、如果发送队列尾部的skb尚未发送而且还有剩余空间,则将用户缓存中的数据copy进去;如果没有这样的空间则申请一个数据空间固定大小的skb,再copy数据;

2、一个skb的空间如果不够就再申请一个固定大小的skb,再copy数据,直到数据全部copy完毕,或skb的缓存空间无法申请,或发送缓存达到限制为止;若是后两种情况,如果socket是非阻塞的则立即返回,否则会等待能够得到空间或超时

3、将申请的skb放入发送队列尾部,再调用tcp_push、__tcp_push_pending_frames或tcp_push_one函数发送队列中的skb

  tcp_push函数:

619 static inline void tcp_push(struct sock *sk, int flags, int mss_now,
 620                 int nonagle)
 621 {
 622     if (tcp_send_head(sk)) { //有数据未发送
 623         struct tcp_sock *tp = tcp_sk(sk);
 624
 625         if (!(flags & MSG_MORE) || forced_push(tp))
 626             tcp_mark_push(tp, tcp_write_queue_tail(sk));
 627
 628         tcp_mark_urg(tp, flags);     //如果客户设置要发送OOB数据,则记录紧急数据的下一个字节的序列号,其实紧急数据就是当前数据包的最后一个字节数据
 629         __tcp_push_pending_frames(sk, mss_now,
 630                       (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 631     }
 632 }
   __tcp_push_pending_frames、tcp_push_one都会调用tcp_write_xmit函数发送数据:
1811 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1812                int push_one, gfp_t gfp)
1813 {
1814     struct tcp_sock *tp = tcp_sk(sk);
1815     struct sk_buff *skb;
1816     unsigned int tso_segs, sent_pkts;
1817     int cwnd_quota;      
1818     int result;
1819
1820     sent_pkts = 0;
1821
1822     if (!push_one) {
1823         /* Do MTU probing. */
1824         result = tcp_mtu_probe(sk);    
1825         if (!result) {   
1826             return false;
1827         } else if (result > 0) {           
1828             sent_pkts = 1;
1829         }
1830     }
1831
1832     while ((skb = tcp_send_head(sk))) {//获取没有发送的最老的skb
1833         unsigned int limit;
1834
1835
1836         tso_segs = tcp_init_tso_segs(sk, skb, mss_now);//获取网卡将这个skb分割成的段的个数
1837         BUG_ON(!tso_segs);
1838
1839         if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE)
1840             goto repair; /* Skip network transmission */
1841
1842         cwnd_quota = tcp_cwnd_test(tp, skb);//计算拥塞窗口允许发送的字节数
1843         if (!cwnd_quota) {
1844             if (push_one == 2)//如果要发送TCP探测包
1845                 /* Force out a loss probe pkt. */
1846                 cwnd_quota = 1;//允许发送一个字节
1847             else
1848                 break;
1849         }
1850
1851         if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) //检查发送窗口是否允许发送数据
1852             break;
1853
1854         if (tso_segs == 1) {//网卡会将此skb按一个包发送
1855             if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
1856                              (tcp_skb_is_last(sk, skb) ?
1857                               nonagle : TCP_NAGLE_PUSH))))//查看nagle算法是否运行发送当前包
1858                 break;
1859         } else {//网卡会将此skb分割成多个包发送
1860             if (!push_one && tcp_tso_should_defer(sk, skb))//判断在希望发送多个包时TOS是否会延迟发送该包
1861                 break;
1862         }
1863
1864         /* TSQ : sk_wmem_alloc accounts skb truesize,
1865          * including skb overhead. But thats OK.
1866          */
1867         if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
1868             set_bit(TSQ_THROTTLED, &tp->tsq_flags);
1869             break;
1870         }
1871         limit = mss_now;
1872         if (tso_segs > 1 && !tcp_urg_mode(tp))//网卡会将此skb分割成多个包发送并且没有紧急数据
1873             limit = tcp_mss_split_point(sk, skb, mss_now,
1874                             min_t(unsigned int,
1875                               cwnd_quota,
1876                               sk->sk_gso_max_segs));//计算网卡能一次发送的字节数
1877
1878         if (skb->len > limit &&    //如果包过大
1879             unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))//就得拆成两个包,当前包的大小会减小为与limit一致
1880             break;
1881
1882         TCP_SKB_CB(skb)->when = tcp_time_stamp;//记录发送时间
1883
1884         if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))//发送数据的副本
1885             break;
1886
1887 repair:
1888         /* Advance the send_head.  This one is sent out.
1889          * This call will increment packets_out.
1890          */
1891         tcp_event_new_data_sent(sk, skb);//send_head指向下一个要发送的包
1892
1893         tcp_minshall_update(tp, mss_now, skb);
1894         sent_pkts += tcp_skb_pcount(skb);//计算已经发送的包的个数
1895
1896         if (push_one)
1897             break;
1898     }
1899
1900     if (likely(sent_pkts)) {//至少发送了一个包
1901         if (tcp_in_cwnd_reduction(sk))
1902             tp->prr_out += sent_pkts;
1903
1904         /* Send one loss probe per tail loss episode. */
1905         if (push_one != 2)
1906             tcp_schedule_loss_probe(sk);
1907         tcp_cwnd_validate(sk);
1908         return false;
1909     }
1910     return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
1911 }
   tcp_write_xmit函数会根据拥塞窗口、发送窗口、nagle算法等条件判断是否发送数据以及发送多少,即新放入发送队列中的数据不一定会立即发送。而且发送包时是发送skb的副本,原来的skb会一直呆在发送队列中, 如果发生了数据丢失则TCP会将发送队列中的skb再发送一次,直到数据被确认时才能删除。

  可以发送时tcp_write_xmit函数会调用tcp_transmit_skb函数发送skb:

828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 829                 gfp_t gfp_mask)
 830 {                     
 831     const struct inet_connection_sock *icsk = inet_csk(sk);
 832     struct inet_sock *inet;
 833     struct tcp_sock *tp;
 834     struct tcp_skb_cb *tcb;
 835     struct tcp_out_options opts;
 836     unsigned int tcp_options_size, tcp_header_size;
 837     struct tcp_md5sig_key *md5;
 838     struct tcphdr *th;
 839     int err;
 840     
 841     BUG_ON(!skb || !tcp_skb_pcount(skb));
 842     
 843     /* If congestion control is doing timestamping, we must
 844      * take such a timestamp before we potentially clone/copy.
 845      */
 846     if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
 847         __net_timestamp(skb);
 848
 849     if (likely(clone_it)) {//clone一个副本发送出去,原本留在队列中等待ACK确认后再删除
 850         const struct sk_buff *fclone = skb + 1;
 851
 852         if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
 853                  fclone->fclone == SKB_FCLONE_CLONE))
 854             NET_INC_STATS_BH(sock_net(sk),
 855                      LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
 856
 857         if (unlikely(skb_cloned(skb)))
 858             skb = pskb_copy(skb, gfp_mask);
 859         else
 860             skb = skb_clone(skb, gfp_mask);
 861         if (unlikely(!skb))
 862             return -ENOBUFS;
 863     }
 864
 865     inet = inet_sk(sk);
 866     tp = tcp_sk(sk);
 867     tcb = TCP_SKB_CB(skb);
 868     memset(&opts, 0, sizeof(opts));
 869
 870     if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
 871         tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);//构建SYN包的选项信息
 872     else
 873         tcp_options_size = tcp_established_options(sk, skb, &opts,//构建非SYN包的选项信息
 874                                &md5);
 875     tcp_header_size = tcp_options_size + sizeof(struct tcphdr);//计算TCP头长度
 876
 877     if (tcp_packets_in_flight(tp) == 0)//没有停留在网络中的包(即发送后未被确认的包,包括重传包)
 878         tcp_ca_event(sk, CA_EVENT_TX_START);
 879
 880     /* if no packet is in qdisc/device queue, then allow XPS to select
 881      * another queue.
 882      */
 883     skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;
 884
 885     skb_push(skb, tcp_header_size);//skb->data指向TCP头
 886     skb_reset_transport_header(skb);
 887
 888     skb_orphan(skb);
 889     skb->sk = sk;
 890     skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
 891               tcp_wfree : sock_wfree;
 892     atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 893
 894     /* Build TCP header and checksum it. */
 895     th = tcp_hdr(skb);
 896     th->source      = inet->inet_sport;//设置源端口
 897     th->dest        = inet->inet_dport;//设置目的端口
 898     th->seq         = htonl(tcb->seq);//设置序列号
 899     th->ack_seq     = htonl(tp->rcv_nxt);//设置确认号
 900     *(((__be16 *)th) + 6)   = htons(((tcp_header_size >> 2) << 12) |
 901                     tcb->tcp_flags);//设置控制标记位
 902
 903     if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
 904         /* RFC1323: The window in SYN & SYN/ACK segments
 905          * is never scaled.
 906          */
 907         th->window  = htons(min(tp->rcv_wnd, 65535U));
 908     } else {
 909         th->window  = htons(tcp_select_window(sk));
 910     }
 911     th->check       = 0;
 912     th->urg_ptr     = 0;
 913
 914     /* The urg_mode check is necessary during a below snd_una win probe */
 915     if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {//有紧急数据且当前包的序列号小于紧急数据的下一字节的序列号
 916         if (before(tp->snd_up, tcb->seq + 0x10000)) {//紧急数据下一字节的序列号与当前包的序列号的差值小于等于紧急指针的最大值(65535)
 917             th->urg_ptr = htons(tp->snd_up - tcb->seq);//设置紧急指针指向紧急数据的下一字节
 918             th->urg = 1;//标识紧急指针有效,即当前包的数据是紧急数据
 919         } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
 920             th->urg_ptr = htons(0xFFFF);//设置紧急指针为最大值,以示紧急数据并不在当前包内
 921             th->urg = 1;
 922         }
 923     }
 924
 925     tcp_options_write((__be32 *)(th + 1), tp, &opts);//将之前构建的选项写入SKB中
 926     if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
 927         TCP_ECN_send(sk, skb, tcp_header_size);
 928
 929 #ifdef CONFIG_TCP_MD5SIG
 930     /* Calculate the MD5 hash, as we have all we need now */
 931     if (md5) {
 932         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 933         tp->af_specific->calc_md5_hash(opts.hash_location,
 934                            md5, sk, NULL, skb);
 935     }
 936 #endif
 937
 938     icsk->icsk_af_ops->send_check(sk, skb);//调用tcp_v4_send_check或tcp_v6_send_check计算TCP检验和
 939
 940     if (likely(tcb->tcp_flags & TCPHDR_ACK))//包中有ACK标记
 941         tcp_event_ack_sent(sk, tcp_skb_pcount(skb));//可以取消延迟ACK定时器,因为即将发送的包中已经携带ACK标记
 942
 943     if (skb->len != tcp_header_size)
 944         tcp_event_data_sent(tp, sk);
 945
 946     if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
 947         TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
 948                   tcp_skb_pcount(skb));
 949
 950     err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);//调用ip_queue_xmit或inet6_csk_xmit构建IP头并将包发送出去
 951     if (likely(err <= 0))
 952         return err;
 953
 954     tcp_enter_cwr(sk, 1);
 955
 956     return net_xmit_eval(err);
 957 }                                                        
  919-921:紧急数据下一字节的序列号与当前包的序列号的差值大于紧急指针65535,且最新发送的数据序列与紧急数据下一字节的序列号的差值小于65535。这意味着当前包的序列号太旧,没有必要在这里设置紧急指针

  923:所有序列号小于紧急数据最后一字节的序列号,且与在最新已发送数据的的序列号差值小于等于紧急指针的最大值的数据(包括重传)在发送时都会被设置紧急指针,故TCP没法设置紧急数据从何处开始,也不需要,因为紧急数据只有1字节而已,只要能通过紧急指针找到这字节数据即可

  关于紧急指针的功能后续章节再讨论。

  综上,TCP发送数据的系统调用会将用户指定缓冲区的内容复制到内核中,内核存储数据的结构是skb。每个skb存储数据的空间大小的固定的,即,用户进程指定的连续缓存中的数据可能会被TCP拆成多个固定长度的skb来存储并发送,这被称为“分段”。发送时一个skb会被封成一个TCP报文,进而封装成一个IP报文。skb包在放入队列时是一个一个添加到队列尾部顺序放入,发送时是从队列头开始一个一个按序发送,从而保证了发送数据的顺序与用户缓存中的顺序是一致的。发数据的系统调用返回值是“成功发送的字节数”,但实际上只是“成功放入TCP发送队列中的字节数”,放入队列的数据不一定会立即发送,但TCP会负责将这些数据可靠的发送给对端,在成功之前不会删除。也就是说,用户进程可以认为数据已经发送成功了。

  TCP也可以copy大段数据到一个skb中从而减少分段,这种技术名为“TSO (TCP Segmentation Offload)”。

你可能感兴趣的:(tcp,linux内核)