5.4 ACK发送与接收

  TCP在发送数据的时候会携带ACK标记,但这里我们要研究的是TCP在收到数据时发送不带数据的ACK报文的情况,这时ACK的发送是通过tcp_send_ack函数完成的:

3027 void tcp_send_ack(struct sock *sk)                                                                                                         
3028 {
3029     struct sk_buff *buff;
3030 
3031     /* If we have been reset, we may not send again. */
3032     if (sk->sk_state == TCP_CLOSE)     
3033         return;          
3034 
3035     /* We are not putting this on the write queue, so
3036      * tcp_transmit_skb() will set the ownership to this                                                                                   
3037      * sock.
3038      */
3039     buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));    //申请一个只能容纳TCP首部的skb                                                                   
3040     if (buff == NULL) {   //申请失败
3041         inet_csk_schedule_ack(sk);     
3042         inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; //记录用来计算延时确认的估值
3043         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3044                       TCP_DELACK_MAX, TCP_RTO_MAX); //使用延迟ACK定时器发送ACK                                                                                        
3045         return;
3046     }
3047 
3048     /* Reserve space for headers and prepare control bits. */
3049     skb_reserve(buff, MAX_TCP_HEADER);
3050     tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);            //设置ACK标记                                                            
3051 
3052     /* Send it off, this clears delayed acks for us. */
3053     TCP_SKB_CB(buff)->when = tcp_time_stamp;  //记录发送时间,用于计算RTT
3054     tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));        //发送无数据的ACK                                                                  
3055 }

  tcp_transmit_skb函数会将ACK标记写如TCP报头,并设置ack_seq:

 828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 829                 gfp_t gfp_mask)                
 830 {
 831     const struct inet_connection_sock *icsk = inet_csk(sk);
...
 899     th->ack_seq     = htonl(tp->rcv_nxt);
 900     *(((__be16 *)th) + 6)   = htons(((tcp_header_size >> 2) << 12) |
 901                     tcb->tcp_flags);
...
  899:tp->rcv_nxt是“下次希望接收的数据的序列号”,tp->rcv_nxt - 1则是“已经接收到的数据的序列号”。

  发送ACK的时机有:

(1)进程将TCP收到的数据读取(通过tcp_recvmsg、tcp_splice、DMA等等)完毕,会调用tcp_cleanup_rbuf函数发送ACK来通过数据发送端更新窗口:

1323 void tcp_cleanup_rbuf(struct sock *sk, int copied)
1324 {
1325     struct tcp_sock *tp = tcp_sk(sk);
1326     bool time_to_ack = false;
1327 
1328     struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1329 
1330     WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1331          "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1332          tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1333 
1334     if (inet_csk_ack_scheduled(sk)) { //需要发送ACK
1335         const struct inet_connection_sock *icsk = inet_csk(sk);
1336            /* Delayed ACKs frequently hit locked sockets during bulk
1337             * receive. */
1338         if (icsk->icsk_ack.blocked || //延迟ACK定时器超时时进程锁定了socket,导致ACK无法发送
1339             /* Once-per-two-segments ACK was not sent by tcp_input.c */
1340             tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || //接收了多于2个数据段但还未发送ACK
1341             /*
1342              * If this read emptied read buffer, we send ACK, if
1343              * connection is not bidirectional, user drained
1344              * receive buffer and there was a small segment
1345              * in queue.
1346              */
1347             (copied > 0 && //进程至少copy了1字节数据
1348              ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1349               ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1350                !icsk->icsk_ack.pingpong)) && //非socket是交互模式(交互模式允许延迟发送ACK)
1351               !atomic_read(&sk->sk_rmem_alloc)))  //接收缓存为空
1352             time_to_ack = true;
1353     }
1354 
1355     /* We send an ACK if we can now advertise a non-zero window
1356      * which has been raised "significantly".
1357      *
1358      * Even if window raised up to infinity, do not send window open ACK
1359      * in states, where we will not receive more. It is useless.
1360      */
1361     if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1362         __u32 rcv_window_now = tcp_receive_window(tp); //得到当前接收窗口的值,即对端计算的发送窗口
1363 
1364         /* Optimize, __tcp_select_window() is not cheap. */
1365         if (2*rcv_window_now <= tp->window_clamp) { //当前接收窗口小于最大接收窗口的一半
1366             __u32 new_window = __tcp_select_window(sk); //根据当前缓存情况得出真正的通告窗口
1367 
1368             /* Send ACK now, if this read freed lots of space
1369              * in our buffer. Certainly, new_window is new window.
1370              * We can advertise it now, if it is not less than current one.
1371              * "Lots" means "at least twice" here.
1372              */
1373             if (new_window && new_window >= 2 * rcv_window_now) //真正的通告窗口是现在对端计算的发送窗口的2倍以上
1374                 time_to_ack = true;
1375         }
1376     }
1377     if (time_to_ack)  //决定发送ACK
1378         tcp_send_ack(sk);
1379 }
  1348-1349:看一下icsk->icsk_ack.pending中标记位的含义:

  ICSK_ACK_SCHED:有ACK需要发送,是立即发送还是延时发送,还需要看其他标志,也是能否发送确认的前提

  ICSK_ACK_TIMER:延时发送ACK定时器已经启动

  ICSK_ACK_PUSHED:只要有ACK需要发送,并且pingpong为0时,ACK可以立即发送

  ICSK_ACK_PUSHED2:只要有ACK需要发送,都可以立即发送,不管是否在快速发送模式中

  如果经过上面流程决定无需发送ACK,则还要进行下面的检查:

  1361-1374:如果有数据copy到进程中,且socket没有关闭,则通过计算新的窗口值来决定是否发送ACK

  综上,进程读取数据后发送ACK的基本条件是:进程成功地copy了数据并且接收缓存为空,或当前可以通告给对端的窗口是对端计算的发送窗口的2倍以上。

(2)报文的序列号或确认号非法导致调用tcp_send_challenge_ack发送挑战ACK

3254 static void tcp_send_challenge_ack(struct sock *sk)
3255 {
3256     /* unprotected vars, we dont care of overwrites */
3257     static u32 challenge_timestamp;
3258     static unsigned int challenge_count;
3259     u32 now = jiffies / HZ;
3260 
3261     if (now != challenge_timestamp) {
3262         challenge_timestamp = now;
3263         challenge_count = 0;
3264     }
3265     if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
3266         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
3267         tcp_send_ack(sk);
3268     }
3269 }
(3)使用tcp_send_dupack发送重复ACK时:

3895 static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
3896 {
3897     struct tcp_sock *tp = tcp_sk(sk);
3898 
3899     if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && //包中有数据或SYN或FIN标记位
3900         before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { //包中有旧数据
3901         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
3902         tcp_enter_quickack_mode(sk); //进入快速ACK模式
3903 
3904         if (tcp_is_sack(tp) && sysctl_tcp_dsack) { //处理SACK相关信息
3905             u32 end_seq = TCP_SKB_CB(skb)->end_seq;
3906 
3907             if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
3908                 end_seq = tp->rcv_nxt;
3909             tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
3910         }
3911     }
3912 
3913     tcp_send_ack(sk); //发送ACK
3914 }
  3902:在包中有部分(也许是全部)旧数据的情况下TCP会调用tcp_enter_quickack_mode进入快速ACK模式。现简要讨论一下快速ACK模式,这个模式下几乎每收到一个包都会发送ACK。相关代码如下:

 173 static void tcp_incr_quickack(struct sock *sk)
 174 {
 175     struct inet_connection_sock *icsk = inet_csk(sk);
 176     unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); //按照每两个MSS一个ACK来计算ACK的数量
 177 
 178     if (quickacks == 0)
 179         quickacks = 2;  //至少发送2个ACK
 180     if (quickacks > icsk->icsk_ack.quick)
 181         icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); //允许快速发送的ACK的最大数量是TCP_MAX_QUICKACKS(16)
 182 }
 183 
 184 static void tcp_enter_quickack_mode(struct sock *sk)
 185 {
 186     struct inet_connection_sock *icsk = inet_csk(sk);
 187     tcp_incr_quickack(sk); //增加可以快速发送ACK段的数量
 188     icsk->icsk_ack.pingpong = 0; //禁用乒乓模式(即交互模式)
 189     icsk->icsk_ack.ato = TCP_ATO_MIN;
 190 }
 191 
 192 /* Send ACKs quickly, if "quick" count is not exhausted
 193  * and the session is not interactive.
 194  */
 195 
 196 static inline bool tcp_in_quickack_mode(const struct sock *sk) //这个函数用于判断是否处于快速ACK模式
 197 {
 198     const struct inet_connection_sock *icsk = inet_csk(sk);
 199 
 200     return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
 201 }
  __tcp_ack_snd_check会调用tcp_in_quickack_mode函数判断是否处于快速ACK模式,如果是则立即发送ACK,否则延迟发送。

  调用tcp_send_dupack有两种情况:PAWS检查失败和序列号非法,这时需要丢包并发送ACK:

4985 static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
4986                   const struct tcphdr *th, int syn_inerr)
4987 {
4988     struct tcp_sock *tp = tcp_sk(sk);
4989 
4990     /* RFC1323: H1. Apply PAWS check first. */
4991     if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
4992         tcp_paws_discard(sk, skb)) {
4993         if (!th->rst) {
4994             NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
4995             tcp_send_dupack(sk, skb); //PAWS检查失败,skb是旧包
4996             goto discard;
4997         }
4998         /* Reset is accepted even if it did not pass PAWS. */
4999     }
5000 
5001     /* Step 1: check sequence number */
5002     if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
5003         /* RFC793, page 37: "In all states except SYN-SENT, all reset
5004          * (RST) segments are validated by checking their SEQ-fields."
5005          * And page 69: "If an incoming segment is not acceptable,
5006          * an acknowledgment should be sent in reply (unless the RST
5007          * bit is set, if so drop the segment and return)".
5008          */
5009         if (!th->rst) {
5010             if (th->syn)
5011                 goto syn_challenge;
5012             tcp_send_dupack(sk, skb); //序列号非法
5013         }
5014         goto discard;
...
(4)__tcp_ack_snd_check函数:

4758 static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
4759 {
4760     struct tcp_sock *tp = tcp_sk(sk);
4761 
4762         /* More than one full frame received... */
4763     if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
4764          /* ... and right edge of window advances far enough.
4765           * (tcp_recvmsg() will send ACK otherwise). Or...
4766           */
4767          __tcp_select_window(sk) >= tp->rcv_wnd) ||  收到至少一个MSS且通告窗口增大
4768         /* We ACK each frame or... */
4769         tcp_in_quickack_mode(sk) || //在快速ACK模式下
4770         /* We have out of order data. */
4771         (ofo_possible && skb_peek(&tp->out_of_order_queue))) { //有乱序数据
4772         /* Then ack it now */
4773         tcp_send_ack(sk);
4774     } else {
4775         /* Else, send delayed ack. */
4776         tcp_send_delayed_ack(sk);
4777     }
4778 }
  快速路径处理模式下每收到一个包就会调用一次__tcp_ack_snd_check,而在快速ACK模式下就会对每个收到的包都发送ACK:

5076 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5077             const struct tcphdr *th, unsigned int len)
5078 {
5079     struct tcp_sock *tp = tcp_sk(sk);
...
5236             if (!copied_early || tp->rcv_nxt != tp->rcv_wup) //如果没有数据通过DMA直接copy给进程,或接收了新的数据
5237                 __tcp_ack_snd_check(sk, 0);
...

  tcp_send_delayed_ack是使用延迟ACK定时器延迟发送ACK:

2974 void tcp_send_delayed_ack(struct sock *sk)
2975 {
2976     struct inet_connection_sock *icsk = inet_csk(sk);
...
3021     icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
3022     icsk->icsk_ack.timeout = timeout;
3023     sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
3024 }

(5)tcp_ack_snd_check函数:

4780 static inline void tcp_ack_snd_check(struct sock *sk)
4781 {
4782     if (!inet_csk_ack_scheduled(sk)) {
4783         /* We sent a data segment already. */
4784         return;
4785     }
4786     __tcp_ack_snd_check(sk, 1);
4787 }   

  只有当inet_csk_ack_scheduled为真时tcp_ack_snd_check函数才会发送ACK:

166 static inline void inet_csk_schedule_ack(struct sock *sk)
167 {
168     inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_SCHED;
169 }
170 
171 static inline int inet_csk_ack_scheduled(const struct sock *sk)
172 {
173     return inet_csk(sk)->icsk_ack.pending & ICSK_ACK_SCHED;
174 }
  这时意味着有发送ACK的必要,除了tcp_send_delayed_ack设置延迟ACK定时器之外,调用inet_csk_schedule_ack也会使inet_csk_ack_scheduled为真,而调用inet_csk_schedule_ack的条件有:

  1)tcp_event_data_recv函数

 584 static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 585 {
 586     struct tcp_sock *tp = tcp_sk(sk);
 587     struct inet_connection_sock *icsk = inet_csk(sk);
 588     u32 now;
 589 
 590     inet_csk_schedule_ack(sk);
...
  而tcp_event_data_recv被调用的条件是

  2)收到乱序数据:

4121 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4122 {
...
4134 
4135     /* Disable header prediction. */
4136     tp->pred_flags = 0;
4137     inet_csk_schedule_ack(sk);
...
  3)收到窗口之外的数据:

4300 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4301 {
...
4380     if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4381         /* A retransmit, 2nd most common case.  Force an immediate ack. */
4382         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4383         tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
4384 
4385 out_of_window:
4386         tcp_enter_quickack_mode(sk);
4387         inet_csk_schedule_ack(sk);
4388 drop:
4389         __kfree_skb(skb);
4390         return;
4391     }
...
(6)延迟ACK定时器会在超时时发送ACK;使用延迟ACK机制可以一次累计确认多个报文段,这样就减少了ACK报文段的发送,从而减轻了网络拥塞。除了tcp_send_delayed_ack会设置延迟ACK定时器外,TCP在处理prequeue时也会设置:

1919 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1920 {
...
1931     __skb_queue_tail(&tp->ucopy.prequeue, skb);
1932     tp->ucopy.memory += skb->truesize;
1933     if (tp->ucopy.memory > sk->sk_rcvbuf) {
1934         struct sk_buff *skb1;
1935 
1936         BUG_ON(sock_owned_by_user(sk));
1937 
1938         while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1939             sk_backlog_rcv(sk, skb1);
1940             NET_INC_STATS_BH(sock_net(sk),
1941                      LINUX_MIB_TCPPREQUEUEDROPPED);
1942         }
1943 
1944         tp->ucopy.memory = 0;
1945     } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { //队列中仅有一个skb
1946         wake_up_interruptible_sync_poll(sk_sleep(sk),
1947                        POLLIN | POLLRDNORM | POLLRDBAND);
1948         if (!inet_csk_ack_scheduled(sk))
1949             inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1950                           (3 * tcp_rto_min(sk)) / 4,
1951                           TCP_RTO_MAX);
1952     }
1953     return true;
1954 }
  关于prequeue后续章节会详细讨论

(7)TCP在将承载进程传入内核数据的skb放入发送队列时会设置ACK标记:

 596 static inline void skb_entail(struct sock *sk, struct sk_buff *skb) //由tcp_sendmsg和do_tcp_sendpages调用
 597 {
 598     struct tcp_sock *tp = tcp_sk(sk);
 599     struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 600 
 601     skb->csum    = 0;
 602     tcb->seq     = tcb->end_seq = tp->write_seq;
 603     tcb->tcp_flags = TCPHDR_ACK; //设置ACK标记
 604     tcb->sacked  = 0;
 605     skb_header_release(skb);
 606     tcp_add_write_queue_tail(sk, skb);
 607     sk->sk_wmem_queued += skb->truesize;
 608     sk_mem_charge(sk, skb->truesize);
 609     if (tp->nonagle & TCP_NAGLE_PUSH)
 610         tp->nonagle &= ~TCP_NAGLE_PUSH;
 611 }
  调用tcp_transmit_skb发送数据时这个标记会被设置到TCP报头上:

 828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 829                 gfp_t gfp_mask)
 830 {
...
 900     *(((__be16 *)th) + 6)   = htons(((tcp_header_size >> 2) << 12) |
 901                     tcb->tcp_flags);
...
 940     if (likely(tcb->tcp_flags & TCPHDR_ACK))
 941         tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
...
  这样TCP发送的数据就会携带ACK标记,tcp_event_ack_sent函数会卸载延迟ACK定时器,并可能会取消快速ACK模式,从而减少ACK报文段的发送。这种方法被称为“捎带确认”:

 178 static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 179 {
 180     tcp_dec_quickack_mode(sk, pkts); //减少quickack计数,减小到0则快速ACK模式会被禁用
 181     inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);  //卸载延迟ACK定时器
 182 }   

(8)另外,TCP收到SYN|ACK、FIN也会发送ACK,此处不做讨论

  总之,TCP发送ACK的情况分为以下几类:

(1)收到非法报文时

(2)进程从TCP接收缓存读取数据后;这时通告窗口可能会变大

(3)收到数据时;这时通告窗口可能会变小,发送ACK会告知对端更新窗口,并释放已经接收的数据

(4)发送数据段时

(5)收到带SYN、FIN标记的包时

  TCP在接收ACK时,ACK标记的处理由tcp_ack函数完成

3325 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3326 {       
3327     struct inet_connection_sock *icsk = inet_csk(sk);
3328     struct tcp_sock *tp = tcp_sk(sk);
3329     u32 prior_snd_una = tp->snd_una;
3330     u32 ack_seq = TCP_SKB_CB(skb)->seq;
3331     u32 ack = TCP_SKB_CB(skb)->ack_seq;
3332     bool is_dupack = false;
3333     u32 prior_in_flight;
3334     u32 prior_fackets;
3335     int prior_packets = tp->packets_out;
3336     int prior_sacked = tp->sacked_out;
3337     int pkts_acked = 0;
3338     int previous_packets_out = 0;
3339             
3340     /* If the ack is older than previous acks
3341      * then we can probably ignore it.
3342      */         
3343     if (before(ack, prior_snd_una)) { //ack_seq是旧的
3344         /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
3345         if (before(ack, prior_snd_una - tp->max_window)) {//如果ACK序列号太旧了
3346             tcp_send_challenge_ack(sk);//发生探测ACK给对端,如果连接已经结束则对段会发送RST
3347             return -1;
3348         }
3349         goto old_ack;
3350     }
3351
3352     /* If the ack includes data we haven't sent yet, discard
3353      * this segment (RFC793 Section 3.9).
3354      */
3355     if (after(ack, tp->snd_nxt)) //ack_seq太超前了
3356         goto invalid_ack;
3357
3358     if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || //设置了早期重传定时器
3359         icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) //设置了丢失重传定时器
3360         tcp_rearm_rto(sk); //取消或重新设置重传定时器
3361
3362     if (after(ack, prior_snd_una))//至少会确认1字节数据
3363         flag |= FLAG_SND_UNA_ADVANCED;
3364
3365     prior_fackets = tp->fackets_out;
3366     prior_in_flight = tcp_packets_in_flight(tp); //在“网络中”的报文的数量,即“已发送的 - 已确认的 - 丢失的 + 重传的”
3367
3368     /* ts_recent update must be made after we are sure that the packet
3369      * is in window.
3370      */
3371     if (flag & FLAG_UPDATE_TS_RECENT)
3372         tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); //更新时间戳
3373
3374     if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {//不是慢速路径且至少会确认1字节数据
3375         /* Window is constant, pure forward advance.
3376          * No more checks are required.
3377          * Note, we use the fact that SND.UNA>=SND.WL2.
3378          */
3379         tcp_update_wl(tp, ack_seq); //记录更新窗口的ACK的序列号
3380         tp->snd_una = ack; //更新“已发送但未确认”的序列号,tp->snd_una - 1即已经被确认的数据的序列号
3381         flag |= FLAG_WIN_UPDATE;
3382
3383         tcp_ca_event(sk, CA_EVENT_FAST_ACK); //进行拥塞控制
3384
3385         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
3386     } else {
3387         if (ack_seq != TCP_SKB_CB(skb)->end_seq)//ACK包中有数据
3388             flag |= FLAG_DATA;
3389         else
3390             NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
3391
3392         flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);//更新发送窗口
3393
3394         if (TCP_SKB_CB(skb)->sacked)
3395             flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); //更新发送队列中s的ack信息
3396
3397         if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))//收到显式拥塞通告
3398             flag |= FLAG_ECE;
3399
3400         tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
3401     }
3402
3403     /* We passed data and got it acked, remove any soft error
3404      * log. Something worked...
3405      */
3406     sk->sk_err_soft = 0;
3407     icsk->icsk_probes_out = 0;
3408     tp->rcv_tstamp = tcp_time_stamp;
3409     if (!prior_packets) //没有未被对端收到的数据
3410         goto no_queue;
3411
3412     /* See if we can take anything off of the retransmit queue. */
3413     previous_packets_out = tp->packets_out;
3414     flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);//将已经确认的数据移出发送缓冲
3415
3416     pkts_acked = previous_packets_out - tp->packets_out;//计算被确认的包的个数
3417
3418     if (tcp_ack_is_dubious(sk, flag)) {//如果ACK是重复的,或要进入拥塞状态,或已经在拥塞状态
3419         /* Advance CWND, if state allows this. */
3420         if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) //确认了数据或可以增大拥塞窗口
3421             tcp_cong_avoid(sk, ack, prior_in_flight); //执行拥塞避免算法
3422         is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3423         tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3424                       prior_packets, is_dupack, flag); //更新拥塞控制信息,并决定是否执行快速重传
3425     } else {
3426         if (flag & FLAG_DATA_ACKED) //确认了数据
3427             tcp_cong_avoid(sk, ack, prior_in_flight);
3428     }
3429
3430     if (tp->tlp_high_seq)  //Tail Loss Probe (TLP)算法相关
3431         tcp_process_tlp_ack(sk, ack, flag);
3432
3433     if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
3434         struct dst_entry *dst = __sk_dst_get(sk);
3435         if (dst)
3436             dst_confirm(dst);
3437     }
3438
3439     if (icsk->icsk_pending == ICSK_TIME_RETRANS) //如果设置了重传定时器
3440         tcp_schedule_loss_probe(sk); //将重传定时器转换为丢失探测定时器
3441     return 1;
3442
3443 no_queue:
3444     /* If data was DSACKed, see if we can undo a cwnd reduction. */
3445     if (flag & FLAG_DSACKING_ACK)
3446         tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3447                       prior_packets, is_dupack, flag);
3448     /* If this ack opens up a zero window, clear backoff.  It was
3449      * being used to time the probes, and is probably far higher than
3450      * it needs to be for normal retransmission.
3451      */
3452     if (tcp_send_head(sk)) //在数据已经全部被对方接收且发送队列中有数据未发送,这通常意味着发送窗口太小
3453         tcp_ack_probe(sk); //重新设置窗口探测定时器(坚持定时器)
3454
3455     if (tp->tlp_high_seq)
3456         tcp_process_tlp_ack(sk, ack, flag);
3457     return 1;
3458 
3459 invalid_ack:
3460     SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3461     return -1;
3462 
3463 old_ack:
3464     /* If data was SACKed, tag it and see if we should send more data.
3465      * If data was DSACKed, see if we can undo a cwnd reduction.
3466      */
3467     if (TCP_SKB_CB(skb)->sacked) {
3468         flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
3469         tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3470                       prior_packets, is_dupack, flag);
3471     }
3472 
3473     SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3474     return 0;
3475 }
   tcp_clean_rtx_queue函数用于清除发送队列中已经被确认的skb:
3001 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3002                    u32 prior_snd_una)
3003 {
3004     struct tcp_sock *tp = tcp_sk(sk);
3005     const struct inet_connection_sock *icsk = inet_csk(sk);
3006     struct sk_buff *skb;
3007     u32 now = tcp_time_stamp;
3008     int fully_acked = true;
3009     int flag = 0;
3010     u32 pkts_acked = 0;  
3011     u32 reord = tp->packets_out;   
3012     u32 prior_sacked = tp->sacked_out;
3013     s32 seq_rtt = -1;    
3014     s32 ca_seq_rtt = -1;
3015     ktime_t last_ackt = net_invalid_timestamp();
3016
3017     while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {//获取发送缓存中最旧的已发送数据
3018         struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3019         u32 acked_pcount;
3020         u8 sacked = scb->sacked;
3021
3022         /* Determine how many packets and what bytes were acked, tso and else */
3023         if (after(scb->end_seq, tp->snd_una)) { //skb有一部分数据未确认
3024             if (tcp_skb_pcount(skb) == 1 ||//skb能够被TSO作为一个包发送
3025                 !after(tp->snd_una, scb->seq))//当前数据未被确认
3026                 break;//当前包不能删除
3027
3028             acked_pcount = tcp_tso_acked(sk, skb);//计算有多少个TSO分割后的包能被确认
3029             if (!acked_pcount)
3030                 break;//不能删除当前包的任何一部分
3031
3032             fully_acked = false;//确认了当前包中的一部分
3033         } else {//当前数据全部被确认
3034             acked_pcount = tcp_skb_pcount(skb);//被确认的包的个数是当前包经TSO分割后的包的数量
3035         }
...
3070         if (!(scb->tcp_flags & TCPHDR_SYN)) {
3071             flag |= FLAG_DATA_ACKED;//有数据被确认
3072         } else {
3073             flag |= FLAG_SYN_ACKED;//SYN包不算被确认的数据
3074             tp->retrans_stamp = 0;
3075         }
3076
3077         if (!fully_acked) //如果skb未被完全确认,则不能删除
3078             break;
3079
3080         tcp_unlink_write_queue(skb, sk);//将skb移出发送缓存
3081         sk_wmem_free_skb(sk, skb);//释放skb
3082         tp->scoreboard_skb_hint = NULL;
3083         if (skb == tp->retransmit_skb_hint)
3084             tp->retransmit_skb_hint = NULL;
3085         if (skb == tp->lost_skb_hint)
3086             tp->lost_skb_hint = NULL;
3087     }
3088
3089     if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))//可能确认了紧急数据
3090         tp->snd_up = tp->snd_una;//更新紧急指针;tp->snd_una == tp->snd_up意味着没有发送紧急数据
...
3165     return flag;
3166 }

  代码省略的部分主要是与拥塞控制有关。

  当TCP把数据从发送缓冲中清除时,这些数据的传输才真正完成。使用TCP发送数据的应用进程不需要关心这些过程,它们把数据成功写入TCP发送缓存的之后就可以不保存数据的副本了,数据的可靠交付由TCP协议完成。

  从ACK的发送和接收的流程中可以看出,ACK的主要功能为:

(1)通告数据发送端清除已经收到的数据

(2)更新通告窗口

(3)参与拥塞控制(快速重传、拥塞避免等等)

  TCP数据接收端发送ACK通知发送端可以删除发送缓存中的数据时,这些数据可能只是放入了接收端的接收队列中,并没有交付应用进程。应用进程收取数据是通过收包系统调用完成的,详见下节。

你可能感兴趣的:(tcp,linux内核)