TCP在发送SYN、FIN以及数据包时为了保证可靠传输,会先将它们放入发送队列再发送副本到网络中,一旦发现数据丢失(比如连续收到多个ack_seq号相同的ACK)则重传发送队列中的skb。如果丢失发现机制失效了呢(比如ACK丢失),这时就需要重传定时器在指定的时间内重传数据,否则数据传输就可能会阻塞。
设置重传定时器的时机有:
(1)调用tcp_check_sack_reneging处理虚假SACK事件时:
1906 static bool tcp_check_sack_reneging(struct sock *sk, int flag) 1907 { 1908 if (flag & FLAG_SACK_RENEGING) { 1909 struct inet_connection_sock *icsk = inet_csk(sk); 1910 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); 1911 1912 tcp_enter_loss(sk, 1); 1913 icsk->icsk_retransmits++; 1914 tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); 1915 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 1916 icsk->icsk_rto, TCP_RTO_MAX); 1917 return true; 1918 } 1919 return false; 1920 }虚假SACK是指:最新收到的ACK的ack_seq指向已记录的SACK块,这说明记录的SACK并没有反应接收方的真实的状态,也就是说接收方现在已经处于严重拥塞的状态或者在处理上有bug,从而删除了乱序队列中的数据(这些数据之前是在SACK选项中发送过来的)。因为按照正常的逻辑流程,接收的ACK不应该指向已记录的SACK,而应该指向SACK后面未接收的地方(因为被SACK的报文是已经放入接收方的乱序队列中,如果收到缺失的段正常情况下会与乱序报文一起交付接收队列,从而使ack_seq指向被SACK的报文的后面)。所以接下来就按照超时重传的方式去处理。
(2)调用tcp_rearm_rto更新RTO时:
2926 void tcp_rearm_rto(struct sock *sk) 2927 { 2928 const struct inet_connection_sock *icsk = inet_csk(sk); 2929 struct tcp_sock *tp = tcp_sk(sk); 2930 2931 /* If the retrans timer is currently being used by Fast Open 2932 * for SYN-ACK retrans purpose, stay put. 2933 */ 2934 if (tp->fastopen_rsk) 2935 return; 2936 2937 if (!tp->packets_out) { //网络中没有已发送的数据 2938 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 2939 } else { 2940 u32 rto = inet_csk(sk)->icsk_rto; 2941 /* Offset the time elapsed after installing regular RTO */ 2942 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 2943 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2944 struct sk_buff *skb = tcp_write_queue_head(sk); 2945 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; 2946 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); 2947 /* delta may not be positive if the socket is locked 2948 * when the retrans timer fires and is rescheduled. 2949 */ 2950 if (delta > 0) 2951 rto = delta; 2952 } 2953 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, 2954 TCP_RTO_MAX); 2955 } 2956 }而调用tcp_rearm_rto并安装重传定时器的常见条件有:
1)收到ACK并确认掉数据时且仍然有未确认的数据时:
3001 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, 3002 u32 prior_snd_una) 3003 { ... 3095 if (flag & FLAG_ACKED) { ... 3105 tcp_rearm_rto(sk);2)收到合法ACK并安装了ER定时器或丢失探测定时器时:
3325 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 3326 { ... 3358 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 3359 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) 3360 tcp_rearm_rto(sk); ...3)发送了新数据并调用tcp_event_new_data_sent函数时:
72 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 73 { 74 struct inet_connection_sock *icsk = inet_csk(sk); 75 struct tcp_sock *tp = tcp_sk(sk); 76 unsigned int prior_packets = tp->packets_out; 77 78 tcp_advance_send_head(sk, skb); 79 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; 80 81 tp->packets_out += tcp_skb_pcount(skb); 82 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 83 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 84 tcp_rearm_rto(sk); 85 } 86 }
(3)发送SYN后:
2925 int tcp_connect(struct sock *sk) 2926 { ... 2947 tcp_connect_queue_skb(sk, buff); ... 2963 /* Timer for repeating the SYN until an answer. */ 2964 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2965 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);(4)开启TFO功能的情况下,收到SYN并发送SYN|ACK后,需要设置重传定时器以便重传SYN|ACK:
1369 static int tcp_v4_conn_req_fastopen(struct sock *sk, 1370 struct sk_buff *skb, 1371 struct sk_buff *skb_synack, 1372 struct request_sock *req) 1373 { ... 1421 /* Activate the retrans timer so that SYNACK can be retransmitted. 1422 * The request socket is not added to the SYN table of the parent 1423 * because it's been added to the accept queue directly. 1424 */ 1425 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, 1426 TCP_TIMEOUT_INIT, TCP_RTO_MAX); ...(5)开启TFO功能的情况下,收到ICMP目的不可达报文时:
326 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 327 { ... 399 case ICMP_DEST_UNREACH: ... 440 skb = tcp_write_queue_head(sk); 441 BUG_ON(!skb); 442 443 remaining = icsk->icsk_rto - min(icsk->icsk_rto, 444 tcp_time_stamp - TCP_SKB_CB(skb)->when); 445 446 if (remaining) { //从skb发送出去到现在经历的事件比RTO短 447 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 448 remaining, TCP_RTO_MAX);(6)定时器超时的时候可能会设置重传定时器;这种情况暂不分析
清除重传定时器的时机为:
(1)调用tcp_rearm_rto且所有发送数据都已经被收到时;
调用tcp_rearm_rto并清除重传定时器的常见情况有:
1)开启TFO的情况下,发送SYN|ACK后收到ACK时:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, 5601 const struct tcphdr *th, unsigned int len) 5602 { ... 5682 case TCP_SYN_RECV: 5683 if (acceptable) { ... 5735 tcp_rearm_rto(sk); ...2)开启TFO的情况下,在TCP_FIN_WAIT1状态下收到ACK但TFO socket仍然存在时:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, 5601 const struct tcphdr *th, unsigned int len) 5602 { ... 5751 case TCP_FIN_WAIT1: ... 5757 if (req != NULL) { ... 5767 reqsk_fastopen_remove(sk, req, false); 5768 tcp_rearm_rto(sk);3)收到ACK并确认掉全部数据时
4)收到合法ACK并安装了ER定时器或丢失探测定时器时
(2)安装丢失探测定时器、ER定时器、坚持定时器时;由于这3个定时器与重传定时器使用同一个数据结构,安装一个就等于拆除了其它类型的定时器。
重传定时器的超时时间是RTO(Retransmission TimeOut)时间,这个时间是从发出数据包到第一次重传开始的时间;它由拥塞控制算法计算的,是不断变化的。
重传定时器所使用的icsk->icsk_retransmit_timer安装的超时函数是tcp_write_timer:
478 void tcp_write_timer_handler(struct sock *sk) 479 { 480 struct inet_connection_sock *icsk = inet_csk(sk); 481 int event; 482 483 if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) //TCP状态是CLOSE或未安装定时器 484 goto out; 485 486 if (time_after(icsk->icsk_timeout, jiffies)) { //尚未超时 487 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); 488 goto out; 489 } 490 491 event = icsk->icsk_pending; 492 493 switch (event) { 494 case ICSK_TIME_EARLY_RETRANS: //ER 495 tcp_resume_early_retransmit(sk); 496 break; 497 case ICSK_TIME_LOSS_PROBE://正常重传&探测报文重传 498 tcp_send_loss_probe(sk); 499 break; 500 case ICSK_TIME_RETRANS://正常重传 501 icsk->icsk_pending = 0; 502 tcp_retransmit_timer(sk); 503 break; 504 case ICSK_TIME_PROBE0: //坚持定时器超时 505 icsk->icsk_pending = 0; 506 tcp_probe_timer(sk); 507 break; 508 } 509 510 out: 511 sk_mem_reclaim(sk); //回收缓存 512 } 513 514 static void tcp_write_timer(unsigned long data) 515 { 516 struct sock *sk = (struct sock *)data; 517 518 bh_lock_sock(sk); 519 if (!sock_owned_by_user(sk)) { 520 tcp_write_timer_handler(sk); 521 } else { 522 /* deleguate our work to tcp_release_cb() */ 523 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) 524 sock_hold(sk); 525 } 526 bh_unlock_sock(sk); 527 sock_put(sk); 528 }
523:如果icsk->icsk_retransmit_timer超时时socket被应用进程锁定,则设置TCP_WRITE_TIMER_DEFERRED标记,这样在应用进程释放socket时会调用tcp_release_cb函数:
741 void tcp_release_cb(struct sock *sk) 742 { ... 757 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { 758 tcp_write_timer_handler(sk); 759 __sock_put(sk); 760 } ...
这样看来重传定时器真正的超时函数是tcp_retransmit_timer:
340 void tcp_retransmit_timer(struct sock *sk) 341 { 342 struct tcp_sock *tp = tcp_sk(sk); 343 struct inet_connection_sock *icsk = inet_csk(sk); 344 345 if (tp->fastopen_rsk) { //开启了TFO功能 346 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && 347 sk->sk_state != TCP_FIN_WAIT1); 348 tcp_fastopen_synack_timer(sk); //重传SYN|ACK 349 /* Before we receive ACK to our SYN-ACK don't retransmit 350 * anything else (e.g., data or FIN segments). 351 */ 352 return; 353 } 354 if (!tp->packets_out) //包已经被全部确认 355 goto out; 356 357 WARN_ON(tcp_write_queue_empty(sk)); 358 359 tp->tlp_high_seq = 0; 360 361 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && //发送窗口关闭且socket并非orphan socket 362 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { //并非连接建立状态 363 /* Receiver dastardly shrinks window. Our retransmits 364 * become zero probes, but we should not timeout this 365 * connection. If the socket is an orphan, time it out, 366 * we cannot allow such beasts to hang infinitely. 367 */ 368 struct inet_sock *inet = inet_sk(sk); 369 if (sk->sk_family == AF_INET) { 370 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), 371 &inet->inet_daddr, 372 ntohs(inet->inet_dport), inet->inet_num, 373 tp->snd_una, tp->snd_nxt); 374 } 375 #if IS_ENABLED(CONFIG_IPV6) 376 else if (sk->sk_family == AF_INET6) { 377 struct ipv6_pinfo *np = inet6_sk(sk); 378 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), 379 &np->daddr, 380 ntohs(inet->inet_dport), inet->inet_num, 381 tp->snd_una, tp->snd_nxt); 382 } 383 #endif 384 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { //超过TCP_RTO_MAX的时间没有收到对端的确认 385 tcp_write_err(sk); //报告错误并关闭连接 386 goto out; 387 } 388 tcp_enter_loss(sk, 0); //进入拥塞控制的LOSS状态 389 tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); //重传发送队列中的首包 390 __sk_dst_reset(sk); 391 goto out_reset_timer; 392 } 393 //发送窗口非0 394 if (tcp_write_timeout(sk)) //重传等待时间过长或orphan socket消耗资源过多 395 goto out; 396 397 if (icsk->icsk_retransmits == 0) { //第一次重传 ... //更新MIB数据库信息,用于网络管理 417 } 418 419 tcp_enter_loss(sk, 0); //进入拥塞控制的LOSS状态 420 421 if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {//重传发送队列中的首包失败 422 /* Retransmission failed because of local congestion, 423 * do not backoff. 424 */ 425 if (!icsk->icsk_retransmits) 426 icsk->icsk_retransmits = 1; 427 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 428 min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), 429 TCP_RTO_MAX);//重设重传定时器 430 goto out; 431 } ... 448 icsk->icsk_backoff++; 449 icsk->icsk_retransmits++; 450 451 out_reset_timer: 452 /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is 453 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this 454 * might be increased if the stream oscillates between thin and thick, 455 * thus the old value might already be too high compared to the value 456 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without 457 * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating 458 * exponential backoff behaviour to avoid continue hammering 459 * linear-timeout retransmissions into a black hole 460 */ 461 if (sk->sk_state == TCP_ESTABLISHED && 462 (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && 463 tcp_stream_is_thin(tp) && 464 icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { 465 icsk->icsk_backoff = 0; 466 icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); 467 } else { 468 /* Use normal (exponential) backoff */ 469 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); //增加超时时间 470 } 471 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); //重设重传定时器 472 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) 473 __sk_dst_reset(sk); 474 475 out:; 476 }tcp_write_timeout函数判断是否应该超时:
156 static int tcp_write_timeout(struct sock *sk) 157 { 158 struct inet_connection_sock *icsk = inet_csk(sk); 159 int retry_until; 160 bool do_reset, syn_set = false; 161 162 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 163 if (icsk->icsk_retransmits) 164 dst_negative_advice(sk); 165 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 166 syn_set = true; 167 } else { 168 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { //这里超时可能是因为MTU过大 169 /* Black hole detection */ 170 tcp_mtu_probing(icsk, sk); //执行路径MTU探测 171 172 dst_negative_advice(sk); 173 } 174 175 retry_until = sysctl_tcp_retries2; 176 if (sock_flag(sk, SOCK_DEAD)) { //当前socket是orphan socket 177 const int alive = (icsk->icsk_rto < TCP_RTO_MAX); 178 179 retry_until = tcp_orphan_retries(sk, alive); 180 do_reset = alive || 181 !retransmits_timed_out(sk, retry_until, 0, 0); 182 183 if (tcp_out_of_resources(sk, do_reset))//当前orphan socket数量太多 184 return 1; 185 } 186 } 187 188 if (retransmits_timed_out(sk, retry_until, 189 syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { //如果超时 190 /* Has it gone just too far? */ 191 tcp_write_err(sk); //关闭连接 192 return 1; 193 } 194 return 0; 195 }retransmits_timed_out函数判断是否超时:
127 static bool retransmits_timed_out(struct sock *sk, 128 unsigned int boundary, 129 unsigned int timeout, 130 bool syn_set) 131 { 132 unsigned int linear_backoff_thresh, start_ts; 133 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; 134 135 if (!inet_csk(sk)->icsk_retransmits) //没有重传 136 return false; 137 138 if (unlikely(!tcp_sk(sk)->retrans_stamp)) 139 start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when; //使用skb发送时间作为起始时间 140 else 141 start_ts = tcp_sk(sk)->retrans_stamp; //使用重传时间作为起始时间 142 143 if (likely(timeout == 0)) { 144 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); 145 146 if (boundary <= linear_backoff_thresh) 147 timeout = ((2 << boundary) - 1) * rto_base; 148 else 149 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + 150 (boundary - linear_backoff_thresh) * TCP_RTO_MAX; 151 } 152 return (tcp_time_stamp - start_ts) >= timeout; 153 }判断应该超时时使用tcp_write_err函数关闭本端连接并向应用进程报告错误:
35 static void tcp_write_err(struct sock *sk) 36 { 37 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; 38 sk->sk_error_report(sk); 39 40 tcp_done(sk); 41 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); 42 }tcp_retransmit_skb函数用于重传skb:
2374 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) 2375 { 2376 struct tcp_sock *tp = tcp_sk(sk); 2377 int err = __tcp_retransmit_skb(sk, skb); 2378 2379 if (err == 0) { 2380 /* Update global TCP statistics. */ 2381 TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); 2382 2383 tp->total_retrans++; 2384 2385 #if FASTRETRANS_DEBUG > 0 2386 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { 2387 net_dbg_ratelimited("retrans_out leaked\n"); 2388 } 2389 #endif 2390 if (!tp->retrans_out) 2391 tp->lost_retrans_low = tp->snd_nxt; 2392 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; //记录skb被重传过 2393 tp->retrans_out += tcp_skb_pcount(skb); 2394 2395 /* Save stamp of the first retransmit. */ 2396 if (!tp->retrans_stamp) 2397 tp->retrans_stamp = TCP_SKB_CB(skb)->when; //记录重传时间 2398 2399 tp->undo_retrans += tcp_skb_pcount(skb); 2400 2401 /* snd_nxt is stored to detect loss of retransmitted segment, 2402 * see tcp_input.c tcp_sacktag_write_queue(). 2403 */ 2404 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; 2405 } 2406 return err; 2407 }
综上,TCP重传定时器的基本功能是:如果有TFO socket则直接重传SYN|ACK,然后返回;如果没有,检查重传是否经过了太长的时间,若是则关闭连接并报告错误;否则重传发送队列中的首包,并将重传定时器设置为更长的超时时间。