4.5 路径MTU发现

  如果两台主机之间的通信要通过多个网络,那么每个网络的链路层就可能有不同的MTU。两台通信主机路径中的最小MTU。它被称作路径MTU(PMTU)。两台主机之间的路径MTU不一定是个常数,它取决于当时所选择的路由。而选路不一定是对称的(从A到B的路由可能与从B到A的路由不同),因此路径MTU在两个方向上不一定是一致的。

  本文研究路径MTU发现主要是要弄明白以下几个问题:

1、路径MTU发现有什么用处?

2、TCP什么时候执行路径MTU发现?

3、TCP路径MTU发现的原理是什么?

4、TCP路径MTU探测的结果是如何维护的?

5、TCP如何使用路径MTU探测的结果?

  下面回答问题1:TCP报文需要封装成IP报文才会发送,报文在网络中按照一定路径传输后会抵达目的地。最理想的情况是IP报文的大小正好是这条路径所能容纳的最大尺寸,因为报文小了则数据传输效率不高,大了则会引起分片。分片会使得路由器的负担加重,增加延迟,而且会增加报文丢失的概率。而IP报文的传输路径是事先不知道的,而且在传输过程中也可能发送变化,所以TCP需要动态测路径MTU的大小,这就是TCP的路径MTU发现。

     接下来我们来寻找问题2的答案:PMTU探测包的发送是在tcp_write_xmit函数中进行:

1811 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1812                int push_one, gfp_t gfp)
1813 {               
1814     struct tcp_sock *tp = tcp_sk(sk);
1815     struct sk_buff *skb;
1816     unsigned int tso_segs, sent_pkts;
1817     int cwnd_quota;
1818     int result;
1819             
1820     sent_pkts = 0;
1821         
1822     if (!push_one) {
1823         /* Do MTU probing. */
1824         result = tcp_mtu_probe(sk);
1825         if (!result) {
1826             return false;
...

  可见只有tcp_write_xmit函数的参数push_one为0时TCP才会开启PMTU探测。直接调用tcp_write_xmit且push_one为0的函数有两个:tcp_tsq_handler和__tcp_push_pending_frames。前者是使用TSQ tasklet发送数据时调用的函数,而直接或间接调用后者的函数有:tcp_push_pending_frames、tcp_push、tcp_data_snd_check。下面一一列举开启PMTU探测的条件:

(1)TSQ tasklet发送数据时:

 684 static void tcp_tsq_handler(struct sock *sk)
 685 {
 686     if ((1 << sk->sk_state) &
 687         (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
 688          TCPF_CLOSE_WAIT  | TCPF_LAST_ACK))
 689         tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
 690 }
(2)通过发包系统调用或使用TCP Splice功能调用do_tcp_sendpages发送数据时:

1016 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1017         size_t size)
1018 {
...
1204             if (forced_push(tp)) {
1205                 tcp_mark_push(tp, skb);
1206                 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1207             } else if (skb == tcp_send_head(sk))
1208                 tcp_push_one(sk, mss_now);
1209             continue;
1210 
1211 wait_for_sndbuf:
1212             set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1213 wait_for_memory:
1214             if (copied)
1215                 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1216 
1217             if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1218                 goto do_error;
1219 
1220             mss_now = tcp_send_mss(sk, &size_goal, flags);
1221         }
1222     }
1223 
1224 out:
1225     if (copied)
1226         tcp_push(sk, flags, mss_now, tp->nonagle);
1227     release_sock(sk);
1228     return copied + copied_syn;
   TCP Splice
 827 static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 828                 size_t size, int flags)
 829 {
...
 914         if (forced_push(tp)) {
 915             tcp_mark_push(tp, skb);
 916             __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
 917         } else if (skb == tcp_send_head(sk))
 918             tcp_push_one(sk, mss_now);
 919         continue;
 920 
 921 wait_for_sndbuf:
 922         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 923 wait_for_memory:
 924         tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 925 
 926         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 927             goto do_error;
 928 
 929         mss_now = tcp_send_mss(sk, &size_goal, flags);
 930     }
 931 
 932 out:
 933     if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
 934         tcp_push(sk, flags, mss_now, tp->nonagle);
 935     return copied;
...
  这里只有一种情况是不开启PMTU探测的:当前已写出的字节数不大于对端通告的最大窗口的一半且发送队列中只有一个skb。其它情况下发送skb都会开启PMTU探测功能。
(3)发现数据丢失并且使用了Forward RTO-Recovery (F-RTO)算法时:

2685 static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
2686 {   
2687     struct inet_connection_sock *icsk = inet_csk(sk);
2688     struct tcp_sock *tp = tcp_sk(sk);
2689     bool recovered = !before(tp->snd_una, tp->high_seq);
2690     
2691     if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
2692         if (flag & FLAG_ORIG_SACK_ACKED) {
2693             /* Step 3.b. A timeout is spurious if not all data are
2694              * lost, i.e., never-retransmitted data are (s)acked.
2695              */
2696             tcp_try_undo_loss(sk, true);
2697             return;
2698         }
2699         if (after(tp->snd_nxt, tp->high_seq) &&
2700             (flag & FLAG_DATA_SACKED || is_dupack)) {
2701             tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
2702         } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2703             tp->high_seq = tp->snd_nxt;
2704             __tcp_push_pending_frames(sk, tcp_current_mss(sk),
2705                           TCP_NAGLE_OFF);
...
(4)发送FIN关闭连接时:

2545 void tcp_send_fin(struct sock *sk)
2546 {
...
2578     __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
2579 }
(5)使用setsockopt设置TCP_NODELAY功能时:
2371 static int do_tcp_setsockopt(struct sock *sk, int level,
2372         int optname, char __user *optval, unsigned int optlen)
2373 {
...
2423     case TCP_NODELAY:
2424         if (val) {
2425             /* TCP_NODELAY is weaker than TCP_CORK, so that
2426              * this option on corked socket is remembered, but
2427              * it is not activated until cork is cleared.
2428              *
2429              * However, when TCP_NODELAY is set we make
2430              * an explicit push, which overrides even TCP_CORK
2431              * for currently queued segments.
2432              */
2433             tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2434             tcp_push_pending_frames(sk);
2435         } else {
2436             tp->nonagle &= ~TCP_NAGLE_OFF;
2437         }
2438         break;
...
(6) 使用setsockopt取消TCP_CORK功能时:
2371 static int do_tcp_setsockopt(struct sock *sk, int level,
2372         int optname, char __user *optval, unsigned int optlen)
2373 {
...
2503     case TCP_CORK:
...
2515         if (val) {
2516             tp->nonagle |= TCP_NAGLE_CORK;
2517         } else {
2518             tp->nonagle &= ~TCP_NAGLE_CORK;
2519             if (tp->nonagle&TCP_NAGLE_OFF)
2520                 tp->nonagle |= TCP_NAGLE_PUSH;
2521             tcp_push_pending_frames(sk);
2522         }
2523         break;
...
(7)收到对端发过来的ACK或数据包时:

5076 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5077             const struct tcphdr *th, unsigned int len)
5078 {
...
5109     if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
5110         TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
5111         !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
5112         int tcp_header_len = tp->tcp_header_len;
...
5136         if (len <= tcp_header_len) {
5137             /* Bulk data transfer: sender */
5138             if (len == tcp_header_len) {
5139                 /* Predicted packet is in window by definition.
5140                  * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5141                  * Hence, check seq<=rcv_wup reduces to:
5142                  */
5143                 if (tcp_header_len ==
5144                     (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5145                     tp->rcv_nxt == tp->rcv_wup)
5146                     tcp_store_ts_recent(tp);
5147 
5148                 /* We know that such packets are checksummed
5149                  * on entry.
5150                  */
5151                 tcp_ack(sk, skb, 0);
5152                 __kfree_skb(skb);
5153                 tcp_data_snd_check(sk);
5154                 return 0;
...
5159         } else {
...
5228             if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
5229                 /* Well, only one small jumplet in fast path... */
5230                 tcp_ack(sk, skb, FLAG_DATA);
5231                 tcp_data_snd_check(sk);
...
5274     /* step 7: process the segment text */
5275     tcp_data_queue(sk, skb);
5276 
5277     tcp_data_snd_check(sk);
5278     tcp_ack_snd_check(sk);
5279     return 0;
...
(8)非TCP_LISTEN和TCP_CLOSE状态下收到合法的包时:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601               const struct tcphdr *th, unsigned int len)
5602 {
...
5649     case TCP_SYN_SENT:
5650         queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
5651         if (queued >= 0)
5652             return queued;
5653 
5654         /* Do step6 onward by hand. */
5655         tcp_urg(sk, skb, th);
5656         __kfree_skb(skb);
5657         tcp_data_snd_check(sk); //发送TFO的数据
5658         return 0;
5659     }
...
5861     /* tcp_data could move socket to TIME-WAIT */
5862     if (sk->sk_state != TCP_CLOSE) {
5863         tcp_data_snd_check(sk);
5864         tcp_ack_snd_check(sk);
5865     }
...
  这样看来,TCP在发送数据的时候大多会执行路径MTU发现。

  对于问题3,TCP完成PMTU探测任务的基本方法是使用tcp_mtu_probe函数用于发送PMTU探测包:

1675 static int tcp_mtu_probe(struct sock *sk)
1676 {
1677     struct tcp_sock *tp = tcp_sk(sk);
1678     struct inet_connection_sock *icsk = inet_csk(sk);
1679     struct sk_buff *skb, *nskb, *next;
1680     int len;
1681     int probe_size;      
1682     int size_needed;     
1683     int copy;
1684     int mss_now;
1685
1686     /* Not currently probing/verifying,
1687      * not in recovery,
1688      * have enough cwnd, and
1689      * not SACKing (the variable headers throw things off) */
1690     if (!icsk->icsk_mtup.enabled ||    
1691         icsk->icsk_mtup.probe_size ||   //正在进行PMTU探测
1692         inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
1693         tp->snd_cwnd < 11 ||
1694         tp->rx_opt.num_sacks || tp->rx_opt.dsack)
1695         return -1;
1696
1697     /* Very simple search strategy: just double the MSS. */
1698     mss_now = tcp_current_mss(sk);
1699     probe_size = 2 * tp->mss_cache; //设置探测包大小为当前MSS的两倍
1700     size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
1701     if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
1702         /* TODO: set timer for probe_converge_event */
1703         return -1;
1704     }
1705
1706     /* Have enough data in the send queue to probe? */
1707     if (tp->write_seq - tp->snd_nxt < size_needed)
1708         return -1;
1709
1710     if (tp->snd_wnd < size_needed) //发送窗口太小
1711         return -1;
1712     if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
1713         return 0;
1714
1715     /* Do we need to wait to drain cwnd? With none in flight, don't stall */
1716     if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
1717         if (!tcp_packets_in_flight(tp))
1718             return -1;
1719         else
1720             return 0;
1721     }
1722
1723     /* We're allowed to probe.  Build it now. */
1724     if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
1725         return -1;
1726     sk->sk_wmem_queued += nskb->truesize;
1727     sk_mem_charge(sk, nskb->truesize);
1728
1729     skb = tcp_send_head(sk);
1730
1731     TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1732     TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1733     TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
1734     TCP_SKB_CB(nskb)->sacked = 0;
1735     nskb->csum = 0;
1736     nskb->ip_summed = skb->ip_summed;
1737
1738     tcp_insert_write_queue_before(nskb, skb, sk);
1739
1740     len = 0;
1741     tcp_for_write_queue_from_safe(skb, next, sk) { //将发送队列中的数据合并到大的探测包中
1742         copy = min_t(int, skb->len, probe_size - len);
1743         if (nskb->ip_summed)
1744             skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
1745         else
1746             nskb->csum = skb_copy_and_csum_bits(skb, 0,
1747                                 skb_put(nskb, copy),
1748                                 copy, nskb->csum);
1749
1750         if (skb->len <= copy) {
1751             /* We've eaten all the data from this skb.
1752              * Throw it away. */
1753             TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1754             tcp_unlink_write_queue(skb, sk);
1755             sk_wmem_free_skb(sk, skb);
1756         } else {
1757             TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
1758                            ~(TCPHDR_FIN|TCPHDR_PSH);
1759             if (!skb_shinfo(skb)->nr_frags) {
1760                 skb_pull(skb, copy);
1761                 if (skb->ip_summed != CHECKSUM_PARTIAL)
1762                     skb->csum = csum_partial(skb->data,
1763                                  skb->len, 0);
1764             } else {
1765                 __pskb_trim_head(skb, copy);
1766                 tcp_set_skb_tso_segs(sk, skb, mss_now);
1767             }
1768             TCP_SKB_CB(skb)->seq += copy;
1769         }
1770
1771         len += copy;
1772
1773         if (len >= probe_size)
1774             break;
1775     }
1776     tcp_init_tso_segs(sk, nskb, nskb->len);
1777
1778     /* We're ready to send.  If this fails, the probe will
1779      * be resegmented into mss-sized pieces by tcp_write_xmit(). */
1780     TCP_SKB_CB(nskb)->when = tcp_time_stamp;
1781     if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { //发送探测包
1782         /* Decrement cwnd here because we are sending
1783          * effectively two packets. */
1784         tp->snd_cwnd--;
1785         tcp_event_new_data_sent(sk, nskb);
1786
1787         icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); //记录此次探测的PMTU值
1788         tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
1789         tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
1790
1791         return 1;
1792     }
1793
1794     return -1;
1795 }

  默认情况下TCP数据包的IP部分都会设置不分片位。在tcp_mtu_probe函数发送大的探测包后需要等待三种结果:(1)收到ACK确认了探测包;这意味着PMTU大于或等于当前探测包的MTU;(2)收到ICMP“需要分片”的报文;这时需要根据报文中通告的MTU来调整PMTU值;(3)数据包丢失导致重传。下面分别讨论这3种情况。

  (1)收到ACK确认了探测包。TCP在收到ACK后会调用tcp_clean_rtx_queue函数来清理发送缓存中的skb:

3001 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3002                    u32 prior_snd_una)
3003 {
...
3099         if (unlikely(icsk->icsk_mtup.probe_size &&  //正在PMTU探测中
3100                  !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { //探测报文全部到达对端
3101             tcp_mtup_probe_success(sk); //探测成功
3102         }
...
  tcp_mtup_probe_success函数:
2588 static void tcp_mtup_probe_success(struct sock *sk)
2589 {
2590     struct tcp_sock *tp = tcp_sk(sk);
2591     struct inet_connection_sock *icsk = inet_csk(sk);
2592 
2593     /* FIXME: breaks with very large cwnd */
2594     tp->prior_ssthresh = tcp_current_ssthresh(sk);
2595     tp->snd_cwnd = tp->snd_cwnd *
2596                tcp_mss_to_mtu(sk, tp->mss_cache) /
2597                icsk->icsk_mtup.probe_size;
2598     tp->snd_cwnd_cnt = 0;
2599     tp->snd_cwnd_stamp = tcp_time_stamp;
2600     tp->snd_ssthresh = tcp_current_ssthresh(sk);
2601 
2602     icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; //记录最小PMTU的值
2603     icsk->icsk_mtup.probe_size = 0; //本次探测结束
2604     tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); //保存探测结果
2605 }
  tcp_sync_mss函数用于保存PMTU的值:
1296 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1297 {
1298     struct tcp_sock *tp = tcp_sk(sk);
1299     struct inet_connection_sock *icsk = inet_csk(sk);
1300     int mss_now;
1301 
1302     if (icsk->icsk_mtup.search_high > pmtu)
1303         icsk->icsk_mtup.search_high = pmtu; //记录PMTU最大值
1304     
1305     mss_now = tcp_mtu_to_mss(sk, pmtu); //将PMTU的值转换为MSS
1306     mss_now = tcp_bound_to_half_wnd(tp, mss_now);
1307 
1308     /* And store cached results */
1309     icsk->icsk_pmtu_cookie = pmtu; //记录当前PMTU
1310     if (icsk->icsk_mtup.enabled)
1311         mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
1312     tp->mss_cache = mss_now; //记录MSS,即TCP报文最大数据长度
1313 
1314     return mss_now;
1315 }  
  在对端收到探测包的情况下TCP会把探测包的PMTU记录下来,当PMTU探测再次启动时发送的探测包的PMTU会更大,最终TCP会得到结果(2)或(3)。先看结果(2), TCPv4中处理ICMP报文的函数是 tcp_v4_err

 326 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 327 {
...
 389     switch (type) {
...
 399     case ICMP_DEST_UNREACH: //路由器丢弃探测包后发送的ICMP报文会由这个分支处理
...
 403         if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 404             /* We are not interested in TCP_LISTEN and open_requests
 405              * (SYN-ACKs send out by Linux are always <576bytes so
 406              * they should go through unfragmented).
 407              */
 408             if (sk->sk_state == TCP_LISTEN)
 409                 goto out;
 410
 411             tp->mtu_info = info; //记录ICMP报文返回的MTU值
 412             if (!sock_owned_by_user(sk)) { //进程没有锁定socket
 413                 tcp_v4_mtu_reduced(sk); //修改MSS的值
 414             } else {
 415                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) //推迟到进程解除锁定socket时调用tcp_v4_mtu_reduced
 416                     sock_hold(sk);
 417             }
 418             goto out;
...
  tcp_v4_mtu_reduced函数:
 271 static void tcp_v4_mtu_reduced(struct sock *sk)
 272 {
 273     struct dst_entry *dst;
 274     struct inet_sock *inet = inet_sk(sk);
 275     u32 mtu = tcp_sk(sk)->mtu_info;
 276
 277     dst = inet_csk_update_pmtu(sk, mtu);  //更新路由表中mtu的信息
 278     if (!dst)
 279         return;
 280
 281     /* Something is about to be wrong... Remember soft error
 282      * for the case, if this connection will not able to recover.
 283      */
 284     if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 285         sk->sk_err_soft = EMSGSIZE;    
 286
 287     mtu = dst_mtu(dst);   //得到路由表中记录的MTU
 288
 289     if (inet->pmtudisc != IP_PMTUDISC_DONT && //确实能够发送不分片报文
 290         inet_csk(sk)->icsk_pmtu_cookie > mtu) { //socket中记录的PMTU大于路由表中的PMTU
 291         tcp_sync_mss(sk, mtu); //更新socket中记录的MTU的值
 292
 293         /* Resend the TCP packet because it's
 294          * clear that the old packet has been
 295          * dropped. This is the new "fast" path mtu
 296          * discovery.    
 297          */
 298         tcp_simple_retransmit(sk);     //重传数据,因为有数据丢失
 299     } /* else let the usual retransmit timer handle it */
 300  

  在路径MTU过大被路由器丢弃并收到ICMP报文的情况下,TCP会把ICMP中通告的PMTU作为结果保存下来。但出于安全等考虑,并不是所有的路由器在丢弃分片过大的报文时都会发送ICMP消息。如果探测包被这样的路由器丢弃,TCP不会收到任何响应,就好像探测包进入了“黑洞”一样,这就是TCP PMTU发现中的Black Hole Detection问题。即结果(3)。探测包丢失后TCP有两张方式处理:快速重传和超时重传。先来看快速重传,指向这个功能的是tcp_fastretrans_alert函数:

2745 static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2746                   int prior_sacked, int prior_packets,
2747                   bool is_dupack, int flag)
2748 {
...
2833         /* MTU probe failure: don't reduce cwnd */
2834         if (icsk->icsk_ca_state < TCP_CA_CWR &&
2835             icsk->icsk_mtup.probe_size &&  //开启PMTU探测
2836             tp->snd_una == tp->mtu_probe.probe_seq_start) { //探测包完全未收到
2837             tcp_mtup_probe_failed(sk);
2838             /* Restores the reduction we did in tcp_mtup_probe() */
2839             tp->snd_cwnd++;
2840             tcp_simple_retransmit(sk);
2841             return;
2842         }
...
  tcp_mtup_probe_failed函数处理探测失败的情况:
2580 static void tcp_mtup_probe_failed(struct sock *sk)
2581 {
2582     struct inet_connection_sock *icsk = inet_csk(sk);
2583 
2584     icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
2585     icsk->icsk_mtup.probe_size = 0; //结束本次探测
2586 }
  在快速重传的情况下,TCP会更新一下PMTU探测的上限。超时重传时呢?重传定时器会调用tcp_write_timeout函数:
156 static int tcp_write_timeout(struct sock *sk)
157 {
158     struct inet_connection_sock *icsk = inet_csk(sk);
159     int retry_until;
160     bool do_reset, syn_set = false;
161 
162     if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
163         if (icsk->icsk_retransmits)    
164             dst_negative_advice(sk);       
165         retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
166         syn_set = true;
167     } else {
168         if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
169             /* Black hole detection */     
170             tcp_mtu_probing(icsk, sk);     
...
  确定是超时时,tcp_write_timeout函数会调用tcp_mtu_probing函数处理PMTU:
102 static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
103 {
104     /* Black hole detection */
105     if (sysctl_tcp_mtu_probing) {
106         if (!icsk->icsk_mtup.enabled) { //如果未开启PMTU发现机制
107             icsk->icsk_mtup.enabled = 1; //开启之
108             tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); //初始化PMTU和MSS
109         } else {
110             struct tcp_sock *tp = tcp_sk(sk);
111             int mss;
112 
113             mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; //缩小MSS
114             mss = min(sysctl_tcp_base_mss, mss);
115             mss = max(mss, 68 - tp->tcp_header_len);
116             icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
117             tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); //保存缩小后的MSS
118         }
119     }
120 }
  这样看来,超时的情况下TCP会减小MSS,如此就使TCP在路由器不支持PMTU发现机制的情况下实现了PMTU的探测。

  下面来总结一下问题3的答案:TCP在发送数据时会将小段数据合并到大的探测包再发送,TCP发送的包的IP头设置会不分片的DF位。如果包顺利抵达目的地,则用这个包的MTU作为PMTU;如果包过大被丢弃,若路由器会发送ICMP“需要分片,但设置了DF位”的ICMP报文,则使用ICMP报文中的MTU值作为PMTU;若路由器不发送IMCP,则在超时重传时TCP会减小PMTU。在得到PMTU后,TCP会将其保存在socket中,并更新MSS信息,接下来用新的MSS继续发送探测包和普通数据包。

  根据对问题3的代码分析来回答第4个问题:TCP路径MTU探测得到的PMTU保存在inet_connection_sock的icsk_pmtu_cookie中,MSS保存在tcp_sock的mss_cache变量中。如果收到了ICMP报文,则用报文中的值更新路由表中保存的MTU信息。

  问题5:TCP会同时使用保存在路由表和socket中的MTU:

  client端连接建立时:

2752 void tcp_connect_init(struct sock *sk)
2753 {
...
2773     tcp_mtup_init(sk);
2774     tcp_sync_mss(sk, dst_mtu(dst));
...
  server端创建socket时:
1642 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1643                   struct request_sock *req,
1644                   struct dst_entry *dst)
1645 {
...
1691     tcp_mtup_init(newsk);
1692     tcp_sync_mss(newsk, dst_mtu(dst));
...
  发送数据时:
1016 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1017         size_t size)
1018 {
...
1067     mss_now = tcp_send_mss(sk, &size_goal, flags);
...
  tcp_send_mss函数会调用tcp_current_mss函数用于获取当前MSS的值:

1321 unsigned int tcp_current_mss(struct sock *sk)
1322 {
1323     const struct tcp_sock *tp = tcp_sk(sk);
1324     const struct dst_entry *dst = __sk_dst_get(sk);
1325     u32 mss_now;
1326     unsigned int header_len;
1327     struct tcp_out_options opts;
1328     struct tcp_md5sig_key *md5;
1329 
1330     mss_now = tp->mss_cache;
1331 
1332     if (dst) {
1333         u32 mtu = dst_mtu(dst);
1334         if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1335             mss_now = tcp_sync_mss(sk, mtu);
1336     }
1337 
1338     header_len = tcp_established_options(sk, NULL, &opts, &md5) +
1339              sizeof(struct tcphdr);
1340     /* The mss_cache is sized based on tp->tcp_header_len, which assumes
1341      * some common options. If this is an odd packet (because we have SACK
1342      * blocks etc) then our calculated header_len will be different, and
1343      * we have to adjust mss_now correspondingly */
1344     if (header_len != tp->tcp_header_len) {
1345         int delta = (int) header_len - tp->tcp_header_len;
1346         mss_now -= delta; 
1347     }
1348 
1349     return mss_now;
1350 }
  至此,关于TCP PMTU发现的问题全部回答完毕。在开启PMTU发现功能时,PMTU探测会不断的进行。网络中路径的情况在不停的变化,TCP也会不时地得到新的探测结果,并利用这些结果去影响所发送的报文段的大小。PMTU发现机制使得TCP能尽快获得数据传输路径的MTU大小,从而尽可能使用“不会因为报文过大而被路由器丢弃”的最大长度去发送没一个报文段,进而力求使得数据发送的效率最大化。

你可能感兴趣的:(tcp,linux内核)