如果两台主机之间的通信要通过多个网络,那么每个网络的链路层就可能有不同的MTU。两台通信主机路径中的最小MTU。它被称作路径MTU(PMTU)。两台主机之间的路径MTU不一定是个常数,它取决于当时所选择的路由。而选路不一定是对称的(从A到B的路由可能与从B到A的路由不同),因此路径MTU在两个方向上不一定是一致的。
本文研究路径MTU发现主要是要弄明白以下几个问题:
1、路径MTU发现有什么用处?
2、TCP什么时候执行路径MTU发现?
3、TCP路径MTU发现的原理是什么?
4、TCP路径MTU探测的结果是如何维护的?
5、TCP如何使用路径MTU探测的结果?
下面回答问题1:TCP报文需要封装成IP报文才会发送,报文在网络中按照一定路径传输后会抵达目的地。最理想的情况是IP报文的大小正好是这条路径所能容纳的最大尺寸,因为报文小了则数据传输效率不高,大了则会引起分片。分片会使得路由器的负担加重,增加延迟,而且会增加报文丢失的概率。而IP报文的传输路径是事先不知道的,而且在传输过程中也可能发送变化,所以TCP需要动态测路径MTU的大小,这就是TCP的路径MTU发现。
接下来我们来寻找问题2的答案:PMTU探测包的发送是在tcp_write_xmit函数中进行:
1811 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1812 int push_one, gfp_t gfp)
1813 {
1814 struct tcp_sock *tp = tcp_sk(sk);
1815 struct sk_buff *skb;
1816 unsigned int tso_segs, sent_pkts;
1817 int cwnd_quota;
1818 int result;
1819
1820 sent_pkts = 0;
1821
1822 if (!push_one) {
1823 /* Do MTU probing. */
1824 result = tcp_mtu_probe(sk);
1825 if (!result) {
1826 return false;
...
可见只有tcp_write_xmit函数的参数push_one为0时TCP才会开启PMTU探测。直接调用tcp_write_xmit且push_one为0的函数有两个:tcp_tsq_handler和__tcp_push_pending_frames。前者是使用TSQ tasklet发送数据时调用的函数,而直接或间接调用后者的函数有:tcp_push_pending_frames、tcp_push、tcp_data_snd_check。下面一一列举开启PMTU探测的条件:
(1)TSQ tasklet发送数据时:
684 static void tcp_tsq_handler(struct sock *sk)
685 {
686 if ((1 << sk->sk_state) &
687 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
688 TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
689 tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
690 }
(2)通过发包系统调用或使用TCP Splice功能调用do_tcp_sendpages发送数据时:
1016 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1017 size_t size)
1018 {
...
1204 if (forced_push(tp)) {
1205 tcp_mark_push(tp, skb);
1206 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1207 } else if (skb == tcp_send_head(sk))
1208 tcp_push_one(sk, mss_now);
1209 continue;
1210
1211 wait_for_sndbuf:
1212 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1213 wait_for_memory:
1214 if (copied)
1215 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1216
1217 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1218 goto do_error;
1219
1220 mss_now = tcp_send_mss(sk, &size_goal, flags);
1221 }
1222 }
1223
1224 out:
1225 if (copied)
1226 tcp_push(sk, flags, mss_now, tp->nonagle);
1227 release_sock(sk);
1228 return copied + copied_syn;
TCP Splice:
827 static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
828 size_t size, int flags)
829 {
...
914 if (forced_push(tp)) {
915 tcp_mark_push(tp, skb);
916 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
917 } else if (skb == tcp_send_head(sk))
918 tcp_push_one(sk, mss_now);
919 continue;
920
921 wait_for_sndbuf:
922 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
923 wait_for_memory:
924 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
925
926 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
927 goto do_error;
928
929 mss_now = tcp_send_mss(sk, &size_goal, flags);
930 }
931
932 out:
933 if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
934 tcp_push(sk, flags, mss_now, tp->nonagle);
935 return copied;
...
这里只有一种情况是不开启PMTU探测的:当前已写出的字节数不大于对端通告的最大窗口的一半且发送队列中只有一个skb。其它情况下发送skb都会开启PMTU探测功能。
2685 static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
2686 {
2687 struct inet_connection_sock *icsk = inet_csk(sk);
2688 struct tcp_sock *tp = tcp_sk(sk);
2689 bool recovered = !before(tp->snd_una, tp->high_seq);
2690
2691 if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
2692 if (flag & FLAG_ORIG_SACK_ACKED) {
2693 /* Step 3.b. A timeout is spurious if not all data are
2694 * lost, i.e., never-retransmitted data are (s)acked.
2695 */
2696 tcp_try_undo_loss(sk, true);
2697 return;
2698 }
2699 if (after(tp->snd_nxt, tp->high_seq) &&
2700 (flag & FLAG_DATA_SACKED || is_dupack)) {
2701 tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
2702 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2703 tp->high_seq = tp->snd_nxt;
2704 __tcp_push_pending_frames(sk, tcp_current_mss(sk),
2705 TCP_NAGLE_OFF);
...
(4)发送FIN关闭连接时:
2545 void tcp_send_fin(struct sock *sk)
2546 {
...
2578 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
2579 }
(5)使用setsockopt设置TCP_NODELAY功能时:
2371 static int do_tcp_setsockopt(struct sock *sk, int level,
2372 int optname, char __user *optval, unsigned int optlen)
2373 {
...
2423 case TCP_NODELAY:
2424 if (val) {
2425 /* TCP_NODELAY is weaker than TCP_CORK, so that
2426 * this option on corked socket is remembered, but
2427 * it is not activated until cork is cleared.
2428 *
2429 * However, when TCP_NODELAY is set we make
2430 * an explicit push, which overrides even TCP_CORK
2431 * for currently queued segments.
2432 */
2433 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2434 tcp_push_pending_frames(sk);
2435 } else {
2436 tp->nonagle &= ~TCP_NAGLE_OFF;
2437 }
2438 break;
...
(6) 使用setsockopt取消TCP_CORK功能时:
2371 static int do_tcp_setsockopt(struct sock *sk, int level,
2372 int optname, char __user *optval, unsigned int optlen)
2373 {
...
2503 case TCP_CORK:
...
2515 if (val) {
2516 tp->nonagle |= TCP_NAGLE_CORK;
2517 } else {
2518 tp->nonagle &= ~TCP_NAGLE_CORK;
2519 if (tp->nonagle&TCP_NAGLE_OFF)
2520 tp->nonagle |= TCP_NAGLE_PUSH;
2521 tcp_push_pending_frames(sk);
2522 }
2523 break;
...
(7)收到对端发过来的ACK或数据包时:
5076 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5077 const struct tcphdr *th, unsigned int len)
5078 {
...
5109 if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
5110 TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
5111 !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
5112 int tcp_header_len = tp->tcp_header_len;
...
5136 if (len <= tcp_header_len) {
5137 /* Bulk data transfer: sender */
5138 if (len == tcp_header_len) {
5139 /* Predicted packet is in window by definition.
5140 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5141 * Hence, check seq<=rcv_wup reduces to:
5142 */
5143 if (tcp_header_len ==
5144 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5145 tp->rcv_nxt == tp->rcv_wup)
5146 tcp_store_ts_recent(tp);
5147
5148 /* We know that such packets are checksummed
5149 * on entry.
5150 */
5151 tcp_ack(sk, skb, 0);
5152 __kfree_skb(skb);
5153 tcp_data_snd_check(sk);
5154 return 0;
...
5159 } else {
...
5228 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
5229 /* Well, only one small jumplet in fast path... */
5230 tcp_ack(sk, skb, FLAG_DATA);
5231 tcp_data_snd_check(sk);
...
5274 /* step 7: process the segment text */
5275 tcp_data_queue(sk, skb);
5276
5277 tcp_data_snd_check(sk);
5278 tcp_ack_snd_check(sk);
5279 return 0;
...
(8)非TCP_LISTEN和TCP_CLOSE状态下收到合法的包时:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601 const struct tcphdr *th, unsigned int len)
5602 {
...
5649 case TCP_SYN_SENT:
5650 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
5651 if (queued >= 0)
5652 return queued;
5653
5654 /* Do step6 onward by hand. */
5655 tcp_urg(sk, skb, th);
5656 __kfree_skb(skb);
5657 tcp_data_snd_check(sk); //发送TFO的数据
5658 return 0;
5659 }
...
5861 /* tcp_data could move socket to TIME-WAIT */
5862 if (sk->sk_state != TCP_CLOSE) {
5863 tcp_data_snd_check(sk);
5864 tcp_ack_snd_check(sk);
5865 }
...
这样看来,TCP在发送数据的时候大多会执行路径MTU发现。
对于问题3,TCP完成PMTU探测任务的基本方法是使用tcp_mtu_probe函数用于发送PMTU探测包:
1675 static int tcp_mtu_probe(struct sock *sk)
1676 {
1677 struct tcp_sock *tp = tcp_sk(sk);
1678 struct inet_connection_sock *icsk = inet_csk(sk);
1679 struct sk_buff *skb, *nskb, *next;
1680 int len;
1681 int probe_size;
1682 int size_needed;
1683 int copy;
1684 int mss_now;
1685
1686 /* Not currently probing/verifying,
1687 * not in recovery,
1688 * have enough cwnd, and
1689 * not SACKing (the variable headers throw things off) */
1690 if (!icsk->icsk_mtup.enabled ||
1691 icsk->icsk_mtup.probe_size || //正在进行PMTU探测
1692 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
1693 tp->snd_cwnd < 11 ||
1694 tp->rx_opt.num_sacks || tp->rx_opt.dsack)
1695 return -1;
1696
1697 /* Very simple search strategy: just double the MSS. */
1698 mss_now = tcp_current_mss(sk);
1699 probe_size = 2 * tp->mss_cache; //设置探测包大小为当前MSS的两倍
1700 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
1701 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
1702 /* TODO: set timer for probe_converge_event */
1703 return -1;
1704 }
1705
1706 /* Have enough data in the send queue to probe? */
1707 if (tp->write_seq - tp->snd_nxt < size_needed)
1708 return -1;
1709
1710 if (tp->snd_wnd < size_needed) //发送窗口太小
1711 return -1;
1712 if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
1713 return 0;
1714
1715 /* Do we need to wait to drain cwnd? With none in flight, don't stall */
1716 if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
1717 if (!tcp_packets_in_flight(tp))
1718 return -1;
1719 else
1720 return 0;
1721 }
1722
1723 /* We're allowed to probe. Build it now. */
1724 if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
1725 return -1;
1726 sk->sk_wmem_queued += nskb->truesize;
1727 sk_mem_charge(sk, nskb->truesize);
1728
1729 skb = tcp_send_head(sk);
1730
1731 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1732 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1733 TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
1734 TCP_SKB_CB(nskb)->sacked = 0;
1735 nskb->csum = 0;
1736 nskb->ip_summed = skb->ip_summed;
1737
1738 tcp_insert_write_queue_before(nskb, skb, sk);
1739
1740 len = 0;
1741 tcp_for_write_queue_from_safe(skb, next, sk) { //将发送队列中的数据合并到大的探测包中
1742 copy = min_t(int, skb->len, probe_size - len);
1743 if (nskb->ip_summed)
1744 skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
1745 else
1746 nskb->csum = skb_copy_and_csum_bits(skb, 0,
1747 skb_put(nskb, copy),
1748 copy, nskb->csum);
1749
1750 if (skb->len <= copy) {
1751 /* We've eaten all the data from this skb.
1752 * Throw it away. */
1753 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1754 tcp_unlink_write_queue(skb, sk);
1755 sk_wmem_free_skb(sk, skb);
1756 } else {
1757 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
1758 ~(TCPHDR_FIN|TCPHDR_PSH);
1759 if (!skb_shinfo(skb)->nr_frags) {
1760 skb_pull(skb, copy);
1761 if (skb->ip_summed != CHECKSUM_PARTIAL)
1762 skb->csum = csum_partial(skb->data,
1763 skb->len, 0);
1764 } else {
1765 __pskb_trim_head(skb, copy);
1766 tcp_set_skb_tso_segs(sk, skb, mss_now);
1767 }
1768 TCP_SKB_CB(skb)->seq += copy;
1769 }
1770
1771 len += copy;
1772
1773 if (len >= probe_size)
1774 break;
1775 }
1776 tcp_init_tso_segs(sk, nskb, nskb->len);
1777
1778 /* We're ready to send. If this fails, the probe will
1779 * be resegmented into mss-sized pieces by tcp_write_xmit(). */
1780 TCP_SKB_CB(nskb)->when = tcp_time_stamp;
1781 if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { //发送探测包
1782 /* Decrement cwnd here because we are sending
1783 * effectively two packets. */
1784 tp->snd_cwnd--;
1785 tcp_event_new_data_sent(sk, nskb);
1786
1787 icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); //记录此次探测的PMTU值
1788 tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
1789 tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
1790
1791 return 1;
1792 }
1793
1794 return -1;
1795 }
默认情况下TCP数据包的IP部分都会设置不分片位。在tcp_mtu_probe函数发送大的探测包后需要等待三种结果:(1)收到ACK确认了探测包;这意味着PMTU大于或等于当前探测包的MTU;(2)收到ICMP“需要分片”的报文;这时需要根据报文中通告的MTU来调整PMTU值;(3)数据包丢失导致重传。下面分别讨论这3种情况。
(1)收到ACK确认了探测包。TCP在收到ACK后会调用tcp_clean_rtx_queue函数来清理发送缓存中的skb:
3001 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3002 u32 prior_snd_una)
3003 {
...
3099 if (unlikely(icsk->icsk_mtup.probe_size && //正在PMTU探测中
3100 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { //探测报文全部到达对端
3101 tcp_mtup_probe_success(sk); //探测成功
3102 }
...
tcp_mtup_probe_success函数:
2588 static void tcp_mtup_probe_success(struct sock *sk)
2589 {
2590 struct tcp_sock *tp = tcp_sk(sk);
2591 struct inet_connection_sock *icsk = inet_csk(sk);
2592
2593 /* FIXME: breaks with very large cwnd */
2594 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2595 tp->snd_cwnd = tp->snd_cwnd *
2596 tcp_mss_to_mtu(sk, tp->mss_cache) /
2597 icsk->icsk_mtup.probe_size;
2598 tp->snd_cwnd_cnt = 0;
2599 tp->snd_cwnd_stamp = tcp_time_stamp;
2600 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2601
2602 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; //记录最小PMTU的值
2603 icsk->icsk_mtup.probe_size = 0; //本次探测结束
2604 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); //保存探测结果
2605 }
tcp_sync_mss函数用于保存PMTU的值:
1296 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1297 {
1298 struct tcp_sock *tp = tcp_sk(sk);
1299 struct inet_connection_sock *icsk = inet_csk(sk);
1300 int mss_now;
1301
1302 if (icsk->icsk_mtup.search_high > pmtu)
1303 icsk->icsk_mtup.search_high = pmtu; //记录PMTU最大值
1304
1305 mss_now = tcp_mtu_to_mss(sk, pmtu); //将PMTU的值转换为MSS
1306 mss_now = tcp_bound_to_half_wnd(tp, mss_now);
1307
1308 /* And store cached results */
1309 icsk->icsk_pmtu_cookie = pmtu; //记录当前PMTU
1310 if (icsk->icsk_mtup.enabled)
1311 mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
1312 tp->mss_cache = mss_now; //记录MSS,即TCP报文最大数据长度
1313
1314 return mss_now;
1315 }
在对端收到探测包的情况下TCP会把探测包的PMTU记录下来,当PMTU探测再次启动时发送的探测包的PMTU会更大,最终TCP会得到结果(2)或(3)。先看结果(2), TCPv4中处理ICMP报文的函数是 tcp_v4_err:
326 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
327 {
...
389 switch (type) {
...
399 case ICMP_DEST_UNREACH: //路由器丢弃探测包后发送的ICMP报文会由这个分支处理
...
403 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
404 /* We are not interested in TCP_LISTEN and open_requests
405 * (SYN-ACKs send out by Linux are always <576bytes so
406 * they should go through unfragmented).
407 */
408 if (sk->sk_state == TCP_LISTEN)
409 goto out;
410
411 tp->mtu_info = info; //记录ICMP报文返回的MTU值
412 if (!sock_owned_by_user(sk)) { //进程没有锁定socket
413 tcp_v4_mtu_reduced(sk); //修改MSS的值
414 } else {
415 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) //推迟到进程解除锁定socket时调用tcp_v4_mtu_reduced
416 sock_hold(sk);
417 }
418 goto out;
...
tcp_v4_mtu_reduced函数:
271 static void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273 struct dst_entry *dst;
274 struct inet_sock *inet = inet_sk(sk);
275 u32 mtu = tcp_sk(sk)->mtu_info;
276
277 dst = inet_csk_update_pmtu(sk, mtu); //更新路由表中mtu的信息
278 if (!dst)
279 return;
280
281 /* Something is about to be wrong... Remember soft error
282 * for the case, if this connection will not able to recover.
283 */
284 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 sk->sk_err_soft = EMSGSIZE;
286
287 mtu = dst_mtu(dst); //得到路由表中记录的MTU
288
289 if (inet->pmtudisc != IP_PMTUDISC_DONT && //确实能够发送不分片报文
290 inet_csk(sk)->icsk_pmtu_cookie > mtu) { //socket中记录的PMTU大于路由表中的PMTU
291 tcp_sync_mss(sk, mtu); //更新socket中记录的MTU的值
292
293 /* Resend the TCP packet because it's
294 * clear that the old packet has been
295 * dropped. This is the new "fast" path mtu
296 * discovery.
297 */
298 tcp_simple_retransmit(sk); //重传数据,因为有数据丢失
299 } /* else let the usual retransmit timer handle it */
300
在路径MTU过大被路由器丢弃并收到ICMP报文的情况下,TCP会把ICMP中通告的PMTU作为结果保存下来。但出于安全等考虑,并不是所有的路由器在丢弃分片过大的报文时都会发送ICMP消息。如果探测包被这样的路由器丢弃,TCP不会收到任何响应,就好像探测包进入了“黑洞”一样,这就是TCP PMTU发现中的Black Hole Detection问题。即结果(3)。探测包丢失后TCP有两张方式处理:快速重传和超时重传。先来看快速重传,指向这个功能的是tcp_fastretrans_alert函数:
2745 static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2746 int prior_sacked, int prior_packets,
2747 bool is_dupack, int flag)
2748 {
...
2833 /* MTU probe failure: don't reduce cwnd */
2834 if (icsk->icsk_ca_state < TCP_CA_CWR &&
2835 icsk->icsk_mtup.probe_size && //开启PMTU探测
2836 tp->snd_una == tp->mtu_probe.probe_seq_start) { //探测包完全未收到
2837 tcp_mtup_probe_failed(sk);
2838 /* Restores the reduction we did in tcp_mtup_probe() */
2839 tp->snd_cwnd++;
2840 tcp_simple_retransmit(sk);
2841 return;
2842 }
...
tcp_mtup_probe_failed函数处理探测失败的情况:
2580 static void tcp_mtup_probe_failed(struct sock *sk)
2581 {
2582 struct inet_connection_sock *icsk = inet_csk(sk);
2583
2584 icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
2585 icsk->icsk_mtup.probe_size = 0; //结束本次探测
2586 }
在快速重传的情况下,TCP会更新一下PMTU探测的上限。超时重传时呢?重传定时器会调用tcp_write_timeout函数:
156 static int tcp_write_timeout(struct sock *sk)
157 {
158 struct inet_connection_sock *icsk = inet_csk(sk);
159 int retry_until;
160 bool do_reset, syn_set = false;
161
162 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
163 if (icsk->icsk_retransmits)
164 dst_negative_advice(sk);
165 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
166 syn_set = true;
167 } else {
168 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
169 /* Black hole detection */
170 tcp_mtu_probing(icsk, sk);
...
确定是超时时,tcp_write_timeout函数会调用tcp_mtu_probing函数处理PMTU:
102 static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
103 {
104 /* Black hole detection */
105 if (sysctl_tcp_mtu_probing) {
106 if (!icsk->icsk_mtup.enabled) { //如果未开启PMTU发现机制
107 icsk->icsk_mtup.enabled = 1; //开启之
108 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); //初始化PMTU和MSS
109 } else {
110 struct tcp_sock *tp = tcp_sk(sk);
111 int mss;
112
113 mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; //缩小MSS
114 mss = min(sysctl_tcp_base_mss, mss);
115 mss = max(mss, 68 - tp->tcp_header_len);
116 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
117 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); //保存缩小后的MSS
118 }
119 }
120 }
这样看来,超时的情况下TCP会减小MSS,如此就使TCP在路由器不支持PMTU发现机制的情况下实现了PMTU的探测。
下面来总结一下问题3的答案:TCP在发送数据时会将小段数据合并到大的探测包再发送,TCP发送的包的IP头设置会不分片的DF位。如果包顺利抵达目的地,则用这个包的MTU作为PMTU;如果包过大被丢弃,若路由器会发送ICMP“需要分片,但设置了DF位”的ICMP报文,则使用ICMP报文中的MTU值作为PMTU;若路由器不发送IMCP,则在超时重传时TCP会减小PMTU。在得到PMTU后,TCP会将其保存在socket中,并更新MSS信息,接下来用新的MSS继续发送探测包和普通数据包。
根据对问题3的代码分析来回答第4个问题:TCP路径MTU探测得到的PMTU保存在inet_connection_sock的icsk_pmtu_cookie中,MSS保存在tcp_sock的mss_cache变量中。如果收到了ICMP报文,则用报文中的值更新路由表中保存的MTU信息。
问题5:TCP会同时使用保存在路由表和socket中的MTU:
client端连接建立时:
2752 void tcp_connect_init(struct sock *sk)
2753 {
...
2773 tcp_mtup_init(sk);
2774 tcp_sync_mss(sk, dst_mtu(dst));
...
server端创建socket时:
1642 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1643 struct request_sock *req,
1644 struct dst_entry *dst)
1645 {
...
1691 tcp_mtup_init(newsk);
1692 tcp_sync_mss(newsk, dst_mtu(dst));
...
发送数据时:
1016 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1017 size_t size)
1018 {
...
1067 mss_now = tcp_send_mss(sk, &size_goal, flags);
...
tcp_send_mss函数会调用tcp_current_mss函数用于获取当前MSS的值:
1321 unsigned int tcp_current_mss(struct sock *sk)
1322 {
1323 const struct tcp_sock *tp = tcp_sk(sk);
1324 const struct dst_entry *dst = __sk_dst_get(sk);
1325 u32 mss_now;
1326 unsigned int header_len;
1327 struct tcp_out_options opts;
1328 struct tcp_md5sig_key *md5;
1329
1330 mss_now = tp->mss_cache;
1331
1332 if (dst) {
1333 u32 mtu = dst_mtu(dst);
1334 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1335 mss_now = tcp_sync_mss(sk, mtu);
1336 }
1337
1338 header_len = tcp_established_options(sk, NULL, &opts, &md5) +
1339 sizeof(struct tcphdr);
1340 /* The mss_cache is sized based on tp->tcp_header_len, which assumes
1341 * some common options. If this is an odd packet (because we have SACK
1342 * blocks etc) then our calculated header_len will be different, and
1343 * we have to adjust mss_now correspondingly */
1344 if (header_len != tp->tcp_header_len) {
1345 int delta = (int) header_len - tp->tcp_header_len;
1346 mss_now -= delta;
1347 }
1348
1349 return mss_now;
1350 }
至此,关于TCP PMTU发现的问题全部回答完毕。在开启PMTU发现功能时,PMTU探测会不断的进行。网络中路径的情况在不停的变化,TCP也会不时地得到新的探测结果,并利用这些结果去影响所发送的报文段的大小。PMTU发现机制使得TCP能尽快获得数据传输路径的MTU大小,从而尽可能使用“不会因为报文过大而被路由器丢弃”的最大长度去发送没一个报文段,进而力求使得数据发送的效率最大化。
补充:关于DF设置的说明
一、首先我要说明一点:我认为所有TCP报文的IP报头都会设置DF位,不只是TCP MTU探测报文会设置。
代码:
if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
只要满足了“ip_dont_fragment(sk, &rt->dst) == 1”且“skb->local_df == 0”,则一定会设置DF位。
二、我们先来看看skb->local_df。
TCP中发送数据包时申请SKB(包括MUT探测报文)的函数是sk_stream_alloc_skb,在这个函数中skb->local_df是0。skb->local_df只有在__ip_make_skb函数中才可能被设置为1::
/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
* to fragment the frame generated here. No matter, what transforms
* how transforms change size of the packet, it will comeout.
*/
if (inet->pmtudisc < IP_PMTUDISC_DO)
skb->local_df = 1;
而__ip_make_skb只有UDP协议会调用,故对所有TCP报文满足“skb->local_df == 0”。
三、ip_dont_fragment函数:
static inline
int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
{
return inet_sk(sk)->pmtudisc == IP_PMTUDISC_DO ||
(inet_sk(sk)->pmtudisc == IP_PMTUDISC_WANT &&
!(dst_metric_locked(dst, RTAX_MTU)));
}
在inet_create函数中inet_sk(sk)->pmtudisc会被设置为 IP_PMTUDISC_WANT:
if (ipv4_config.no_pmtu_disc) //ipv4_config.no_pmtu_disc默认是0
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
dst_metric_locked(dst,RTAX_MTU)是查看dst_metric的MTU对应的位是否被锁定了。metric是用来保存与对端通信时的参数,与MTU对应的参数应该只有在修改时才会被锁定。故在通常情况下dst_metric_locked(dst, RTAX_MTU)的值应该是0。
四、综上,对于所有TCP报文,其IP报头的DF位都会被设置。