TCP拥塞状态共有5个:
135 enum tcp_ca_state { 136 TCP_CA_Open = 0, 137 #define TCPF_CA_Open (1<<TCP_CA_Open) 138 TCP_CA_Disorder = 1, 139 #define TCPF_CA_Disorder (1<<TCP_CA_Disorder) 140 TCP_CA_CWR = 2, 141 #define TCPF_CA_CWR (1<<TCP_CA_CWR) 142 TCP_CA_Recovery = 3, 143 #define TCPF_CA_Recovery (1<<TCP_CA_Recovery) 144 TCP_CA_Loss = 4 145 #define TCPF_CA_Loss (1<<TCP_CA_Loss) 146 };
Disorder:当第一次由于SACK块或重复确认而检测到拥塞时进入此状态;此状态下拥塞窗口不变,TCP需要保持网络中的包的数量不变;TCP在进入Recovery状态之前要进入本状态
CRW(Congestion Window Reduced):此状态下TCP会减小拥塞窗口,但不会重传已发送数据;这个状态在本地拥塞或收到显示拥塞通告(ECN)后设置
Recovery:减小拥塞窗口直至到达ssthresh但不能增加拥塞窗口,会重传数据
Loss:所有已发送数据都会被标记为丢失,拥塞窗口减小到一个报文段,然后数据发送端使用慢启动算法增大拥塞窗口。这个状态下不能使用快速重传算法
TCP在调用tcp_write_xmit函数发送数据时会检查拥塞窗口:
1811 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, 1812 int push_one, gfp_t gfp) 1813 { ... 1842 cwnd_quota = tcp_cwnd_test(tp, skb); 1843 if (!cwnd_quota) { //拥塞窗口不允许发送数据 1844 if (push_one == 2) //发送丢失探测报文是允许的 1845 /* Force out a loss probe pkt. */ 1846 cwnd_quota = 1; 1847 else //其它的报文不允许 1848 break; 1849 } ...tcp_cwnd_test函数来检查拥塞窗口是否允许发送数据:
1407 static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, 1408 const struct sk_buff *skb) 1409 { 1410 u32 in_flight, cwnd; 1411 1412 /* Don't be strict about the congestion window for the final FIN. */ 1413 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && 1414 tcp_skb_pcount(skb) == 1) 1415 return 1; //不需要分段的带FIN标记位的报文是不受拥塞窗口限制的 1416 1417 in_flight = tcp_packets_in_flight(tp); //得到在网络中的报文数量 1418 cwnd = tp->snd_cwnd; 1419 if (in_flight < cwnd) 1420 return (cwnd - in_flight); //当前拥塞窗口即允许发送的报文数量,减去在网络中的报文数量就是现在允许发送的数量 1421 1422 return 0; 1423 }可见拥塞窗口的值保存在tp->snd_cwnd中,这个值由拥塞控制算法来计算。
TCP的拥塞控制是从ACK的处理开始的:
3325 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 3326 { 3327 struct inet_connection_sock *icsk = inet_csk(sk); 3328 struct tcp_sock *tp = tcp_sk(sk); 3329 u32 prior_snd_una = tp->snd_una; 3330 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3331 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3332 bool is_dupack = false; 3333 u32 prior_in_flight; 3334 u32 prior_fackets; 3335 int prior_packets = tp->packets_out; 3336 int prior_sacked = tp->sacked_out; 3337 int pkts_acked = 0; 3338 int previous_packets_out = 0; 3339 3340 /* If the ack is older than previous acks 3341 * then we can probably ignore it. 3342 */ 3343 if (before(ack, prior_snd_una)) { 3344 /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ 3345 if (before(ack, prior_snd_una - tp->max_window)) { 3346 tcp_send_challenge_ack(sk); 3347 return -1; 3348 } 3349 goto old_ack; 3350 } 3351 3352 /* If the ack includes data we haven't sent yet, discard 3353 * this segment (RFC793 Section 3.9). 3354 */ 3355 if (after(ack, tp->snd_nxt)) 3356 goto invalid_ack; ... 3374 if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { //处于快速处理路径并且有新被确认的数据 3375 /* Window is constant, pure forward advance. 3376 * No more checks are required. 3377 * Note, we use the fact that SND.UNA>=SND.WL2. 3378 */ 3379 tcp_update_wl(tp, ack_seq); 3380 tp->snd_una = ack; 3381 flag |= FLAG_WIN_UPDATE; 3382 3383 tcp_ca_event(sk, CA_EVENT_FAST_ACK); //处理快速ACK拥塞事件 3384 3385 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); 3386 } else { //处于慢速处理路径或ack_seq号与之前重复 3387 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 3388 flag |= FLAG_DATA; //包中有数据 3389 else 3390 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS); 3391 3392 flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); 3393 3394 if (TCP_SKB_CB(skb)->sacked) 3395 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); 3396 3397 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) //TCP开启了ECN功能且在ACK中发现了ecn标记 3398 flag |= FLAG_ECE; 3399 3400 tcp_ca_event(sk, CA_EVENT_SLOW_ACK); //处理慢速ACK拥塞事件 3401 } ... 3409 if (!prior_packets) 3410 goto no_queue; ... 3413 previous_packets_out = tp->packets_out; 3414 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); 3415 3416 pkts_acked = previous_packets_out - tp->packets_out; 3417 3418 if (tcp_ack_is_dubious(sk, flag)) { 3419 /* Advance CWND, if state allows this. */ 3420 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) 3421 tcp_cong_avoid(sk, ack, prior_in_flight); 3422 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3423 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, 3424 prior_packets, is_dupack, flag); 3425 } else { 3426 if (flag & FLAG_DATA_ACKED) 3427 tcp_cong_avoid(sk, ack, prior_in_flight); 3428 } 3429 3430 if (tp->tlp_high_seq) 3431 tcp_process_tlp_ack(sk, ack, flag); 3432 3433 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { 3434 struct dst_entry *dst = __sk_dst_get(sk); 3435 if (dst) 3436 dst_confirm(dst); 3437 } 3438 3439 if (icsk->icsk_pending == ICSK_TIME_RETRANS) 3440 tcp_schedule_loss_probe(sk); 3441 return 1; 3442 3443 no_queue: 3444 /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3445 if (flag & FLAG_DSACKING_ACK) 3446 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, 3447 prior_packets, is_dupack, flag); 3448 /* If this ack opens up a zero window, clear backoff. It was 3449 * being used to time the probes, and is probably far higher than 3450 * it needs to be for normal retransmission. 3451 */ 3452 if (tcp_send_head(sk)) 3453 tcp_ack_probe(sk); 3454 3455 if (tp->tlp_high_seq) 3456 tcp_process_tlp_ack(sk, ack, flag); 3457 return 1; 3458 3459 invalid_ack: 3460 SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt); 3461 return -1; 3462 3463 old_ack: 3464 /* If data was SACKed, tag it and see if we should send more data. 3465 * If data was DSACKed, see if we can undo a cwnd reduction. 3466 */ 3467 if (TCP_SKB_CB(skb)->sacked) { 3468 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); 3469 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, 3470 prior_packets, is_dupack, flag); 3471 } 3472 3473 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); 3474 return 0;
To Be continued...