tcp cwnd

cwnd的值可以看成报文段的计数
tcp_write_xmit () 调用
 
计算当前拥塞窗口cwnd值与mss最大报文段大小,即计算拥塞窗口有多少bytes,然后比较发送窗口大小,判断是否超过拥塞窗口
目的比较发送窗口和拥塞窗口
static unsigned int tcp_window_allows (struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd )
{
    u32 window, cwnd_len;
 
    window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
    cwnd_len = mss_now * cwnd ;
    return min(window, cwnd_len);
}
 
/* Can at least one segment of SKB be sent right now, according to the
* congestion window rules?  If so, return how many segments are allowed.
*/
拿拥塞窗口值和正在网络上传输的包数目比较,如果拥塞窗口还大,则返回拥塞窗口减掉正在网络上传输的包数目剩下的大小
 
目的是判断正在网络上传输的包数目是否超过拥塞窗口,如果大了,我们不发

static inline unsigned int tcp_cwnd_test (struct tcp_sock *tp, struct sk_buff *skb)
{
    u32 in_flight, cwnd;
 
    /* Don't be strict about the congestion window for the final FIN.  */ 对FIN包不检测,让他通过
    if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
        tcp_skb_pcount(skb) == 1)
        return 1;
 
    in_flight = tcp_packets_in_flight(tp); 看正在网络上传输的包数目
    cwnd = tp->snd_cwnd ;
    if (in_flight < cwnd)
        return (cwnd - in_flight);
 
    return 0;
}
 

/* Try to defer sending, if possible, in order to minimize the amount
* of TSO splitting we do.  View it as a kind of TSO Nagle test.
*
* This algorithm is from John Heffner.
*/
拥塞窗口减掉飞行中的包,还剩下的空间计算还剩下64k空间,可以直接发送
 
tcp_write_xmit () 调用

  static int tcp_tso_should_defer (struct sock *sk, struct sk_buff *skb)
{
    struct tcp_sock *tp = tcp_sk(sk);
    const struct inet_connection_sock *icsk = inet_csk(sk);
    u32 send_win, cong_win, limit, in_flight;
 
    if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
        goto send_now;
 
    if (icsk->icsk_ca_state != TCP_CA_Open)
        goto send_now;
 
    /* Defer for less than two clock ticks. */
    if (!tp->tso_deferred && ((jiffies<<1)>>1) - (tp->tso_deferred>>1) > 1)
        goto send_now;
 
    in_flight = tcp_packets_in_flight (tp);正在网络上传输的包数目
 
    BUG_ON(tcp_skb_pcount(skb) <= 1 ||
        (tp->snd_cwnd <= in_flight));
 
    send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
 
    /* From in_flight test above, we know that cwnd > in_flight.  */
    cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; 拥塞窗口减掉飞行中的包,还剩 下的空间
 
    limit = min(send_win, cong_win); 比较 发送窗口和拥塞窗口
 
    /* If a full-sized TSO skb can be sent, do it. */
    if (limit >= 65536) 还剩下64k空间,可以直接发送
        goto send_now;
 
    if (sysctl_tcp_tso_win_divisor) {
        u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
 
        /* If at least some fraction of a window is available,
        * just use it.
        */
        chunk /= sysctl_tcp_tso_win_divisor;
        if (limit >= chunk )
            goto send_now;
    } else {
        /* Different approach, try not to defer past a single
        * ACK.  Receiver should ACK every other full sized
        * frame, so if we have space for more than 3 frames
        * then send now.
        */
        if (limit > tcp_max_burst(tp) * tp->mss_cache)
            goto send_now;
    }
 
    /* Ok, it looks like it is advisable to defer.  */
    tp->tso_deferred = 1 | (jiffies<<1);
 
    return 1;
 
send_now:
    tp->tso_deferred = 0;
    return 0;
}
 
 
/* This gets called after a retransmit timeout, and the initially
* retransmitted data is acknowledged.  It tries to continue
* resending the rest of the retransmit queue, until either
* we've sent it all or the congestion window limit is reached.
* If doing SACK, the first ACK which comes back for a timeout
* based retransmit packet might feed us FACK information again.
* If so, we use it to avoid unnecessarily retransmissions.
*/
如果飞行中包大过拥塞窗口,不立即重传
void tcp_xmit_retransmit_queue (struct sock *sk)
{
    const struct inet_connection_sock *icsk = inet_csk(sk);
    struct tcp_sock *tp = tcp_sk(sk);
    struct sk_buff *skb;
    int packet_cnt;
 
    if (tp->retransmit_skb_hint) {
        skb = tp->retransmit_skb_hint;
        packet_cnt = tp->retransmit_cnt_hint;
    }else{
        skb = tcp_write_queue_head(sk);
        packet_cnt = 0;
    }
 
    /* First pass: retransmit lost packets. */
    if (tp->lost_out) {
        tcp_for_write_queue_from(skb, sk) {
            __u8 sacked = TCP_SKB_CB(skb)->sacked;
 
            if (skb == tcp_send_head(sk))
                break;
            /* we could do better than to assign each time */
            tp->retransmit_skb_hint = skb;
            tp->retransmit_cnt_hint = packet_cnt;
 
            /* Assume this retransmit will generate
            * only one packet for congestion window
            * calculation purposes.  This works because
            * tcp_retransmit_skb() will chop up the
            * packet to be MSS sized and all the
            * packet counting works out.
            */
            如果飞行中包大过拥塞窗口,不立即重传
            if (tcp_packets_in_flight (tp) >= tp->snd_cwnd )
                return;
 
            if (sacked & TCPCB_LOST) {
                if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
                    if (tcp_retransmit_skb(sk, skb)) {
                        tp->retransmit_skb_hint = NULL;
                        return;
                    }
                    if (icsk->icsk_ca_state != TCP_CA_Loss)
                        NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
                    else
                        NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
 
                    if (skb == tcp_write_queue_head(sk))
                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                        inet_csk(sk)->icsk_rto,
                        TCP_RTO_MAX);
                }
 
                packet_cnt += tcp_skb_pcount(skb);
                if (packet_cnt >= tp->lost_out )
                    break;
            }
        }
    }
 
    /* OK, demanded retransmission is finished. */
 
    /* Forward retransmissions are possible only during Recovery. */
    if (icsk->icsk_ca_state != TCP_CA_Recovery)
        return;
 
    /* No forward retransmissions in Reno are possible. */
    if (tcp_is_reno(tp))
        return;
 
    /* Yeah, we have to make difficult choice between forward transmission
    * and retransmission... Both ways have their merits...
    *
    * For now we do not retransmit anything, while we have some new
    * segments to send. In the other cases, follow rule 3 for
    * NextSeg() specified in RFC3517.
    */
 
    if (tcp_may_send_now(sk))
        return;
 
    /* If nothing is SACKed, highest_sack in the loop won't be valid */
    if (!tp->sacked_out)
        return;
 
    if (tp->forward_skb_hint)
        skb = tp->forward_skb_hint;
    else
        skb = tcp_write_queue_head(sk);
 
    tcp_for_write_queue_from(skb, sk) {
        if (skb == tcp_send_head(sk))
            break;
        tp->forward_skb_hint = skb;
 
        if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack))
            break;
 
        if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
            break;
 
        if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
            continue;
 
        /* Ok, retransmit it. */
        if (tcp_retransmit_skb(sk, skb)) {
            tp->forward_skb_hint = NULL;
            break;
        }
 
        if (skb == tcp_write_queue_head(sk))
            inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
            inet_csk(sk)->icsk_rto,
            TCP_RTO_MAX);
 
        NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
    }
}
 
/* Numbers are taken from RFC3390.
 *
 * John Heffner states:
 *
 * The RFC specifies a window of no more than 4380 bytes
 * unless 2*MSS > 4380.  Reading the pseudocode in the RFC
 * is a bit misleading because they use a clamp at 4380 bytes
 * rather than use a multiplier in the relevant range.
 */
__u32 tcp_init_cwnd (struct tcp_sock *tp, struct dst_entry *dst)
{
 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
 
 if (!cwnd) {
  if (tp->mss_cache > 1460)
   cwnd = 2;
  else
   cwnd = (tp->mss_cache > 1095) ? 3 : 4;
 }
 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
 
 
进入拥塞恢复
/* Set slow start threshold and cwnd not falling to slow start */
void tcp_enter_cwr (struct sock *sk, const int set_ssthresh)
{
 struct tcp_sock *tp = tcp_sk(sk);
 const struct inet_connection_sock *icsk = inet_csk(sk);
 
 tp->prior_ssthresh = 0;
 tp->bytes_acked = 0;
 if (icsk->icsk_ca_state < TCP_CA_CWR) {
  tp->undo_marker = 0;
  if (set_ssthresh)
   tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
  tp->snd_cwnd = min(tp->snd_cwnd,tcp_packets_in_flight (tp) + 1U);
  tp->snd_cwnd_cnt = 0;
  tp->high_seq = tp->snd_nxt;
  tp->snd_cwnd_stamp = tcp_time_stamp;
  TCP_ECN_queue_cwr(tp);
 
  tcp_set_ca_state(sk, TCP_CA_CWR);
 }
}
 
 
 
/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
* recovery a bit and use heuristics in tcp_process_frto() to detect if
* the RTO was spurious. Only clear SACKED_RETRANS of the head here to
* keep retrans_out counting accurate (with SACK F-RTO, other than head
* may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
* bits are handled if the Loss state is really to be entered (in
* tcp_enter_frto_loss).
*
* Do like tcp_enter_loss() would; when RTO expires the second time it
* does:
*  "Reduce ssthresh if it has not yet been made inside this window."
*/
void tcp_enter_frto (struct sock *sk)
{
    const struct inet_connection_sock *icsk = inet_csk(sk);
    struct tcp_sock *tp = tcp_sk(sk);
    struct sk_buff *skb;
 
    if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
        tp->snd_una == tp->high_seq ||
        ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
        !icsk->icsk_retransmits)) {
            tp->prior_ssthresh = tcp_current_ssthresh(sk);
            /* Our state is too optimistic in ssthresh() call because cwnd
            * is not reduced until tcp_enter_frto_loss() when previous F-RTO
            * recovery has not yet completed. Pattern would be this: RTO,
            * Cumulative ACK, RTO (2xRTO for the same segment does not end
            * up here twice).
            * RFC4138 should be more specific on what to do, even though
            * RTO is quite unlikely to occur after the first Cumulative ACK
            * due to back-off and complexity of triggering events ...
            */
            if (tp->frto_counter) {
                u32 stored_cwnd;
                stored_cwnd = tp->snd_cwnd;
                tp->snd_cwnd = 2;
                tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
                tp->snd_cwnd = stored_cwnd;
            } else {
                tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
            }
            /* ... in theory, cong.control module could do "any tricks" in
            * ssthresh(), which means that ca_state, lost bits and lost_out
            * counter would have to be faked before the call occurs. We
            * consider that too expensive, unlikely and hacky, so modules
            * using these in ssthresh() must deal these incompatibility
            * issues if they receives CA_EVENT_FRTO and frto_counter != 0
            */
            tcp_ca_event(sk, CA_EVENT_FRTO);
    }
 
    tp->undo_marker = tp->snd_una;
    tp->undo_retrans = 0;
 
    skb = tcp_write_queue_head(sk);
    if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
        tp->undo_marker = 0;
    if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
        TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
        tp->retrans_out -= tcp_skb_pcount(skb);
    }
    tcp_verify_left_out(tp);
 
    /* Too bad if TCP was application limited */
    tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
 
    /* Earlier loss recovery underway (see RFC4138; Appendix B).
    * The last condition is necessary at least in tp->frto_counter case.
    */
    if (IsSackFrto() && (tp->frto_counter ||
        ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
        after(tp->high_seq, tp->snd_una)) {
            tp->frto_highmark = tp->high_seq;
    } else {
        tp->frto_highmark = tp->snd_nxt;
    }
    tcp_set_ca_state(sk, TCP_CA_Disorder);
    tp->high_seq = tp->snd_nxt;
    tp->frto_counter = 1;
}
 
/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
* which indicates that we should follow the traditional RTO recovery,
* i.e. mark everything lost and do go-back-N retransmission.
*/
static void tcp_enter_frto_loss (struct sock *sk, int allowed_segments, int flag)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct sk_buff *skb;
 
    tp->lost_out = 0;
    tp->retrans_out = 0;
    if (tcp_is_reno(tp))
        tcp_reset_reno_sack(tp);
 
    tcp_for_write_queue(skb, sk) {
        if (skb == tcp_send_head(sk))
            break;
 
        TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
        /*
        * Count the retransmission made on RTO correctly (only when
        * waiting for the first ACK and did not get it)...
        */
        if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) {
            /* For some reason this R-bit might get cleared? */
            if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
                tp->retrans_out += tcp_skb_pcount(skb);
            /* ...enter this if branch just for the first segment */
            flag |= FLAG_DATA_ACKED;
        } else {
            if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
                tp->undo_marker = 0;
            TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
        }
 
        /* Don't lost mark skbs that were fwd transmitted after RTO */
        if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) &&
            !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) {
                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                tp->lost_out += tcp_skb_pcount(skb);
        }
    }
    tcp_verify_left_out(tp);
 
    tp->snd_cwnd = tcp_packets_in_flight (tp) + allowed_segments;
    tp->snd_cwnd_cnt = 0;
    tp->snd_cwnd_stamp = tcp_time_stamp;
    tp->frto_counter = 0;
    tp->bytes_acked = 0;
 
    tp->reordering = min_t(unsigned int, tp->reordering,
        sysctl_tcp_reordering);
    tcp_set_ca_state(sk, TCP_CA_Loss);
    tp->high_seq = tp->frto_highmark;
    TCP_ECN_queue_cwr(tp);
 
    tcp_clear_retrans_hints_partial(tp);
}
 
进入丢失状态
/* Enter Loss state. If "how" is not zero, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
*/
void tcp_enter_loss (struct sock *sk, int how)
{
    const struct inet_connection_sock *icsk = inet_csk(sk);
    struct tcp_sock *tp = tcp_sk(sk);
    struct sk_buff *skb;
 
    /* Reduce ssthresh if it has not yet been made inside this window. */
    if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
        (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
            tp->prior_ssthresh = tcp_current_ssthresh(sk);
            tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
            tcp_ca_event(sk, CA_EVENT_LOSS);
    }
    tp->snd_cwnd     = 1;
    tp->snd_cwnd_cnt    = 0;
    tp->snd_cwnd_stamp = tcp_time_stamp;
 
    tp->bytes_acked = 0;
    tcp_clear_retrans_partial(tp);
 
    if (tcp_is_reno(tp))
        tcp_reset_reno_sack(tp);
 
    if (!how) {
        /* Push undo marker, if it was plain RTO and nothing
        * was retransmitted. */
        tp->undo_marker = tp->snd_una;
        tcp_clear_retrans_hints_partial(tp);
    } else {
        tp->sacked_out = 0;
        tp->fackets_out = 0;
        tcp_clear_all_retrans_hints(tp);
    }
 
    tcp_for_write_queue(skb, sk) {
        if (skb == tcp_send_head(sk))
            break;
 
        if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
            tp->undo_marker = 0;
        TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
        if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
            TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
            TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
            tp->lost_out += tcp_skb_pcount(skb);
        }
    }
    tcp_verify_left_out(tp);
 
    tp->reordering = min_t(unsigned int, tp->reordering,
        sysctl_tcp_reordering);
    tcp_set_ca_state(sk, TCP_CA_Loss);
    tp->high_seq = tp->snd_nxt;
    TCP_ECN_queue_cwr(tp);
    /* Abort F-RTO algorithm if one is in progress */
    tp->frto_counter = 0;
}
 
 
/* CWND moderation, preventing bursts due to too big ACKs
 * in dubious situations.
 */
static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
{
 tp->snd_cwnd = min(tp->snd_cwnd,
      tcp_packets_in_flight(tp)+tcp_max_burst(tp));
 tp->snd_cwnd_stamp = tcp_time_stamp;
}
 
/* Decrease cwnd each second ack. */
static void tcp_cwnd_down(struct sock *sk, int flag)
{
 struct tcp_sock *tp = tcp_sk(sk);
 int decr = tp->snd_cwnd_cnt + 1;
 
 if ((flag&(FLAG_ANY_PROGRESS|FLAG_DSACKING_ACK)) ||
     (tcp_is_reno(tp) && !(flag&FLAG_NOT_DUP))) {
  tp->snd_cwnd_cnt = decr&1;
  decr >>= 1;
 
  if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
   tp->snd_cwnd -= decr;
 
  tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
  tp->snd_cwnd_stamp = tcp_time_stamp;
 }
}
 
 
static void tcp_undo_cwr (struct sock *sk, const int undo)
{
    struct tcp_sock *tp = tcp_sk(sk);
 
    if (tp->prior_ssthresh) {
        const struct inet_connection_sock *icsk = inet_csk(sk);
 
        if (icsk->icsk_ca_ops->undo_cwnd)
            tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
        else
            tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
 
        if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
            tp->snd_ssthresh = tp->prior_ssthresh;
            TCP_ECN_withdraw_cwr(tp);
        }
    } else {
        tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
    }
    tcp_moderate_cwnd(tp);
    tp->snd_cwnd_stamp = tcp_time_stamp;
 
    /* There is something screwy going on with the retrans hints after
    an undo */
    tcp_clear_all_retrans_hints(tp);
}
 
 
static inline void tcp_complete_cwr(struct sock *sk)
{
 struct tcp_sock *tp = tcp_sk(sk);
 tp->snd_cwnd = min(tp->snd_cwnd , tp->snd_ssthresh);
 tp->snd_cwnd_stamp = tcp_time_stamp;
 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}
 
 
 
 
 
 
 
 
 
 
 

你可能感兴趣的:(tcp cwnd)