to tell something in brief about default init receive window of TCP

the referenced kernel version is 3.10.


for linux TCP/IP protocol implementation, while an tcp socket established, the initial receive window size will be controlled bysysctl_tcp_default_init_rwnd, the value of it determines how many segments should be preferred.

and you can know that by below code even if you do not get that before.

/kernel/net/ipv4/tcp_input.c

static void tcp_fixup_rcvbuf(struct sock *sk) 
{
    u32 mss = tcp_sk(sk)->advmss;
    u32 icwnd = sysctl_tcp_default_init_rwnd;
    int rcvmem;

    /* Limit to 10 segments if mss <= 1460,
     * or 14600/mss segments, with a minimum of two segments.
     */
    if (mss > 1460)
        icwnd = max_t(u32, (1460 * icwnd) / mss, 2);

    rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER);
    while (tcp_win_from_space(rcvmem) < mss) 
        rcvmem += 128; 

    rcvmem *= icwnd;

    if (sk->sk_rcvbuf < rcvmem)
        sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
}
here it suppose the mss should around 1460(mss is measured by bytes). as an most common ethernet environment with MTU 1500, that's reasonable.

for a bigger mss(more than 1460), the minimum receive window is 2(segments).

then it adjust the receive memory size available in the window should not less than 1 segment. but i'm confused why the step of this size adjust should be 128 but not other values.

after combining the least memory size of window with the count of biggest segment count of the window, update the socket's receive buffer size if it is smaller than expected.

but not larger than  default tcp receive memory config.

/kernel/include/net/tcp.h

static inline int tcp_win_from_space(int space)
{
    return sysctl_tcp_adv_win_scale<=0 ?
        (space>>(-sysctl_tcp_adv_win_scale)) :
        space - (space>>sysctl_tcp_adv_win_scale);
}
with above function, we can see that with an dedicated connection under certain TCP protocol stack memory config, the bigger the sysctl_tcp_default_init_rwnd, the larger the receive window of the current socket.

then let's have a look at where it defined, how it is called and what method we can use to config this value.

from the head file, you'll find that it is just and reference while the definition exits in the other place.

/kernel/include/net/tcp.h

/* 
 * Never offer a window over 32767 without using window scaling. Some
 * poor stacks do signed 16bit maths! 
 */
#define MAX_TCP_WINDOW      32767U

/* Offer an initial receive window of 20 mss. */
#define TCP_DEFAULT_INIT_RCVWND 20

/* Minimal accepted MSS. It is (60+60+8) - (20+20). */
#define TCP_MIN_MSS     88U

/* The least MTU to use for probing */
#define TCP_BASE_MSS        512

extern int sysctl_tcp_default_init_rwnd;
/kernel/net/ipv4/tcp_input.c

int sysctl_tcp_default_init_rwnd __read_mostly = TCP_DEFAULT_INIT_RCVWND;
so without any customization, the minimum receive window would be as small as 88*2 bytes from above info.

/kernel/net/ipv4/tcp_input.c

void tcp_init_buffer_space(struct sock *sk)
{
    struct tcp_sock *tp = tcp_sk(sk);
    int maxwin;

    if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
        tcp_fixup_rcvbuf(sk);
    if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
        tcp_fixup_sndbuf(sk);
... 
the function is called by tcp_init_buffer_space(), under the condition that user did not set the limitation on the receive buffer(about the user config part, we'll show it later).

and it is called at two point, one is got ACK at SYN_SENT state, the other is at SYN_RCV state:

void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct inet_connection_sock *icsk = inet_csk(sk);

    tcp_set_state(sk, TCP_ESTABLISHED);
...    /* Prevent spurious tcp_cwnd_restart() on first data
     * packet.      
     */
    tp->lsndtime = tcp_time_stamp;

    tcp_init_buffer_space(sk);
...

static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                     const struct tcphdr *th, unsigned int len)
{
...
    if (th->ack) {
...
        tcp_mtup_init(sk);
        tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
        tcp_initialize_rcv_mss(sk);

        /* Remember, tcp_poll() does not lock socket!
         * Change state from SYN-SENT only after copied_seq
         * is initialized. */
        tp->copied_seq = tp->rcv_nxt;

        smp_mb();

        tcp_finish_connect(sk, skb);
...
discard:
            __kfree_skb(skb);
            return 0;
        } else {
            tcp_send_ack(sk);
        }
        return -1;
    }


int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
              const struct tcphdr *th, unsigned int len)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct inet_connection_sock *icsk = inet_csk(sk);
    struct request_sock *req;
    int queued = 0;

    tp->rx_opt.saw_tstamp = 0;

    switch (sk->sk_state) {
...
    case TCP_SYN_SENT:
        queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
        if (queued >= 0)
            return queued;

        /* Do step6 onward by hand. */
        tcp_urg(sk, skb, th);
        __kfree_skb(skb);
        tcp_data_snd_check(sk);
        return 0;
    }

    req = tp->fastopen_rsk;

...
    /* step 5: check the ACK field */
    if (true) {
        int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
                          FLAG_UPDATE_TS_RECENT) > 0;

        switch (sk->sk_state) {
        case TCP_SYN_RECV:
            if (acceptable) {
                /* Once we leave TCP_SYN_RECV, we no longer
                 * need req so release it.
                 */
                if (req) {
                    tcp_synack_rtt_meas(sk, req);
                    tp->total_retrans = req->num_retrans;

                    reqsk_fastopen_remove(sk, req, false);
                } else {
                    /* Make sure socket is routed, for
                     * correct metrics.
                     */
                    icsk->icsk_af_ops->rebuild_header(sk);
                    tcp_init_congestion_control(sk);

                    tcp_mtup_init(sk);
                    tcp_init_buffer_space(sk);
                    tp->copied_seq = tp->rcv_nxt;
                }
                smp_mb();
                tcp_set_state(sk, TCP_ESTABLISHED);

 from above code logic, you'll find that the config is updated just before set the TCP state to ESTABLISHED!

then how could user be able to change this config? there still no any user interface yet!

/net/ipv4/sysctl_net_ipv4.c

static struct ctl_table ipv4_table[] = {
...
    {
        .procname       = "tcp_default_init_rwnd",
        .data           = &sysctl_tcp_default_init_rwnd,
        .maxlen         = sizeof(int),
        .mode           = 0644,
        .proc_handler   = proc_tcp_default_init_rwnd
    },   

static __init int sysctl_ipv4_init(void)
{
    struct ctl_table_header *hdr;
...
    hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
    if (hdr == NULL)
        return -ENOMEM;
...
    return 0;
}

__initcall(sysctl_ipv4_init);
by this way, it register an file node under proc file system " /proc/sys/net/ipv4/tcp_default_init_rwnd" and it is able to be modified by permitted user.

you'll find that the init function is wrapped by "__initcall" while the inet function that callstcp_v4_init() is by "fs_initcall", that would make suresysctl_ipv4_init() run after inet_init().

static int __init inet_init(void)
{
...
    ip_init();

    tcp_v4_init();

    /* Setup TCP slab cache for open requests. */
    tcp_init();
...
}
fs_initcall(inet_init);
/kernel/include/linux/init.h

/* initcalls are now grouped by functionality into separate 
 * subsections. Ordering inside the subsections is determined
 * by link order. 
 * For backwards compatibility, initcall() puts the call in 
 * the device init subsection.
 *
 * The `id' arg to __define_initcall() is needed so that multiple initcalls
 * can point at the same handler without causing duplicate-symbol build errors.
 */

#define __define_initcall(fn, id) \
    static initcall_t __initcall_##fn##id __used \
    __attribute__((__section__(".initcall" #id ".init"))) = fn
...
#define pure_initcall(fn)       __define_initcall(fn, 0)

#define core_initcall(fn)       __define_initcall(fn, 1)
#define core_initcall_sync(fn)      __define_initcall(fn, 1s)
#define postcore_initcall(fn)       __define_initcall(fn, 2)
#define postcore_initcall_sync(fn)  __define_initcall(fn, 2s)
#define arch_initcall(fn)       __define_initcall(fn, 3)
#define arch_initcall_sync(fn)      __define_initcall(fn, 3s)
#define subsys_initcall(fn)     __define_initcall(fn, 4)
#define subsys_initcall_sync(fn)    __define_initcall(fn, 4s)
#define fs_initcall(fn)         __define_initcall(fn, 5)
#define fs_initcall_sync(fn)        __define_initcall(fn, 5s)
#define rootfs_initcall(fn)     __define_initcall(fn, rootfs)
#define device_initcall(fn)     __define_initcall(fn, 6)
#define device_initcall_sync(fn)    __define_initcall(fn, 6s)
#define late_initcall(fn)       __define_initcall(fn, 7)
#define late_initcall_sync(fn)      __define_initcall(fn, 7s)

#define __initcall(fn) device_initcall(fn) 

and now, let's talk about the previous left "SOCK_RCVBUF_LOCK" that indicates user config.

int sock_setsockopt(struct socket *sock, int level, int optname,
            char __user *optval, unsigned int optlen)
{
    struct sock *sk = sock->sk;
    int val;
...
    if (get_user(val, (int __user *)optval))
        return -EFAULT;

    valbool = val ? 1 : 0;

    lock_sock(sk);

    switch (optname) {
...
    case SO_RCVBUF:
        val = min_t(u32, val, sysctl_rmem_max);
set_rcvbuf: 
        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
        /*
         * We double it on the way in to account for
         * "struct sk_buff" etc. overhead.   Applications
         * assume that the SO_RCVBUF setting they make will
         * allow that much actual data to be received on that
         * socket.
         *
         * Applications are unaware that "struct sk_buff" and
         * other overheads allocate from the receive buffer
         * during socket buffer allocation.
         *
         * And after considering the possible alternatives,
         * returning the value we actually used in getsockopt
         * is the most desirable behavior.
         */
        sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
        break;

    case SO_RCVBUFFORCE:
        if (!capable(CAP_NET_ADMIN)) {
            ret = -EPERM;
            break;
        }
        goto set_rcvbuf;

 
there the user config logic still will be limited by some other configs: sysctl_rmem_max and SOCK_MIN_RCVBUF. the macro SOCK_MIN_RCVBUF is defined as (2048 + sizeof(struct sk_buff)) in include/net/sock.h, and sysctl_rmem_max just works the same as that of sysctl_tcp_default_init_rwnd.

你可能感兴趣的:(to tell something in brief about default init receive window of TCP)