深入理解TCP协议及其源代码

1、TCP协议栈从上到下提供的接口

深入理解TCP协议及其源代码_第1张图片

2、 三次握手

  结构体变量struct proto tcp_prot指定了TCP协议栈的访问接口函数

 1 struct proto tcp_prot = {
 2     .name                     = "TCP",
 3     .owner                    = THIS_MODULE,
 4     .close                    = tcp_close,
 5     .pre_connect              = tcp_v4_pre_connect,
 6     .connect                  = tcp_v4_connect,
 7     .disconnect               = tcp_disconnect,
 8     .accept                   = inet_csk_accept,
 9     .ioctl                    = tcp_ioctl,
10     .init                     = tcp_v4_init_sock,
11     .destroy                  = tcp_v4_destroy_sock,
12     .shutdown                 = tcp_shutdown,
13     .setsockopt               = tcp_setsockopt,
14     .getsockopt               = tcp_getsockopt,
15     .keepalive                = tcp_set_keepalive,
16     .recvmsg                  = tcp_recvmsg,
17     .sendmsg                  = tcp_sendmsg,
18     .sendpage                 = tcp_sendpage,
19     .backlog_rcv              = tcp_v4_do_rcv,
20     .release_cb               = tcp_release_cb,
21     .hash                     = inet_hash,
22     .unhash                   = inet_unhash,
23     .get_port                 = inet_csk_get_port,
24     .enter_memory_pressure    = tcp_enter_memory_pressure,
25     .leave_memory_pressure    = tcp_leave_memory_pressure,
26     .stream_memory_free       = tcp_stream_memory_free,
27     .sockets_allocated        = &tcp_sockets_allocated,
28     .orphan_count             = &tcp_orphan_count,
29     .memory_allocated         = &tcp_memory_allocated,
30     .memory_pressure          = &tcp_memory_pressure,
31     .sysctl_mem               = sysctl_tcp_mem,
32     .sysctl_wmem_offset       = offsetof(struct net, ipv4.sysctl_tcp_wmem),
33     .sysctl_rmem_offset       = offsetof(struct net, ipv4.sysctl_tcp_rmem),
34     .max_header               = MAX_TCP_HEADER,
35     .obj_size                 = sizeof(struct tcp_sock),
36     .slab_flags               = SLAB_TYPESAFE_BY_RCU,
37     .twsk_prot                = &tcp_timewait_sock_ops,
38     .rsk_prot                 = &tcp_request_sock_ops,
39     .h.hashinfo               = &tcp_hashinfo,
40     .no_autobind              = true,

2.1 首先客户端发送SYN报文

  tcp_v4_connect函数

深入理解TCP协议及其源代码_第2张图片

  1 /* This will initiate an outgoing connection. */
  2 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
  3 {
  4     struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
  5     struct inet_sock *inet = inet_sk(sk);
  6     struct tcp_sock *tp = tcp_sk(sk);
  7     __be16 orig_sport, orig_dport;
  8     __be32 daddr, nexthop;
  9     struct flowi4 *fl4;
 10     struct rtable *rt;
 11     int err;
 12     struct ip_options_rcu *inet_opt;
 13     struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 14 
 15     if (addr_len < sizeof(struct sockaddr_in))
 16         return -EINVAL;
 17 
 18     if (usin->sin_family != AF_INET)
 19         return -EAFNOSUPPORT;
 20 
 21     nexthop = daddr = usin->sin_addr.s_addr;
 22     inet_opt = rcu_dereference_protected(inet->inet_opt,
 23                          lockdep_sock_is_held(sk));
 24     if (inet_opt && inet_opt->opt.srr) {
 25         if (!daddr)
 26             return -EINVAL;
 27         nexthop = inet_opt->opt.faddr;
 28     }
 29 
 30     orig_sport = inet->inet_sport;
 31     orig_dport = usin->sin_port;
 32     fl4 = &inet->cork.fl.u.ip4;
 33     rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 34                   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 35                   IPPROTO_TCP,
 36                   orig_sport, orig_dport, sk);
 37     if (IS_ERR(rt)) {
 38         err = PTR_ERR(rt);
 39         if (err == -ENETUNREACH)
 40             IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 41         return err;
 42     }
 43 
 44     if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 45         ip_rt_put(rt);
 46         return -ENETUNREACH;
 47     }
 48 
 49     if (!inet_opt || !inet_opt->opt.srr)
 50         daddr = fl4->daddr;
 51 
 52     if (!inet->inet_saddr)
 53         inet->inet_saddr = fl4->saddr;
 54     sk_rcv_saddr_set(sk, inet->inet_saddr);
 55 
 56     if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 57         /* Reset inherited state */
 58         tp->rx_opt.ts_recent       = 0;
 59         tp->rx_opt.ts_recent_stamp = 0;
 60         if (likely(!tp->repair))
 61             tp->write_seq       = 0;
 62     }
 63 
 64     inet->inet_dport = usin->sin_port;
 65     sk_daddr_set(sk, daddr);
 66 
 67     inet_csk(sk)->icsk_ext_hdr_len = 0;
 68     if (inet_opt)
 69         inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 70 
 71     tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 72 
 73     /* Socket identity is still unknown (sport may be zero).
 74      * However we set state to SYN-SENT and not releasing socket
 75      * lock select source port, enter ourselves into the hash tables and
 76      * complete initialization after this.
 77      */
 78     tcp_set_state(sk, TCP_SYN_SENT);
 79     err = inet_hash_connect(tcp_death_row, sk);
 80     if (err)
 81         goto failure;
 82 
 83     sk_set_txhash(sk);
 84 
 85     rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 86                    inet->inet_sport, inet->inet_dport, sk);
 87     if (IS_ERR(rt)) {
 88         err = PTR_ERR(rt);
 89         rt = NULL;
 90         goto failure;
 91     }
 92     /* OK, now commit destination to socket.  */
 93     sk->sk_gso_type = SKB_GSO_TCPV4;
 94     sk_setup_caps(sk, &rt->dst);
 95     rt = NULL;
 96 
 97     if (likely(!tp->repair)) {
 98         if (!tp->write_seq)
 99             tp->write_seq = secure_tcp_seq(inet->inet_saddr,
100                                inet->inet_daddr,
101                                inet->inet_sport,
102                                usin->sin_port);
103         tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
104                          inet->inet_saddr,
105                          inet->inet_daddr);
106     }
107 
108     inet->inet_id = tp->write_seq ^ jiffies;
109 
110     if (tcp_fastopen_defer_connect(sk, &err))
111         return err;
112     if (err)
113         goto failure;
114 
115     err = tcp_connect(sk);
116 
117     if (err)
118         goto failure;
119 
120     return 0;
121 
122 failure:
123     /*
124      * This unhashes the socket and releases the local port,
125      * if necessary.
126      */
127     tcp_set_state(sk, TCP_CLOSE);
128     ip_rt_put(rt);
129     sk->sk_route_caps = 0;
130     inet->inet_dport = 0;
131     return err;
132 }

2.2 另一头服务端accept等待连接请求

  inet_csk_accept函数

深入理解TCP协议及其源代码_第3张图片

 1 /*
 2  * This will accept the next outstanding connection.
 3  */
 4 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
 5 {
 6     struct inet_connection_sock *icsk = inet_csk(sk);
 7     struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 8     struct request_sock *req;
 9     struct sock *newsk;
10     int error;
11 
12     lock_sock(sk);
13 
14     /* We need to make sure that this socket is listening,
15      * and that it has something pending.
16      */
17     error = -EINVAL;
18     if (sk->sk_state != TCP_LISTEN)
19         goto out_err;
20 
21     /* Find already established connection */
22     if (reqsk_queue_empty(queue)) {
23         long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
24 
25         /* If this is a non blocking socket don't sleep */
26         error = -EAGAIN;
27         if (!timeo)
28             goto out_err;
29 
30         error = inet_csk_wait_for_connect(sk, timeo);
31         if (error)
32             goto out_err;
33     }
34     req = reqsk_queue_remove(queue, sk);
35     newsk = req->sk;
36 
37     if (sk->sk_protocol == IPPROTO_TCP &&
38         tcp_rsk(req)->tfo_listener) {
39         spin_lock_bh(&queue->fastopenq.lock);
40         if (tcp_rsk(req)->tfo_listener) {
41             /* We are still waiting for the final ACK from 3WHS
42              * so can't free req now. Instead, we set req->sk to
43              * NULL to signify that the child socket is taken
44              * so reqsk_fastopen_remove() will free the req
45              * when 3WHS finishes (or is aborted).
46              */
47             req->sk = NULL;
48             req = NULL;
49         }
50         spin_unlock_bh(&queue->fastopenq.lock);
51     }
52 out:
53     release_sock(sk);
54     if (req)
55         reqsk_put(req);
56     return newsk;
57 out_err:
58     newsk = NULL;
59     req = NULL;
60 *err = error; 61 goto out; 62 }

  inet_csk_wait_for_connect函数

 1 /*
 2  * Wait for an incoming connection, avoid race conditions. This must be called
 3  * with the socket locked.
 4  */
 5 static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 6 {
 7     struct inet_connection_sock *icsk = inet_csk(sk);
 8     DEFINE_WAIT(wait);
 9     int err;
10 
11     /*
12      * True wake-one mechanism for incoming connections: only
13      * one process gets woken up, not the 'whole herd'.
14      * Since we do not 'race & poll' for established sockets
15      * anymore, the common case will execute the loop only once.
16      *
17      * Subtle issue: "add_wait_queue_exclusive()" will be added
18      * after any current non-exclusive waiters, and we know that
19      * it will always _stay_ after any new non-exclusive waiters
20      * because all non-exclusive waiters are added at the
21      * beginning of the wait-queue. As such, it's ok to "drop"
22      * our exclusiveness temporarily when we get woken up without
23      * having to remove and re-insert us on the wait queue.
24      */
25     for (;;) {
26         prepare_to_wait_exclusive(sk_sleep(sk), &wait,
27                       TASK_INTERRUPTIBLE);
28         release_sock(sk);
29         if (reqsk_queue_empty(&icsk->icsk_accept_queue))
30             timeo = schedule_timeout(timeo);
31         sched_annotate_sleep();
32         lock_sock(sk);
33         err = 0;
34         if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
35             break;
36         err = -EINVAL;
37         if (sk->sk_state != TCP_LISTEN)
38             break;
39         err = sock_intr_errno(timeo);
40         if (signal_pending(current))
41             break;
42         err = -EAGAIN;
43         if (!timeo)
44             break;
45     }
46     finish_wait(sk_sleep(sk), &wait);
47     return err;
48 }

2.3 三次握手中携带SYN/ACK的TCP头数据的发送和接收

  TCP/IP协议栈初始化

  inet_init函数

 1 static __net_init int inet_init_net(struct net *net)
 2 {
 3     /*
 4      * Set defaults for local port range
 5      */
 6     seqlock_init(&net->ipv4.ip_local_ports.lock);
 7     net->ipv4.ip_local_ports.range[0] =  32768;
 8     net->ipv4.ip_local_ports.range[1] =  60999;
 9 
10     seqlock_init(&net->ipv4.ping_group_range.lock);
11     /*
12      * Sane defaults - nobody may create ping sockets.
13      * Boot scripts should set this to distro-specific group.
14      */
15     net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
16     net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
17 
18     /* Default values for sysctl-controlled parameters.
19      * We set them here, in case sysctl is not compiled.
20      */
21     net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
22     net->ipv4.sysctl_ip_fwd_update_priority = 1;
23     net->ipv4.sysctl_ip_dynaddr = 0;
24     net->ipv4.sysctl_ip_early_demux = 1;
25     net->ipv4.sysctl_udp_early_demux = 1;
26     net->ipv4.sysctl_tcp_early_demux = 1;
27 #ifdef CONFIG_SYSCTL
28     net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
29 #endif
30 
31     /* Some igmp sysctl, whose values are always used */
32     net->ipv4.sysctl_igmp_max_memberships = 20;
33     net->ipv4.sysctl_igmp_max_msf = 10;
34     /* IGMP reports for link-local multicast groups are enabled by default */
35     net->ipv4.sysctl_igmp_llm_reports = 1;
36     net->ipv4.sysctl_igmp_qrv = 2;
37 
38     return 0;
39 }

 2.4 服务端接收客户端发来的SYN,发送SYN+ACK

  tcp_v4_do_rcv函数

 1 * The socket must have it's spinlock held when we get
 2  * here, unless it is a TCP_LISTEN socket.
 3  *
 4  * We have a potential double-lock case here, so even when
 5  * doing backlog processing we use the BH locking scheme.
 6  * This is because we cannot sleep with the original spinlock
 7  * held.
 8  */
 9 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
10 {
11     struct sock *rsk;
12 
13     if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
14         struct dst_entry *dst = sk->sk_rx_dst;
15 
16         sock_rps_save_rxhash(sk, skb);
17         sk_mark_napi_id(sk, skb);
18         if (dst) {
19             if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
20                 !dst->ops->check(dst, 0)) {
21                 dst_release(dst);
22                 sk->sk_rx_dst = NULL;
23             }
24         }
25         tcp_rcv_established(sk, skb);
26         return 0;
27     }
28 
29     if (tcp_checksum_complete(skb))
30         goto csum_err;
31 
32     if (sk->sk_state == TCP_LISTEN) {
33         struct sock *nsk = tcp_v4_cookie_check(sk, skb);
34 
35         if (!nsk)
36             goto discard;
37         if (nsk != sk) {
38             if (tcp_child_process(sk, nsk, skb)) {
39                 rsk = nsk;
40                 goto reset;
41             }
42             return 0;
43         }
44     } else
45         sock_rps_save_rxhash(sk, skb);
46 
47     if (tcp_rcv_state_process(sk, skb)) {
48         rsk = sk;
49         goto reset;
50     }
51     return 0;
52 
53 reset:
54     tcp_v4_send_reset(rsk, skb);
55 discard:
56     kfree_skb(skb);
57     /* Be careful here. If this function gets more complicated and
58      * gcc suffers from register pressure on the x86, sk (in %ebx)
59      * might be destroyed here. This current version compiles correctly,
60      * but you have been warned.
61      */
62     return 0;
63 
64 csum_err:
65     TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
66     TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
67     goto discard;
68 }

2.5 客户端收到服务端的SYN+ACK,发送ACK

  tcp_rcv_synsent_state_proces函数

1 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
2                                          const struct tcphdr *th, unsigned int len)
3 {    
4 ..
5            tcp_send_ack(sk);
6 ...
7 }

  到这里我们已经从linux网络核心的角度从架构上整体理解了三次握手,即携带SYN/ACK标志的数据收发过程。

3、gdb调试过程

  深入理解TCP协议及其源代码_第4张图片

  深入理解TCP协议及其源代码_第5张图片

(gdb) c
Continuing.

Breakpoint 1, __sys_socket (family=2, type=1, protocol=0) at net/socket.c:1346
1346        retval = sock_create(family, type, protocol, &sock);
(gdb) c
Continuing.

Breakpoint 2, __sys_accept4 (fd=4, upeer_sockaddr=0xffbb869c, 
    upeer_addrlen=0xffbb867c, flags=0) at net/socket.c:1542
1542    {
(gdb) c
Continuing.

Breakpoint 1, __sys_socket (family=2, type=1, protocol=0) at net/socket.c:1346
1346        retval = sock_create(family, type, protocol, &sock);
(gdb) c
Continuing.

Breakpoint 3, tcp_v4_connect (sk=0xffff888006498880, uaddr=0xffffc90000043e20, 
    addr_len=16) at net/ipv4/tcp_ipv4.c:203
203    {
(gdb) c
Continuing.

Breakpoint 4, tcp_v4_rcv (skb=0xffff8880068ed4e0) at net/ipv4/tcp_ipv4.c:1782
1782    {
(gdb) c
Continuing.

Breakpoint 4, tcp_v4_rcv (skb=0xffff888007584000) at net/ipv4/tcp_ipv4.c:1782
1782    {
(gdb) c
Continuing.

Breakpoint 4, tcp_v4_rcv (skb=0xffff888007584100) at net/ipv4/tcp_ipv4.c:1782
1782    {
(gdb) c
Continuing.

Breakpoint 4, tcp_v4_rcv (skb=0xffff8880068ed4e0) at net/ipv4/tcp_ipv4.c:1782
1782    {
(gdb) c
Continuing.

Breakpoint 4, tcp_v4_rcv (skb=0xffff888007584100) at net/ipv4/tcp_ipv4.c:1782
1782    {
(gdb) c
Continuing.

Breakpoint 2, __sys_accept4 (fd=4, upeer_sockaddr=0xffbb869c, 
    upeer_addrlen=0xffbb867c, flags=0) at net/socket.c:1542
1542    {
(gdb) 

你可能感兴趣的:(深入理解TCP协议及其源代码)