1、TCP协议栈从上到下提供的接口
2、 三次握手
结构体变量struct proto tcp_prot指定了TCP协议栈的访问接口函数
1 struct proto tcp_prot = { 2 .name = "TCP", 3 .owner = THIS_MODULE, 4 .close = tcp_close, 5 .pre_connect = tcp_v4_pre_connect, 6 .connect = tcp_v4_connect, 7 .disconnect = tcp_disconnect, 8 .accept = inet_csk_accept, 9 .ioctl = tcp_ioctl, 10 .init = tcp_v4_init_sock, 11 .destroy = tcp_v4_destroy_sock, 12 .shutdown = tcp_shutdown, 13 .setsockopt = tcp_setsockopt, 14 .getsockopt = tcp_getsockopt, 15 .keepalive = tcp_set_keepalive, 16 .recvmsg = tcp_recvmsg, 17 .sendmsg = tcp_sendmsg, 18 .sendpage = tcp_sendpage, 19 .backlog_rcv = tcp_v4_do_rcv, 20 .release_cb = tcp_release_cb, 21 .hash = inet_hash, 22 .unhash = inet_unhash, 23 .get_port = inet_csk_get_port, 24 .enter_memory_pressure = tcp_enter_memory_pressure, 25 .leave_memory_pressure = tcp_leave_memory_pressure, 26 .stream_memory_free = tcp_stream_memory_free, 27 .sockets_allocated = &tcp_sockets_allocated, 28 .orphan_count = &tcp_orphan_count, 29 .memory_allocated = &tcp_memory_allocated, 30 .memory_pressure = &tcp_memory_pressure, 31 .sysctl_mem = sysctl_tcp_mem, 32 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 33 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 34 .max_header = MAX_TCP_HEADER, 35 .obj_size = sizeof(struct tcp_sock), 36 .slab_flags = SLAB_TYPESAFE_BY_RCU, 37 .twsk_prot = &tcp_timewait_sock_ops, 38 .rsk_prot = &tcp_request_sock_ops, 39 .h.hashinfo = &tcp_hashinfo, 40 .no_autobind = true,
2.1 首先客户端发送SYN报文
tcp_v4_connect函数
1 /* This will initiate an outgoing connection. */ 2 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 3 { 4 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 5 struct inet_sock *inet = inet_sk(sk); 6 struct tcp_sock *tp = tcp_sk(sk); 7 __be16 orig_sport, orig_dport; 8 __be32 daddr, nexthop; 9 struct flowi4 *fl4; 10 struct rtable *rt; 11 int err; 12 struct ip_options_rcu *inet_opt; 13 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 14 15 if (addr_len < sizeof(struct sockaddr_in)) 16 return -EINVAL; 17 18 if (usin->sin_family != AF_INET) 19 return -EAFNOSUPPORT; 20 21 nexthop = daddr = usin->sin_addr.s_addr; 22 inet_opt = rcu_dereference_protected(inet->inet_opt, 23 lockdep_sock_is_held(sk)); 24 if (inet_opt && inet_opt->opt.srr) { 25 if (!daddr) 26 return -EINVAL; 27 nexthop = inet_opt->opt.faddr; 28 } 29 30 orig_sport = inet->inet_sport; 31 orig_dport = usin->sin_port; 32 fl4 = &inet->cork.fl.u.ip4; 33 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 34 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 35 IPPROTO_TCP, 36 orig_sport, orig_dport, sk); 37 if (IS_ERR(rt)) { 38 err = PTR_ERR(rt); 39 if (err == -ENETUNREACH) 40 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 41 return err; 42 } 43 44 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 45 ip_rt_put(rt); 46 return -ENETUNREACH; 47 } 48 49 if (!inet_opt || !inet_opt->opt.srr) 50 daddr = fl4->daddr; 51 52 if (!inet->inet_saddr) 53 inet->inet_saddr = fl4->saddr; 54 sk_rcv_saddr_set(sk, inet->inet_saddr); 55 56 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 57 /* Reset inherited state */ 58 tp->rx_opt.ts_recent = 0; 59 tp->rx_opt.ts_recent_stamp = 0; 60 if (likely(!tp->repair)) 61 tp->write_seq = 0; 62 } 63 64 inet->inet_dport = usin->sin_port; 65 sk_daddr_set(sk, daddr); 66 67 inet_csk(sk)->icsk_ext_hdr_len = 0; 68 if (inet_opt) 69 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 70 71 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 72 73 /* Socket identity is still unknown (sport may be zero). 74 * However we set state to SYN-SENT and not releasing socket 75 * lock select source port, enter ourselves into the hash tables and 76 * complete initialization after this. 77 */ 78 tcp_set_state(sk, TCP_SYN_SENT); 79 err = inet_hash_connect(tcp_death_row, sk); 80 if (err) 81 goto failure; 82 83 sk_set_txhash(sk); 84 85 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 86 inet->inet_sport, inet->inet_dport, sk); 87 if (IS_ERR(rt)) { 88 err = PTR_ERR(rt); 89 rt = NULL; 90 goto failure; 91 } 92 /* OK, now commit destination to socket. */ 93 sk->sk_gso_type = SKB_GSO_TCPV4; 94 sk_setup_caps(sk, &rt->dst); 95 rt = NULL; 96 97 if (likely(!tp->repair)) { 98 if (!tp->write_seq) 99 tp->write_seq = secure_tcp_seq(inet->inet_saddr, 100 inet->inet_daddr, 101 inet->inet_sport, 102 usin->sin_port); 103 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 104 inet->inet_saddr, 105 inet->inet_daddr); 106 } 107 108 inet->inet_id = tp->write_seq ^ jiffies; 109 110 if (tcp_fastopen_defer_connect(sk, &err)) 111 return err; 112 if (err) 113 goto failure; 114 115 err = tcp_connect(sk); 116 117 if (err) 118 goto failure; 119 120 return 0; 121 122 failure: 123 /* 124 * This unhashes the socket and releases the local port, 125 * if necessary. 126 */ 127 tcp_set_state(sk, TCP_CLOSE); 128 ip_rt_put(rt); 129 sk->sk_route_caps = 0; 130 inet->inet_dport = 0; 131 return err; 132 }
2.2 另一头服务端accept等待连接请求
inet_csk_accept函数
1 /* 2 * This will accept the next outstanding connection. 3 */ 4 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) 5 { 6 struct inet_connection_sock *icsk = inet_csk(sk); 7 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 8 struct request_sock *req; 9 struct sock *newsk; 10 int error; 11 12 lock_sock(sk); 13 14 /* We need to make sure that this socket is listening, 15 * and that it has something pending. 16 */ 17 error = -EINVAL; 18 if (sk->sk_state != TCP_LISTEN) 19 goto out_err; 20 21 /* Find already established connection */ 22 if (reqsk_queue_empty(queue)) { 23 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 24 25 /* If this is a non blocking socket don't sleep */ 26 error = -EAGAIN; 27 if (!timeo) 28 goto out_err; 29 30 error = inet_csk_wait_for_connect(sk, timeo); 31 if (error) 32 goto out_err; 33 } 34 req = reqsk_queue_remove(queue, sk); 35 newsk = req->sk; 36 37 if (sk->sk_protocol == IPPROTO_TCP && 38 tcp_rsk(req)->tfo_listener) { 39 spin_lock_bh(&queue->fastopenq.lock); 40 if (tcp_rsk(req)->tfo_listener) { 41 /* We are still waiting for the final ACK from 3WHS 42 * so can't free req now. Instead, we set req->sk to 43 * NULL to signify that the child socket is taken 44 * so reqsk_fastopen_remove() will free the req 45 * when 3WHS finishes (or is aborted). 46 */ 47 req->sk = NULL; 48 req = NULL; 49 } 50 spin_unlock_bh(&queue->fastopenq.lock); 51 } 52 out: 53 release_sock(sk); 54 if (req) 55 reqsk_put(req); 56 return newsk; 57 out_err: 58 newsk = NULL; 59 req = NULL;
60 *err = error; 61 goto out; 62 }
inet_csk_wait_for_connect函数
1 /* 2 * Wait for an incoming connection, avoid race conditions. This must be called 3 * with the socket locked. 4 */ 5 static int inet_csk_wait_for_connect(struct sock *sk, long timeo) 6 { 7 struct inet_connection_sock *icsk = inet_csk(sk); 8 DEFINE_WAIT(wait); 9 int err; 10 11 /* 12 * True wake-one mechanism for incoming connections: only 13 * one process gets woken up, not the 'whole herd'. 14 * Since we do not 'race & poll' for established sockets 15 * anymore, the common case will execute the loop only once. 16 * 17 * Subtle issue: "add_wait_queue_exclusive()" will be added 18 * after any current non-exclusive waiters, and we know that 19 * it will always _stay_ after any new non-exclusive waiters 20 * because all non-exclusive waiters are added at the 21 * beginning of the wait-queue. As such, it's ok to "drop" 22 * our exclusiveness temporarily when we get woken up without 23 * having to remove and re-insert us on the wait queue. 24 */ 25 for (;;) { 26 prepare_to_wait_exclusive(sk_sleep(sk), &wait, 27 TASK_INTERRUPTIBLE); 28 release_sock(sk); 29 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) 30 timeo = schedule_timeout(timeo); 31 sched_annotate_sleep(); 32 lock_sock(sk); 33 err = 0; 34 if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) 35 break; 36 err = -EINVAL; 37 if (sk->sk_state != TCP_LISTEN) 38 break; 39 err = sock_intr_errno(timeo); 40 if (signal_pending(current)) 41 break; 42 err = -EAGAIN; 43 if (!timeo) 44 break; 45 } 46 finish_wait(sk_sleep(sk), &wait); 47 return err; 48 }
2.3 三次握手中携带SYN/ACK的TCP头数据的发送和接收
TCP/IP协议栈初始化
inet_init函数
1 static __net_init int inet_init_net(struct net *net) 2 { 3 /* 4 * Set defaults for local port range 5 */ 6 seqlock_init(&net->ipv4.ip_local_ports.lock); 7 net->ipv4.ip_local_ports.range[0] = 32768; 8 net->ipv4.ip_local_ports.range[1] = 60999; 9 10 seqlock_init(&net->ipv4.ping_group_range.lock); 11 /* 12 * Sane defaults - nobody may create ping sockets. 13 * Boot scripts should set this to distro-specific group. 14 */ 15 net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); 16 net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); 17 18 /* Default values for sysctl-controlled parameters. 19 * We set them here, in case sysctl is not compiled. 20 */ 21 net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; 22 net->ipv4.sysctl_ip_fwd_update_priority = 1; 23 net->ipv4.sysctl_ip_dynaddr = 0; 24 net->ipv4.sysctl_ip_early_demux = 1; 25 net->ipv4.sysctl_udp_early_demux = 1; 26 net->ipv4.sysctl_tcp_early_demux = 1; 27 #ifdef CONFIG_SYSCTL 28 net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; 29 #endif 30 31 /* Some igmp sysctl, whose values are always used */ 32 net->ipv4.sysctl_igmp_max_memberships = 20; 33 net->ipv4.sysctl_igmp_max_msf = 10; 34 /* IGMP reports for link-local multicast groups are enabled by default */ 35 net->ipv4.sysctl_igmp_llm_reports = 1; 36 net->ipv4.sysctl_igmp_qrv = 2; 37 38 return 0; 39 }
2.4 服务端接收客户端发来的SYN,发送SYN+ACK
tcp_v4_do_rcv函数
1 * The socket must have it's spinlock held when we get 2 * here, unless it is a TCP_LISTEN socket. 3 * 4 * We have a potential double-lock case here, so even when 5 * doing backlog processing we use the BH locking scheme. 6 * This is because we cannot sleep with the original spinlock 7 * held. 8 */ 9 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 10 { 11 struct sock *rsk; 12 13 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 14 struct dst_entry *dst = sk->sk_rx_dst; 15 16 sock_rps_save_rxhash(sk, skb); 17 sk_mark_napi_id(sk, skb); 18 if (dst) { 19 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 20 !dst->ops->check(dst, 0)) { 21 dst_release(dst); 22 sk->sk_rx_dst = NULL; 23 } 24 } 25 tcp_rcv_established(sk, skb); 26 return 0; 27 } 28 29 if (tcp_checksum_complete(skb)) 30 goto csum_err; 31 32 if (sk->sk_state == TCP_LISTEN) { 33 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 34 35 if (!nsk) 36 goto discard; 37 if (nsk != sk) { 38 if (tcp_child_process(sk, nsk, skb)) { 39 rsk = nsk; 40 goto reset; 41 } 42 return 0; 43 } 44 } else 45 sock_rps_save_rxhash(sk, skb); 46 47 if (tcp_rcv_state_process(sk, skb)) { 48 rsk = sk; 49 goto reset; 50 } 51 return 0; 52 53 reset: 54 tcp_v4_send_reset(rsk, skb); 55 discard: 56 kfree_skb(skb); 57 /* Be careful here. If this function gets more complicated and 58 * gcc suffers from register pressure on the x86, sk (in %ebx) 59 * might be destroyed here. This current version compiles correctly, 60 * but you have been warned. 61 */ 62 return 0; 63 64 csum_err: 65 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 66 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 67 goto discard; 68 }
2.5 客户端收到服务端的SYN+ACK,发送ACK
tcp_rcv_synsent_state_proces函数
1 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 2 const struct tcphdr *th, unsigned int len) 3 { 4 .. 5 tcp_send_ack(sk); 6 ... 7 }
到这里我们已经从linux网络核心的角度从架构上整体理解了三次握手,即携带SYN/ACK标志的数据收发过程。
3、gdb调试过程
(gdb) c Continuing. Breakpoint 1, __sys_socket (family=2, type=1, protocol=0) at net/socket.c:1346 1346 retval = sock_create(family, type, protocol, &sock); (gdb) c Continuing. Breakpoint 2, __sys_accept4 (fd=4, upeer_sockaddr=0xffbb869c, upeer_addrlen=0xffbb867c, flags=0) at net/socket.c:1542 1542 { (gdb) c Continuing. Breakpoint 1, __sys_socket (family=2, type=1, protocol=0) at net/socket.c:1346 1346 retval = sock_create(family, type, protocol, &sock); (gdb) c Continuing. Breakpoint 3, tcp_v4_connect (sk=0xffff888006498880, uaddr=0xffffc90000043e20, addr_len=16) at net/ipv4/tcp_ipv4.c:203 203 { (gdb) c Continuing. Breakpoint 4, tcp_v4_rcv (skb=0xffff8880068ed4e0) at net/ipv4/tcp_ipv4.c:1782 1782 { (gdb) c Continuing. Breakpoint 4, tcp_v4_rcv (skb=0xffff888007584000) at net/ipv4/tcp_ipv4.c:1782 1782 { (gdb) c Continuing. Breakpoint 4, tcp_v4_rcv (skb=0xffff888007584100) at net/ipv4/tcp_ipv4.c:1782 1782 { (gdb) c Continuing. Breakpoint 4, tcp_v4_rcv (skb=0xffff8880068ed4e0) at net/ipv4/tcp_ipv4.c:1782 1782 { (gdb) c Continuing. Breakpoint 4, tcp_v4_rcv (skb=0xffff888007584100) at net/ipv4/tcp_ipv4.c:1782 1782 { (gdb) c Continuing. Breakpoint 2, __sys_accept4 (fd=4, upeer_sockaddr=0xffbb869c, upeer_addrlen=0xffbb867c, flags=0) at net/socket.c:1542 1542 { (gdb)