服务器发送的SYN|ACK抵达客户端的网卡、经过链路层、网络层的协议处理后,如果网络层协议为IPv4,则会进入到TCPv4的入口函数tcp_v4_rcv:
1961 int tcp_v4_rcv(struct sk_buff *skb)
1962 {
1963 const struct iphdr *iph;
1964 const struct tcphdr *th;
1965 struct sock *sk;
1966 int ret;
1967 struct net *net = dev_net(skb->dev);
...
2002 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);//SYN|ACK包会在ESTABLISHED hash table中找到socket
2003 if (!sk)
2004 goto no_tcp_socket;
...
2036 if (!tcp_prequeue(sk, skb))
2037 ret = tcp_v4_do_rcv(sk, skb);//进入到主处理函数
2038 }
看tcp_v4_do_rcv函数对SYN|ACK的处理:
1800 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1801 {
1802 struct sock *rsk;
...
1814 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
...
1835 if (sk->sk_state == TCP_LISTEN) {
...
1851 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1852 rsk = sk;
1853 goto reset;
1854 }
1855 return 0;
...
执行connect系统调用后sk->sk_state的值为TCP_SYN_SENT,故1814行和1835行的判断都为假,SYN|ACK会由1851行的tcp_rcv_state_process函数进行处理:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601 const struct tcphdr *th, unsigned int len)
5602 {
5603 struct tcp_sock *tp = tcp_sk(sk);
5604 struct inet_connection_sock *icsk = inet_csk(sk);
5605 struct request_sock *req;
5606 int queued = 0;
...
5649 case TCP_SYN_SENT:
5650 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);//处理SYN|ACK包
5651 if (queued >= 0)
5652 return queued;
5653
5654 /* Do step6 onward by hand. */
5655 tcp_urg(sk, skb, th);//处理紧急数据
5656 __kfree_skb(skb);
5657 tcp_data_snd_check(sk);//发送队列中缓存的数据
5658 return 0;
5659 }
...
tcp_rcv_synsent_state_process函数:
5373 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5374 const struct tcphdr *th, unsigned int len)
5375 {
5376 struct inet_connection_sock *icsk = inet_csk(sk);
5377 struct tcp_sock *tp = tcp_sk(sk);
5378 struct tcp_fastopen_cookie foc = { .len = -1 };
5379 int saved_clamp = tp->rx_opt.mss_clamp;
5380
5381 tcp_parse_options(skb, &tp->rx_opt, 0, &foc);//解析选项
5382 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) //如果有时间戳选项,则根据用户的设置校正回显时间戳
5383 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
5384
5385 if (th->ack) {//处理带ACK标记的包
...
5394 if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || //如果确认号小于等于已发送但未确认的序列号
5395 after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) //或大于下次发送数据的序列号
5396 goto reset_and_undo; //则确认号非法,丢弃之
5397
5398 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
5399 !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
5400 tcp_time_stamp)) {//如果回显的时间戳小于等于当前时间,但大于等于SYN的构建时间,则合法;否则丢弃
5401 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
5402 goto reset_and_undo;
5403 }
...
5413 if (th->rst) { //包中不能携带RST标记
5414 tcp_reset(sk);
5415 goto discard;
5416 }
...
5425 if (!th->syn) //包中必须携带SYN标记
5426 goto discard_and_undo;
...
5480 tcp_finish_connect(sk, skb); //设置socket,完成连接的建立的相关工作
...
5486 if (sk->sk_write_pending ||
5487 icsk->icsk_accept_queue.rskq_defer_accept ||
5488 icsk->icsk_ack.pingpong) {
...
5496 inet_csk_schedule_ack(sk);//标识此socket正在等待发送ACK,如果以后有数据要发送的话会尽快发送,以便将携带的ACK尽快发送到对端
5497 icsk->icsk_ack.lrcvtime = tcp_time_stamp;//记住最后一次接收到数据包的时间
5498 tcp_enter_quickack_mode(sk);//进入快速ACK回复模式,下次收到SYN|ACK时必须立即回复ACK,不能再延迟了
5499 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
5500 TCP_DELACK_MAX, TCP_RTO_MAX);//设置延迟ACK定时器,如果超时则立即发送ACK报文
5501
5502 discard:
5503 __kfree_skb(skb);
5504 return 0;
5505 } else {
5506 tcp_send_ack(sk); //立即发送ACK给对端
5507 }
5508 return -1;
5509 }
...
5394:tp->snd_una为之前发送的SYN的seq
5395:tp->snd_nxt = seq + 1,而正常情况下ack_seq == seq + 1,这是合法的
来看看tcp_finish_connect函数的功能:
5291 void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5292 {
5293 struct tcp_sock *tp = tcp_sk(sk);
5294 struct inet_connection_sock *icsk = inet_csk(sk);
5295
5296 tcp_set_state(sk, TCP_ESTABLISHED);//状态切换到TCP_ESTABLISHED
5297
5298 if (skb != NULL) {
5299 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
5300 security_inet_conn_established(sk, skb);
5301 }
5302
5303 /* Make sure socket is routed, for correct metrics. */
5304 icsk->icsk_af_ops->rebuild_header(sk);//调用inet_sk_rebuild_header或inet6_sk_rebuild_header,根据SYN|ACK包的信息重新计算路由
5305
5306 tcp_init_metrics(sk);//初始化TCP metrics,用于保存与TCP相关的路由等信息
5307
5308 tcp_init_congestion_control(sk);//初始化拥塞控制算法
5309
5310 /* Prevent spurious tcp_cwnd_restart() on first data
5311 * packet.
5312 */
5313 tp->lsndtime = tcp_time_stamp; //记录发送最后一个数据包的时间
5314
5315 tcp_init_buffer_space(sk);//初始化接收缓存和发送缓存的空间
5316
5317 if (sock_flag(sk, SOCK_KEEPOPEN)) //如果用户设置开启KEEP_ALIVE功能
5318 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));//设置KEEP ALIVE定时器
5319
5320 if (!tp->rx_opt.snd_wscale)//如果对端没有开启窗口扩大选项
5321 __tcp_fast_path_on(tp, tp->snd_wnd);//开启首部预测
5322 else
5323 tp->pred_flags = 0;
5324
5325 if (!sock_flag(sk, SOCK_DEAD)) {
5326 sk->sk_state_change(sk);//调用sock_def_wakeup函数唤醒等待connect成功的进程
5327 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5328 }
5329 }
tcp_finish_connect函数的功能是:
1、TCP状态机跳转到TCP_ESTABLISHED
2、根据SYN|ACK包的内容更新路由信息
3、初始化TCP连接的拥塞控制算法、接收缓存和发送缓存空间等信息
4、唤醒调用connect系统调用并等待结果的进程
回到tcp_rcv_synsent_state_process函数,5486-5488行的判断中的3个条件为解析如下:
1、sk->sk_write_pending非零,即有数据等待发送
2、icsk->icsk_accept_queue.rskq_defer_accept非零,即用户设置了延迟对端的accept行为,即先不发送ACK,等待有数据发送给对端时再触发其accept行为
3、icsk->icsk_ack.pingpong非零,即允许延迟发送ACK,这个值可以通过setsockopt函数的TCP_QUICKACK选项进行设置
如果上述3个条件中的任意一个为真,则不会立即发送ACK,而是等待客户端有数据发送时,利用数据中设置的ACK标记触发对端最终完成三次握手.这样会节约一个ACK报文。
至此,对于客户端而言,连接已经建立完成。而对于服务器端,必须收到客户端发送的ACK才能完成连接的建立。服务器端在连接建立阶段对ACK的处理流程如下:
1800 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1801 {
1802 struct sock *rsk;
...
1835 if (sk->sk_state == TCP_LISTEN) {//此次匹配到的socket仍然是listen中的socket
1836 struct sock *nsk = tcp_v4_hnd_req(sk, skb);//找到接收SYN时保存的request sock,并生成socket
1837 if (!nsk)
1838 goto discard;
1839
1840 if (nsk != sk) {//这次判断条件为真
1841 sock_rps_save_rxhash(nsk, skb);
1842 if (tcp_child_process(sk, nsk, skb)) {//使用子socket处理ACK包,唤醒父socket,即监听socket
1843 rsk = nsk;
1844 goto reset;
1845 }
1846 return 0;
1847 }
...
tcp_v4_hnd_req函数代码解析:
1739 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1740 {
1741 struct tcphdr *th = tcp_hdr(skb);
1742 const struct iphdr *iph = ip_hdr(skb);
1743 struct sock *nsk;
1744 struct request_sock **prev;
1745 /* Find possible connection requests. */
1746 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1747 iph->saddr, iph->daddr); //在SYN TABLE中查找request sock
1748 if (req) //如果没有超时则会找到
1749 return tcp_check_req(sk, skb, req, prev, false);//使用request sock建立一个socket
1750
...
tcp_check_req函数代码解析:
503 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
504 struct request_sock *req,
505 struct request_sock **prev,
506 bool fastopen)
507 {
508 struct tcp_options_received tmp_opt;
509 struct sock *child;
510 const struct tcphdr *th = tcp_hdr(skb);
511 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
512 bool paws_reject = false;
513
514 BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
515
516 tmp_opt.saw_tstamp = 0;
517 if (th->doff > (sizeof(struct tcphdr)>>2)) { //有选项
518 tcp_parse_options(skb, &tmp_opt, 0, NULL); //解析选项
519
520 if (tmp_opt.saw_tstamp) {//如果有时间戳选项
521 tmp_opt.ts_recent = req->ts_recent;//记录收到SYN时包中的时间戳
...
526 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<num_timeout);//估算存储req->ts_recent的时间(秒)
527 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);//查看序列号是否出现回绕
528 }
529 }
530
531 /* Check for pure retransmitted SYN. */
532 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
533 flg == TCP_FLAG_SYN &&
534 !paws_reject) {//重传SYN且序列号没有回绕
...
558 if (!inet_rtx_syn_ack(sk, req))//调用tcp_v4_rtx_synack或tcp_v6_rtx_synack重发SYN|ACK
559 req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
560 TCP_RTO_MAX) + jiffies;//重新设置request sock的超时时间
561 return NULL;
562 }
563
564 /* Further reproduces section "SEGMENT ARRIVES"
565 for state SYN-RECEIVED of RFC793.
566 It is broken, however, it does not work only
567 when SYNs are crossed.
568
569 You would think that SYN crossing is impossible here, since
570 we should have a SYN_SENT socket (from connect()) on our end,
571 but this is not true if the crossed SYNs were sent to both
572 ends by a malicious third party. We must defend against this,
573 and to do that we first verify the ACK (as per RFC793, page
574 36) and reset if it is invalid. Is this a true full defense?
575 To convince ourselves, let us consider a way in which the ACK
576 test can still pass in this 'malicious crossed SYNs' case.
577 Malicious sender sends identical SYNs (and thus identical sequence
578 numbers) to both A and B:
579
580 A: gets SYN, seq=7
581 B: gets SYN, seq=7
582
583 By our good fortune, both A and B select the same initial
584 send sequence number of seven :-)
585
586 A: sends SYN|ACK, seq=7, ack_seq=8
587 B: sends SYN|ACK, seq=7, ack_seq=8
588
589 So we are now A eating this SYN|ACK, ACK test passes. So
590 does sequence test, SYN is truncated, and thus we consider
591 it a bare ACK.
592
593 If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
594 bare ACK. Otherwise, we create an established connection. Both
595 ends (listening sockets) accept the new incoming connection and try
596 to talk to each other. 8-)
597
598 Note: This case is both harmless, and rare. Possibility is about the
599 same as us discovering intelligent life on another plant tomorrow.
600
601 But generally, we should (RFC lies!) to accept ACK
602 from SYNACK both here and in tcp_rcv_state_process().
603 tcp_rcv_state_process() does not, hence, we do not too.
604
605 Note that the case is absolutely generic:
606 we cannot optimize anything here without
607 violating protocol. All the checks must be made
608 before attempt to create socket.
609 */
610
611 /* RFC793 page 36: "If the connection is in any non-synchronized state ...
612 * and the incoming segment acknowledges something not yet
613 * sent (the segment carries an unacceptable ACK) ...
614 * a reset is sent."
615 *
616 * Invalid ACK: reset will be sent by listening socket.
617 * Note that the ACK validity check for a Fast Open socket is done
618 * elsewhere and is checked directly against the child socket rather
619 * than req because user data may have been sent out.
620 */
621 if ((flg & TCP_FLAG_ACK) && !fastopen &&
622 (TCP_SKB_CB(skb)->ack_seq !=
623 tcp_rsk(req)->snt_isn + 1))
624 return sk;//如果确认号不对,则返回listening socekt,在tcp_v4_do_rcv函数中会发送Reset
633 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
634 tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
635 /* Out of window: send ACK and drop. */
636 if (!(flg & TCP_FLAG_RST)) //没有RST标记位
637 req->rsk_ops->send_ack(sk, skb, req); //调用tcp_v4_reqsk_send_ack发送ACK,这个函数可以在仅有request_sock的时候发送ACK
638 if (paws_reject)
639 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
640 return NULL;
641 }
642
643 /* In sequence, PAWS is OK. */
644
645 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))//如果开启了时间戳选项并且此包的序列号小于等于期望接收的序列号
646 req->ts_recent = tmp_opt.rcv_tsval;//记录对端的时间戳
647
648 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {//当前包的序列号与第一个SYN包的一致
649 /* Truncate SYN, it is out of window starting //这是个部分超出窗口的包;它一定携带了数据,否则应该在633-641行的处理流程中就会被丢弃
650 at tcp_rsk(req)->rcv_isn + 1. */
651 flg &= ~TCP_FLAG_SYN;//清除SYN标记
652 }
...
657 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
658 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
659 goto embryonic_reset;
660 }
...
668 if (!(flg & TCP_FLAG_ACK))//不接受非ACK包
669 return NULL;
670
671 /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
672 if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
673 tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
674 else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */
675 tcp_rsk(req)->snt_synack = 0;
676
677 /* For Fast Open no more processing is needed (sk is the
678 * child socket).
679 */
680 if (fastopen)
681 return sk;
682
683 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
684 if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
685 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {//如果开启了TCP_DEFER_ACCEPT功能,则要尽可能推迟建立子socket的时间
686 inet_rsk(req)->acked = 1;
687 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
688 return NULL;//丢弃当前不带数据的ACK,等待带数据的ACK到来再建立子socket
689 }
... //调用tcp_v4_syn_recv_sock或tcp_v6_syn_recv_sock建立子socket,将socket的状态设置为TCP_SYN_RECV,并将其加入到established hash表中
697 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
698 if (child == NULL)
699 goto listen_overflow;
700
701 inet_csk_reqsk_queue_unlink(sk, req, prev); //从将request sock从listening socket的SYN table中摘出来
702 inet_csk_reqsk_queue_removed(sk, req);
703
704 inet_csk_reqsk_queue_add(sk, req, child);//加入到listening socket的accept_queue中
705 return child;
这段代码的569-609行描述了一种很有意思的场景:如果一个恶意的攻击者给A和B都发送了一个SYN,其源|目的IP及端口与A和B都是匹配的。这样当A收到SYN时,就会认为是B发来的,B也会认为是A发送的SYN,此时在A和B看来:
A:收到从B发来的SYN,seq=7
B:收到从A发来的SYN,seq=7
A和B分别回复SYN|ACK,很巧合的是,它们选择的起始序列号都是7(这个概率极低,与发现外星智慧生命的概率相似):
A:发送SYN|ACK,seq=7,ack_seq=8
B:发送SYN|ACK,seq=7,ack_seq=8
然后A和B会分别收到对方发送来的SYN|ACK,并会当作ACK包处理。
A:收到从B发来的ACK,seq=7,ack_seq=8
B:收到从A发来的ACL,seq=7,ack_seq=8
这时这个包中的ack_seq正确,但seq号错误(应该为8),在633-637的处理流程中会因序列号检查失败而发送ACK:
A:发送ACK,seq=8,ack_seq=8
B:发送ACK,seq=8,ack_seq=8
这次当A和B分别收到对方发送的ACK时,就会建立连接,准备与对方通信。但由于这个连接并不是双方中的任何一方想发起的,故不会有任何数据交互。这个连接会一直存在直到应用层进程将其关闭。
这种场景出现的概率很低,而且危害不大(多占用了两个服务器的各一个socket),更重要的是无法在不修改协议的情况下解决这个问题(SYN|ACK包必须当作ACK处理,且收到序列号不合法的包必须发送ACK),故Linux TCP无法解决这个问题。
704:inet_csk_reqsk_queue_add函数在将request_sock加入到accept_queue中的时候将697行创建的scoket结构与request_sock关联起来
tcp_v4_syn_recv_sock函数会根据request的信息创建一个sock:
1642 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1643 struct request_sock *req,
1644 struct dst_entry *dst)
1645 {
1646 struct inet_request_sock *ireq;
1647 struct inet_sock *newinet;
1648 struct tcp_sock *newtp;
1649 struct sock *newsk;
,,,
1655 if (sk_acceptq_is_full(sk)) //有过多等待accept系统调用的socket
1656 goto exit_overflow;
1657
1658 newsk = tcp_create_openreq_child(sk, req, skb); //.创建并初始化一个sock
1659 if (!newsk)
1660 goto exit_nonewsk;
1661
1662 newsk->sk_gso_type = SKB_GSO_TCPV4;
1663 inet_sk_rx_dst_set(newsk, skb);
1664
1665 newtp = tcp_sk(newsk);
1666 newinet = inet_sk(newsk);
1667 ireq = inet_rsk(req);
1668 newinet->inet_daddr = ireq->rmt_addr;
1669 newinet->inet_rcv_saddr = ireq->loc_addr;
1670 newinet->inet_saddr = ireq->loc_addr;
1671 inet_opt = ireq->opt; //复制选项信息
1672 rcu_assign_pointer(newinet->inet_opt, inet_opt);
1673 ireq->opt = NULL;
1674 newinet->mc_index = inet_iif(skb);
1675 newinet->mc_ttl = ip_hdr(skb)->ttl;
1676 newinet->rcv_tos = ip_hdr(skb)->tos;
1677 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1678 if (inet_opt)
1679 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1680 newinet->inet_id = newtp->write_seq ^ jiffies;
1681
1682 if (!dst) {
1683 dst = inet_csk_route_child_sock(sk, newsk, req);
1684 if (!dst)
1685 goto put_and_exit;
1686 } else {
1687 /* syncookie case : see end of cookie_v4_check() */
1688 }
1689 sk_setup_caps(newsk, dst);
1690
1691 tcp_mtup_init(newsk);
1692 tcp_sync_mss(newsk, dst_mtu(dst)); //更新MSS
1693 newtp->advmss = dst_metric_advmss(dst);
1694 if (tcp_sk(sk)->rx_opt.user_mss &&
1695 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1696 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1697
1698 tcp_initialize_rcv_mss(newsk);
1699 tcp_synack_rtt_meas(newsk, req);
1700 newtp->total_retrans = req->num_retrans;
...
1719 if (__inet_inherit_port(sk, newsk) < 0) //继承listen sock的本地端口,即监听的端口
1720 goto put_and_exit;
1721 __inet_hash_nolisten(newsk, NULL); //将新创建的sock加入到ESTABLISHED连接表中,后续的数据处理会由新sock完成
1722
1723 return newsk;
1724
1725 exit_overflow:
1726 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1727 exit_nonewsk:
1728 dst_release(dst);
1729 exit:
1730 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1731 return NULL;
1732 put_and_exit:
1733 inet_csk_prepare_forced_close(newsk);
1734 tcp_done(newsk);
1735 goto exit;
1736 }
1719行的__inet_inherit_port函数会将新sock加入到listen sock监听端口的绑定队列中,这个动作的后果是如果newsk没有释放(比如处于TIME_WIAT状态)则不允许重复bind监听的地址和端口(除非设置了端口重用)。下面来看__inet_inherit_port的代码:
106 int __inet_inherit_port(struct sock *sk, struct sock *child)
107 {
108 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
109 unsigned short port = inet_sk(child)->inet_num; //这个port就是本地port,赋值是在inet_csk_clone_lock函数中
110 const int bhash = inet_bhashfn(sock_net(sk), port,
111 table->bhash_size);
112 struct inet_bind_hashbucket *head = &table->bhash[bhash];
113 struct inet_bind_bucket *tb;
114
115 spin_lock(&head->lock);
116 tb = inet_csk(sk)->icsk_bind_hash;
117 if (tb->port != port) {
118 /* NOTE: using tproxy and redirecting skbs to a proxy
119 * on a different listener port breaks the assumption
120 * that the listener socket's icsk_bind_hash is the same
121 * as that of the child socket. We have to look up or
122 * create a new bind bucket for the child here. */
123 inet_bind_bucket_for_each(tb, &head->chain) {
124 if (net_eq(ib_net(tb), sock_net(sk)) &&
125 tb->port == port)
126 break;
127 }
128 if (!tb) {
129 tb = inet_bind_bucket_create(table->bind_bucket_cachep,
130 sock_net(sk), head, port);
131 if (!tb) {
132 spin_unlock(&head->lock);
133 return -ENOMEM;
134 }
135 }
136 }
137 inet_bind_hash(child, tb, port); //通常child的port与parent的一致,这样就加入到相同的bind队列中
138 spin_unlock(&head->lock);
139
140 return 0;
141 }
TCP为什么要将新创建的子sock与监听的地址和端口绑定?这样会使得当前的bind有效,但如果取消当前bind(比如关闭socket)再快速重新bind则会失败。为什么要做成这个样子呢?Bind对于TCP的意义究竟有哪些?我暂时没有答案,以后慢慢领悟。
tcp_check_req函数返回后,tcp_v4_do_rcv会调用tcp_child_process继续进行处理:
745 int tcp_child_process(struct sock *parent, struct sock *child,
746 struct sk_buff *skb)
747 {
748 int ret = 0;
749 int state = child->sk_state;
750
751 if (!sock_owned_by_user(child)) {//新的socket没有被进行系统调用的进程锁定
752 ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
753 skb->len);//进行状态处理
754 /* Wakeup parent, send SIGIO */
755 if (state == TCP_SYN_RECV && child->sk_state != state)//状态处理结束后socket的状态发生了变化
756 parent->sk_data_ready(parent, 0);//调用sock_def_readable函数发送可读事件通告给listening socket,告知其可以进行accept系统调用
757 } else {//新的socket被进行系统调用的进程锁定;因为这是新的socket,所以在tcp_v4_rcv加的锁不会起到保护新socket的作用
758 /* Alas, it is possible again, because we do lookup
759 * in main socket hash table and lock on listening
760 * socket does not protect us more.
761 */
762 __sk_add_backlog(child, skb);//加入到backlog队列,等待进程系统调用结束时处理
763 }
764
765 bh_unlock_sock(child);
766 sock_put(child);
767 return ret;
768 }
tcp_rcv_state_process函数代码解析:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601 const struct tcphdr *th, unsigned int len)
5602 {
5603 struct tcp_sock *tp = tcp_sk(sk);
5604 struct inet_connection_sock *icsk = inet_csk(sk);
5605 struct request_sock *req;
5606 int queued = 0;
...
5661 req = tp->fastopen_rsk;
...
5670 if (!th->ack && !th->rst)//不处理不带ACK标记也不带RST标记的包
5671 goto discard;
5672
5673 if (!tcp_validate_incoming(sk, skb, th, 0))//检查包的合法性(主要是检查序列号)
5674 return 0;
5675
5676 /* step 5: check the ACK field */
5677 if (true) {
5678 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
5679 FLAG_UPDATE_TS_RECENT) > 0;//检查ACK的合法性;确认发送队列中的数据包
5680
5681 switch (sk->sk_state) {
5682 case TCP_SYN_RECV: //三次握手的ACK会走这里
5683 if (acceptable) {//如果ack_seq合法,acceptable就会是真
5684 /* Once we leave TCP_SYN_RECV, we no longer
5685 * need req so release it.
5686 */
5687 if (req) { //TFO功能相关
5688 tcp_synack_rtt_meas(sk, req);
5689 tp->total_retrans = req->num_retrans;
5690
5691 reqsk_fastopen_remove(sk, req, false);
5692 } else {
5693 /* Make sure socket is routed, for
5694 * correct metrics.
5695 */
5696 icsk->icsk_af_ops->rebuild_header(sk);//调用inet_sk_rebuild_header或inet6_sk_rebuild_header,根据ACK包的信息重新计算路由
5697 tcp_init_congestion_control(sk);//初始化拥塞控制算法
5698
5699 tcp_mtup_init(sk);//初始化MTU探测功能
5700 tcp_init_buffer_space(sk);//初始化接收缓存和发送缓存的空间
5701 tp->copied_seq = tp->rcv_nxt;
5702 }
5703 smp_mb();
5704 tcp_set_state(sk, TCP_ESTABLISHED); //socket状态切换为TCP_ESTABLISHED
5705 sk->sk_state_change(sk);//调用sock_def_wakeup唤醒等待的进程
...
5712 if (sk->sk_socket)
5713 sk_wake_async(sk,
5714 SOCK_WAKE_IO, POLL_OUT);
5715
5716 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
5717 tp->snd_wnd = ntohs(th->window) <<
5718 tp->rx_opt.snd_wscale;
5719 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5720
5721 if (tp->rx_opt.tstamp_ok)
5722 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
5723
5724 if (req) {
...
5735 tcp_rearm_rto(sk);
5736 } else
5737 tcp_init_metrics(sk);
5738
5739 /* Prevent spurious tcp_cwnd_restart() on
5740 * first data packet.
5741 */
5742 tp->lsndtime = tcp_time_stamp;
5743
5744 tcp_initialize_rcv_mss(sk);
5745 tcp_fast_path_on(tp);
5746 } else {
5747 return 1;
5748 }
5749 break;
...
5855 case TCP_ESTABLISHED:
5856 tcp_data_queue(sk, skb);//如果ACK报文有数据则会在这个函数中进行处理
5857 queued = 1;
5858 break;
5859 }
...
5705:对于服务器而言,此时这个新的socket尚未被accpet,故不会有进程等待这个socket的事件
再看看tcp_validate_incoming函数:
4985 static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
4986 const struct tcphdr *th, int syn_inerr)
4987 {
4988 struct tcp_sock *tp = tcp_sk(sk);
4989
4990 /* RFC1323: H1. Apply PAWS check first. */
4991 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
4992 tcp_paws_discard(sk, skb)) { //如果包的序列号回绕
4993 if (!th->rst) { //如果不是RST包,则发送ACK,并丢弃此包
4994 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
4995 tcp_send_dupack(sk, skb);
4996 goto discard;
4997 }
4998 /* Reset is accepted even if it did not pass PAWS. */
4999 }
5000
5001 /* Step 1: check sequence number */
5002 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {//检查序列号是否在窗口之内
5003 /* RFC793, page 37: "In all states except SYN-SENT, all reset
5004 * (RST) segments are validated by checking their SEQ-fields."
5005 * And page 69: "If an incoming segment is not acceptable,
5006 * an acknowledgment should be sent in reply (unless the RST
5007 * bit is set, if so drop the segment and return)".
5008 */
5009 if (!th->rst) {
5010 if (th->syn)
5011 goto syn_challenge;
5012 tcp_send_dupack(sk, skb); //不是reset包,发送ACK,告诉对端发送正确的报文
5013 }
5014 goto discard;//非法包,丢弃
5015 }
5016
5017 /* Step 2: check RST bit */
5018 if (th->rst) {
5019 /* RFC 5961 3.2 :
5020 * If sequence number exactly matches RCV.NXT, then
5021 * RESET the connection
5022 * else
5023 * Send a challenge ACK
5024 */
5025 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
5026 tcp_reset(sk);
5027 else
5028 tcp_send_challenge_ack(sk);
5029 goto discard;
5030 }
5031
5032 /* step 3: check security and precedence [ignored] */
5033
5034 /* step 4: Check for a SYN
5035 * RFC 5691 4.2 : Send a challenge ack
5036 */
5037 if (th->syn) {
5038 syn_challenge:
5039 if (syn_inerr)
5040 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5041 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
5042 tcp_send_challenge_ack(sk);
5043 goto discard;
5044 }
5045
5046 return true;
5047
5048 discard:
5049 __kfree_skb(skb);
5050 return false;
5051 }
5028:Reset包的序列号有问题,怀疑是伪造包攻击(spoofed packet injection attacks),发送ACK探测一下对端的反应,已确定是否是攻击
5038:SYN包不应该出现在当前流程中,怀疑是伪造包攻击(spoofed packet injection attacks),发送ACK探测一下对端的反应,已确定是否是攻击
三次握手的ACK报文是TCP连接开始时发送的,排除恶意攻击,则不会出现回绕,也不会出现序列号在窗口之外的情况。
下面总结一下server端收到三次握手的ACK报文时的完成的工作:
1、找到在接收SYN并发送SYN|ACK时建立的request_sock,创建一个TCP sock结构,用request_sock中的内容去初始化sock;
2、将request_sock从listen socket的syn_table中移除,再加入到listen socket的accept_queue中,并将1步中创建的sock与request_sock关联起来
3、检查ACK报文的合法性,如果合法则将报文中的信息保存到tcp_sock中;
4、TCP状态机跳转到TCP_ESTABLISHED,将sock加入到ESTABLISH hash表中,这样在应用调用accept系统调用之前内核就可以与client进行TCP数据交互了;
5、唤醒睡眠在connect系统调用中的进程
至此,客户端进程与服务器进程的三次握手全部完成,TCP连接建立成功。
以上讨论的是标准的C/S模式下的TCP连接建立过程,下面我们讨论一种非C/S模式的TCP连接建立过程:同时打开。