3.3 连接建立完成

  服务器发送的SYN|ACK抵达客户端的网卡、经过链路层、网络层的协议处理后,如果网络层协议为IPv4,则会进入到TCPv4的入口函数tcp_v4_rcv:

1961 int tcp_v4_rcv(struct sk_buff *skb)
1962 {
1963     const struct iphdr *iph;
1964     const struct tcphdr *th;
1965     struct sock *sk;
1966     int ret;
1967     struct net *net = dev_net(skb->dev);
...
2002     sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);//SYN|ACK包会在ESTABLISHED hash table中找到socket
2003     if (!sk)
2004         goto no_tcp_socket;
...
2036             if (!tcp_prequeue(sk, skb)) 
2037                 ret = tcp_v4_do_rcv(sk, skb);//进入到主处理函数
2038         }

  看tcp_v4_do_rcv函数对SYN|ACK的处理:

1800 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 
1801 {
1802     struct sock *rsk;
...
1814     if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
...
1835     if (sk->sk_state == TCP_LISTEN) { 
...
1851     if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1852         rsk = sk;
1853         goto reset;
1854     }
1855     return 0;
...

  执行connect系统调用后sk->sk_state的值为TCP_SYN_SENT,故1814行和1835行的判断都为假,SYN|ACK会由1851行的tcp_rcv_state_process函数进行处理:

 5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601               const struct tcphdr *th, unsigned int len)
5602 {
5603     struct tcp_sock *tp = tcp_sk(sk);
5604     struct inet_connection_sock *icsk = inet_csk(sk);
5605     struct request_sock *req;
5606     int queued = 0;
...
5649     case TCP_SYN_SENT:
5650         queued = tcp_rcv_synsent_state_process(sk, skb, th, len);//处理SYN|ACK包
5651         if (queued >= 0)
5652             return queued;
5653        
5654         /* Do step6 onward by hand. */
5655         tcp_urg(sk, skb, th);//处理紧急数据
5656         __kfree_skb(skb);
5657         tcp_data_snd_check(sk);//发送队列中缓存的数据
5658         return 0;
5659     }
...

  tcp_rcv_synsent_state_process函数:

5373 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5374                      const struct tcphdr *th, unsigned int len)
5375 {
5376     struct inet_connection_sock *icsk = inet_csk(sk);
5377     struct tcp_sock *tp = tcp_sk(sk);
5378     struct tcp_fastopen_cookie foc = { .len = -1 };
5379     int saved_clamp = tp->rx_opt.mss_clamp;
5380
5381     tcp_parse_options(skb, &tp->rx_opt, 0, &foc);//解析选项
5382     if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)    //如果有时间戳选项,则根据用户的设置校正回显时间戳
5383         tp->rx_opt.rcv_tsecr -= tp->tsoffset;
5384
5385     if (th->ack) {//处理带ACK标记的包
...
5394         if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||    //如果确认号小于等于已发送但未确认的序列号
5395             after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))    //或大于下次发送数据的序列号
5396             goto reset_and_undo;                             //则确认号非法,丢弃之
5397
5398         if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
5399             !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
5400                  tcp_time_stamp)) {//如果回显的时间戳小于等于当前时间,但大于等于SYN的构建时间,则合法;否则丢弃
5401             NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
5402             goto reset_and_undo;
5403         }
...
5413         if (th->rst) {    //包中不能携带RST标记
5414             tcp_reset(sk);
5415             goto discard;
5416         }
...
5425         if (!th->syn)    //包中必须携带SYN标记
5426             goto discard_and_undo;
...
5480         tcp_finish_connect(sk, skb); //设置socket,完成连接的建立的相关工作
...
5486         if (sk->sk_write_pending || 
5487             icsk->icsk_accept_queue.rskq_defer_accept ||
5488             icsk->icsk_ack.pingpong) {
...
5496             inet_csk_schedule_ack(sk);//标识此socket正在等待发送ACK,如果以后有数据要发送的话会尽快发送,以便将携带的ACK尽快发送到对端
5497             icsk->icsk_ack.lrcvtime = tcp_time_stamp;//记住最后一次接收到数据包的时间
5498             tcp_enter_quickack_mode(sk);//进入快速ACK回复模式,下次收到SYN|ACK时必须立即回复ACK,不能再延迟了
5499             inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
5500                           TCP_DELACK_MAX, TCP_RTO_MAX);//设置延迟ACK定时器,如果超时则立即发送ACK报文
5501
5502 discard:
5503             __kfree_skb(skb);
5504             return 0;
5505         } else {
5506             tcp_send_ack(sk);    //立即发送ACK给对端
5507         }
5508         return -1;
5509     }
...

  5394:tp->snd_una为之前发送的SYN的seq

  5395:tp->snd_nxt = seq + 1,而正常情况下ack_seq == seq + 1,这是合法的

  来看看tcp_finish_connect函数的功能:

5291 void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5292 {
5293     struct tcp_sock *tp = tcp_sk(sk);
5294     struct inet_connection_sock *icsk = inet_csk(sk);
5295
5296     tcp_set_state(sk, TCP_ESTABLISHED);//状态切换到TCP_ESTABLISHED
5297
5298     if (skb != NULL) {   
5299         icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
5300         security_inet_conn_established(sk, skb);
5301     }
5302
5303     /* Make sure socket is routed, for correct metrics.  */
5304     icsk->icsk_af_ops->rebuild_header(sk);//调用inet_sk_rebuild_header或inet6_sk_rebuild_header,根据SYN|ACK包的信息重新计算路由
5305
5306     tcp_init_metrics(sk);//初始化TCP metrics,用于保存与TCP相关的路由等信息
5307
5308     tcp_init_congestion_control(sk);//初始化拥塞控制算法
5309
5310     /* Prevent spurious tcp_cwnd_restart() on first data
5311      * packet.
5312      */
5313     tp->lsndtime = tcp_time_stamp; //记录发送最后一个数据包的时间
5314
5315     tcp_init_buffer_space(sk);//初始化接收缓存和发送缓存的空间
5316
5317     if (sock_flag(sk, SOCK_KEEPOPEN)) //如果用户设置开启KEEP_ALIVE功能
5318         inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));//设置KEEP ALIVE定时器
5319
5320     if (!tp->rx_opt.snd_wscale)//如果对端没有开启窗口扩大选项
5321         __tcp_fast_path_on(tp, tp->snd_wnd);//开启首部预测
5322     else
5323         tp->pred_flags = 0;
5324     
5325     if (!sock_flag(sk, SOCK_DEAD)) {
5326         sk->sk_state_change(sk);//调用sock_def_wakeup函数唤醒等待connect成功的进程
5327         sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5328     }
5329 }            

  tcp_finish_connect函数的功能是:

1、TCP状态机跳转到TCP_ESTABLISHED

2、根据SYN|ACK包的内容更新路由信息

3、初始化TCP连接的拥塞控制算法、接收缓存和发送缓存空间等信息

4、唤醒调用connect系统调用并等待结果的进程

  回到tcp_rcv_synsent_state_process函数,5486-5488行的判断中的3个条件为解析如下:

1、sk->sk_write_pending非零,即有数据等待发送

2、icsk->icsk_accept_queue.rskq_defer_accept非零,即用户设置了延迟对端的accept行为,即先不发送ACK,等待有数据发送给对端时再触发其accept行为

3、icsk->icsk_ack.pingpong非零,即允许延迟发送ACK,这个值可以通过setsockopt函数的TCP_QUICKACK选项进行设置

  如果上述3个条件中的任意一个为真,则不会立即发送ACK,而是等待客户端有数据发送时,利用数据中设置的ACK标记触发对端最终完成三次握手.这样会节约一个ACK报文。

  至此,对于客户端而言,连接已经建立完成。而对于服务器端,必须收到客户端发送的ACK才能完成连接的建立。服务器端在连接建立阶段对ACK的处理流程如下:

1800 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1801 {
1802     struct sock *rsk;
...
1835     if (sk->sk_state == TCP_LISTEN) {//此次匹配到的socket仍然是listen中的socket
1836         struct sock *nsk = tcp_v4_hnd_req(sk, skb);//找到接收SYN时保存的request sock,并生成socket
1837         if (!nsk)
1838             goto discard;
1839
1840         if (nsk != sk) {//这次判断条件为真
1841             sock_rps_save_rxhash(nsk, skb);
1842             if (tcp_child_process(sk, nsk, skb)) {//使用子socket处理ACK包,唤醒父socket,即监听socket
1843                 rsk = nsk;
1844                 goto reset;
1845             }
1846             return 0;
1847         }
...

  tcp_v4_hnd_req函数代码解析:

1739 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1740 {
1741     struct tcphdr *th = tcp_hdr(skb);
1742     const struct iphdr *iph = ip_hdr(skb);
1743     struct sock *nsk;    
1744     struct request_sock **prev;    
1745     /* Find possible connection requests. */
1746     struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1747                                iph->saddr, iph->daddr);   //在SYN TABLE中查找request sock   
1748     if (req) //如果没有超时则会找到
1749         return tcp_check_req(sk, skb, req, prev, false);//使用request sock建立一个socket
1750
...

  tcp_check_req函数代码解析:

 503 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
504                struct request_sock *req,
505                struct request_sock **prev,
506                bool fastopen)
507 {
508     struct tcp_options_received tmp_opt;
509     struct sock *child;
510     const struct tcphdr *th = tcp_hdr(skb);
511     __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
512     bool paws_reject = false;
513
514     BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
515
516     tmp_opt.saw_tstamp = 0;
517     if (th->doff > (sizeof(struct tcphdr)>>2)) { //有选项
518         tcp_parse_options(skb, &tmp_opt, 0, NULL); //解析选项
519
520         if (tmp_opt.saw_tstamp) {//如果有时间戳选项
521             tmp_opt.ts_recent = req->ts_recent;//记录收到SYN时包中的时间戳
...
526             tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<num_timeout);//估算存储req->ts_recent的时间(秒)
527             paws_reject = tcp_paws_reject(&tmp_opt, th->rst);//查看序列号是否出现回绕
528         }
529     }
530
531     /* Check for pure retransmitted SYN. */
532     if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
533         flg == TCP_FLAG_SYN &&
534         !paws_reject) {//重传SYN且序列号没有回绕
...
558         if (!inet_rtx_syn_ack(sk, req))//调用tcp_v4_rtx_synack或tcp_v6_rtx_synack重发SYN|ACK
559             req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
560                        TCP_RTO_MAX) + jiffies;//重新设置request sock的超时时间
561         return NULL;
562     }
563
564     /* Further reproduces section "SEGMENT ARRIVES"
565        for state SYN-RECEIVED of RFC793.
566        It is broken, however, it does not work only
567        when SYNs are crossed.
568
569        You would think that SYN crossing is impossible here, since
570        we should have a SYN_SENT socket (from connect()) on our end,
571        but this is not true if the crossed SYNs were sent to both
572        ends by a malicious third party.  We must defend against this,
573        and to do that we first verify the ACK (as per RFC793, page
574        36) and reset if it is invalid.  Is this a true full defense?
575        To convince ourselves, let us consider a way in which the ACK
576        test can still pass in this 'malicious crossed SYNs' case.
577        Malicious sender sends identical SYNs (and thus identical sequence
578        numbers) to both A and B:
579
580         A: gets SYN, seq=7
581         B: gets SYN, seq=7
582
583        By our good fortune, both A and B select the same initial
584        send sequence number of seven :-)
585
586         A: sends SYN|ACK, seq=7, ack_seq=8
587         B: sends SYN|ACK, seq=7, ack_seq=8
588
589        So we are now A eating this SYN|ACK, ACK test passes.  So
590        does sequence test, SYN is truncated, and thus we consider
591        it a bare ACK.
592
593        If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
594        bare ACK.  Otherwise, we create an established connection.  Both
595        ends (listening sockets) accept the new incoming connection and try
596        to talk to each other. 8-)
597
598        Note: This case is both harmless, and rare.  Possibility is about the
599        same as us discovering intelligent life on another plant tomorrow.
600
601        But generally, we should (RFC lies!) to accept ACK
602        from SYNACK both here and in tcp_rcv_state_process().
603        tcp_rcv_state_process() does not, hence, we do not too.
604
605        Note that the case is absolutely generic:
606        we cannot optimize anything here without
607        violating protocol. All the checks must be made
608        before attempt to create socket.
609      */
610
611     /* RFC793 page 36: "If the connection is in any non-synchronized state ...
612      *                  and the incoming segment acknowledges something not yet
613      *                  sent (the segment carries an unacceptable ACK) ...
614      *                  a reset is sent."
615      *
616      * Invalid ACK: reset will be sent by listening socket.
617      * Note that the ACK validity check for a Fast Open socket is done
618      * elsewhere and is checked directly against the child socket rather
619      * than req because user data may have been sent out.
620      */
621     if ((flg & TCP_FLAG_ACK) && !fastopen &&
622         (TCP_SKB_CB(skb)->ack_seq !=
623          tcp_rsk(req)->snt_isn + 1))
624         return sk;//如果确认号不对,则返回listening socekt,在tcp_v4_do_rcv函数中会发送Reset
633     if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
634                       tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
635         /* Out of window: send ACK and drop. */
636         if (!(flg & TCP_FLAG_RST)) //没有RST标记位
637             req->rsk_ops->send_ack(sk, skb, req); //调用tcp_v4_reqsk_send_ack发送ACK,这个函数可以在仅有request_sock的时候发送ACK
638         if (paws_reject)
639             NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
640         return NULL;
641     }
642
643     /* In sequence, PAWS is OK. */
644
645     if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))//如果开启了时间戳选项并且此包的序列号小于等于期望接收的序列号
646         req->ts_recent = tmp_opt.rcv_tsval;//记录对端的时间戳
647
648     if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {//当前包的序列号与第一个SYN包的一致
649         /* Truncate SYN, it is out of window starting    //这是个部分超出窗口的包;它一定携带了数据,否则应该在633-641行的处理流程中就会被丢弃
650            at tcp_rsk(req)->rcv_isn + 1. */
651         flg &= ~TCP_FLAG_SYN;//清除SYN标记
652     }
...
657     if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
658         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
659         goto embryonic_reset;
660     }
...
668     if (!(flg & TCP_FLAG_ACK))//不接受非ACK包
669         return NULL;
670
671     /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
672     if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
673         tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
674     else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */
675         tcp_rsk(req)->snt_synack = 0;
676
677     /* For Fast Open no more processing is needed (sk is the
678      * child socket).
679      */
680     if (fastopen)
681         return sk;
682
683     /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
684     if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
685         TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {//如果开启了TCP_DEFER_ACCEPT功能,则要尽可能推迟建立子socket的时间
686         inet_rsk(req)->acked = 1;
687         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
688         return NULL;//丢弃当前不带数据的ACK,等待带数据的ACK到来再建立子socket
689     }
...  //调用tcp_v4_syn_recv_sock或tcp_v6_syn_recv_sock建立子socket,将socket的状态设置为TCP_SYN_RECV,并将其加入到established hash表中
697     child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
698     if (child == NULL)
699         goto listen_overflow;
700
701     inet_csk_reqsk_queue_unlink(sk, req, prev); //从将request sock从listening socket的SYN table中摘出来
702     inet_csk_reqsk_queue_removed(sk, req);
703
704     inet_csk_reqsk_queue_add(sk, req, child);//加入到listening socket的accept_queue中
705     return child;

  这段代码的569-609行描述了一种很有意思的场景:如果一个恶意的攻击者给A和B都发送了一个SYN,其源|目的IP及端口与A和B都是匹配的。这样当A收到SYN时,就会认为是B发来的,B也会认为是A发送的SYN,此时在A和B看来:
     A:收到从B发来的SYN,seq=7
     B:收到从A发来的SYN,seq=7
     A和B分别回复SYN|ACK,很巧合的是,它们选择的起始序列号都是7(这个概率极低,与发现外星智慧生命的概率相似):
      A:发送SYN|ACK,seq=7,ack_seq=8
      B:发送SYN|ACK,seq=7,ack_seq=8
     然后A和B会分别收到对方发送来的SYN|ACK,并会当作ACK包处理。
     A:收到从B发来的ACK,seq=7,ack_seq=8
     B:收到从A发来的ACL,seq=7,ack_seq=8
     这时这个包中的ack_seq正确,但seq号错误(应该为8),在633-637的处理流程中会因序列号检查失败而发送ACK:
     A:发送ACK,seq=8,ack_seq=8
     B:发送ACK,seq=8,ack_seq=8
     这次当A和B分别收到对方发送的ACK时,就会建立连接,准备与对方通信。但由于这个连接并不是双方中的任何一方想发起的,故不会有任何数据交互。这个连接会一直存在直到应用层进程将其关闭。
     这种场景出现的概率很低,而且危害不大(多占用了两个服务器的各一个socket),更重要的是无法在不修改协议的情况下解决这个问题(SYN|ACK包必须当作ACK处理,且收到序列号不合法的包必须发送ACK),故Linux TCP无法解决这个问题。

  704:inet_csk_reqsk_queue_add函数在将request_sock加入到accept_queue中的时候将697行创建的scoket结构与request_sock关联起来

  tcp_v4_syn_recv_sock函数会根据request的信息创建一个sock:

1642 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1643                   struct request_sock *req,
1644                   struct dst_entry *dst)
1645 {
1646     struct inet_request_sock *ireq;
1647     struct inet_sock *newinet;
1648     struct tcp_sock *newtp;
1649     struct sock *newsk;
,,, 
1655     if (sk_acceptq_is_full(sk))   //有过多等待accept系统调用的socket
1656         goto exit_overflow;
1657 
1658     newsk = tcp_create_openreq_child(sk, req, skb); //.创建并初始化一个sock
1659     if (!newsk)
1660         goto exit_nonewsk;
1661 
1662     newsk->sk_gso_type = SKB_GSO_TCPV4;
1663     inet_sk_rx_dst_set(newsk, skb);
1664 
1665     newtp             = tcp_sk(newsk);
1666     newinet           = inet_sk(newsk);
1667     ireq              = inet_rsk(req);
1668     newinet->inet_daddr   = ireq->rmt_addr;
1669     newinet->inet_rcv_saddr = ireq->loc_addr;
1670     newinet->inet_saddr       = ireq->loc_addr;
1671     inet_opt          = ireq->opt; //复制选项信息
1672     rcu_assign_pointer(newinet->inet_opt, inet_opt);
1673     ireq->opt         = NULL;
1674     newinet->mc_index     = inet_iif(skb);
1675     newinet->mc_ttl       = ip_hdr(skb)->ttl;
1676     newinet->rcv_tos      = ip_hdr(skb)->tos;
1677     inet_csk(newsk)->icsk_ext_hdr_len = 0;
1678     if (inet_opt)
1679         inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1680     newinet->inet_id = newtp->write_seq ^ jiffies;
1681 
1682     if (!dst) {
1683         dst = inet_csk_route_child_sock(sk, newsk, req);
1684         if (!dst)
1685             goto put_and_exit;
1686     } else {
1687         /* syncookie case : see end of cookie_v4_check() */
1688     }
1689     sk_setup_caps(newsk, dst);
1690 
1691     tcp_mtup_init(newsk);
1692     tcp_sync_mss(newsk, dst_mtu(dst)); //更新MSS
1693     newtp->advmss = dst_metric_advmss(dst);
1694     if (tcp_sk(sk)->rx_opt.user_mss &&
1695         tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1696         newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1697 
1698     tcp_initialize_rcv_mss(newsk);
1699     tcp_synack_rtt_meas(newsk, req);
1700     newtp->total_retrans = req->num_retrans;
...
1719     if (__inet_inherit_port(sk, newsk) < 0) //继承listen sock的本地端口,即监听的端口
1720         goto put_and_exit;
1721     __inet_hash_nolisten(newsk, NULL); //将新创建的sock加入到ESTABLISHED连接表中,后续的数据处理会由新sock完成
1722 
1723     return newsk;
1724 
1725 exit_overflow:
1726     NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1727 exit_nonewsk:
1728     dst_release(dst);
1729 exit:
1730     NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1731     return NULL;
1732 put_and_exit:
1733     inet_csk_prepare_forced_close(newsk);
1734     tcp_done(newsk);
1735     goto exit;
1736 }

        1719行的__inet_inherit_port函数会将新sock加入到listen sock监听端口的绑定队列中,这个动作的后果是如果newsk没有释放(比如处于TIME_WIAT状态)则不允许重复bind监听的地址和端口(除非设置了端口重用)。下面来看__inet_inherit_port的代码:

106 int __inet_inherit_port(struct sock *sk, struct sock *child)                                                                                       
107 {
108     struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
109     unsigned short port = inet_sk(child)->inet_num; //这个port就是本地port,赋值是在inet_csk_clone_lock函数中
110     const int bhash = inet_bhashfn(sock_net(sk), port,
111             table->bhash_size);    
112     struct inet_bind_hashbucket *head = &table->bhash[bhash];
113     struct inet_bind_bucket *tb;                                                                                                                   
114 
115     spin_lock(&head->lock);
116     tb = inet_csk(sk)->icsk_bind_hash;
117     if (tb->port != port) {
118         /* NOTE: using tproxy and redirecting skbs to a proxy
119          * on a different listener port breaks the assumption
120          * that the listener socket's icsk_bind_hash is the same
121          * as that of the child socket. We have to look up or
122          * create a new bind bucket for the child here. */
123         inet_bind_bucket_for_each(tb, &head->chain) {
124             if (net_eq(ib_net(tb), sock_net(sk)) &&
125                 tb->port == port)              
126                 break;
127         }
128         if (!tb) {
129             tb = inet_bind_bucket_create(table->bind_bucket_cachep,
130                              sock_net(sk), head, port);                                                                                            
131             if (!tb) {
132                 spin_unlock(&head->lock);      
133                 return -ENOMEM;            
134             }
135         }
136     }
137     inet_bind_hash(child, tb, port); //通常child的port与parent的一致,这样就加入到相同的bind队列中
138     spin_unlock(&head->lock);
139 
140     return 0;
141 }


        TCP为什么要将新创建的子sock与监听的地址和端口绑定?这样会使得当前的bind有效,但如果取消当前bind(比如关闭socket)再快速重新bind则会失败。为什么要做成这个样子呢?Bind对于TCP的意义究竟有哪些?我暂时没有答案,以后慢慢领悟。

  tcp_check_req函数返回后,tcp_v4_do_rcv会调用tcp_child_process继续进行处理:

745 int tcp_child_process(struct sock *parent, struct sock *child,
746               struct sk_buff *skb)
747 {
748     int ret = 0;
749     int state = child->sk_state;
750
751     if (!sock_owned_by_user(child)) {//新的socket没有被进行系统调用的进程锁定
752         ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
753                         skb->len);//进行状态处理
754         /* Wakeup parent, send SIGIO */
755         if (state == TCP_SYN_RECV && child->sk_state != state)//状态处理结束后socket的状态发生了变化
756             parent->sk_data_ready(parent, 0);//调用sock_def_readable函数发送可读事件通告给listening socket,告知其可以进行accept系统调用
757     } else {//新的socket被进行系统调用的进程锁定;因为这是新的socket,所以在tcp_v4_rcv加的锁不会起到保护新socket的作用
758         /* Alas, it is possible again, because we do lookup
759          * in main socket hash table and lock on listening
760          * socket does not protect us more.
761          */
762         __sk_add_backlog(child, skb);//加入到backlog队列,等待进程系统调用结束时处理
763     }
764
765     bh_unlock_sock(child);
766     sock_put(child);
767     return ret;
768 }

  tcp_rcv_state_process函数代码解析:

 5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601               const struct tcphdr *th, unsigned int len)
5602 {
5603     struct tcp_sock *tp = tcp_sk(sk);
5604     struct inet_connection_sock *icsk = inet_csk(sk);
5605     struct request_sock *req;
5606     int queued = 0;
...
5661     req = tp->fastopen_rsk;
...
5670     if (!th->ack && !th->rst)//不处理不带ACK标记也不带RST标记的包
5671         goto discard;
5672
5673     if (!tcp_validate_incoming(sk, skb, th, 0))//检查包的合法性(主要是检查序列号)
5674         return 0;
5675
5676     /* step 5: check the ACK field */
5677     if (true) {
5678         int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
5679                           FLAG_UPDATE_TS_RECENT) > 0;//检查ACK的合法性;确认发送队列中的数据包
5680
5681         switch (sk->sk_state) {
5682         case TCP_SYN_RECV:  //三次握手的ACK会走这里
5683             if (acceptable) {//如果ack_seq合法,acceptable就会是真
5684                 /* Once we leave TCP_SYN_RECV, we no longer
5685                  * need req so release it.
5686                  */
5687                 if (req) {  //TFO功能相关
5688                     tcp_synack_rtt_meas(sk, req);
5689                     tp->total_retrans = req->num_retrans;
5690
5691                     reqsk_fastopen_remove(sk, req, false);
5692                 } else {
5693                     /* Make sure socket is routed, for
5694                      * correct metrics.
5695                      */
5696                     icsk->icsk_af_ops->rebuild_header(sk);//调用inet_sk_rebuild_header或inet6_sk_rebuild_header,根据ACK包的信息重新计算路由
5697                     tcp_init_congestion_control(sk);//初始化拥塞控制算法
5698
5699                     tcp_mtup_init(sk);//初始化MTU探测功能
5700                     tcp_init_buffer_space(sk);//初始化接收缓存和发送缓存的空间
5701                     tp->copied_seq = tp->rcv_nxt;
5702                 }
5703                 smp_mb();
5704                 tcp_set_state(sk, TCP_ESTABLISHED); //socket状态切换为TCP_ESTABLISHED
5705                 sk->sk_state_change(sk);//调用sock_def_wakeup唤醒等待的进程
...
5712                 if (sk->sk_socket)
5713                     sk_wake_async(sk,
5714                               SOCK_WAKE_IO, POLL_OUT);
5715
5716                 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
5717                 tp->snd_wnd = ntohs(th->window) <<
5718                           tp->rx_opt.snd_wscale;
5719                 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5720
5721                 if (tp->rx_opt.tstamp_ok)
5722                     tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
5723
5724                 if (req) {
...
5735                     tcp_rearm_rto(sk);
5736                 } else
5737                     tcp_init_metrics(sk);
5738
5739                 /* Prevent spurious tcp_cwnd_restart() on
5740                  * first data packet.
5741                  */
5742                 tp->lsndtime = tcp_time_stamp;
5743
5744                 tcp_initialize_rcv_mss(sk);
5745                 tcp_fast_path_on(tp);
5746             } else {
5747                 return 1;
5748             }
5749             break;
...
5855     case TCP_ESTABLISHED:
5856         tcp_data_queue(sk, skb);//如果ACK报文有数据则会在这个函数中进行处理
5857         queued = 1;
5858         break;
5859     }
...

  5705:对于服务器而言,此时这个新的socket尚未被accpet,故不会有进程等待这个socket的事件

  再看看tcp_validate_incoming函数:

4985 static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
4986                   const struct tcphdr *th, int syn_inerr)
4987 {
4988     struct tcp_sock *tp = tcp_sk(sk);
4989
4990     /* RFC1323: H1. Apply PAWS check first. */
4991     if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
4992         tcp_paws_discard(sk, skb)) {   //如果包的序列号回绕
4993         if (!th->rst) {  //如果不是RST包,则发送ACK,并丢弃此包
4994             NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
4995             tcp_send_dupack(sk, skb);      
4996             goto discard;
4997         }
4998         /* Reset is accepted even if it did not pass PAWS. */
4999     }
5000
5001     /* Step 1: check sequence number */
5002     if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {//检查序列号是否在窗口之内
5003         /* RFC793, page 37: "In all states except SYN-SENT, all reset
5004          * (RST) segments are validated by checking their SEQ-fields."
5005          * And page 69: "If an incoming segment is not acceptable,
5006          * an acknowledgment should be sent in reply (unless the RST
5007          * bit is set, if so drop the segment and return)".
5008          */
5009         if (!th->rst) {
5010             if (th->syn)
5011                 goto syn_challenge;           
5012             tcp_send_dupack(sk, skb);   //不是reset包,发送ACK,告诉对端发送正确的报文  
5013         }
5014         goto discard;//非法包,丢弃
5015     }
5016
5017     /* Step 2: check RST bit */    
5018     if (th->rst) {
5019         /* RFC 5961 3.2 :
5020          * If sequence number exactly matches RCV.NXT, then
5021          *     RESET the connection
5022          * else
5023          *     Send a challenge ACK
5024          */
5025         if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
5026             tcp_reset(sk);
5027         else
5028             tcp_send_challenge_ack(sk);
5029         goto discard;
5030     }
5031
5032     /* step 3: check security and precedence [ignored] */
5033
5034     /* step 4: Check for a SYN
5035      * RFC 5691 4.2 : Send a challenge ack
5036      */
5037     if (th->syn) {
5038 syn_challenge:    
5039         if (syn_inerr)
5040             TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5041         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
5042         tcp_send_challenge_ack(sk);
5043         goto discard;
5044     }
5045
5046     return true;
5047
5048 discard:
5049     __kfree_skb(skb);
5050     return false;
5051 }

  5028:Reset包的序列号有问题,怀疑是伪造包攻击(spoofed packet injection attacks),发送ACK探测一下对端的反应,已确定是否是攻击

  5038:SYN包不应该出现在当前流程中,怀疑是伪造包攻击(spoofed packet injection attacks),发送ACK探测一下对端的反应,已确定是否是攻击
  三次握手的ACK报文是TCP连接开始时发送的,排除恶意攻击,则不会出现回绕,也不会出现序列号在窗口之外的情况。

  下面总结一下server端收到三次握手的ACK报文时的完成的工作:

1、找到在接收SYN并发送SYN|ACK时建立的request_sock,创建一个TCP sock结构,用request_sock中的内容去初始化sock;

2、将request_sock从listen socket的syn_table中移除,再加入到listen socket的accept_queue中,并将1步中创建的sock与request_sock关联起来

3、检查ACK报文的合法性,如果合法则将报文中的信息保存到tcp_sock中;

4、TCP状态机跳转到TCP_ESTABLISHED,将sock加入到ESTABLISH hash表中,这样在应用调用accept系统调用之前内核就可以与client进行TCP数据交互了;

5、唤醒睡眠在connect系统调用中的进程

  至此,客户端进程与服务器进程的三次握手全部完成,TCP连接建立成功。

  以上讨论的是标准的C/S模式下的TCP连接建立过程,下面我们讨论一种非C/S模式的TCP连接建立过程:同时打开。

你可能感兴趣的:(TCP协议,TCP协议详解,linux内核,tcp)