客户端发送的SYN请求到达服务器的网卡后,进入服务器操作系统的网络协议栈,经过链路层和网络层的处理后,抵达TCP协议的入口函数。TCPv4的入口函数是tcp_v4_rcv,TCPv6的入口函数是tcp_v6_rcv。下面对tcp_v4_rcv进行分析:
1961 int tcp_v4_rcv(struct sk_buff *skb) 1962 { 1963 const struct iphdr *iph; 1964 const struct tcphdr *th; 1965 struct sock *sk; 1966 int ret; ... 1975 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1976 goto discard_it; 1977 1978 th = tcp_hdr(skb); 1979 1980 if (th->doff < sizeof(struct tcphdr) / 4) //检查TCP头长度是否大于最小值 1981 goto bad_packet; 1982 if (!pskb_may_pull(skb, th->doff * 4)) 1983 goto discard_it; 1984 1985 /* An explanation is required here, I think. 1986 * Packet length and doff are validated by header prediction, 1987 * provided case of th->doff==0 is eliminated. 1988 * So, we defer the checks. */ 1989 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) //校验检验和 1990 goto csum_error; 1991 1992 th = tcp_hdr(skb); 1993 iph = ip_hdr(skb); 1994 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1995 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1996 skb->len - th->doff * 4); 1997 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1998 TCP_SKB_CB(skb)->when = 0; 1999 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2000 TCP_SKB_CB(skb)->sacked = 0; 2001 2002 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); 2003 if (!sk) 2004 goto no_tcp_socket; ... 2023 2024 bh_lock_sock_nested(sk); //加自旋锁,以防止其它软中断同时访问当前socket 2025 ret = 0; 2026 if (!sock_owned_by_user(sk)) {//如果没有进程锁定此socket;对于SYN此条件为真 ... 2035 { 2036 if (!tcp_prequeue(sk, skb)) //判断是否加入到prequeue中;若无进程等待使用prequeue的话则调用tcp_v4_do_rcv函数进行处理 2037 ret = tcp_v4_do_rcv(sk, skb);//进入主处理函数 2038 } ... 2043 goto discard_and_relse; 2044 } 2045 bh_unlock_sock(sk); 2046 2047 sock_put(sk); 2048 2049 return ret; 2050代码解析:
1975:pskb_may_pull的作用就是检测skb对应的主buf中是否有足够的空间来pull出len长度,如果不够就重新分配skb并将frags中的数据整合进连续空间中;SYN包则不存在frags中的数据。
1994-2000:将报文中的一些信息记录到skb的cb字段中以便使用;
2002:__inet_lookup_skb会根据报文的源/目的IP和源/目的端口等信息在tcp_hashinfo中查找连接。先查找ehash,即established连接表,对于SYN请求这次查找会失败;然后再找listening_hash,这时会找到server端在listen系统调用中加入到listening_hash表中的socket。
下面分析主处理函数tcp_v4_do_rcv:
1800 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1801 { 1802 struct sock *rsk; ... 1835 if (sk->sk_state == TCP_LISTEN) { 1836 struct sock *nsk = tcp_v4_hnd_req(sk, skb); //查找request sock 1837 if (!nsk) 1838 goto discard; 1839 1840 if (nsk != sk) { 1841 sock_rps_save_rxhash(nsk, skb); 1842 if (tcp_child_process(sk, nsk, skb)) { 1843 rsk = nsk; 1844 goto reset; 1845 } 1846 return 0; 1847 } 1848 } else 1849 sock_rps_save_rxhash(sk, skb); 1850 1851 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { 1852 rsk = sk; 1853 goto reset; 1854 } 1855 return 0; ...server端socket的状态一定是TCP_LISTEN,即1835行的判断成立。tcp_v4_hnd_req用于查找一个处于“半建立”状态(即发送SYN|ACK,等待ACK)的连接:
1739 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) 1740 { 1741 struct tcphdr *th = tcp_hdr(skb); 1742 const struct iphdr *iph = ip_hdr(skb); 1743 struct sock *nsk; 1744 struct request_sock **prev; 1745 /* Find possible connection requests. */ 1746 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, 1747 iph->saddr, iph->daddr); 1748 if (req) //对于第一个SYN包req一定是NULL 1749 return tcp_check_req(sk, skb, req, prev, false); 1750 1751 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, 1752 th->source, iph->daddr, th->dest, inet_iif(skb)); 1753 1754 if (nsk) { //对于第一个SYN包nsk一定是NULL 1755 if (nsk->sk_state != TCP_TIME_WAIT) { 1756 bh_lock_sock(nsk); 1757 return nsk; 1758 } 1759 inet_twsk_put(inet_twsk(nsk)); 1760 return NULL; 1761 } ... 1767 return sk; 1768 }值得注意的是,1751-1752行又进行了一次在established表中的连接查找。在tcp_v4_rcv函数的2002行的__inet_lookup_skb函数不是查过一回了吗?tcp_v4_hnd_req函数中查找是否不必要呢?很必要!因为Linux支持多处理器,考虑如下场景:sever端的机器有两个CPU,如果client先后发送了两个SYN,第一个SYN到来后由CPU0处理,先调用__inet_lookup_skb查找socket,这时只能在listen表中查找;找到之后在锁定这个socket之前CPU1收到了第二个SYN,并以迅雷不及掩耳的速度完成了三次握手并建立了established连接,然后CPU0才慢吞吞的锁定socket,调用tcp_v4_hnd_req函数,这时iinet_lookup_established函数就会查找到由CPU1建立的socket。如果没有1751-1752行的处理,在这种情况下server端的TCP就会错误的再次发送SYN|ACK并重复建立连接。上述场景发生的概率虽然很低,但并非为零。可见这个“查漏补缺”的查找的必要性。
通常情况下,对于第一个SYN请求tcp_v4_hnd_req会在1767行返回,即返回listen socket的指针。这样对于tcp_v4_do_rcv函数,1840行的判断会为假,代码直接跳转到1851行的tcp_rcv_state_process函数:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, 5601 const struct tcphdr *th, unsigned int len) 5602 { ... 5604 struct inet_connection_sock *icsk = inet_csk(sk); ... 5610 switch (sk->sk_state) { ... 5614 case TCP_LISTEN: ... 5621 if (th->syn) { 5622 if (th->fin) 5623 goto discard; //SYN包中不允许置FIN标志位 5624 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) 5625 return 1; ... 5644 kfree_skb(skb); 5645 return 0; 5646 } ...icsk->icsk_af_ops->conn_request指向tcp_v4_conn_request函数:
1465 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1466 { 1467 struct tcp_options_received tmp_opt; 1468 struct request_sock *req; 1469 struct inet_request_sock *ireq; 1470 struct tcp_sock *tp = tcp_sk(sk); 1471 struct dst_entry *dst = NULL; 1472 __be32 saddr = ip_hdr(skb)->saddr; 1473 __be32 daddr = ip_hdr(skb)->daddr; 1474 __u32 isn = TCP_SKB_CB(skb)->when; 1475 bool want_cookie = false; 1476 struct flowi4 fl4; 1477 struct tcp_fastopen_cookie foc = { .len = -1 }; 1478 struct tcp_fastopen_cookie valid_foc = { .len = -1 }; 1479 struct sk_buff *skb_synack; 1480 int do_fastopen; ... 1486 /* TW buckets are converted to open requests without 1487 * limitations, they conserve resources and peer is 1488 * evidently real one. 1489 */ 1490 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {//如果requeset队列已满并且SYN包没有命中TIME_WAIT socekt 1491 want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); //判断是否需要发送SYN Cookie 1492 if (!want_cookie) //如果不需要发送Cookie,则丢弃 1493 goto drop; 1494 } 1495 1496 /* Accept backlog is full. If we have already queued enough 1497 * of warm entries in syn queue, drop request. It is better than 1498 * clogging syn queue with openreqs with exponentially increasing 1499 * timeout. 1500 */ 1501 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { 1502 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1503 goto drop; //如果accept backlog队列已满,且未超时的request socket的数量大于1,则丢弃当前请求 1504 } 1505 1506 req = inet_reqsk_alloc(&tcp_request_sock_ops);//申请一个request socket结构体 1507 if (!req) 1508 goto drop; 1509 1514 tcp_clear_options(&tmp_opt); 1515 tmp_opt.mss_clamp = TCP_MSS_DEFAULT; 1516 tmp_opt.user_mss = tp->rx_opt.user_mss; 1517 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); 1518 1519 if (want_cookie && !tmp_opt.saw_tstamp) 1520 tcp_clear_options(&tmp_opt); 1521 1522 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 1523 tcp_openreq_init(req, &tmp_opt, skb); 1524 1525 ireq = inet_rsk(req); 1526 ireq->loc_addr = daddr; 1527 ireq->rmt_addr = saddr; 1528 ireq->no_srccheck = inet_sk(sk)->transparent; 1529 ireq->opt = tcp_v4_save_options(skb); ... 1536 1537 if (want_cookie) { 1538 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1539 req->cookie_ts = tmp_opt.tstamp_ok; 1540 } else if (!isn) { 1541 /* VJ's idea. We save last timestamp seen 1542 * from the destination in peer table, when entering 1543 * state TIME-WAIT, and check against it before 1544 * accepting new connection request. 1545 * 1546 * If "isn" is not zero, this request hit alive 1547 * timewait bucket, so that all the necessary checks 1548 * are made in the function processing timewait state. 1549 */ 1550 if (tmp_opt.saw_tstamp && //SYN包中携带了时间戳选项 1551 tcp_death_row.sysctl_tw_recycle && //系统设置了回收TIME_WAIT socket 1552 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && //查找到目的路由 1553 fl4.daddr == saddr) { //路由得到的目的IP与SYN包中的源IP一致 1554 if (!tcp_peer_is_proven(req, dst, true)) {//这时time_wait bucket已经超时了,只能通过时间戳验证一下新的请求是否与旧连接冲突 1555 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1556 goto drop_and_release; 1557 } 1558 } 1559 /* Kill the following clause, if you dislike this way. */ 1560 else if (!sysctl_tcp_syncookies && //SYN Cookie没有开启 1561 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 1562 (sysctl_max_syn_backlog >> 2)) && //仅仅一个socket的request socket的数量就占据了SYN backlog队列最大空间的3/4还多 1563 !tcp_peer_is_proven(req, dst, false)) {//无法证明对端主机处于活动状态 ... 1573 goto drop_and_release; //这时很有可能发生了SYN Flood攻击,需要丢弃 1574 } 1575 1576 isn = tcp_v4_init_sequence(skb);//生成随机序列号 1577 } 1578 tcp_rsk(req)->snt_isn = isn; ... 1598 skb_synack = tcp_make_synack(sk, dst, req, 1599 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); //构建一个SYN|ACK包 1600 1601 if (skb_synack) { 1602 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);//计算检验和 1603 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb)); 1604 } else 1605 goto drop_and_free; 1606 1607 if (likely(!do_fastopen)) { 1608 int err; 1609 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, 1610 ireq->rmt_addr, ireq->opt); 1611 err = net_xmit_eval(err); //检查返回值,如果不是“成功”或“拥塞通告”则丢弃此连接 1612 if (err || want_cookie) 1613 goto drop_and_free; 1614 1615 tcp_rsk(req)->snt_synack = tcp_time_stamp; 1616 tcp_rsk(req)->listener = NULL; 1617 /* Add the request_sock to the SYN table */ 1618 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); ... 1633 return 0; 1634 }
代码解析:
1490和1501:限制未处理完毕的连接请求的数量,以避免未建立的连接消耗的内存过大
1506:申请一个request_sock结构用于保存连接信息;使用request_sock而不使用socket的原因是前者比后者小得多,会大大减少内存的占用
1523:tcp_openreq_init函数用于保存SYN请求中的信息:
1075 static inline void tcp_openreq_init(struct request_sock *req, 1076 struct tcp_options_received *rx_opt, 1077 struct sk_buff *skb) 1078 { 1079 struct inet_request_sock *ireq = inet_rsk(req); 1080 1081 req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ 1082 req->cookie_ts = 0; 1083 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; 1084 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; 1085 tcp_rsk(req)->snt_synack = 0; 1086 req->mss = rx_opt->mss_clamp; 1087 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; 1088 ireq->tstamp_ok = rx_opt->tstamp_ok; 1089 ireq->sack_ok = rx_opt->sack_ok; 1090 ireq->snd_wscale = rx_opt->snd_wscale; 1091 ireq->wscale_ok = rx_opt->wscale_ok; 1092 ireq->acked = 0; 1093 ireq->ecn_ok = 0; 1094 ireq->rmt_port = tcp_hdr(skb)->source; 1095 ireq->loc_port = tcp_hdr(skb)->dest; 1096 }1084:rcv_nxt是期望接收的下一个报文的起始序列号,这里必须是SYN包的seq + 1;发送ACK报文时rcv_nxt会被用作确认号
1526-1529:将地址、选项等信息保存下来以便建立established连接时使用
1609-1610:填充SYN|ACK包的IP首部并发送;发送的SYN|ACK并没有保存到队列中,是为了防止发生SYN Flood攻击时耗费内存
1618:将这个request_sock加入到SYN hash表中并设置定时器,超时后会重发SYN|ACK或删除request_sock
下面来看看SYN|ACK的构建和发送的过程。SYN|ACK的构建使用的是tcp_make_synack:
2654 struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2655 struct request_sock *req, 2656 struct tcp_fastopen_cookie *foc) 2657 { 2658 struct tcp_out_options opts; 2659 struct inet_request_sock *ireq = inet_rsk(req); 2660 struct tcp_sock *tp = tcp_sk(sk); 2661 struct tcphdr *th; 2662 struct sk_buff *skb; 2663 struct tcp_md5sig_key *md5; 2664 int tcp_header_size; 2665 int mss; 2666 2667 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); //申请skb 2668 if (unlikely(!skb)) { 2669 dst_release(dst); 2670 return NULL; 2671 } 2672 /* Reserve space for headers. */ 2673 skb_reserve(skb, MAX_TCP_HEADER); 2674 2675 skb_dst_set(skb, dst); ... 2682 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ 2683 __u8 rcv_wscale; 2684 /* Set this up on the first call only */ 2685 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); 2686 2687 /* limit the window selection if the user enforce a smaller rx buffer */ 2688 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && 2689 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) 2690 req->window_clamp = tcp_full_space(sk); 2691 2692 /* tcp_full_space because it is guaranteed to be the first packet */ 2693 tcp_select_initial_window(tcp_full_space(sk), 2694 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 2695 &req->rcv_wnd, 2696 &req->window_clamp, 2697 ireq->wscale_ok, 2698 &rcv_wscale, 2699 dst_metric(dst, RTAX_INITRWND)); 2700 ireq->rcv_wscale = rcv_wscale; 2701 } ... 2710 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, 2711 foc) + sizeof(*th); 2712 2713 skb_push(skb, tcp_header_size); 2714 skb_reset_transport_header(skb); 2715 2716 th = tcp_hdr(skb); 2717 memset(th, 0, sizeof(struct tcphdr)); 2718 th->syn = 1; 2719 th->ack = 1; 2720 TCP_ECN_make_synack(req, th); 2721 th->source = ireq->loc_port; 2722 th->dest = ireq->rmt_port; ... 2726 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, 2727 TCPHDR_SYN | TCPHDR_ACK); 2728 2729 th->seq = htonl(TCP_SKB_CB(skb)->seq); 2730 /* XXX data is queued and acked as is. No buffer/window check */ 2731 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); 2732 2733 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 2734 th->window = htons(min(req->rcv_wnd, 65535U)); 2735 tcp_options_write((__be32 *)(th + 1), tp, &opts); 2736 th->doff = (tcp_header_size >> 2); ... 2747 return skb; 2748 }代码解析:
2693-2700:初始化通告窗口大小
2710-2711:构建选项字段
2716-2736:设置TCP头的各个字段;其中2731行代码,SYN|ACK的ack_seq用tcp_openreq_init函数保存的rcv_nxt来赋值,这个值是SYN报文的seq + 1
这样一个SYN|ACK就构建完毕了,而包的其余部分的构建和发送是靠ip_build_and_send_pkt来完成:
129 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, 130 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) 131 { 132 struct inet_sock *inet = inet_sk(sk); 133 struct rtable *rt = skb_rtable(skb); 134 struct iphdr *iph; 135 136 /* Build the IP header. */ 137 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0)); 138 skb_reset_network_header(skb); 139 iph = ip_hdr(skb); 140 iph->version = 4; //版本 141 iph->ihl = 5; //首部长度 142 iph->tos = inet->tos; //TOS: type of service 143 if (ip_dont_fragment(sk, &rt->dst)) 144 iph->frag_off = htons(IP_DF); //不分片 145 else 146 iph->frag_off = 0; 147 iph->ttl = ip_select_ttl(inet, &rt->dst); //TTL: time to live 148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); 149 iph->saddr = saddr; 150 iph->protocol = sk->sk_protocol; //传输层协议号 151 ip_select_ident(iph, &rt->dst, sk); 152 153 if (opt && opt->opt.optlen) { 154 iph->ihl += opt->opt.optlen>>2; 155 ip_options_build(skb, &opt->opt, daddr, rt, 0); //设置选项字段 156 } 157 158 skb->priority = sk->sk_priority; 159 skb->mark = sk->sk_mark; 160 161 /* Send it out. */ 162 return ip_local_out(skb); 163 }server端TCP在收到SYN请求后进行的处理总结如下:
1、创建一个比较小的数据结构request_sock并保存连接信息
2、将request_sock加入到syn table中,以便ACK到来时能够找到相应的连接信息
3、创建SYN|ACK包并发送出去,同时设置重传定时器以避免SYN|ACK包丢失
发送完SYN|ACK后,server端就会等待接收client端处理完SYN|ACK后再发送的ACK报文。client端会如何处理SYN|ACK呢?server端又会如何处理ACK以完成三次握手流程呢?