TCP状态切换流程

2019独角兽企业重金招聘Python工程师标准>>> hot3.png

TCP状态切换流程

enum { /*

  • Description of States:

  • TCP_SYN_SENT sent a connection request, waiting for ack

  • TCP_SYN_RECV received a connection request, sent ack,

  • waiting for final ack in three-way handshake.

  • TCP_ESTABLISHED connection established

  • TCP_FIN_WAIT1 our side has shutdown, waiting to complete

  • transmission of remaining buffered data

  • TCP_FIN_WAIT2 all buffered data sent, waiting for remote

  • to shutdown

  • TCP_CLOSING both sides have shutdown but we still have

  • data we have to finish sending

  • TCP_TIME_WAIT timeout to catch resent junk before entering

  • closed, can only be entered from FIN_WAIT2

  • or CLOSING. Required because the other end

  • may not have gotten our last ACK causing it

  • to retransmit the data packet (which we ignore)

  • TCP_CLOSE_WAIT remote side has shutdown and is waiting for

  • us to finish writing our data and to shutdown

  • (we have to close() to move on to LAST_ACK)

  • TCP_LAST_ACK out side has shutdown after remote has

  • shutdown. There may still be data in our

  • buffer that we have to finish sending

  • TCP_CLOSE socket is finished */

  • 连接已建立 / TCP_ESTABLISHED = 1, /

  • 已发送SYN包 / TCP_SYN_SENT, /

  • 已接收到SYN包 / TCP_SYN_RECV, /

  • 执行主动关闭,已发送FIN包 / TCP_FIN_WAIT1, /

  • 执行主动关闭,发送的FIN包后收到对端的ACK包 / TCP_FIN_WAIT2, /

  • 执行主动关闭,,接收到对端的FIN包,并发送ACK包 / TCP_TIME_WAIT, /

  • 连接初始状态 / TCP_CLOSE, /

  • 执行被动关闭,接收到对端的FIN包,并发送ACK包 / TCP_CLOSE_WAIT, /

  • 执行被动关闭,接收到FIN包后,发送自己的FIN包 / TCP_LAST_ACK, /

  • 监听状态 / TCP_LISTEN, /

  • 两端同时关闭,在发送FIN包后接收到对端的FIN包 / TCP_CLOSING, / Now a valid state */

TCP_MAX_STATES /* Leave at the end! */ };

一、主动端

1、TCP_CLOSE ---->TCP_SYN_SENT 复制代码

141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 142 { 143 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 144 struct inet_sock *inet = inet_sk(sk); 145 struct tcp_sock *tp = tcp_sk(sk); 146 __be16 orig_sport, orig_dport; 147 __be32 daddr, nexthop; 148 struct flowi4 *fl4; 149 struct rtable *rt; 150 int err; 151 struct ip_options_rcu inet_opt; 152 153 if (addr_len < sizeof(struct sockaddr_in)) 154 return -EINVAL; 155 156 if (usin->sin_family != AF_INET) 157 return -EAFNOSUPPORT; 158 159 nexthop = daddr = usin->sin_addr.s_addr; 160 inet_opt = rcu_dereference_protected(inet->inet_opt, 161 sock_owned_by_user(sk)); 162 if (inet_opt && inet_opt->opt.srr) { 163 if (!daddr) 164 return -EINVAL; 165 nexthop = inet_opt->opt.faddr; 166 } 167 168 orig_sport = inet->inet_sport; 169 orig_dport = usin->sin_port; 170 fl4 = &inet->cork.fl.u.ip4; 171 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 172 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 173 IPPROTO_TCP, 174 orig_sport, orig_dport, sk); 175 if (IS_ERR(rt)) { 176 err = PTR_ERR(rt); 177 if (err == -ENETUNREACH) 178 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 179 return err; 180 } 181 182 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 183 ip_rt_put(rt); 184 return -ENETUNREACH; 185 } 186 187 if (!inet_opt || !inet_opt->opt.srr) 188 daddr = fl4->daddr; 189 190 if (!inet->inet_saddr) 191 inet->inet_saddr = fl4->saddr; 192 sk_rcv_saddr_set(sk, inet->inet_saddr); 193 194 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 195 / Reset inherited state / 196 tp->rx_opt.ts_recent = 0; 197 tp->rx_opt.ts_recent_stamp = 0; 198 if (likely(!tp->repair)) 199 tp->write_seq = 0; 200 } 201 202 if (tcp_death_row.sysctl_tw_recycle && 203 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) 204 tcp_fetch_timewait_stamp(sk, &rt->dst); 205 206 inet->inet_dport = usin->sin_port; 207 sk_daddr_set(sk, daddr); 208 209 inet_csk(sk)->icsk_ext_hdr_len = 0; 210 if (inet_opt) 211 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 212 213 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 214 215 / Socket identity is still unknown (sport may be zero). 216 * However we set state to SYN-SENT and not releasing socket 217 * lock select source port, enter ourselves into the hash tables and 218 * complete initialization after this. 219 / 220 tcp_set_state(sk, TCP_SYN_SENT); 221 err = inet_hash_connect(&tcp_death_row, sk); 222 if (err) 223 goto failure; 224 225 inet_set_txhash(sk); 226 227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 228 inet->inet_sport, inet->inet_dport, sk); 229 if (IS_ERR(rt)) { 230 err = PTR_ERR(rt); 231 rt = NULL; 232 goto failure; 233 } 234 / OK, now commit destination to socket. / 235 sk->sk_gso_type = SKB_GSO_TCPV4; 236 sk_setup_caps(sk, &rt->dst); 237 238 if (!tp->write_seq && likely(!tp->repair)) 239 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, 240 inet->inet_daddr, 241 inet->inet_sport, 242 usin->sin_port); 243 244 inet->inet_id = tp->write_seq ^ jiffies; 245 246 err = tcp_connect(sk); 247 248 rt = NULL; 249 if (err) 250 goto failure; 251 252 return 0; 253 254 failure: 255 / 256 * This unhashes the socket and releases the local port, 257 * if necessary. 258 */ 259 tcp_set_state(sk, TCP_CLOSE); 260 ip_rt_put(rt); 261 sk->sk_route_caps = 0; 262 inet->inet_dport = 0; 263 return err; 264 } 265 EXPORT_SYMBOL(tcp_v4_connect);

复制代码

2、TCP_SYN_SEND---->TCP_ESTABLISHED

复制代码

5434 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5435 struct tcphdr *th, unsigned len) 5436 { 5437 u8 *hash_location; 5438 struct inet_connection_sock *icsk = inet_csk(sk); 5439 struct tcp_sock *tp = tcp_sk(sk); 5440 struct tcp_cookie_values cvp = tp->cookie_values; 5441 int saved_clamp = tp->rx_opt.mss_clamp; 5442 5443 tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); 5444 5445 if (th->ack) { 5446 / rfc793: 5447 * "If the state is SYN-SENT then 5448 * first check the ACK bit 5449 * If the ACK bit is set 5450 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send 5451 * a reset (unless the RST bit is set, if so drop 5452 * the segment and return)" 5453 * 5454 * We do not send data with SYN, so that RFC-correct 5455 * test reduces to: 5456 / 5457 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) 5458 goto reset_and_undo; 5459 5460 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 5461 !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, 5462 tcp_time_stamp)) { 5463 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); 5464 goto reset_and_undo; 5465 } 5466 5467 / Now ACK is acceptable. 5468 * 5469 * "If the RST bit is set 5470 * If the ACK was acceptable then signal the user "error: 5471 * connection reset", drop the segment, enter CLOSED state, 5472 * delete TCB, and return." 5473 / 5474 5475 if (th->rst) { 5476 tcp_reset(sk); 5477 goto discard; 5478 } 5479 5480 / rfc793: 5481 * "fifth, if neither of the SYN or RST bits is set then 5482 * drop the segment and return." 5483 * 5484 * See note below! 5485 * --ANK(990513) 5486 / 5487 if (!th->syn) 5488 goto discard_and_undo; 5489 5490 / rfc793: 5491 * "If the SYN bit is on ... 5492 * are acceptable then ... 5493 * (our SYN has been ACKed), change the connection 5494 * state to ESTABLISHED..." 5495 / 5496 5497 TCP_ECN_rcv_synack(tp, th); 5498 5499 tp->snd_wl1 = TCP_SKB_CB(skb)->seq; 5500 tcp_ack(sk, skb, FLAG_SLOWPATH); 5501 5502 / Ok.. it's good. Set up sequence numbers and 5503 * move to established. 5504 / 5505 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; 5506 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; 5507 5508 / RFC1323: The window in SYN & SYN/ACK segments is 5509 * never scaled. 5510 / 5511 tp->snd_wnd = ntohs(th->window); 5512 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5513 5514 if (!tp->rx_opt.wscale_ok) { 5515 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; 5516 tp->window_clamp = min(tp->window_clamp, 65535U); 5517 } 5518 5519 if (tp->rx_opt.saw_tstamp) { 5520 tp->rx_opt.tstamp_ok = 1; 5521 tp->tcp_header_len = 5522 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 5523 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5524 tcp_store_ts_recent(tp); 5525 } else { 5526 tp->tcp_header_len = sizeof(struct tcphdr); 5527 } 5528 5529 if (tcp_is_sack(tp) && sysctl_tcp_fack) 5530 tcp_enable_fack(tp); 5531 5532 tcp_mtup_init(sk); 5533 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 5534 tcp_initialize_rcv_mss(sk); 5535 5536 / Remember, tcp_poll() does not lock socket! 5537 * Change state from SYN-SENT only after copied_seq 5538 * is initialized. / 5539 tp->copied_seq = tp->rcv_nxt; 5540 5541 if (cvp != NULL && 5542 cvp->cookie_pair_size > 0 && 5543 tp->rx_opt.cookie_plus > 0) { 5544 int cookie_size = tp->rx_opt.cookie_plus 5545 - TCPOLEN_COOKIE_BASE; 5546 int cookie_pair_size = cookie_size 5547 + cvp->cookie_desired; 5548 5549 / A cookie extension option was sent and returned. 5550 * Note that each incoming SYNACK replaces the 5551 * Responder cookie. The initial exchange is most 5552 * fragile, as protection against spoofing relies 5553 * entirely upon the sequence and timestamp (above). 5554 * This replacement strategy allows the correct pair to 5555 * pass through, while any others will be filtered via 5556 * Responder verification later. 5557 / 5558 if (sizeof(cvp->cookie_pair) >= cookie_pair_size) { 5559 memcpy(&cvp->cookie_pair[cvp->cookie_desired], 5560 hash_location, cookie_size); 5561 cvp->cookie_pair_size = cookie_pair_size; 5562 } 5563 } 5564 5565 smp_mb(); 5566 tcp_set_state(sk, TCP_ESTABLISHED); 5567 5568 security_inet_conn_established(sk, skb); 5569 5570 / Make sure socket is routed, for correct metrics. / 5571 icsk->icsk_af_ops->rebuild_header(sk); 5572 5573 tcp_init_metrics(sk); 5574 5575 tcp_init_congestion_control(sk); 5576 5577 / Prevent spurious tcp_cwnd_restart() on first data 5578 * packet. 5579 / 5580 tp->lsndtime = tcp_time_stamp; 5581 5582 tcp_init_buffer_space(sk); 5583 5584 if (sock_flag(sk, SOCK_KEEPOPEN)) 5585 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); 5586 5587 if (!tp->rx_opt.snd_wscale) 5588 __tcp_fast_path_on(tp, tp->snd_wnd); 5589 else 5590 tp->pred_flags = 0; 5591 5592 if (!sock_flag(sk, SOCK_DEAD)) { 5593 sk->sk_state_change(sk); 5594 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); 5595 } 5596 5597 if (sk->sk_write_pending || 5598 icsk->icsk_accept_queue.rskq_defer_accept || 5599 icsk->icsk_ack.pingpong) { 5600 / Save one ACK. Data will be ready after 5601 * several ticks, if write_pending is set. 5602 * 5603 * It may be deleted, but with this feature tcpdumps 5604 * look so wonderfully clever, that I was not able 5605 * to stand against the temptation 8) --ANK 5606 / 5607 inet_csk_schedule_ack(sk); 5608 icsk->icsk_ack.lrcvtime = tcp_time_stamp; 5609 icsk->icsk_ack.ato = TCP_ATO_MIN; 5610 tcp_incr_quickack(sk); 5611 tcp_enter_quickack_mode(sk); 5612 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5613 TCP_DELACK_MAX, TCP_RTO_MAX); 5614 5615 discard: 5616 __kfree_skb(skb); 5617 return 0; 5618 } else { 5619 tcp_send_ack(sk); 5620 } 5621 return -1; 5622 } 5623 5624 / No ACK in the segment / 5625 5626 if (th->rst) { 5627 / rfc793: 5628 * "If the RST bit is set 5629 * 5630 * Otherwise (no ACK) drop the segment and return." 5631 / 5632 5633 goto discard_and_undo; 5634 } 5635 5636 / PAWS check. / 5637 if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && 5638 tcp_paws_reject(&tp->rx_opt, 0)) 5639 goto discard_and_undo; 5640 5641 if (th->syn) { 5642 / We see SYN without ACK. It is attempt of 5643 * simultaneous connect with crossed SYNs. 5644 * Particularly, it can be connect to self. 5645 / 5646 tcp_set_state(sk, TCP_SYN_RECV); 5647 5648 if (tp->rx_opt.saw_tstamp) { 5649 tp->rx_opt.tstamp_ok = 1; 5650 tcp_store_ts_recent(tp); 5651 tp->tcp_header_len = 5652 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 5653 } else { 5654 tp->tcp_header_len = sizeof(struct tcphdr); 5655 } 5656 5657 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; 5658 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; 5659 5660 / RFC1323: The window in SYN & SYN/ACK segments is 5661 * never scaled. 5662 / 5663 tp->snd_wnd = ntohs(th->window); 5664 tp->snd_wl1 = TCP_SKB_CB(skb)->seq; 5665 tp->max_window = tp->snd_wnd; 5666 5667 TCP_ECN_rcv_syn(tp, th); 5668 5669 tcp_mtup_init(sk); 5670 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 5671 tcp_initialize_rcv_mss(sk); 5672 5673 tcp_send_synack(sk); 5674 #if 0 5675 / Note, we could accept data and URG from this segment. 5676 * There are no obstacles to make this. 5677 * 5678 * However, if we ignore data in ACKless segments sometimes, 5679 * we have no reasons to accept it sometimes. 5680 * Also, seems the code doing it in step6 of tcp_rcv_state_process 5681 * is not flawless. So, discard packet for sanity. 5682 * Uncomment this return to process the data. 5683 / 5684 return -1; 5685 #else 5686 goto discard; 5687 #endif 5688 } 5689 / "fifth, if neither of the SYN or RST bits is set then 5690 * drop the segment and return." 5691 */ 5692 5693 discard_and_undo: 5694 tcp_clear_options(&tp->rx_opt); 5695 tp->rx_opt.mss_clamp = saved_clamp; 5696 goto discard; 5697 5698 reset_and_undo: 5699 tcp_clear_options(&tp->rx_opt); 5700 tp->rx_opt.mss_clamp = saved_clamp; 5701 return 1; 5702 }

复制代码

二、被动打开

1、TCP_CLOSE ----> TCP_LISTEN 复制代码

794 int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) 795 { 796 struct inet_sock *inet = inet_sk(sk); 797 struct inet_connection_sock icsk = inet_csk(sk); 798 int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); 799 800 if (rc != 0) 801 return rc; 802 803 sk->sk_max_ack_backlog = 0; 804 sk->sk_ack_backlog = 0; 805 inet_csk_delack_init(sk); 806 807 / There is race window here: we announce ourselves listening, 808 * but this transition is still not validated by get_port(). 809 * It is OK, because this socket enters to hash table only 810 * after validation is complete. 811 */ 812 sk->sk_state = TCP_LISTEN; 813 if (!sk->sk_prot->get_port(sk, inet->inet_num)) { 814 inet->inet_sport = htons(inet->inet_num); 815 816 sk_dst_reset(sk); 817 sk->sk_prot->hash(sk); 818 819 return 0; 820 } 821 822 sk->sk_state = TCP_CLOSE; 823 __reqsk_queue_destroy(&icsk->icsk_accept_queue); 824 return -EADDRINUSE; 825 }

复制代码

2、TCP_LISTEN ----> TCP_SYN_RCVE 复制代码

5434 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5435 struct tcphdr *th, unsigned len) 5436 { 5437 u8 *hash_location; 5438 struct inet_connection_sock *icsk = inet_csk(sk); 5439 struct tcp_sock *tp = tcp_sk(sk); 5440 struct tcp_cookie_values cvp = tp->cookie_values; 5441 int saved_clamp = tp->rx_opt.mss_clamp; 5442 5443 tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); 5444 5445 if (th->ack) { 5446 / rfc793: 5447 * "If the state is SYN-SENT then 5448 * first check the ACK bit 5449 * If the ACK bit is set 5450 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send 5451 * a reset (unless the RST bit is set, if so drop 5452 * the segment and return)" 5453 * 5454 * We do not send data with SYN, so that RFC-correct 5455 * test reduces to: 5456 / 5457 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) 5458 goto reset_and_undo; 5459 5460 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 5461 !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, 5462 tcp_time_stamp)) { 5463 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); 5464 goto reset_and_undo; 5465 } 5466 5467 / Now ACK is acceptable. 5468 * 5469 * "If the RST bit is set 5470 * If the ACK was acceptable then signal the user "error: 5471 * connection reset", drop the segment, enter CLOSED state, 5472 * delete TCB, and return." 5473 / 5474 5475 if (th->rst) { 5476 tcp_reset(sk); 5477 goto discard; 5478 } 5479 5480 / rfc793: 5481 * "fifth, if neither of the SYN or RST bits is set then 5482 * drop the segment and return." 5483 * 5484 * See note below! 5485 * --ANK(990513) 5486 / 5487 if (!th->syn) 5488 goto discard_and_undo; 5489 5490 / rfc793: 5491 * "If the SYN bit is on ... 5492 * are acceptable then ... 5493 * (our SYN has been ACKed), change the connection 5494 * state to ESTABLISHED..." 5495 / 5496 5497 TCP_ECN_rcv_synack(tp, th); 5498 5499 tp->snd_wl1 = TCP_SKB_CB(skb)->seq; 5500 tcp_ack(sk, skb, FLAG_SLOWPATH); 5501 5502 / Ok.. it's good. Set up sequence numbers and 5503 * move to established. 5504 / 5505 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; 5506 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; 5507 5508 / RFC1323: The window in SYN & SYN/ACK segments is 5509 * never scaled. 5510 / 5511 tp->snd_wnd = ntohs(th->window); 5512 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5513 5514 if (!tp->rx_opt.wscale_ok) { 5515 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; 5516 tp->window_clamp = min(tp->window_clamp, 65535U); 5517 } 5518 5519 if (tp->rx_opt.saw_tstamp) { 5520 tp->rx_opt.tstamp_ok = 1; 5521 tp->tcp_header_len = 5522 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 5523 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5524 tcp_store_ts_recent(tp); 5525 } else { 5526 tp->tcp_header_len = sizeof(struct tcphdr); 5527 } 5528 5529 if (tcp_is_sack(tp) && sysctl_tcp_fack) 5530 tcp_enable_fack(tp); 5531 5532 tcp_mtup_init(sk); 5533 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 5534 tcp_initialize_rcv_mss(sk); 5535 5536 / Remember, tcp_poll() does not lock socket! 5537 * Change state from SYN-SENT only after copied_seq 5538 * is initialized. / 5539 tp->copied_seq = tp->rcv_nxt; 5540 5541 if (cvp != NULL && 5542 cvp->cookie_pair_size > 0 && 5543 tp->rx_opt.cookie_plus > 0) { 5544 int cookie_size = tp->rx_opt.cookie_plus 5545 - TCPOLEN_COOKIE_BASE; 5546 int cookie_pair_size = cookie_size 5547 + cvp->cookie_desired; 5548 5549 / A cookie extension option was sent and returned. 5550 * Note that each incoming SYNACK replaces the 5551 * Responder cookie. The initial exchange is most 5552 * fragile, as protection against spoofing relies 5553 * entirely upon the sequence and timestamp (above). 5554 * This replacement strategy allows the correct pair to 5555 * pass through, while any others will be filtered via 5556 * Responder verification later. 5557 / 5558 if (sizeof(cvp->cookie_pair) >= cookie_pair_size) { 5559 memcpy(&cvp->cookie_pair[cvp->cookie_desired], 5560 hash_location, cookie_size); 5561 cvp->cookie_pair_size = cookie_pair_size; 5562 } 5563 } 5564 5565 smp_mb(); 5566 tcp_set_state(sk, TCP_ESTABLISHED); 5567 5568 security_inet_conn_established(sk, skb); 5569 5570 / Make sure socket is routed, for correct metrics. / 5571 icsk->icsk_af_ops->rebuild_header(sk); 5572 5573 tcp_init_metrics(sk); 5574 5575 tcp_init_congestion_control(sk); 5576 5577 / Prevent spurious tcp_cwnd_restart() on first data 5578 * packet. 5579 / 5580 tp->lsndtime = tcp_time_stamp; 5581 5582 tcp_init_buffer_space(sk); 5583 5584 if (sock_flag(sk, SOCK_KEEPOPEN)) 5585 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); 5586 5587 if (!tp->rx_opt.snd_wscale) 5588 __tcp_fast_path_on(tp, tp->snd_wnd); 5589 else 5590 tp->pred_flags = 0; 5591 5592 if (!sock_flag(sk, SOCK_DEAD)) { 5593 sk->sk_state_change(sk); 5594 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); 5595 } 5596 5597 if (sk->sk_write_pending || 5598 icsk->icsk_accept_queue.rskq_defer_accept || 5599 icsk->icsk_ack.pingpong) { 5600 / Save one ACK. Data will be ready after 5601 * several ticks, if write_pending is set. 5602 * 5603 * It may be deleted, but with this feature tcpdumps 5604 * look so wonderfully clever, that I was not able 5605 * to stand against the temptation 8) --ANK 5606 / 5607 inet_csk_schedule_ack(sk); 5608 icsk->icsk_ack.lrcvtime = tcp_time_stamp; 5609 icsk->icsk_ack.ato = TCP_ATO_MIN; 5610 tcp_incr_quickack(sk); 5611 tcp_enter_quickack_mode(sk); 5612 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5613 TCP_DELACK_MAX, TCP_RTO_MAX); 5614 5615 discard: 5616 __kfree_skb(skb); 5617 return 0; 5618 } else { 5619 tcp_send_ack(sk); 5620 } 5621 return -1; 5622 } 5623 5624 / No ACK in the segment / 5625 5626 if (th->rst) { 5627 / rfc793: 5628 * "If the RST bit is set 5629 * 5630 * Otherwise (no ACK) drop the segment and return." 5631 / 5632 5633 goto discard_and_undo; 5634 } 5635 5636 / PAWS check. / 5637 if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && 5638 tcp_paws_reject(&tp->rx_opt, 0)) 5639 goto discard_and_undo; 5640 5641 if (th->syn) { 5642 / We see SYN without ACK. It is attempt of 5643 * simultaneous connect with crossed SYNs. 5644 * Particularly, it can be connect to self. 5645 / 5646 tcp_set_state(sk, TCP_SYN_RECV); 5647 5648 if (tp->rx_opt.saw_tstamp) { 5649 tp->rx_opt.tstamp_ok = 1; 5650 tcp_store_ts_recent(tp); 5651 tp->tcp_header_len = 5652 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 5653 } else { 5654 tp->tcp_header_len = sizeof(struct tcphdr); 5655 } 5656 5657 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; 5658 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; 5659 5660 / RFC1323: The window in SYN & SYN/ACK segments is 5661 * never scaled. 5662 / 5663 tp->snd_wnd = ntohs(th->window); 5664 tp->snd_wl1 = TCP_SKB_CB(skb)->seq; 5665 tp->max_window = tp->snd_wnd; 5666 5667 TCP_ECN_rcv_syn(tp, th); 5668 5669 tcp_mtup_init(sk); 5670 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 5671 tcp_initialize_rcv_mss(sk); 5672 5673 tcp_send_synack(sk); 5674 #if 0 5675 / Note, we could accept data and URG from this segment. 5676 * There are no obstacles to make this. 5677 * 5678 * However, if we ignore data in ACKless segments sometimes, 5679 * we have no reasons to accept it sometimes. 5680 * Also, seems the code doing it in step6 of tcp_rcv_state_process 5681 * is not flawless. So, discard packet for sanity. 5682 * Uncomment this return to process the data. 5683 / 5684 return -1; 5685 #else 5686 goto discard; 5687 #endif 5688 } 5689 / "fifth, if neither of the SYN or RST bits is set then 5690 * drop the segment and return." 5691 */ 5692 5693 discard_and_undo: 5694 tcp_clear_options(&tp->rx_opt); 5695 tp->rx_opt.mss_clamp = saved_clamp; 5696 goto discard; 5697 5698 reset_and_undo: 5699 tcp_clear_options(&tp->rx_opt); 5700 tp->rx_opt.mss_clamp = saved_clamp; 5701 return 1; 5702 }

复制代码

3、TCP_SYN_RCVE ----> TCP_ESTABLISHED 复制代码

5711 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, 5712 struct tcphdr *th, unsigned len) 5713 { 5714 struct tcp_sock *tp = tcp_sk(sk); 5715 struct inet_connection_sock icsk = inet_csk(sk); 5716 int queued = 0; 5717 int res; 5718 5719 tp->rx_opt.saw_tstamp = 0; 5720 5721 switch (sk->sk_state) { 5722 case TCP_CLOSE: 5723 goto discard; 5724 5725 case TCP_LISTEN: 5726 if (th->ack) 5727 return 1; 5728 5729 if (th->rst) 5730 goto discard; 5731 5732 if (th->syn) { 5733 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) 5734 return 1; 5735 5736 / Now we have several options: In theory there is 5737 * nothing else in the frame. KA9Q has an option to 5738 * send data with the syn, BSD accepts data with the 5739 * syn up to the [to be] advertised window and 5740 * Solaris 2.1 gives you a protocol error. For now 5741 * we just ignore it, that fits the spec precisely 5742 * and avoids incompatibilities. It would be nice in 5743 * future to drop through and process the data. 5744 * 5745 * Now that TTCP is starting to be used we ought to 5746 * queue this data. 5747 * But, this leaves one open to an easy denial of 5748 * service attack, and SYN cookies can't defend 5749 * against this problem. So, we drop the data 5750 * in the interest of security over speed unless 5751 * it's still in use. 5752 / 5753 kfree_skb(skb); 5754 return 0; 5755 } 5756 goto discard; 5757 5758 case TCP_SYN_SENT: 5759 queued = tcp_rcv_synsent_state_process(sk, skb, th, len); 5760 if (queued >= 0) 5761 return queued; 5762 5763 / Do step6 onward by hand. / 5764 tcp_urg(sk, skb, th); 5765 __kfree_skb(skb); 5766 tcp_data_snd_check(sk); 5767 return 0; 5768 } 5769 5770 res = tcp_validate_incoming(sk, skb, th, 0); 5771 if (res <= 0) 5772 return -res; 5773 5774 / step 5: check the ACK field / 5775 if (th->ack) { 5776 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; 5777 5778 switch (sk->sk_state) { 5779 case TCP_SYN_RECV: 5780 if (acceptable) { 5781 tp->copied_seq = tp->rcv_nxt; 5782 smp_mb(); 5783 tcp_set_state(sk, TCP_ESTABLISHED); 5784 sk->sk_state_change(sk); 5785 5786 / Note, that this wakeup is only for marginal 5787 * crossed SYN case. Passively open sockets 5788 * are not waked up, because sk->sk_sleep == 5789 * NULL and sk->sk_socket == NULL. 5790 / 5791 if (sk->sk_socket) 5792 sk_wake_async(sk, 5793 SOCK_WAKE_IO, POLL_OUT); 5794 5795 tp->snd_una = TCP_SKB_CB(skb)->ack_seq; 5796 tp->snd_wnd = ntohs(th->window) << 5797 tp->rx_opt.snd_wscale; 5798 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5799 5800 / tcp_ack considers this ACK as duplicate 5801 * and does not calculate rtt. 5802 * Force it here. 5803 / 5804 tcp_ack_update_rtt(sk, 0, 0); 5805 5806 if (tp->rx_opt.tstamp_ok) 5807 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5808 5809 / Make sure socket is routed, for 5810 * correct metrics. 5811 / 5812 icsk->icsk_af_ops->rebuild_header(sk); 5813 5814 tcp_init_metrics(sk); 5815 5816 tcp_init_congestion_control(sk); 5817 5818 / Prevent spurious tcp_cwnd_restart() on 5819 * first data packet. 5820 / 5821 tp->lsndtime = tcp_time_stamp; 5822 5823 tcp_mtup_init(sk); 5824 tcp_initialize_rcv_mss(sk); 5825 tcp_init_buffer_space(sk); 5826 tcp_fast_path_on(tp); 5827 } else { 5828 return 1; 5829 } 5830 break; 5831 5832 case TCP_FIN_WAIT1: 5833 if (tp->snd_una == tp->write_seq) { 5834 tcp_set_state(sk, TCP_FIN_WAIT2); 5835 sk->sk_shutdown |= SEND_SHUTDOWN; 5836 dst_confirm(sk->sk_dst_cache); 5837 5838 if (!sock_flag(sk, SOCK_DEAD)) 5839 / Wake up lingering close() / 5840 sk->sk_state_change(sk); 5841 else { 5842 int tmo; 5843 5844 if (tp->linger2 < 0 || 5845 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 5846 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { 5847 tcp_done(sk); 5848 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); 5849 return 1; 5850 } 5851 5852 tmo = tcp_fin_time(sk); 5853 if (tmo > TCP_TIMEWAIT_LEN) { 5854 inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); 5855 } else if (th->fin || sock_owned_by_user(sk)) { 5856 / Bad case. We could lose such FIN otherwise. 5857 * It is not a big problem, but it looks confusing 5858 * and not so rare event. We still can lose it now, 5859 * if it spins in bh_lock_sock(), but it is really 5860 * marginal case. 5861 / 5862 inet_csk_reset_keepalive_timer(sk, tmo); 5863 } else { 5864 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 5865 goto discard; 5866 } 5867 } 5868 } 5869 break; 5870 5871 case TCP_CLOSING: 5872 if (tp->snd_una == tp->write_seq) { 5873 tcp_time_wait(sk, TCP_TIME_WAIT, 0); 5874 goto discard; 5875 } 5876 break; 5877 5878 case TCP_LAST_ACK: 5879 if (tp->snd_una == tp->write_seq) { 5880 tcp_update_metrics(sk); 5881 tcp_done(sk); 5882 goto discard; 5883 } 5884 break; 5885 } 5886 } else 5887 goto discard; 5888 5889 / step 6: check the URG bit / 5890 tcp_urg(sk, skb, th); 5891 5892 / step 7: process the segment text / 5893 switch (sk->sk_state) { 5894 case TCP_CLOSE_WAIT: 5895 case TCP_CLOSING: 5896 case TCP_LAST_ACK: 5897 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 5898 break; 5899 case TCP_FIN_WAIT1: 5900 case TCP_FIN_WAIT2: 5901 / RFC 793 says to queue data in these states, 5902 * RFC 1122 says we MUST send a reset. 5903 * BSD 4.4 also does reset. 5904 / 5905 if (sk->sk_shutdown & RCV_SHUTDOWN) { 5906 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 5907 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { 5908 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); 5909 tcp_reset(sk); 5910 return 1; 5911 } 5912 } 5913 / Fall through / 5914 case TCP_ESTABLISHED: 5915 tcp_data_queue(sk, skb); 5916 queued = 1; 5917 break; 5918 } 5919 5920 / tcp_data could move socket to TIME-WAIT */ 5921 if (sk->sk_state != TCP_CLOSE) { 5922 tcp_data_snd_check(sk); 5923 tcp_ack_snd_check(sk); 5924 } 5925 5926 if (!queued) { 5927 discard: 5928 __kfree_skb(skb); 5929 } 5930 return 0; 5931 }

复制代码

三、主动关闭

1、TCP_ESTABLISHED ----> TCP_FIN_WAIT1 复制代码

2008 void tcp_close(struct sock sk, long timeout) 2009 { 2010 struct sk_buff skb; 2011 int data_was_unread = 0; 2012 int state; 2013 2014 lock_sock(sk); 2015 sk->sk_shutdown = SHUTDOWN_MASK; 2016 2017 if (sk->sk_state == TCP_LISTEN) { 2018 tcp_set_state(sk, TCP_CLOSE); 2019 2020 / Special case. / 2021 inet_csk_listen_stop(sk); 2022 2023 goto adjudge_to_death; 2024 } 2025 2026 / We need to flush the recv. buffs. We do this only on the 2027 * descriptor close, not protocol-sourced closes, because the 2028 * reader process may not have drained the data yet! 2029 / 2030 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { 2031 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; 2032 2033 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 2034 len--; 2035 data_was_unread += len; 2036 __kfree_skb(skb); 2037 } 2038 2039 sk_mem_reclaim(sk); 2040 2041 / If socket has been already reset (e.g. in tcp_reset()) - kill it. / 2042 if (sk->sk_state == TCP_CLOSE) 2043 goto adjudge_to_death; 2044 2045 / As outlined in RFC 2525, section 2.17, we send a RST here because 2046 * data was lost. To witness the awful effects of the old behavior of 2047 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk 2048 * GET in an FTP client, suspend the process, wait for the client to 2049 * advertise a zero window, then kill -9 the FTP client, wheee... 2050 * Note: timeout is always zero in such a case. 2051 / 2052 if (unlikely(tcp_sk(sk)->repair)) { 2053 sk->sk_prot->disconnect(sk, 0); 2054 } else if (data_was_unread) { 2055 / Unread data was tossed, zap the connection. / 2056 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); 2057 tcp_set_state(sk, TCP_CLOSE); 2058 tcp_send_active_reset(sk, sk->sk_allocation); 2059 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { 2060 / Check zero linger after checking for unread data. / 2061 sk->sk_prot->disconnect(sk, 0); 2062 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA); 2063 } else if (tcp_close_state(sk)) { 2064 / We FIN if the application ate all the data before 2065 * zapping the connection. 2066 / 2067 2068 / RED-PEN. Formally speaking, we have broken TCP state 2069 * machine. State transitions: 2070 * 2071 * TCP_ESTABLISHED -> TCP_FIN_WAIT1 2072 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) 2073 * TCP_CLOSE_WAIT -> TCP_LAST_ACK 2074 * 2075 * are legal only when FIN has been sent (i.e. in window), 2076 * rather than queued out of window. Purists blame. 2077 * 2078 * F.e. "RFC state" is ESTABLISHED, 2079 * if Linux state is FIN-WAIT-1, but FIN is still not sent. 2080 * 2081 * The visible declinations are that sometimes 2082 * we enter time-wait state, when it is not required really 2083 * (harmless), do not send active resets, when they are 2084 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when 2085 * they look as CLOSING or LAST_ACK for Linux) 2086 * Probably, I missed some more holelets. 2087 * --ANK 2088 * XXX (TFO) - To start off we don't support SYN+ACK+FIN 2089 * in a single packet! (May consider it later but will 2090 * probably need API support or TCP_CORK SYN-ACK until 2091 * data is written and socket is closed.) 2092 / 2093 tcp_send_fin(sk); 2094 } 2095 2096 sk_stream_wait_close(sk, timeout); 2097 2098 adjudge_to_death: 2099 state = sk->sk_state; 2100 sock_hold(sk); 2101 sock_orphan(sk); 2102 2103 / It is the last release_sock in its life. It will remove backlog. / 2104 release_sock(sk); 2105 2106 2107 / Now socket is owned by kernel and we acquire BH lock 2108 to finish close. No need to check for user refs. 2109 / 2110 local_bh_disable(); 2111 bh_lock_sock(sk); 2112 WARN_ON(sock_owned_by_user(sk)); 2113 2114 percpu_counter_inc(sk->sk_prot->orphan_count); 2115 2116 / Have we already been destroyed by a softirq or backlog? / 2117 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) 2118 goto out; 2119 2120 / This is a (useful) BSD violating of the RFC. There is a 2121 * problem with TCP as specified in that the other end could 2122 * keep a socket open forever with no application left this end. 2123 * We use a 1 minute timeout (about the same as BSD) then kill 2124 * our end. If they send after that then tough - BUT: long enough 2125 * that we won't make the old 4rto = almost no time - whoops 2126 * reset mistake. 2127 * 2128 * Nope, it was not mistake. It is really desired behaviour 2129 * f.e. on http servers, when such sockets are useless, but 2130 * consume significant resources. Let's do it with special 2131 * linger2 option. --ANK 2132 */ 2133 2134 if (sk->sk_state == TCP_FIN_WAIT2) { 2135 struct tcp_sock *tp = tcp_sk(sk); 2136 if (tp->linger2 < 0) { 2137 tcp_set_state(sk, TCP_CLOSE); 2138 tcp_send_active_reset(sk, GFP_ATOMIC); 2139 NET_INC_STATS_BH(sock_net(sk), 2140 LINUX_MIB_TCPABORTONLINGER); 2141 } else { 2142 const int tmo = tcp_fin_time(sk); 2143 2144 if (tmo > TCP_TIMEWAIT_LEN) { 2145 inet_csk_reset_keepalive_timer(sk, 2146 tmo - TCP_TIMEWAIT_LEN); 2147 } else { 2148 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 2149 goto out; 2150 } 2151 } 2152 } 2153 if (sk->sk_state != TCP_CLOSE) { 2154 sk_mem_reclaim(sk); 2155 if (tcp_check_oom(sk, 0)) { 2156 tcp_set_state(sk, TCP_CLOSE); 2157 tcp_send_active_reset(sk, GFP_ATOMIC); 2158 NET_INC_STATS_BH(sock_net(sk), 2159 LINUX_MIB_TCPABORTONMEMORY); 2160 } 2161 } 2162 2163 if (sk->sk_state == TCP_CLOSE) { 2164 struct request_sock req = tcp_sk(sk)->fastopen_rsk; 2165 / We could get here with a non-NULL req if the socket is 2166 * aborted (e.g., closed with unread data) before 3WHS 2167 * finishes. 2168 / 2169 if (req) 2170 reqsk_fastopen_remove(sk, req, false); 2171 inet_csk_destroy_sock(sk); 2172 } 2173 / Otherwise, socket is reprieved until protocol close. */ 2174 2175 out: 2176 bh_unlock_sock(sk); 2177 local_bh_enable(); 2178 sock_put(sk); 2179 } 2180 EXPORT_SYMBOL(tcp_close);

1959 static int tcp_close_state(struct sock *sk) 1960 { 1961 int next = (int)new_state[sk->sk_state]; 1962 int ns = next & TCP_STATE_MASK; 1963 1964 tcp_set_state(sk, ns); 1965 1966 return next & TCP_ACTION_FIN; 1967 }

复制代码

复制代码

1974 void tcp_shutdown(struct sock sk, int how) 1975 { 1976 / We need to grab some memory, and put together a FIN, 1977 * and then put it into the queue to be sent. 1978 * Tim MacKenzie([email protected]) 4 Dec '92. 1979 / 1980 if (!(how & SEND_SHUTDOWN)) 1981 return; 1982 1983 / If we've already sent a FIN, or it's a closed state, skip this. / 1984 if ((1 << sk->sk_state) & 1985 (TCPF_ESTABLISHED | TCPF_SYN_SENT | 1986 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { 1987 / Clear out any half completed packets. FIN if needed. */ 1988 if (tcp_close_state(sk)) 1989 tcp_send_fin(sk); 1990 } 1991 }

1959 static int tcp_close_state(struct sock *sk) 1960 { 1961 int next = (int)new_state[sk->sk_state]; 1962 int ns = next & TCP_STATE_MASK; 1963 1964 tcp_set_state(sk, ns); 1965 1966 return next & TCP_ACTION_FIN; 1967 }

复制代码

2、TCP_FIN_WAIT1---->TCP_FIN_WAIT2

复制代码

5711 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, 5712 struct tcphdr *th, unsigned len) 5713 { 5714 struct tcp_sock *tp = tcp_sk(sk); 5715 struct inet_connection_sock icsk = inet_csk(sk); 5716 int queued = 0; 5717 int res; 5718 5719 tp->rx_opt.saw_tstamp = 0; 5720 5721 switch (sk->sk_state) { 5722 case TCP_CLOSE: 5723 goto discard; 5724 5725 case TCP_LISTEN: 5726 if (th->ack) 5727 return 1; 5728 5729 if (th->rst) 5730 goto discard; 5731 5732 if (th->syn) { 5733 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) 5734 return 1; 5735 5736 / Now we have several options: In theory there is 5737 * nothing else in the frame. KA9Q has an option to 5738 * send data with the syn, BSD accepts data with the 5739 * syn up to the [to be] advertised window and 5740 * Solaris 2.1 gives you a protocol error. For now 5741 * we just ignore it, that fits the spec precisely 5742 * and avoids incompatibilities. It would be nice in 5743 * future to drop through and process the data. 5744 * 5745 * Now that TTCP is starting to be used we ought to 5746 * queue this data. 5747 * But, this leaves one open to an easy denial of 5748 * service attack, and SYN cookies can't defend 5749 * against this problem. So, we drop the data 5750 * in the interest of security over speed unless 5751 * it's still in use. 5752 / 5753 kfree_skb(skb); 5754 return 0; 5755 } 5756 goto discard; 5757 5758 case TCP_SYN_SENT: 5759 queued = tcp_rcv_synsent_state_process(sk, skb, th, len); 5760 if (queued >= 0) 5761 return queued; 5762 5763 / Do step6 onward by hand. / 5764 tcp_urg(sk, skb, th); 5765 __kfree_skb(skb); 5766 tcp_data_snd_check(sk); 5767 return 0; 5768 } 5769 5770 res = tcp_validate_incoming(sk, skb, th, 0); 5771 if (res <= 0) 5772 return -res; 5773 5774 / step 5: check the ACK field / 5775 if (th->ack) { 5776 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; 5777 5778 switch (sk->sk_state) { 5779 case TCP_SYN_RECV: 5780 if (acceptable) { 5781 tp->copied_seq = tp->rcv_nxt; 5782 smp_mb(); 5783 tcp_set_state(sk, TCP_ESTABLISHED); 5784 sk->sk_state_change(sk); 5785 5786 / Note, that this wakeup is only for marginal 5787 * crossed SYN case. Passively open sockets 5788 * are not waked up, because sk->sk_sleep == 5789 * NULL and sk->sk_socket == NULL. 5790 / 5791 if (sk->sk_socket) 5792 sk_wake_async(sk, 5793 SOCK_WAKE_IO, POLL_OUT); 5794 5795 tp->snd_una = TCP_SKB_CB(skb)->ack_seq; 5796 tp->snd_wnd = ntohs(th->window) << 5797 tp->rx_opt.snd_wscale; 5798 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5799 5800 / tcp_ack considers this ACK as duplicate 5801 * and does not calculate rtt. 5802 * Force it here. 5803 / 5804 tcp_ack_update_rtt(sk, 0, 0); 5805 5806 if (tp->rx_opt.tstamp_ok) 5807 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5808 5809 / Make sure socket is routed, for 5810 * correct metrics. 5811 / 5812 icsk->icsk_af_ops->rebuild_header(sk); 5813 5814 tcp_init_metrics(sk); 5815 5816 tcp_init_congestion_control(sk); 5817 5818 / Prevent spurious tcp_cwnd_restart() on 5819 * first data packet. 5820 / 5821 tp->lsndtime = tcp_time_stamp; 5822 5823 tcp_mtup_init(sk); 5824 tcp_initialize_rcv_mss(sk); 5825 tcp_init_buffer_space(sk); 5826 tcp_fast_path_on(tp); 5827 } else { 5828 return 1; 5829 } 5830 break; 5831 5832 case TCP_FIN_WAIT1: 5833 if (tp->snd_una == tp->write_seq) { 5834 tcp_set_state(sk, TCP_FIN_WAIT2); 5835 sk->sk_shutdown |= SEND_SHUTDOWN; 5836 dst_confirm(sk->sk_dst_cache); 5837 5838 if (!sock_flag(sk, SOCK_DEAD)) 5839 / Wake up lingering close() / 5840 sk->sk_state_change(sk); 5841 else { 5842 int tmo; 5843 5844 if (tp->linger2 < 0 || 5845 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 5846 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { 5847 tcp_done(sk); 5848 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); 5849 return 1; 5850 } 5851 5852 tmo = tcp_fin_time(sk); 5853 if (tmo > TCP_TIMEWAIT_LEN) { 5854 inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); 5855 } else if (th->fin || sock_owned_by_user(sk)) { 5856 / Bad case. We could lose such FIN otherwise. 5857 * It is not a big problem, but it looks confusing 5858 * and not so rare event. We still can lose it now, 5859 * if it spins in bh_lock_sock(), but it is really 5860 * marginal case. 5861 / 5862 inet_csk_reset_keepalive_timer(sk, tmo); 5863 } else { 5864 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 5865 goto discard; 5866 } 5867 } 5868 } 5869 break; 5870 5871 case TCP_CLOSING: 5872 if (tp->snd_una == tp->write_seq) { 5873 tcp_time_wait(sk, TCP_TIME_WAIT, 0); 5874 goto discard; 5875 } 5876 break; 5877 5878 case TCP_LAST_ACK: 5879 if (tp->snd_una == tp->write_seq) { 5880 tcp_update_metrics(sk); 5881 tcp_done(sk); 5882 goto discard; 5883 } 5884 break; 5885 } 5886 } else 5887 goto discard; 5888 5889 / step 6: check the URG bit / 5890 tcp_urg(sk, skb, th); 5891 5892 / step 7: process the segment text / 5893 switch (sk->sk_state) { 5894 case TCP_CLOSE_WAIT: 5895 case TCP_CLOSING: 5896 case TCP_LAST_ACK: 5897 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 5898 break; 5899 case TCP_FIN_WAIT1: 5900 case TCP_FIN_WAIT2: 5901 / RFC 793 says to queue data in these states, 5902 * RFC 1122 says we MUST send a reset. 5903 * BSD 4.4 also does reset. 5904 / 5905 if (sk->sk_shutdown & RCV_SHUTDOWN) { 5906 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 5907 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { 5908 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); 5909 tcp_reset(sk); 5910 return 1; 5911 } 5912 } 5913 / Fall through / 5914 case TCP_ESTABLISHED: 5915 tcp_data_queue(sk, skb); 5916 queued = 1; 5917 break; 5918 } 5919 5920 / tcp_data could move socket to TIME-WAIT */ 5921 if (sk->sk_state != TCP_CLOSE) { 5922 tcp_data_snd_check(sk); 5923 tcp_ack_snd_check(sk); 5924 } 5925 5926 if (!queued) { 5927 discard: 5928 __kfree_skb(skb); 5929 } 5930 return 0; 5931 } 5932

复制代码

3、TCP_FIN_WAIT2 ----> TCP_TIME_WAIT 复制代码

4064 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) 4065 { 4066 struct tcp_sock tp = tcp_sk(sk); 4067 4068 inet_csk_schedule_ack(sk); 4069 4070 sk->sk_shutdown |= RCV_SHUTDOWN; 4071 sock_set_flag(sk, SOCK_DONE); 4072 4073 switch (sk->sk_state) { 4074 case TCP_SYN_RECV: 4075 case TCP_ESTABLISHED: 4076 / Move to CLOSE_WAIT / 4077 tcp_set_state(sk, TCP_CLOSE_WAIT); 4078 inet_csk(sk)->icsk_ack.pingpong = 1; 4079 break; 4080 4081 case TCP_CLOSE_WAIT: 4082 case TCP_CLOSING: 4083 / Received a retransmission of the FIN, do 4084 * nothing. 4085 / 4086 break; 4087 case TCP_LAST_ACK: 4088 / RFC793: Remain in the LAST-ACK state. / 4089 break; 4090 4091 case TCP_FIN_WAIT1: 4092 / This case occurs when a simultaneous close 4093 * happens, we must ack the received FIN and 4094 * enter the CLOSING state. 4095 / 4096 tcp_send_ack(sk); 4097 tcp_set_state(sk, TCP_CLOSING); 4098 break; 4099 case TCP_FIN_WAIT2: 4100 / Received a FIN -- send ACK and enter TIME_WAIT. / 4101 tcp_send_ack(sk); 4102 tcp_time_wait(sk, TCP_TIME_WAIT, 0); 4103 break; 4104 default: 4105 / Only TCP_LISTEN and TCP_CLOSE are left, in these 4106 * cases we should never reach this piece of code. 4107 / 4108 printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", 4109 func, sk->sk_state); 4110 break; 4111 } 4112 4113 / It is possible, that we have something out-of-order after FIN. 4114 * Probably, we should reset in this case. For now drop them. 4115 / 4116 __skb_queue_purge(&tp->out_of_order_queue); 4117 if (tcp_is_sack(tp)) 4118 tcp_sack_reset(&tp->rx_opt); 4119 sk_mem_reclaim(sk); 4120 4121 if (!sock_flag(sk, SOCK_DEAD)) { 4122 sk->sk_state_change(sk); 4123 4124 / Do not send POLL_HUP for half duplex close. */ 4125 if (sk->sk_shutdown == SHUTDOWN_MASK || 4126 sk->sk_state == TCP_CLOSE) 4127 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); 4128 else 4129 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 4130 } 4131 } 4132

复制代码

4、TCP_TIME_WAIT ---->TCP_CLOSE

四、被动关闭

1、TCP_ESTABLISHED---->TCP_CLOSE_WAIT 复制代码

4064 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) 4065 { 4066 struct tcp_sock tp = tcp_sk(sk); 4067 4068 inet_csk_schedule_ack(sk); 4069 4070 sk->sk_shutdown |= RCV_SHUTDOWN; 4071 sock_set_flag(sk, SOCK_DONE); 4072 4073 switch (sk->sk_state) { 4074 case TCP_SYN_RECV: 4075 case TCP_ESTABLISHED: 4076 / Move to CLOSE_WAIT / 4077 tcp_set_state(sk, TCP_CLOSE_WAIT); 4078 inet_csk(sk)->icsk_ack.pingpong = 1; 4079 break; 4080 4081 case TCP_CLOSE_WAIT: 4082 case TCP_CLOSING: 4083 / Received a retransmission of the FIN, do 4084 * nothing. 4085 / 4086 break; 4087 case TCP_LAST_ACK: 4088 / RFC793: Remain in the LAST-ACK state. / 4089 break; 4090 4091 case TCP_FIN_WAIT1: 4092 / This case occurs when a simultaneous close 4093 * happens, we must ack the received FIN and 4094 * enter the CLOSING state. 4095 / 4096 tcp_send_ack(sk); 4097 tcp_set_state(sk, TCP_CLOSING); 4098 break; 4099 case TCP_FIN_WAIT2: 4100 / Received a FIN -- send ACK and enter TIME_WAIT. / 4101 tcp_send_ack(sk); 4102 tcp_time_wait(sk, TCP_TIME_WAIT, 0); 4103 break; 4104 default: 4105 / Only TCP_LISTEN and TCP_CLOSE are left, in these 4106 * cases we should never reach this piece of code. 4107 / 4108 printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", 4109 func, sk->sk_state); 4110 break; 4111 } 4112 4113 / It is possible, that we have something out-of-order after FIN. 4114 * Probably, we should reset in this case. For now drop them. 4115 / 4116 __skb_queue_purge(&tp->out_of_order_queue); 4117 if (tcp_is_sack(tp)) 4118 tcp_sack_reset(&tp->rx_opt); 4119 sk_mem_reclaim(sk); 4120 4121 if (!sock_flag(sk, SOCK_DEAD)) { 4122 sk->sk_state_change(sk); 4123 4124 / Do not send POLL_HUP for half duplex close. */ 4125 if (sk->sk_shutdown == SHUTDOWN_MASK || 4126 sk->sk_state == TCP_CLOSE) 4127 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); 4128 else 4129 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 4130 } 4131 } 4132

复制代码

2、TCP_CLOSE_WAIT---->TCP_LAST_ACK 复制代码

1974 void tcp_shutdown(struct sock sk, int how) 1975 { 1976 / We need to grab some memory, and put together a FIN, 1977 * and then put it into the queue to be sent. 1978 * Tim MacKenzie([email protected]) 4 Dec '92. 1979 / 1980 if (!(how & SEND_SHUTDOWN)) 1981 return; 1982 1983 / If we've already sent a FIN, or it's a closed state, skip this. / 1984 if ((1 << sk->sk_state) & 1985 (TCPF_ESTABLISHED | TCPF_SYN_SENT | 1986 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { 1987 / Clear out any half completed packets. FIN if needed. */ 1988 if (tcp_close_state(sk)) 1989 tcp_send_fin(sk); 1990 } 1991 }

1959 static int tcp_close_state(struct sock *sk) 1960 { 1961 int next = (int)new_state[sk->sk_state]; 1962 int ns = next & TCP_STATE_MASK; 1963 1964 tcp_set_state(sk, ns); 1965 1966 return next & TCP_ACTION_FIN; 1967 }

复制代码

3、TCP_LAST_ACK---->TCP_CLOSE 复制代码

5711 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, 5712 struct tcphdr *th, unsigned len) 5713 { 5714 struct tcp_sock *tp = tcp_sk(sk); 5715 struct inet_connection_sock icsk = inet_csk(sk); 5716 int queued = 0; 5717 int res; 5718 5719 tp->rx_opt.saw_tstamp = 0; 5720 5721 switch (sk->sk_state) { 5722 case TCP_CLOSE: 5723 goto discard; 5724 5725 case TCP_LISTEN: 5726 if (th->ack) 5727 return 1; 5728 5729 if (th->rst) 5730 goto discard; 5731 5732 if (th->syn) { 5733 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) 5734 return 1; 5735 5736 / Now we have several options: In theory there is 5737 * nothing else in the frame. KA9Q has an option to 5738 * send data with the syn, BSD accepts data with the 5739 * syn up to the [to be] advertised window and 5740 * Solaris 2.1 gives you a protocol error. For now 5741 * we just ignore it, that fits the spec precisely 5742 * and avoids incompatibilities. It would be nice in 5743 * future to drop through and process the data. 5744 * 5745 * Now that TTCP is starting to be used we ought to 5746 * queue this data. 5747 * But, this leaves one open to an easy denial of 5748 * service attack, and SYN cookies can't defend 5749 * against this problem. So, we drop the data 5750 * in the interest of security over speed unless 5751 * it's still in use. 5752 / 5753 kfree_skb(skb); 5754 return 0; 5755 } 5756 goto discard; 5757 5758 case TCP_SYN_SENT: 5759 queued = tcp_rcv_synsent_state_process(sk, skb, th, len); 5760 if (queued >= 0) 5761 return queued; 5762 5763 / Do step6 onward by hand. / 5764 tcp_urg(sk, skb, th); 5765 __kfree_skb(skb); 5766 tcp_data_snd_check(sk); 5767 return 0; 5768 } 5769 5770 res = tcp_validate_incoming(sk, skb, th, 0); 5771 if (res <= 0) 5772 return -res; 5773 5774 / step 5: check the ACK field / 5775 if (th->ack) { 5776 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; 5777 5778 switch (sk->sk_state) { 5779 case TCP_SYN_RECV: 5780 if (acceptable) { 5781 tp->copied_seq = tp->rcv_nxt; 5782 smp_mb(); 5783 tcp_set_state(sk, TCP_ESTABLISHED); 5784 sk->sk_state_change(sk); 5785 5786 / Note, that this wakeup is only for marginal 5787 * crossed SYN case. Passively open sockets 5788 * are not waked up, because sk->sk_sleep == 5789 * NULL and sk->sk_socket == NULL. 5790 / 5791 if (sk->sk_socket) 5792 sk_wake_async(sk, 5793 SOCK_WAKE_IO, POLL_OUT); 5794 5795 tp->snd_una = TCP_SKB_CB(skb)->ack_seq; 5796 tp->snd_wnd = ntohs(th->window) << 5797 tp->rx_opt.snd_wscale; 5798 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5799 5800 / tcp_ack considers this ACK as duplicate 5801 * and does not calculate rtt. 5802 * Force it here. 5803 / 5804 tcp_ack_update_rtt(sk, 0, 0); 5805 5806 if (tp->rx_opt.tstamp_ok) 5807 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5808 5809 / Make sure socket is routed, for 5810 * correct metrics. 5811 / 5812 icsk->icsk_af_ops->rebuild_header(sk); 5813 5814 tcp_init_metrics(sk); 5815 5816 tcp_init_congestion_control(sk); 5817 5818 / Prevent spurious tcp_cwnd_restart() on 5819 * first data packet. 5820 / 5821 tp->lsndtime = tcp_time_stamp; 5822 5823 tcp_mtup_init(sk); 5824 tcp_initialize_rcv_mss(sk); 5825 tcp_init_buffer_space(sk); 5826 tcp_fast_path_on(tp); 5827 } else { 5828 return 1; 5829 } 5830 break; 5831 5832 case TCP_FIN_WAIT1: 5833 if (tp->snd_una == tp->write_seq) { 5834 tcp_set_state(sk, TCP_FIN_WAIT2); 5835 sk->sk_shutdown |= SEND_SHUTDOWN; 5836 dst_confirm(sk->sk_dst_cache); 5837 5838 if (!sock_flag(sk, SOCK_DEAD)) 5839 / Wake up lingering close() / 5840 sk->sk_state_change(sk); 5841 else { 5842 int tmo; 5843 5844 if (tp->linger2 < 0 || 5845 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 5846 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { 5847 tcp_done(sk); 5848 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); 5849 return 1; 5850 } 5851 5852 tmo = tcp_fin_time(sk); 5853 if (tmo > TCP_TIMEWAIT_LEN) { 5854 inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); 5855 } else if (th->fin || sock_owned_by_user(sk)) { 5856 / Bad case. We could lose such FIN otherwise. 5857 * It is not a big problem, but it looks confusing 5858 * and not so rare event. We still can lose it now, 5859 * if it spins in bh_lock_sock(), but it is really 5860 * marginal case. 5861 / 5862 inet_csk_reset_keepalive_timer(sk, tmo); 5863 } else { 5864 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 5865 goto discard; 5866 } 5867 } 5868 } 5869 break; 5870 5871 case TCP_CLOSING: 5872 if (tp->snd_una == tp->write_seq) { 5873 tcp_time_wait(sk, TCP_TIME_WAIT, 0); 5874 goto discard; 5875 } 5876 break; 5877 5878 case TCP_LAST_ACK: 5879 if (tp->snd_una == tp->write_seq) { 5880 tcp_update_metrics(sk); 5881 tcp_done(sk); 5882 goto discard; 5883 } 5884 break; 5885 } 5886 } else 5887 goto discard; 5888 5889 / step 6: check the URG bit / 5890 tcp_urg(sk, skb, th); 5891 5892 / step 7: process the segment text / 5893 switch (sk->sk_state) { 5894 case TCP_CLOSE_WAIT: 5895 case TCP_CLOSING: 5896 case TCP_LAST_ACK: 5897 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 5898 break; 5899 case TCP_FIN_WAIT1: 5900 case TCP_FIN_WAIT2: 5901 / RFC 793 says to queue data in these states, 5902 * RFC 1122 says we MUST send a reset. 5903 * BSD 4.4 also does reset. 5904 / 5905 if (sk->sk_shutdown & RCV_SHUTDOWN) { 5906 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 5907 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { 5908 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); 5909 tcp_reset(sk); 5910 return 1; 5911 } 5912 } 5913 / Fall through / 5914 case TCP_ESTABLISHED: 5915 tcp_data_queue(sk, skb); 5916 queued = 1; 5917 break; 5918 } 5919 5920 / tcp_data could move socket to TIME-WAIT */ 5921 if (sk->sk_state != TCP_CLOSE) { 5922 tcp_data_snd_check(sk); 5923 tcp_ack_snd_check(sk); 5924 } 5925 5926 if (!queued) { 5927 discard: 5928 __kfree_skb(skb); 5929 } 5930 return 0; 5931 } 5932

复制代码

转载于:https://my.oschina.net/innovation/blog/808641

你可能感兴趣的:(TCP状态切换流程)