TCP协议连接初始化后的状态管理和切换由tcp_rcv_sate_preocess函数完成,tcp_v4_recevie函数收到数据包后查看TCP协议头,区分是只含纯传送负载数据还是包含控制信息SYN、FIN、RST、ACK等的数据包。各种状态的数据包处理过程大部分在tcp_rcv_state_process函数中完成,除ESTABLISHE和TIME_WAIT这两个状态外。数据包到达后如果是CLOSED状态就扔掉。
1、从LINSTEN到SYN_RECV
处于LINSTEN状态表名套接字是一个服务器,在等待一个连接请求,这时TCP协议收到的各种标志数据包处理如下:
ACK:发送连接复位。
RST:连接由客户端复位,扔掉数据包。
SYN:客户端发送一个连接请求,调用icsk_af_ops->conne_rquest实际指向函数tcp_v4_conn_request,初始化序列号、发送SYN 和ACK标志给客户端,将TCP状态设置为TCP_SYN_RECV。
其他数据包:这时连接还没建立,就扔掉。
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int queued = 0;
int res;
tp->rx_opt.saw_tstamp = 0;
switch (sk->sk_state) {
//状态是CLOSE直接扔掉数据包
case TCP_CLOSE:
goto discard;
//表名套接字是一个服务器
case TCP_LISTEN:
//收到ACK包返回1,发送连接复位
if (th->ack)
return 1;
//收到RST包直接扔掉
if (th->rst)
goto discard;
if (th->syn) {
//处理连接请求,实际调用tcp_v4_conn_request
//回复对端ACK SYN包
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1;
kfree_skb(skb);
return 0;
}
goto discard;
...
}
2、SYN_SENT到ESTABLISHED
套接字状态是SYN_SENT,表名套接字是一个客户单,它发送了SYN包,在等待服务器的SYN和ACK包,以确保状态转换到ESTABLISHED
...
//表明是个客户端
case TCP_SYN_SENT:
//处理成功从SYN_SET切换到ESTABLISHED
queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
if (queued >= 0)
return queued;
/* Do step6 onward by hand. */
tcp_urg(sk, skb, th);
__kfree_skb(skb);
tcp_data_snd_check(sk);
return 0;
...
(1)收到ACK
tcp_rcv_synsent_state_process函数会对数据包和TCP协议头进行检验,如果数据包合法而且设置了正确的ACK标志,就把套接字状态切换到ESTABLISHED。
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
...
//收到ACK且数据包合法,切换到ESTABLISHED状态
tcp_set_state(sk, TCP_ESTABLISHED);
....
}
(2)收到连接复位
如果收到连接复位请求,则复位连接并扔掉数据包。
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
...
//收到连接复位,复位连接,并扔掉数据包
if (th->rst) {
tcp_reset(sk);
goto discard;
}
...
}
(3)没有SYN标志
如果数据包没有SYN标志,就扔掉。
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
...
//没有SYN标志扔掉数据包
if (!th->syn)
goto discard_and_undo;
...
}
(4)tcp_rcv_synsent_state_process返回值
tcp_rcv_synsent_state_process返回一个负值,表明数据段中还有数据等待处理,处理查看URG标志外不做任何处理。
...
//处理成功从SYN_SET切换到ESTABLISHED
queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
if (queued >= 0)
return queued;
/* Do step6 onward by hand. */
tcp_urg(sk, skb, th);
__kfree_skb(skb);
tcp_data_snd_check(sk);
...
到目前位置处理了SYN_SENT、LINSTEN、CLOSE这三个状态的套接字,其他状态在接下来处理。
3、数据包有效性检查
数据包的有效性检查由函数tcp_validate_incoming完成,按照RFC793规范进行。
(1)、序列号检查
tcp_sequence函数检查序列号是否在窗口范围内,如果超出了当前窗口就扔掉数据包。
static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, int syn_inerr)
{
...
/* Step 1: check sequence number */
//第一步序列号检查
if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
/* RFC793, page 37: "In all states except SYN-SENT, all reset
* (RST) segments are validated by checking their SEQ-fields."
* And page 69: "If an incoming segment is not acceptable,
* an acknowledgment should be sent in reply (unless the RST
* bit is set, if so drop the segment and return)".
*/
//函数复位标志RST直接扔掉数据包并返回
if (!th->rst)
tcp_send_dupack(sk, skb);
goto discard;
}
...
}
(2)复位连接标志
如果数据包有复位连接标志RST,就复位连接,并扔掉数据包。
static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, int syn_inerr)
{
...
//如果有复位标志RST则复位连接,扔掉数据包
/* Step 2: check RST bit */
if (th->rst) {
tcp_reset(sk);
goto discard;
}
...
}
(3)检查SYN是否在窗口范围内
static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, int syn_inerr)
{
...
//如果是SYN,查看序列号是否在当前窗口范围内
/* step 4: Check for a SYN in window. */
if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
if (syn_inerr)
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
tcp_reset(sk);
return -1;
}
...
}
4、数据包有效:有ACK标志
(1)TCP_SYN_RECV
如果收到ACK标志数据包,而且套接字状态处于SYN_RECV,这时最大可能是处于被迫打开状态,应切换到ESTABLISH状态,计算RTT,如果接受到的ACK数据包中有时间戳选项,RTT基于时间戳计算,RTT的值保存在struct tcp_sock数据结构的srtt数据域中,重新构建协议头
...
/* step 5: check the ACK field */
if (th->ack) {
int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
switch (sk->sk_state) {
//收到ACK,连接状态处于SYN_RECV,这时处于被迫打开的状态
//状态切换到ESTABLISHE
case TCP_SYN_RECV:
if (acceptable) {
tp->copied_seq = tp->rcv_nxt;
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
sk->sk_state_change(sk);
/* Note, that this wakeup is only for marginal
* crossed SYN case. Passively open sockets
* are not waked up, because sk->sk_sleep ==
* NULL and sk->sk_socket == NULL.
*/
//唤醒套接字
if (sk->sk_socket)
sk_wake_async(sk,
SOCK_WAKE_IO, POLL_OUT);
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) <<
tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
/* tcp_ack considers this ACK as duplicate
* and does not calculate rtt.
* Force it here.
*/
//计算RTT
tcp_ack_update_rtt(sk, 0, 0);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
/* Make sure socket is routed, for
* correct metrics.
*/
//重新构建TCP头部
icsk->icsk_af_ops->rebuild_header(sk);
//初始化套接字某些字段
tcp_init_metrics(sk);
tcp_init_congestion_control(sk);
/* Prevent spurious tcp_cwnd_restart() on
* first data packet.
*/
tp->lsndtime = tcp_time_stamp;
tcp_mtup_init(sk);
//套接字上的预留缓冲区基于收到的MSS值来确定
//初始化窗口大小一个猜测值
tcp_initialize_rcv_mss(sk);
//为套接字预留缓冲区空间
tcp_init_buffer_space(sk);
//计算struc tcp_sock数据结构上的pred_flags数据域
//该数据域决定是否应交给Fast Path处理
tcp_fast_path_on(tp);
} else {
return 1;
}
break;
...
(2)FIN_WAIT_1
如果套接字状态处于FIN_WAIT_1收到一个ACK包,套接字状态就会切换到FIN_WAIT_2,同时设置套接字的shutdown数据域的值为SEND_SHUTDOWN,指明随后套接字切换成CLOSED状态时应向站点发送包含RST的数据包shutdown。
处理TCP选项:
TCP_LINGER2选项:决定套接字进入CLOSED状态之前,需要在FIN_WAIT_2状态上等待多长时间,他的值保存在struct tcp_sock tp->linger2数据域中,如果linger2为负值,则套接字立即切换到CLOSED状态,不经过 FIN_WAIT_2和TIMEI_WAIT状态。
keepalive选项:keepalive时钟超时的情况要被复位。
如果收到ACK数据包是最后一个回答FIN,或套接字被其他进程锁定,则复位keepalive时钟,如果不这么做就会丢失FIN。
....
case TCP_FIN_WAIT1:
//FIN_WAIT1状态收到ACK切换到FIN_WAIT2
if (tp->snd_una == tp->write_seq) {
tcp_set_state(sk, TCP_FIN_WAIT2);
//设置shutdown数据域为SEND_SHUTDOWN
//指明套接字切换成CLOSED要发送RST数据包
sk->sk_shutdown |= SEND_SHUTDOWN;
dst_confirm(__sk_dst_get(sk));
if (!sock_flag(sk, SOCK_DEAD))
/* Wake up lingering close() */
sk->sk_state_change(sk);
else {
//不是一个死套接字
int tmo;
//决定套接字进入CLOSED状态前要在FIN_WAIT2状态等待多长时间
//这个时间保存在tp->linger2,如果linger2小于0,则套接字立即从
//FIN_WAIT1切换到CLISED,不经过FIN_WAIT2
if (tp->linger2 < 0 ||
(TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
tcp_done(sk);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}
tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
//keepalive时钟超时情况下被复位
inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
} else if (th->fin || sock_owned_by_user(sk)) {
/* Bad case. We could lose such FIN otherwise.
* It is not a big problem, but it looks confusing
* and not so rare event. We still can lose it now,
* if it spins in bh_lock_sock(), but it is really
* marginal case.
*/
//收到的ACK是最后一个回答FIN
//或者套接字被其他进程锁定就要复位时钟
inet_csk_reset_keepalive_timer(sk, tmo);
} else {
//复制进入FIN_WAIT2
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto discard;
}
}
}
break;
...
(3)CLOSING
收到ACK后套接字直接进入TIME_WAIT状态,说明对端没有数据向外发送了。
...
case TCP_CLOSING:
//TCP_CLOSING收到ACK直接进入TIME_WAIT,
//表名对端没有向外发送数据了
if (tp->snd_una == tp->write_seq) {
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
goto discard;
}
break;
...
(4)LAST_ACK
套接字被迫关闭,响应应用程序close,收到ACK就关闭套接字,所以调用tcp_done。
...
case TCP_LAST_ACK:
//套接字被迫关闭,这个状态收到ACK就可以关闭套接字
//所以调用tcp_done关闭套接字
if (tp->snd_una == tp->write_seq) {
tcp_update_metrics(sk);
tcp_done(sk);
goto discard;
}
break;
...
5、处理段中的数据内容
以下5中状态多要将数据段放入队列中:case TCP_CLOSE_WAIT、TCP_CLOSING、TCP_LAST_ACK、TCP_FIN_WAIT1、TCP_FIN_WAIT2并发送一个复位。
...
//以下几种状态是可以接受数据的
//处理数据包
/* step 7: process the segment text */
switch (sk->sk_state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
* RFC 1122 says we MUST send a reset.
* BSD 4.4 also does reset.
*/
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
return 1;
}
}
/* Fall through */
...
6、ESTABLISHED状态处理
套接字状态为ESTABLISHED状态是可以收到常规数据段的,它调用tcp_data_queue函数将数据段放入套接字输入缓冲区队列中。
tcp_rcv_state_process完整代码:
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int queued = 0;
int res;
tp->rx_opt.saw_tstamp = 0;
switch (sk->sk_state) {
//状态是CLOSE直接扔掉数据包
case TCP_CLOSE:
goto discard;
//表名套接字是一个服务器
case TCP_LISTEN:
//收到ACK包返回1,发送连接复位
if (th->ack)
return 1;
//收到RST包直接扔掉
if (th->rst)
goto discard;
if (th->syn) {
//处理连接请求,实际调用tcp_v4_conn_request
//回复对端ACK SYN包
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1;
/* Now we have several options: In theory there is
* nothing else in the frame. KA9Q has an option to
* send data with the syn, BSD accepts data with the
* syn up to the [to be] advertised window and
* Solaris 2.1 gives you a protocol error. For now
* we just ignore it, that fits the spec precisely
* and avoids incompatibilities. It would be nice in
* future to drop through and process the data.
*
* Now that TTCP is starting to be used we ought to
* queue this data.
* But, this leaves one open to an easy denial of
* service attack, and SYN cookies can't defend
* against this problem. So, we drop the data
* in the interest of security over speed unless
* it's still in use.
*/
kfree_skb(skb);
return 0;
}
goto discard;
//表明是个客户端
case TCP_SYN_SENT:
//处理成功从SYN_SET切换到ESTABLISHED
queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
if (queued >= 0)
return queued;
/* Do step6 onward by hand. */
tcp_urg(sk, skb, th);
__kfree_skb(skb);
tcp_data_snd_check(sk);
return 0;
}
//对数据包检查
res = tcp_validate_incoming(sk, skb, th, 0);
if (res <= 0)
return -res;
/* step 5: check the ACK field */
if (th->ack) {
int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
switch (sk->sk_state) {
//收到ACK,连接状态处于SYN_RECV,这时处于被迫打开的状态
//状态切换到ESTABLISHE
case TCP_SYN_RECV:
if (acceptable) {
tp->copied_seq = tp->rcv_nxt;
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
sk->sk_state_change(sk);
/* Note, that this wakeup is only for marginal
* crossed SYN case. Passively open sockets
* are not waked up, because sk->sk_sleep ==
* NULL and sk->sk_socket == NULL.
*/
//唤醒套接字
if (sk->sk_socket)
sk_wake_async(sk,
SOCK_WAKE_IO, POLL_OUT);
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) <<
tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
/* tcp_ack considers this ACK as duplicate
* and does not calculate rtt.
* Force it here.
*/
//计算RTT
tcp_ack_update_rtt(sk, 0, 0);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
/* Make sure socket is routed, for
* correct metrics.
*/
//重新构建TCP头部
icsk->icsk_af_ops->rebuild_header(sk);
//初始化套接字某些字段
tcp_init_metrics(sk);
tcp_init_congestion_control(sk);
/* Prevent spurious tcp_cwnd_restart() on
* first data packet.
*/
tp->lsndtime = tcp_time_stamp;
tcp_mtup_init(sk);
//套接字上的预留缓冲区基于收到的MSS值来确定
//初始化窗口大小一个猜测值
tcp_initialize_rcv_mss(sk);
tcp_init_buffer_space(sk);
tcp_fast_path_on(tp);
} else {
return 1;
}
break;
case TCP_FIN_WAIT1:
//FIN_WAIT1状态收到ACK切换到FIN_WAIT2
if (tp->snd_una == tp->write_seq) {
tcp_set_state(sk, TCP_FIN_WAIT2);
//设置shutdown数据域为SEND_SHUTDOWN
//指明套接字切换成CLOSED要发送RST数据包
sk->sk_shutdown |= SEND_SHUTDOWN;
dst_confirm(__sk_dst_get(sk));
if (!sock_flag(sk, SOCK_DEAD))
/* Wake up lingering close() */
sk->sk_state_change(sk);
else {
int tmo;
//决定套接字进入CLOSED状态前要在FIN_WAIT2状态等待多长时间
//这个时间保存在tp->linger2,如果linger2小于0,则套接字立即从
//FIN_WAIT1切换到CLISED,不经过FIN_WAIT2
if (tp->linger2 < 0 ||
(TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
tcp_done(sk);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}
tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
//keepalive时钟超时情况下被复位
inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
} else if (th->fin || sock_owned_by_user(sk)) {
/* Bad case. We could lose such FIN otherwise.
* It is not a big problem, but it looks confusing
* and not so rare event. We still can lose it now,
* if it spins in bh_lock_sock(), but it is really
* marginal case.
*/
//收到的ACK是最后一个回答FIN
//或者套接字被其他进程锁定就要复位时钟
inet_csk_reset_keepalive_timer(sk, tmo);
} else {
//复制进入FIN_WAIT2
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto discard;
}
}
}
break;
case TCP_CLOSING:
//TCP_CLOSING收到ACK直接进入TIME_WAIT,
//表名对端没有向外发送数据了
if (tp->snd_una == tp->write_seq) {
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
goto discard;
}
break;
case TCP_LAST_ACK:
//套接字被迫关闭,这个状态收到ACK就可以关闭套接字
//所以调用tcp_done关闭套接字
if (tp->snd_una == tp->write_seq) {
tcp_update_metrics(sk);
tcp_done(sk);
goto discard;
}
break;
}
} else
goto discard;
/* step 6: check the URG bit */
tcp_urg(sk, skb, th);
//以下几种状态是可以接受数据的
//处理数据包
/* step 7: process the segment text */
switch (sk->sk_state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
* RFC 1122 says we MUST send a reset.
* BSD 4.4 also does reset.
*/
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
return 1;
}
}
/* Fall through */
case TCP_ESTABLISHED:
tcp_data_queue(sk, skb);
queued = 1;
break;
}
/* tcp_data could move socket to TIME-WAIT */
if (sk->sk_state != TCP_CLOSE) {
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
}
if (!queued) {
discard:
__kfree_skb(skb);
}
return 0;
}