本文从内核的角度看timewait是如何解决的。贴代码,和网上看到的挺多冲突的!
timewait在tcp结束后主动关闭一方的等待时候的行为。图片中的服务和客户端描述不是非常准确,这里客户端是主动关闭一方。(在web服务器模型下,web服务器也可主动关闭客户端,这个时候web服务器就变成了四次握手的客户端)。
这里的客户端,不是四次握手的客户端,而是指发起tcp请求的一方。发起一方需要绑定本地端口,本地端口的绑定方式非常暴力:直接是从配置的偶数值开始遍历查找可用端口:(偶数端口和奇数端口内核的一个patch)
如果发起大量的客户端请求,并且不能回收,系统调用connect时长增加,甚至直接因端口耗尽直接调用失败。
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
// 本地端口绑定初始化
err = inet_hash_connect(tcp_death_row, sk);
}
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
return __inet_hash_connect(death_row, sk, port_offset,
__inet_check_established);
}
// inet_hash_connect调用下面函数
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u32 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16, struct inet_timewait_sock **))
{
inet_get_local_port_range(net, &low, &high);
other_parity_scan:
port = low + offset;
for (i = 0; i < remaining; i += 2, port += 2) {
inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
tb->port == port) {
if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0)
goto next_port;
WARN_ON(hlist_empty(&tb->owners));
// 这里主要是调用tcp_twsk_unique,保证开启了timestamp就可以连接
if (!check_established(death_row, sk,
port, &tw))
goto ok;
goto next_port;
}
}
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port, l3mdev);
if (!tb) {
spin_unlock_bh(&head->lock);
return -ENOMEM;
}
tb->fastreuse = -1;
tb->fastreuseport = -1;
goto ok;
next_port:
spin_unlock_bh(&head->lock);
cond_resched();
}
}
端口重用的逻辑从__inet_check_established->tcp_twsk_unique(源码),总结下逻辑:
timewait满足的条件:
所以需要解决timewait的客户端问题有三个办法:
timewait端口重用的逻辑:
static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, __u16 lport,
struct inet_timewait_sock **twp)
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
__be32 daddr = inet->inet_rcv_saddr;
__be32 saddr = inet->inet_daddr;
int dif = sk->sk_bound_dev_if;
struct net *net = sock_net(sk);
int sdif = l3mdev_master_ifindex_by_index(net, dif);
INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
unsigned int hash = inet_ehashfn(net, daddr, lport,
saddr, inet->inet_dport);
struct inet_timewait_sock *tw = NULL;
sk_nulls_for_each(sk2, node, &head->chain) {
if (sk2->sk_hash != hash)
continue;
if (likely(INET_MATCH(sk2, net, acookie,
saddr, daddr, ports, dif, sdif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
if (twsk_unique(sk, sk2, twp))
break;
}
goto not_unique;
}
}
return 0;
not_unique:
spin_unlock(lock);
return -EADDRNOTAVAIL;
}
static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
if (sk->sk_prot->twsk_prot->twsk_unique != NULL)
return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp);
return 0;
}
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
if (tcptw->tw_ts_recent_stamp &&
(!twp || (reuse && time_after32(ktime_get_seconds(),
tcptw->tw_ts_recent_stamp)))) {
if (likely(!tp->repair)) {
u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
if (!seq)
seq = 1;
WRITE_ONCE(tp->write_seq, seq);
tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
}
sock_hold(sktw);
return 1;
}
}
限制timewait代码:
void tcp_fin(struct sock *sk)
{
case TCP_FIN_WAIT2:
/* Received a FIN -- send ACK and enter TIME_WAIT. */
tcp_send_ack(sk);
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
break;
}
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
struct inet_timewait_death_row *dr,
const int state)
{
struct inet_timewait_sock *tw;
if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets)
return NULL;
}
服务器(非tcp四次握手的服务器,指如web服务器)的timewait主动端开,端口都是同一个不影响端口,但是占用机器资源。此类影响,可以从内存方面分析。