当socekt进入TIME_WAIT状态后,TIME_WAIT定时器启动。在超时之前,替代socket的tw sock会处理旧连接中的包,阻止其危害新连接。定时器超时后,tw sock被删除,并释放其占用的端口号。
TIME_WAIT定时器的安装由tcp_time_wait函数完成,调用tcp_time_wait函数的时机有:
(1)在TCP_FIN_WAIT2状态下socket关闭,没有用TCP_LINGER2选项将tp->linger2设置为小于0且tcp_fin_time的大小小于等于TCP_TIMEWAIT_LEN:
2059 void tcp_close(struct sock *sk, long timeout) 2060 { ... 2183 if (sk->sk_state == TCP_FIN_WAIT2) { 2184 struct tcp_sock *tp = tcp_sk(sk); 2185 if (tp->linger2 < 0) { ... 2190 } else { 2191 const int tmo = tcp_fin_time(sk); 2192 2193 if (tmo > TCP_TIMEWAIT_LEN) { 2194 inet_csk_reset_keepalive_timer(sk, 2195 tmo - TCP_TIMEWAIT_LEN); 2196 } else { 2197 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 2198 goto out; 2199 } 2200 } ...(2)TCP_FIN_WAIT2状态下收到FIN并发送ACK后:
3783 static void tcp_fin(struct sock *sk) 3784 { ... 3818 case TCP_FIN_WAIT2: 3819 /* Received a FIN -- send ACK and enter TIME_WAIT. */ 3820 tcp_send_ack(sk); 3821 tcp_time_wait(sk, TCP_TIME_WAIT, 0);(3)孤儿socket在TCP_FIN_WAIT1状态下收到ACK时, 满足:
1)没有用TCP_LINGER2选项将tp->linger2设置为小于0
2)tcp_fin_time的大小小于等于TCP_TIMEWAIT_LEN:
3)ACK中没有数据或数据全是旧的
4)ACK中没有FIN标记并且socket没有被应用进程锁定
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, 5601 const struct tcphdr *th, unsigned int len) 5602 { ... 5751 case TCP_FIN_WAIT1: ... 5780 if (!sock_flag(sk, SOCK_DEAD)) 5781 /* Wake up lingering close() */ 5782 sk->sk_state_change(sk); 5783 else { 5784 int tmo; 5785 5786 if (tp->linger2 < 0 || 5787 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 5788 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { 5789 tcp_done(sk); 5790 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); 5791 return 1; 5792 } 5793 5794 tmo = tcp_fin_time(sk); 5795 if (tmo > TCP_TIMEWAIT_LEN) { 5796 inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); 5797 } else if (th->fin || sock_owned_by_user(sk)) { 5798 /* Bad case. We could lose such FIN otherwise. 5799 * It is not a big problem, but it looks confusing 5800 * and not so rare event. We still can lose it now, 5801 * if it spins in bh_lock_sock(), but it is really 5802 * marginal case. 5803 */ 5804 inet_csk_reset_keepalive_timer(sk, tmo); 5805 } else { 5806 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 5807 goto discard; 5808 } ...(4)TCP在TCP_CLOSING状态下收到ACK时:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, 5601 const struct tcphdr *th, unsigned int len) 5602 { ... 5813 case TCP_CLOSING: 5814 if (tp->snd_una == tp->write_seq) { 5815 tcp_time_wait(sk, TCP_TIME_WAIT, 0); 5816 goto discard; 5817 } ...(5)FIN_WAIT2定时器超时时, 没有用TCP_LINGER2选项将tp->linger2设置为小于0且tcp_fin_time的大小大于TCP_TIMEWAIT_LEN:
558 static void tcp_keepalive_timer (unsigned long data) 559 { ... 578 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { 579 if (tp->linger2 >= 0) { 580 const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; 581 582 if (tmo > 0) { 583 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 584 goto out; 585 } ...tcp_time_wait函数会调用inet_twsk_schedule函数安装TIME_WAIT定时器:
266 void tcp_time_wait(struct sock *sk, int state, int timeo) 267 { ... 327 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); //将tw sock放入ESTABLESHED hash表和bind hash表中,将sk从ESTABLISHED hash表中移除 328 329 /* Get the TIME_WAIT timeout firing. */ 330 if (timeo < rto) 331 timeo = rto; 332 333 if (recycle_ok) { 334 tw->tw_timeout = rto; 335 } else { 336 tw->tw_timeout = TCP_TIMEWAIT_LEN; 337 if (state == TCP_TIME_WAIT) 338 timeo = TCP_TIMEWAIT_LEN; 339 } 340 341 inet_twsk_schedule(tw, &tcp_death_row, timeo, 342 TCP_TIMEWAIT_LEN); 343 inet_twsk_put(tw); ...
__inet_twsk_hashdance函数将tw_sock加入到bind hash表和ESTABLISHED表中,这样在tw_sock被删除之前相应IP|端口不允许bind,也不允许建立:
126 void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, 127 struct inet_hashinfo *hashinfo) 128 { 129 const struct inet_sock *inet = inet_sk(sk); 130 const struct inet_connection_sock *icsk = inet_csk(sk); 131 struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); 132 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 133 struct inet_bind_hashbucket *bhead; 134 /* Step 1: Put TW into bind hash. Original socket stays there too. 135 Note, that any socket with inet->num != 0 MUST be bound in 136 binding cache, even if it is closed. 137 */ 138 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, 139 hashinfo->bhash_size)]; 140 spin_lock(&bhead->lock); 141 tw->tw_tb = icsk->icsk_bind_hash; 142 WARN_ON(!icsk->icsk_bind_hash); 143 inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); //加入到bind hash表中 144 spin_unlock(&bhead->lock); 145 146 spin_lock(lock); ... 153 inet_twsk_add_node_rcu(tw, &ehead->twchain); //加入到ESBABLISHED hash表中 154 155 /* Step 3: Remove SK from established hash. */ 156 if (__sk_nulls_del_node_init_rcu(sk)) 157 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); ... 167 atomic_add(1 + 1 + 1, &tw->tw_refcnt); 168 169 spin_unlock(lock); 170 }这样,在应用进程使用bind系统调用绑定与tw_sock相同的IP|端口对时内核会用到inet_csk_bind_conflict函数,但由于成功匹配到bind hash表中的tw_sock,会导致冲突,无法bind(详见 2.2 Bind系统调用 )。而在建立连接时,inet_hash_connect函数会调用__inet_check_established检查即将建立的连接是否与已建立的连接冲突:
311 static int __inet_check_established(struct inet_timewait_death_row *death_row, 312 struct sock *sk, __u16 lport, 313 struct inet_timewait_sock **twp) 314 { ... 335 sk_nulls_for_each(sk2, node, &head->twchain) { 336 if (sk2->sk_hash != hash) 337 continue; 338 339 if (likely(INET_TW_MATCH(sk2, net, acookie, 340 saddr, daddr, ports, dif))) { //地址|端口匹配 341 tw = inet_twsk(sk2); 342 if (twsk_unique(sk, sk2, twp)) //调用tcp_twsk_unique判断是否冲突 343 goto unique; //不冲突 344 else 345 goto not_unique; //冲突 346 } 347 } 348 tw = NULL; ... 359 unique: ... 376 if (twp) { 377 *twp = tw; //交给调用者处理 378 } else if (tw) { 379 /* Silly. Should hash-dance instead... */ 380 inet_twsk_deschedule(tw, death_row); 381 382 inet_twsk_put(tw); 383 } 384 return 0; 385 386 not_unique: 387 spin_unlock(lock); 388 return -EADDRNOTAVAIL; 389 }tcp_twsk_unique函数
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 110 { 111 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 112 struct tcp_sock *tp = tcp_sk(sk); ... 125 if (tcptw->tw_ts_recent_stamp && //开启时间戳选项且在TIME_WAIT状态下收到过包 126 (twp == NULL || (sysctl_tcp_tw_reuse && 127 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 128 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 129 if (tp->write_seq == 0) 130 tp->write_seq = 1; 131 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 132 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 133 sock_hold(sktw); 134 return 1; 135 } 136 137 return 0; 138 }可见,当:
(1)__inet_check_established函数的调用者不需要返回tw_sock的时候(即twp == NULL为真),或
(2)应用进程设置了net.ipv4.tcp_tw_reuse内核选项允许tw_sock重用时,
tcp_twsk_unique函数会返回1,即不冲突。不冲突时如果是(1),则__inet_check_established函数会释放tw_sock;否则会将tw_sock返回给调用者inet_hash_connect函数处理。在不冲突时,情况(1)发生时到底意味着什么?情况(1)没有发生时inet_hash_connect函数用tw_sock干什么?来看代码:
589 int inet_hash_connect(struct inet_timewait_death_row *death_row, 590 struct sock *sk) 591 { 592 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 593 __inet_check_established, __inet_hash_nolisten); 594 }看来__inet_check_established函数的使用者是__inet_hash_connect函数:
477 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 478 struct sock *sk, u32 port_offset, 479 int (*check_established)(struct inet_timewait_death_row *, 480 struct sock *, __u16, struct inet_timewait_sock **), 481 int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) 482 { ... 491 if (!snum) { ... 520 if (!check_established(death_row, sk, 521 port, &tw)) 522 goto ok; ... 544 ok: 545 hint += i; 546 547 /* Head lock still held and bh's disabled */ 548 inet_bind_hash(sk, tb, port); 549 if (sk_unhashed(sk)) { 550 inet_sk(sk)->inet_sport = htons(port); 551 twrefcnt += hash(sk, tw); //将sk加入到ESTABLISHED hash表中,将tw_sock从这个表中摘出 552 } 553 if (tw) 554 twrefcnt += inet_twsk_bind_unhash(tw, hinfo); //将tw_sock从bind hash表中摘出 555 spin_unlock(&head->lock); 556 557 if (tw) { 558 inet_twsk_deschedule(tw, death_row); //释放tw_sock 559 while (twrefcnt) { 560 twrefcnt--; 561 inet_twsk_put(tw); 562 } 563 } 564 565 ret = 0; 566 goto out; 567 } 568 569 head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; 570 tb = inet_csk(sk)->icsk_bind_hash; 571 spin_lock_bh(&head->lock); 572 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { //绑定到这个IP|port对的只有这一个socket 573 hash(sk, NULL); 574 spin_unlock_bh(&head->lock); 575 return 0; 576 } else { 577 spin_unlock(&head->lock); 578 /* No definite answer... Walk to established hash table */ 579 ret = check_established(death_row, sk, snum, NULL); 580 out: 581 local_bh_enable(); 582 return ret; 583 } 584 }
要绑定的端口非0情况(1)才会发生,这时意味着应用进程在调用connect系统调用之前已经成功地使用了bind系统调用,既然bind时不冲突,那么在connect时直接将tw_sock释放即可。而情况(1)没有发生时,tw_sock也会被释放并从hash表中摘出。
tcp_death_row的定义为:
35 struct inet_timewait_death_row tcp_death_row = { 36 .sysctl_max_tw_buckets = NR_FILE * 2, 37 .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, 38 .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), 39 .hashinfo = &tcp_hashinfo, 40 .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, 41 (unsigned long)&tcp_death_row), 42 .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, 43 inet_twdr_twkill_work), 44 /* Short-time timewait calendar */ 45 46 .twcal_hand = -1, 47 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, 48 (unsigned long)&tcp_death_row), 49 };inet_twsk_schedule函数:
340 void inet_twsk_schedule(struct inet_timewait_sock *tw, 341 struct inet_timewait_death_row *twdr, 342 const int timeo, const int timewait_len) 343 { 344 struct hlist_head *list; 345 int slot; 346 ... //计算tw sock加入到time_wait定时器链表中的位置,slot越大则超时时间越长 371 slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; 372 373 spin_lock(&twdr->death_lock); 374 375 /* Unlink it, if it was scheduled */ 376 if (inet_twsk_del_dead_node(tw))//已经在time_wait定时器链表中了,则摘除 377 twdr->tw_count--; 378 else 379 atomic_inc(&tw->tw_refcnt); 380 381 if (slot >= INET_TWDR_RECYCLE_SLOTS) { //超时时间过长,使用慢速定时器 382 /* Schedule to slow timer */ 383 if (timeo >= timewait_len) { 384 slot = INET_TWDR_TWKILL_SLOTS - 1; 385 } else { 386 slot = DIV_ROUND_UP(timeo, twdr->period); 387 if (slot >= INET_TWDR_TWKILL_SLOTS) 388 slot = INET_TWDR_TWKILL_SLOTS - 1; 389 } 390 tw->tw_ttd = jiffies + timeo; 391 slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); 392 list = &twdr->cells[slot]; //添加tw_sock到twdr->cells中 393 } else { //超时时间短的都放入再生定时器中 394 tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); 395 396 if (twdr->twcal_hand < 0) { //再生定时器未设置或已经超时 397 twdr->twcal_hand = 0; 398 twdr->twcal_jiffie = jiffies; //记录初次设置定时器的时间 399 twdr->twcal_timer.expires = twdr->twcal_jiffie + 400 (slot << INET_TWDR_RECYCLE_TICK); 401 add_timer(&twdr->twcal_timer);//设置再生定时器 402 } else { 403 if (time_after(twdr->twcal_timer.expires, 404 jiffies + (slot << INET_TWDR_RECYCLE_TICK))) //再生定时器未超时 405 mod_timer(&twdr->twcal_timer, 406 jiffies + (slot << INET_TWDR_RECYCLE_TICK));//设置再生超时定时器 407 slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); 408 } 409 list = &twdr->twcal_row[slot]; //添加tw_sock到twdr->twcal_row中 410 } 411 412 hlist_add_head(&tw->tw_death_node, list);//加入到time_wait定时器链表中 413 414 if (twdr->tw_count++ == 0)//加入之前time_wait定时器链表中没有成员 415 mod_timer(&twdr->tw_timer, jiffies + twdr->period); //设置慢速定时器 416 spin_unlock(&twdr->death_lock); 417 }
371:按照超时时间长短划分slot:0 jiffies为slot 0,1-2^INET_TWDR_RECYCLE_TICK jiffies为slot 1,2^INET_TWDR_RECYCLE_TICK + 1 -2^(INET_TWDR_RECYCLE_TICK + 1)为slot 2...每个slot 的时间长度是2^INET_TWDR_RECYCLE_TICK个jiffies。
386:按照超时时间长短划分slot,每个slot的时间长度是twdr->period。
可见TIME_WAIT定时器包含2个定时器结构:twcal_timer和tw_timer。其中twcal_timer的超时时间较短,被称为“再生定时器”。
tw_timer的超时时间是TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS(即7.5s),删除的条件有:
(1)在应用进程使用connect系统调用绑定IP|端口时匹配到tw_sock,但判定不冲突时(详见__inet_twsk_hashdance函数相关分析);
(2)调用inet_twsk_deschedule删除一个tw_sock,如果tw队列中没有成员,则禁用tw_timer:
326 void inet_twsk_deschedule(struct inet_timewait_sock *tw, 327 struct inet_timewait_death_row *twdr) 328 { 329 spin_lock(&twdr->death_lock); 330 if (inet_twsk_del_dead_node(tw)) { 331 inet_twsk_put(tw); 332 if (--twdr->tw_count == 0) //tw队列为空 333 del_timer(&twdr->tw_timer); //删除tw_timer 334 } 335 spin_unlock(&twdr->death_lock); 336 __inet_twsk_kill(tw, twdr->hashinfo); 337 }
__inet_twsk_kill会将tw_sock从bind hash表和ESTABLISHED hash表中删除:
70 static void __inet_twsk_kill(struct inet_timewait_sock *tw, 71 struct inet_hashinfo *hashinfo) 72 { 73 struct inet_bind_hashbucket *bhead; 74 int refcnt; 75 /* Unlink from established hashes. */ 76 spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); 77 78 spin_lock(lock); 79 refcnt = inet_twsk_unhash(tw); //从ESTABLISHED hash表中删除 80 spin_unlock(lock); 81 82 /* Disassociate with bind bucket. */ 83 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, 84 hashinfo->bhash_size)]; 85 86 spin_lock(&bhead->lock); 87 refcnt += inet_twsk_bind_unhash(tw, hashinfo); //从bind hash表中删除 88 spin_unlock(&bhead->lock); 89 90 #ifdef SOCK_REFCNT_DEBUG 91 if (atomic_read(&tw->tw_refcnt) != 1) { 92 pr_debug("%s timewait_sock %p refcnt=%d\n", 93 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); 94 } 95 #endif 96 while (refcnt) { 97 inet_twsk_put(tw); 98 refcnt--; 99 } 100 }
(3)twcal_timer超时时调用inet_twdr_twcal_tick删除tw_sock,如果tw队列中没有成员,则禁用tw_timer.
再生定时器不会被删除,其超时时间为slot * 2^INET_TWDR_RECYCLE_TICK。INET_TWDR_RECYCLE_TICK的定义如下:
41 #if HZ <= 16 || HZ > 4096 42 # error Unsupported: HZ <= 16 or HZ > 4096 43 #elif HZ <= 32 44 # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 45 #elif HZ <= 64 46 # define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 47 #elif HZ <= 128 48 # define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 49 #elif HZ <= 256 50 # define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 51 #elif HZ <= 512 52 # define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 53 #elif HZ <= 1024 54 # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 55 #elif HZ <= 2048 56 # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 57 #else 58 # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 59 #endif
如果jiffies每1ms加1,则INET_TWDR_RECYCLE_TICK的值为7;如果timo的值为60s(通常是最大值),则slot的值为469,那么再生定时器的最大超时时间为60s.如果1ms <= timeo <= 128ms,则slot = 1,再生定时器的最小超时时间为127ms.
twcal_timer对应的超时函数是inet_twdr_twcal_tick:
420 void inet_twdr_twcal_tick(unsigned long data) 421 { 422 struct inet_timewait_death_row *twdr; 423 int n, slot; 424 unsigned long j; 425 unsigned long now = jiffies; 426 int killed = 0; 427 int adv = 0; 428 429 twdr = (struct inet_timewait_death_row *)data; 430 431 spin_lock(&twdr->death_lock); 432 if (twdr->twcal_hand < 0)//再生超时定时器未设置或已经超时 433 goto out; 434 435 slot = twdr->twcal_hand; 436 j = twdr->twcal_jiffie; //获取初次设置定时器的时间 437 438 for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { //遍历所有时隙 439 if (time_before_eq(j, now)) { //已经超时 440 struct hlist_node *safe; 441 struct inet_timewait_sock *tw; 442 443 inet_twsk_for_each_inmate_safe(tw, safe, 444 &twdr->twcal_row[slot]) { //遍历一个时隙中的所有节点 445 __inet_twsk_del_dead_node(tw); //删除定时节点 446 __inet_twsk_kill(tw, twdr->hashinfo);//将tw sock移出TCP ESTABLISH hash表 ... 450 inet_twsk_put(tw); 451 killed++; //记录已删除的节点的数量 452 } 453 } else {//尚未超时 454 if (!adv) { 455 adv = 1; 456 twdr->twcal_jiffie = j; //更新尚未超时的时间起点 457 twdr->twcal_hand = slot; //更新尚未超时的时隙起点 458 } 459 460 if (!hlist_empty(&twdr->twcal_row[slot])) { 461 mod_timer(&twdr->twcal_timer, j); 462 goto out; 463 } 464 } 465 j += 1 << INET_TWDR_RECYCLE_TICK; 466 slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1); //进入下一个时隙 467 } 468 twdr->twcal_hand = -1; //标记再生定时器已经超时 469 470 out: 471 if ((twdr->tw_count -= killed) == 0) 472 del_timer(&twdr->tw_timer); 473 #ifndef CONFIG_NET_NS 474 NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed); 475 #endif 476 spin_unlock(&twdr->death_lock); 477 }439-451:再生定时器会将所有落入相同时隙(slot)的节点做同样的对待,它的基本动作是超时则删除,否则再次设置再生定时器
慢速定时器tw_timer对应的超时函数是inet_twdr_hangman:
262 void inet_twdr_hangman(unsigned long data) 263 { 264 struct inet_timewait_death_row *twdr; 265 unsigned int need_timer; 266 267 twdr = (struct inet_timewait_death_row *)data; 268 spin_lock(&twdr->death_lock); 269 270 if (twdr->tw_count == 0) //没有tw_sock 271 goto out; 272 273 need_timer = 0; 274 if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { //删除慢速定时器链表中的节点及其对应的tw_sock 275 twdr->thread_slots |= (1 << twdr->slot); //将当前slot的值标记下来 276 schedule_work(&twdr->twkill_work); //若杀死了过多的tw_sock,则将没有删除完毕则将任务放入工作者队列中由工作者进程完成 277 need_timer = 1; 278 } else { //没有杀死过多的tw_sock 279 /* We purged the entire slot, anything left? */ 280 if (twdr->tw_count) //还有tw_sock 281 need_timer = 1; //还要继续设置tw_timer 282 twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); //进入下一个slot 283 } 284 if (need_timer) 285 mod_timer(&twdr->tw_timer, jiffies + twdr->period); 286 out: 287 spin_unlock(&twdr->death_lock); 288inet_twdr_hangman每次超时只处理一个slot,然后再设置tw_timer在经过twdr->period的时间后再超时处理下一个slot。由于相邻slot的超时时间差正好是一个twdr->period,故所有slot都能得到及时的处理。
inet_twdr_do_twkill_work函数删除慢速定时器链表中的节点及其对应的tw_sock:
215 static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, 216 const int slot) 217 { 218 struct inet_timewait_sock *tw; 219 unsigned int killed; 220 int ret; 221 222 /* NOTE: compare this to previous version where lock 223 * was released after detaching chain. It was racy, 224 * because tw buckets are scheduled in not serialized context 225 * in 2.3 (with netfilter), and with softnet it is common, because 226 * soft irqs are not sequenced. 227 */ 228 killed = 0; 229 ret = 0; 230 rescan: 231 inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) { //遍历慢速超时队列 232 __inet_twsk_del_dead_node(tw); 233 spin_unlock(&twdr->death_lock); 234 __inet_twsk_kill(tw, twdr->hashinfo); 235 #ifdef CONFIG_NET_NS 236 NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); 237 #endif 238 inet_twsk_put(tw); 239 killed++; 240 spin_lock(&twdr->death_lock); 241 if (killed > INET_TWDR_TWKILL_QUOTA) { //杀戮过重 242 ret = 1; 243 break; 244 } 245 246 /* While we dropped twdr->death_lock, another cpu may have 247 * killed off the next TW bucket in the list, therefore 248 * do a fresh re-read of the hlist head node with the 249 * lock reacquired. We still use the hlist traversal 250 * macro in order to get the prefetches. 251 */ 252 goto rescan; 253 } 254 255 twdr->tw_count -= killed; 256 #ifndef CONFIG_NET_NS 257 NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed); 258 #endif 259 return ret; 260 }inet_twdr_twkill_work 函数是twdr->twkill_work对应的工作者线程处理函数,用于将inet_twdr_do_twkill_work函数未完成的屠杀进行到底:
291 void inet_twdr_twkill_work(struct work_struct *work) 292 { 293 struct inet_timewait_death_row *twdr = 294 container_of(work, struct inet_timewait_death_row, twkill_work); 295 int i; 296 297 BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) > 298 (sizeof(twdr->thread_slots) * 8)); 299 300 while (twdr->thread_slots) { 301 spin_lock_bh(&twdr->death_lock); 302 for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { 303 if (!(twdr->thread_slots & (1 << i))) //slot i不需要处理 304 continue; 305 306 while (inet_twdr_do_twkill_work(twdr, i) != 0) { //循环一直到杀光为止 307 if (need_resched()) { 308 spin_unlock_bh(&twdr->death_lock); 309 schedule(); 310 spin_lock_bh(&twdr->death_lock); 311 } 312 } 313 314 twdr->thread_slots &= ~(1 << i); //已经杀光此slot了 315 } 316 spin_unlock_bh(&twdr->death_lock); 317 } 318 }问题:慢速定时器超时时如果释放的tw_sock超出限制为什么要将任务转移到工作者线程中完成呢?
答案(个人理解):Linux定时器是在软中断上下文执行,如果运行时间过长会导致当前CPU的其它任务无法执行,有违公平性。而工作者线程的优先级较低,运行的时间长一点也没关系。