9.10 TIME_WAIT定时器

9.10.1 Why

        当socekt进入TIME_WAIT状态后,TIME_WAIT定时器启动。在超时之前,替代socket的tw sock会处理旧连接中的包,阻止其危害新连接。定时器超时后,tw sock被删除,并释放其占用的端口号。

9.10.2 When

        TIME_WAIT定时器的安装由tcp_time_wait函数完成,调用tcp_time_wait函数的时机有:

(1)在TCP_FIN_WAIT2状态下socket关闭,没有用TCP_LINGER2选项将tp->linger2设置为小于0且tcp_fin_time的大小小于等于TCP_TIMEWAIT_LEN:

2059 void tcp_close(struct sock *sk, long timeout)
2060 {
...
2183     if (sk->sk_state == TCP_FIN_WAIT2) {
2184         struct tcp_sock *tp = tcp_sk(sk);
2185         if (tp->linger2 < 0) {
...
2190         } else {
2191             const int tmo = tcp_fin_time(sk);
2192 
2193             if (tmo > TCP_TIMEWAIT_LEN) {
2194                 inet_csk_reset_keepalive_timer(sk,
2195                         tmo - TCP_TIMEWAIT_LEN);
2196             } else {
2197                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2198                 goto out;
2199             }
2200         }
...
 
(2)TCP_FIN_WAIT2状态下收到FIN并发送ACK后:

3783 static void tcp_fin(struct sock *sk)
3784 {
...
3818     case TCP_FIN_WAIT2:
3819         /* Received a FIN -- send ACK and enter TIME_WAIT. */
3820         tcp_send_ack(sk);
3821         tcp_time_wait(sk, TCP_TIME_WAIT, 0);
(3)孤儿socket在TCP_FIN_WAIT1状态下收到ACK时, 满足:

1)没有用TCP_LINGER2选项将tp->linger2设置为小于0

2)tcp_fin_time的大小小于等于TCP_TIMEWAIT_LEN:

3)ACK中没有数据或数据全是旧的

4)ACK中没有FIN标记并且socket没有被应用进程锁定

5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601               const struct tcphdr *th, unsigned int len)
5602 {
...
5751         case TCP_FIN_WAIT1:
...
5780                 if (!sock_flag(sk, SOCK_DEAD))
5781                     /* Wake up lingering close() */
5782                     sk->sk_state_change(sk);
5783                 else {
5784                     int tmo;
5785 
5786                     if (tp->linger2 < 0 ||
5787                         (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
5788                          after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
5789                         tcp_done(sk);
5790                         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
5791                         return 1;
5792                     }
5793 
5794                     tmo = tcp_fin_time(sk);
5795                     if (tmo > TCP_TIMEWAIT_LEN) {
5796                         inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
5797                     } else if (th->fin || sock_owned_by_user(sk)) {
5798                         /* Bad case. We could lose such FIN otherwise.
5799                          * It is not a big problem, but it looks confusing
5800                          * and not so rare event. We still can lose it now,
5801                          * if it spins in bh_lock_sock(), but it is really
5802                          * marginal case.
5803                          */
5804                         inet_csk_reset_keepalive_timer(sk, tmo);
5805                     } else {
5806                         tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
5807                         goto discard;
5808                     }
...
(4)TCP在TCP_CLOSING状态下收到ACK时:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601               const struct tcphdr *th, unsigned int len)
5602 {
...
5813         case TCP_CLOSING:
5814             if (tp->snd_una == tp->write_seq) {
5815                 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
5816                 goto discard;
5817             }
...
(5)FIN_WAIT2定时器超时时, 没有用TCP_LINGER2选项将tp->linger2设置为小于0且tcp_fin_time的大小大于TCP_TIMEWAIT_LEN:
558 static void tcp_keepalive_timer (unsigned long data)
559 {
...
578     if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
579         if (tp->linger2 >= 0) {
580             const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
581 
582             if (tmo > 0) {
583                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
584                 goto out;
585             }
...
         tcp_time_wait函数会调用inet_twsk_schedule函数安装TIME_WAIT定时器:
266 void tcp_time_wait(struct sock *sk, int state, int timeo)
267 {
...
327         __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); //将tw sock放入ESTABLESHED hash表和bind hash表中,将sk从ESTABLISHED hash表中移除
328 
329         /* Get the TIME_WAIT timeout firing. */
330         if (timeo < rto)
331             timeo = rto;
332 
333         if (recycle_ok) {
334             tw->tw_timeout = rto;
335         } else {
336             tw->tw_timeout = TCP_TIMEWAIT_LEN;
337             if (state == TCP_TIME_WAIT)
338                 timeo = TCP_TIMEWAIT_LEN;
339         }
340 
341         inet_twsk_schedule(tw, &tcp_death_row, timeo,
342                    TCP_TIMEWAIT_LEN);
343         inet_twsk_put(tw);
...

        __inet_twsk_hashdance函数将tw_sock加入到bind hash表和ESTABLISHED表中,这样在tw_sock被删除之前相应IP|端口不允许bind,也不允许建立:

126 void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
127                struct inet_hashinfo *hashinfo)
128 {
129     const struct inet_sock *inet = inet_sk(sk); 
130     const struct inet_connection_sock *icsk = inet_csk(sk);
131     struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
132     spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
133     struct inet_bind_hashbucket *bhead;
134     /* Step 1: Put TW into bind hash. Original socket stays there too.
135        Note, that any socket with inet->num != 0 MUST be bound in
136        binding cache, even if it is closed.
137      */
138     bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
139             hashinfo->bhash_size)];        
140     spin_lock(&bhead->lock);
141     tw->tw_tb = icsk->icsk_bind_hash;
142     WARN_ON(!icsk->icsk_bind_hash);
143     inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);    //加入到bind hash表中
144     spin_unlock(&bhead->lock);
145 
146     spin_lock(lock);
...
153     inet_twsk_add_node_rcu(tw, &ehead->twchain);  //加入到ESBABLISHED hash表中
154 
155     /* Step 3: Remove SK from established hash. */
156     if (__sk_nulls_del_node_init_rcu(sk))
157         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
...
167     atomic_add(1 + 1 + 1, &tw->tw_refcnt);
168 
169     spin_unlock(lock);
170 } 
        这样,在应用进程使用bind系统调用绑定与tw_sock相同的IP|端口对时内核会用到inet_csk_bind_conflict函数,但由于成功匹配到bind hash表中的tw_sock,会导致冲突,无法bind(详见 2.2 Bind系统调用 )。而在建立连接时,inet_hash_connect函数会调用__inet_check_established检查即将建立的连接是否与已建立的连接冲突:
311 static int __inet_check_established(struct inet_timewait_death_row *death_row,
312                     struct sock *sk, __u16 lport,
313                     struct inet_timewait_sock **twp)
314 {
...
335     sk_nulls_for_each(sk2, node, &head->twchain) {
336         if (sk2->sk_hash != hash)
337             continue;
338 
339         if (likely(INET_TW_MATCH(sk2, net, acookie,
340                      saddr, daddr, ports, dif))) {    //地址|端口匹配
341             tw = inet_twsk(sk2);
342             if (twsk_unique(sk, sk2, twp))    //调用tcp_twsk_unique判断是否冲突
343                 goto unique;    //不冲突
344             else
345                 goto not_unique; //冲突
346         }
347     }
348     tw = NULL;
...
359 unique:
...
376     if (twp) {
377         *twp = tw;    //交给调用者处理
378     } else if (tw) {
379         /* Silly. Should hash-dance instead... */
380         inet_twsk_deschedule(tw, death_row);
381 
382         inet_twsk_put(tw);
383     }
384     return 0;
385 
386 not_unique:
387     spin_unlock(lock);
388     return -EADDRNOTAVAIL;
389 } 
        tcp_twsk_unique函数
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {   
 111     const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112     struct tcp_sock *tp = tcp_sk(sk);
...
 125     if (tcptw->tw_ts_recent_stamp &&    //开启时间戳选项且在TIME_WAIT状态下收到过包
 126         (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                  get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128         tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129         if (tp->write_seq == 0)
 130             tp->write_seq = 1;
 131         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133         sock_hold(sktw);
 134         return 1;
 135     }
 136 
 137     return 0;
 138 }
        可见,当:

(1)__inet_check_established函数的调用者不需要返回tw_sock的时候(即twp == NULL为真),或

(2)应用进程设置了net.ipv4.tcp_tw_reuse内核选项允许tw_sock重用时,

        tcp_twsk_unique函数会返回1,即不冲突。不冲突时如果是(1),则__inet_check_established函数会释放tw_sock;否则会将tw_sock返回给调用者inet_hash_connect函数处理。在不冲突时,情况(1)发生时到底意味着什么?情况(1)没有发生时inet_hash_connect函数用tw_sock干什么?来看代码:

589 int inet_hash_connect(struct inet_timewait_death_row *death_row,
590               struct sock *sk)
591 {
592     return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
593             __inet_check_established, __inet_hash_nolisten);
594 }
        看来__inet_check_established函数的使用者是__inet_hash_connect函数:
477 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
478         struct sock *sk, u32 port_offset,
479         int (*check_established)(struct inet_timewait_death_row *,
480             struct sock *, __u16, struct inet_timewait_sock **),
481         int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
482 {
...
491     if (!snum) {
...
520                     if (!check_established(death_row, sk,
521                                 port, &tw))
522                         goto ok;
...
544 ok:
545         hint += i;
546 
547         /* Head lock still held and bh's disabled */
548         inet_bind_hash(sk, tb, port);
549         if (sk_unhashed(sk)) {
550             inet_sk(sk)->inet_sport = htons(port);
551             twrefcnt += hash(sk, tw);    //将sk加入到ESTABLISHED hash表中,将tw_sock从这个表中摘出
552         }
553         if (tw)
554             twrefcnt += inet_twsk_bind_unhash(tw, hinfo);  //将tw_sock从bind hash表中摘出
555         spin_unlock(&head->lock);
556 
557         if (tw) {
558             inet_twsk_deschedule(tw, death_row);  //释放tw_sock
559             while (twrefcnt) {
560                 twrefcnt--;
561                 inet_twsk_put(tw);
562             }
563         }
564 
565         ret = 0;
566         goto out;
567     }
568 
569     head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
570     tb  = inet_csk(sk)->icsk_bind_hash;
571     spin_lock_bh(&head->lock);
572     if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {    //绑定到这个IP|port对的只有这一个socket
573         hash(sk, NULL);
574         spin_unlock_bh(&head->lock);
575         return 0;
576     } else {
577         spin_unlock(&head->lock);
578         /* No definite answer... Walk to established hash table */
579         ret = check_established(death_row, sk, snum, NULL);
580 out:
581         local_bh_enable();
582         return ret;
583     }
584 }                

        要绑定的端口非0情况(1)才会发生,这时意味着应用进程在调用connect系统调用之前已经成功地使用了bind系统调用,既然bind时不冲突,那么在connect时直接将tw_sock释放即可。而情况(1)没有发生时,tw_sock也会被释放并从hash表中摘出。

        tcp_death_row的定义为:

 35 struct inet_timewait_death_row tcp_death_row = {
 36     .sysctl_max_tw_buckets = NR_FILE * 2,
 37     .period     = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
 38     .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
 39     .hashinfo   = &tcp_hashinfo,
 40     .tw_timer   = TIMER_INITIALIZER(inet_twdr_hangman, 0,
 41                         (unsigned long)&tcp_death_row),
 42     .twkill_work    = __WORK_INITIALIZER(tcp_death_row.twkill_work,
 43                          inet_twdr_twkill_work),
 44 /* Short-time timewait calendar */
 45 
 46     .twcal_hand = -1,
 47     .twcal_timer    = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
 48                         (unsigned long)&tcp_death_row),
 49 };
        inet_twsk_schedule函数:
340 void inet_twsk_schedule(struct inet_timewait_sock *tw,
341                struct inet_timewait_death_row *twdr,
342                const int timeo, const int timewait_len)
343 {           
344     struct hlist_head *list;
345     int slot;
346             
...     //计算tw sock加入到time_wait定时器链表中的位置,slot越大则超时时间越长
371     slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
372 
373     spin_lock(&twdr->death_lock);
374
375     /* Unlink it, if it was scheduled */
376     if (inet_twsk_del_dead_node(tw))//已经在time_wait定时器链表中了,则摘除
377         twdr->tw_count--;
378     else
379         atomic_inc(&tw->tw_refcnt);
380
381     if (slot >= INET_TWDR_RECYCLE_SLOTS) {   //超时时间过长,使用慢速定时器
382         /* Schedule to slow timer */
383         if (timeo >= timewait_len) {
384             slot = INET_TWDR_TWKILL_SLOTS - 1;
385         } else {
386             slot = DIV_ROUND_UP(timeo, twdr->period);
387             if (slot >= INET_TWDR_TWKILL_SLOTS)
388                 slot = INET_TWDR_TWKILL_SLOTS - 1;
389         }
390         tw->tw_ttd = jiffies + timeo;
391         slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
392         list = &twdr->cells[slot];  //添加tw_sock到twdr->cells中
393     } else {  //超时时间短的都放入再生定时器中
394         tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
395
396         if (twdr->twcal_hand < 0) { //再生定时器未设置或已经超时
397             twdr->twcal_hand = 0;
398             twdr->twcal_jiffie = jiffies;  //记录初次设置定时器的时间
399             twdr->twcal_timer.expires = twdr->twcal_jiffie +
400                           (slot << INET_TWDR_RECYCLE_TICK);
401             add_timer(&twdr->twcal_timer);//设置再生定时器
402         } else {
403             if (time_after(twdr->twcal_timer.expires,
404                        jiffies + (slot << INET_TWDR_RECYCLE_TICK)))  //再生定时器未超时
405                 mod_timer(&twdr->twcal_timer,
406                       jiffies + (slot << INET_TWDR_RECYCLE_TICK));//设置再生超时定时器
407             slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
408         }
409         list = &twdr->twcal_row[slot];  //添加tw_sock到twdr->twcal_row中
410     }
411
412     hlist_add_head(&tw->tw_death_node, list);//加入到time_wait定时器链表中
413
414     if (twdr->tw_count++ == 0)//加入之前time_wait定时器链表中没有成员
415         mod_timer(&twdr->tw_timer, jiffies + twdr->period); //设置慢速定时器
416     spin_unlock(&twdr->death_lock);
417 }

        371:按照超时时间长短划分slot:0 jiffies为slot 0,1-2^INET_TWDR_RECYCLE_TICK jiffies为slot 1,2^INET_TWDR_RECYCLE_TICK + 1 -2^(INET_TWDR_RECYCLE_TICK  + 1)为slot 2...每个slot 的时间长度是2^INET_TWDR_RECYCLE_TICK个jiffies。

        386:按照超时时间长短划分slot,每个slot的时间长度是twdr->period。

        可见TIME_WAIT定时器包含2个定时器结构:twcal_timer和tw_timer。其中twcal_timer的超时时间较短,被称为“再生定时器”。

        tw_timer的超时时间是TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS(即7.5s),删除的条件有:

(1)在应用进程使用connect系统调用绑定IP|端口时匹配到tw_sock,但判定不冲突时(详见__inet_twsk_hashdance函数相关分析);

(2)调用inet_twsk_deschedule删除一个tw_sock,如果tw队列中没有成员,则禁用tw_timer:

326 void inet_twsk_deschedule(struct inet_timewait_sock *tw,
327               struct inet_timewait_death_row *twdr)
328 {   
329     spin_lock(&twdr->death_lock);
330     if (inet_twsk_del_dead_node(tw)) {
331         inet_twsk_put(tw);
332         if (--twdr->tw_count == 0)     //tw队列为空
333             del_timer(&twdr->tw_timer);   //删除tw_timer 
334     }   
335     spin_unlock(&twdr->death_lock);
336     __inet_twsk_kill(tw, twdr->hashinfo);
337 }           

        __inet_twsk_kill会将tw_sock从bind hash表和ESTABLISHED hash表中删除:

 70 static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 71                  struct inet_hashinfo *hashinfo)
 72 {   
 73     struct inet_bind_hashbucket *bhead;
 74     int refcnt;
 75     /* Unlink from established hashes. */
 76     spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
 77 
 78     spin_lock(lock);
 79     refcnt = inet_twsk_unhash(tw);    //从ESTABLISHED hash表中删除
 80     spin_unlock(lock);
 81     
 82     /* Disassociate with bind bucket. */
 83     bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
 84             hashinfo->bhash_size)];
 85         
 86     spin_lock(&bhead->lock);
 87     refcnt += inet_twsk_bind_unhash(tw, hashinfo);   //从bind hash表中删除
 88     spin_unlock(&bhead->lock);
 89     
 90 #ifdef SOCK_REFCNT_DEBUG
 91     if (atomic_read(&tw->tw_refcnt) != 1) {
 92         pr_debug("%s timewait_sock %p refcnt=%d\n",
 93              tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
 94     }
 95 #endif
 96     while (refcnt) {
 97         inet_twsk_put(tw);
 98         refcnt--;
 99     }
100 }

(3)twcal_timer超时时调用inet_twdr_twcal_tick删除tw_sock,如果tw队列中没有成员,则禁用tw_timer.

        再生定时器不会被删除,其超时时间为slot * 2^INET_TWDR_RECYCLE_TICK。INET_TWDR_RECYCLE_TICK的定义如下:

 41 #if HZ <= 16 || HZ > 4096
 42 # error Unsupported: HZ <= 16 or HZ > 4096
 43 #elif HZ <= 32
 44 # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 45 #elif HZ <= 64
 46 # define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 47 #elif HZ <= 128
 48 # define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 49 #elif HZ <= 256
 50 # define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 51 #elif HZ <= 512
 52 # define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 53 #elif HZ <= 1024
 54 # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 55 #elif HZ <= 2048
 56 # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 57 #else
 58 # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 59 #endif

        如果jiffies每1ms加1,则INET_TWDR_RECYCLE_TICK的值为7;如果timo的值为60s(通常是最大值),则slot的值为469,那么再生定时器的最大超时时间为60s.如果1ms <= timeo <= 128ms,则slot = 1,再生定时器的最小超时时间为127ms.

9.10.3 What

        twcal_timer对应的超时函数是inet_twdr_twcal_tick:

420 void inet_twdr_twcal_tick(unsigned long data)
421 {
422     struct inet_timewait_death_row *twdr;
423     int n, slot;
424     unsigned long j;
425     unsigned long now = jiffies;
426     int killed = 0;
427     int adv = 0;
428
429     twdr = (struct inet_timewait_death_row *)data;
430
431     spin_lock(&twdr->death_lock);
432     if (twdr->twcal_hand < 0)//再生超时定时器未设置或已经超时
433         goto out;
434
435     slot = twdr->twcal_hand;
436     j = twdr->twcal_jiffie;  //获取初次设置定时器的时间
437
438     for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {  //遍历所有时隙
439         if (time_before_eq(j, now)) { //已经超时
440             struct hlist_node *safe;
441             struct inet_timewait_sock *tw;
442
443             inet_twsk_for_each_inmate_safe(tw, safe,
444                                &twdr->twcal_row[slot]) {   //遍历一个时隙中的所有节点   
445                 __inet_twsk_del_dead_node(tw); //删除定时节点
446                 __inet_twsk_kill(tw, twdr->hashinfo);//将tw sock移出TCP ESTABLISH hash表
...
450                 inet_twsk_put(tw);             
451                 killed++;  //记录已删除的节点的数量
452             }
453         } else {//尚未超时
454             if (!adv) {   
455                 adv = 1;  
456                 twdr->twcal_jiffie = j;     //更新尚未超时的时间起点   
457                 twdr->twcal_hand = slot;    //更新尚未超时的时隙起点   
458             }
459
460             if (!hlist_empty(&twdr->twcal_row[slot])) {
461                 mod_timer(&twdr->twcal_timer, j);
462                 goto out;
463             }
464         }
465         j += 1 << INET_TWDR_RECYCLE_TICK;
466         slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);  //进入下一个时隙
467     }
468     twdr->twcal_hand = -1;   //标记再生定时器已经超时
469
470 out:
471     if ((twdr->tw_count -= killed) == 0)
472         del_timer(&twdr->tw_timer);
473 #ifndef CONFIG_NET_NS
474     NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed);
475 #endif
476     spin_unlock(&twdr->death_lock);
477 }
        439-451:再生定时器会将所有落入相同时隙(slot)的节点做同样的对待,它的基本动作是超时则删除,否则再次设置再生定时器

        慢速定时器tw_timer对应的超时函数是inet_twdr_hangman:

262 void inet_twdr_hangman(unsigned long data)
263 {
264     struct inet_timewait_death_row *twdr;
265     unsigned int need_timer;
266
267     twdr = (struct inet_timewait_death_row *)data;
268     spin_lock(&twdr->death_lock);
269
270     if (twdr->tw_count == 0)  //没有tw_sock
271         goto out;
272
273     need_timer = 0;
274     if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { //删除慢速定时器链表中的节点及其对应的tw_sock
275         twdr->thread_slots |= (1 << twdr->slot);  //将当前slot的值标记下来
276         schedule_work(&twdr->twkill_work); //若杀死了过多的tw_sock,则将没有删除完毕则将任务放入工作者队列中由工作者进程完成
277         need_timer = 1;
278     } else {  //没有杀死过多的tw_sock
279         /* We purged the entire slot, anything left?  */
280         if (twdr->tw_count)  //还有tw_sock
281             need_timer = 1;  //还要继续设置tw_timer
282         twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));  //进入下一个slot
283     }
284     if (need_timer)
285         mod_timer(&twdr->tw_timer, jiffies + twdr->period);
286 out:
287     spin_unlock(&twdr->death_lock);
288 
        inet_twdr_hangman每次超时只处理一个slot,然后再设置tw_timer在经过twdr->period的时间后再超时处理下一个slot。由于相邻slot的超时时间差正好是一个twdr->period,故所有slot都能得到及时的处理。

        inet_twdr_do_twkill_work函数删除慢速定时器链表中的节点及其对应的tw_sock:

215 static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
216                     const int slot)
217 {
218     struct inet_timewait_sock *tw;
219     unsigned int killed;
220     int ret;
221
222     /* NOTE: compare this to previous version where lock
223      * was released after detaching chain. It was racy,
224      * because tw buckets are scheduled in not serialized context
225      * in 2.3 (with netfilter), and with softnet it is common, because
226      * soft irqs are not sequenced.
227      */
228     killed = 0;
229     ret = 0;
230 rescan:
231     inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) { //遍历慢速超时队列
232         __inet_twsk_del_dead_node(tw);
233         spin_unlock(&twdr->death_lock);
234         __inet_twsk_kill(tw, twdr->hashinfo);
235 #ifdef CONFIG_NET_NS
236         NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
237 #endif
238         inet_twsk_put(tw);
239         killed++;       
240         spin_lock(&twdr->death_lock);
241         if (killed > INET_TWDR_TWKILL_QUOTA) {  //杀戮过重
242             ret = 1;
243             break;
244         }
245     
246         /* While we dropped twdr->death_lock, another cpu may have
247          * killed off the next TW bucket in the list, therefore
248          * do a fresh re-read of the hlist head node with the
249          * lock reacquired.  We still use the hlist traversal
250          * macro in order to get the prefetches.
251          */
252         goto rescan;
253     }
254
255     twdr->tw_count -= killed;
256 #ifndef CONFIG_NET_NS
257     NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed);
258 #endif
259     return ret;
260 }
        inet_twdr_twkill_work 函数是twdr->twkill_work对应的工作者线程处理函数,用于将inet_twdr_do_twkill_work函数未完成的屠杀进行到底:
291 void inet_twdr_twkill_work(struct work_struct *work)
292 {   
293     struct inet_timewait_death_row *twdr =
294         container_of(work, struct inet_timewait_death_row, twkill_work);
295     int i;
296     
297     BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) >
298             (sizeof(twdr->thread_slots) * 8));
299                          
300     while (twdr->thread_slots) {
301         spin_lock_bh(&twdr->death_lock);
302         for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
303             if (!(twdr->thread_slots & (1 << i))) //slot i不需要处理
304                 continue;
305
306             while (inet_twdr_do_twkill_work(twdr, i) != 0) {  //循环一直到杀光为止
307                 if (need_resched()) {
308                     spin_unlock_bh(&twdr->death_lock);
309                     schedule();
310                     spin_lock_bh(&twdr->death_lock);
311                 }
312             }
313         
314             twdr->thread_slots &= ~(1 << i);  //已经杀光此slot了
315         }
316         spin_unlock_bh(&twdr->death_lock);
317     }
318 }
        问题:慢速定时器超时时如果释放的tw_sock超出限制为什么要将任务转移到工作者线程中完成呢?

        答案(个人理解):Linux定时器是在软中断上下文执行,如果运行时间过长会导致当前CPU的其它任务无法执行,有违公平性。而工作者线程的优先级较低,运行的时间长一点也没关系。

你可能感兴趣的:(网络,tcp,linux内核)