dpvs 连接会话老化处理逻辑

前提知识储备,dpdk 定时器:http://blog.csdn.net/linzhaolover/article/details/9410529

rte_get_timer_hz() 获得CPU主频(1s多少个cycle)


cursors的本意是游标,我用滴答取代吧。


dpvs 代码分析:

1、核线程配置

相关文件:dpvs 配置文件dpvs.conf

    dpvs设计沿用了dpdk对每个cpu核线程的操作,每个cpu核线程作为独立的运行体,每个核线程接收指定网卡,指定队列的数据(当然也可以一个核线程接收所有的,在此不讨论),核线程相关设置如下图,

eg、

     worker cpu1 {
        type    slave
        cpu_id  1
        port    dpdk0 {
            rx_queue_ids     0
            tx_queue_ids     0
            ! isol_rx_cpu_ids  9
            ! isol_rxq_ring_sz 1048576
        }
    }

完整配置可以看官网文件里面。


2、dpvs定时器(timer)

相关文件:timer.c timer.h

相关结构

struct timer_scheduler {
    /* wheels and cursors */
    rte_spinlock_t      lock;
    uint32_t            cursors[LEVEL_DEPTH];
    struct list_head    *hashs[LEVEL_DEPTH];

    /* leverage dpdk rte_timer to drive us */
    struct rte_timer    rte_tim;
};
    这个数据结构里cursors[LEVEL_DEPTH]代表定时器每次运行的滴答数,初始值从0开始的,最大值是LEVEL_SIZE,它不断从0->LEVEL_SIZE循环,LEVEL_DEPTH等于2,根据代码的意思是分成2个轮回(我这么理解的),hash和cursors是一一对应的。

 * with 1000hz, if LEVEL_SIZE is 2<<18 and LEVEL_DEPTH is 2:
 * it's about 524s for first wheel and 8.7 years for all wheels.
 */
/* __NOTE__: make sure (LEVEL_SIZE ** LEVEL_DEPTH) > TIMER_MAX_TICKS. */
#define LEVEL_SIZE              (2<<18)


每个会话里的struct dpvs_timer保存连接超时相关信息

struct dpvs_timer {
    struct list_head    list;

    dpvs_timer_cb_t     handler;
    void                *priv;
    bool                is_period;

    /*
     * 'delay' for one-short timer
     * 'interval' for periodic timer.
     */
    dpvs_tick_t         delay;
};


在核线程启动时会调用timer_init_schedler()函数启动rte_timer_tick_cb周期定时器,
/* 
 * it takes exactly one tick between invokations,
 * except system (including time handles) takes more then 
 * one tick to get rte_timer_manage() called.
 * we needn't calculate ticks elapsed by ourself.
 */
static void rte_timer_tick_cb(struct rte_timer *tim, void *arg)
{
    struct timer_scheduler *sched = arg;
    struct dpvs_timer *timer, *next;
    uint64_t left, hash, off;
    int level, lower;
    uint32_t *cursor;
    bool carry;

    assert(tim && sched);
#ifdef CONFIG_TIMER_MEASURE
    deviation_measure();
    return;
#endif

    rte_spinlock_lock(&sched->lock);

    /* drive timer to move and handle expired timers. */
    for (level = 0; level < LEVEL_DEPTH; level++) {
        cursor = &sched->cursors[level];
        
        (*cursor)++;/* 每次运行滴答数加1*/

        /*cursor的值等于LEVEL_SIZE, 赋值成0开始循环*/
        if (*cursor < LEVEL_SIZE) {
            carry = false;
        } else {
            /* reset the cursor and handle next level later. */
            *cursor = 0;
            carry = true;
        }
        /*遍历指定滴答数,level的hash表,执行会话超时操作*/
        list_for_each_entry_safe(timer, next,
                                 &sched->hashs[level][*cursor], list) {
      
            /* is all lower levels ticks empty ? */
            left = timer->delay % get_level_ticks(level);

            if (!left) {
                timer_expire(sched, timer);
            } else {
                /* drop to lower level wheel, note it may not drop to
                 * "next" lower level wheel. */
                list_del(&timer->list);

                lower = level;
                while (--lower >= 0) {
                    off = timer->delay / get_level_ticks(lower);
                    if (!off)
                        continue; /* next lower level */

                    hash = (*cursor + off) % LEVEL_SIZE;
                    list_add_tail(&timer->list, &sched->hashs[lower][hash]);
                    break;
                }

                assert(lower >= 0);
            }
        }

        if (!carry)
            break;
    }

    rte_spinlock_unlock(&sched->lock);
    return;
}

rte_timer_tick_cb函数周期性的扫描会话超时hash表链表里的会话。

3、会话添加

相关文件:ip_vs_conn.c

    struct dp_vs_conn为会话结构体,当有新会话时,执行dp_vs_conn_new()函数添加会话,dp_vs_conn_new()函数调用dpvs_timer_sched()函数添加会话定时器,每个新的会话的new->timeout.tv_sec初始化的秒数是DPVS_CONN_INIT_TIMEOUT_DEF(3秒),后面会根据指定协议指定状态修改超时时间。执行添加timer的函数是__dpvs_timer_sched(),timeval_to_ticks()函数会将初始化的秒数转换成程序的滴答数。

/* call me with lock */
static int __dpvs_timer_sched(struct timer_scheduler *sched,
                              struct dpvs_timer *timer, struct timeval *delay,
                              dpvs_timer_cb_t handler, void *arg, bool period)
{
    uint32_t off, hash;
    int level;

    assert(timer && delay && handler);

    if (timer_pending(timer))
        RTE_LOG(WARNING, DTIMER, "schedule a pending timer ?\n");

    timer->handler = handler;
    timer->priv = arg;
    timer->is_period = period;
    timer->delay = timeval_to_ticks(delay);/*将秒数转换成滴答数,若滴答数大于LEVEL_SIZE,这个timer会添加到level为1的hash表里,一般情况滴答数都小于LEVEL_SIZE*/

    if (unlikely(timer->delay >= TIMER_MAX_TICKS)) {
        RTE_LOG(WARNING, DTIMER, "exceed timer range\n");
        return EDPVS_INVAL;
    }

    /*
     * to schedule a 0 delay timer is not make sence.
     * and it will never stopped (periodic) or never triggered (one-shut).
     */
    if (unlikely(!timer->delay)) {
        RTE_LOG(WARNING, DTIMER, "schedule 0 timeout timer.\n");
        return EDPVS_INVAL;
    }
    /**/
    /* add to corresponding wheel, from higher level to lower. */
    for (level = LEVEL_DEPTH - 1; level >= 0; level--) {
        off = timer->delay / get_level_ticks(level);/*这里就是验证上面转换的滴答数是否大于LEVEL_SIZE,大于的话off肯定大于0,timer就会添加到level为1的hash表里,但一般滴答都小于LEVEL_SIZE,所以timer都添加到了level为0的hash表里*/
        if (off > 0) {
            /*sched->cursors[level],在上面已经提到,它就是此时核线程的运行滴答数,它的值加上off和LEVEL_SIZE取余,获得hash key,然后把timer加到指定的hash链表了*/
            hash = (sched->cursors[level] + off) % LEVEL_SIZE;

            RTE_LOG(ERR, DTIMER, "hash[%d]  level[%d] timer[%p] hash[%p] delay[%d]\n", 
                           hash, level, timer, &sched->hashs[level][hash], timer->delay);
            list_add_tail(&timer->list, &sched->hashs[level][hash]);
            return EDPVS_OK;
        }
    }

    /* not adopted by any wheel (never happend) */
    return EDPVS_INVAL;
}
举例说明,一般level都为0,hash表的位置根据level和*cursor值确定(sched->hashs[level][*cursor])
dpvs 连接会话老化处理逻辑_第1张图片

根据上图举例,此时会话的timer就加到了蓝色箭头指向的hash链表里。定时器(rte_timer_tick_cb)每执行一次,*cursor的值加1,由于1-5的*cursor指定的hash表为空,rte_timer_tick_cb没有操作,当执行到第6时,发现sched->hashs[level][*cursor]指向的hash  链表不为空,就会把里面的会话取出来执行超时操作。

4、会话定时器更新

会话管理对应指定协议,以tcp会话为例:

static int tcp_timeouts[DPVS_TCP_S_LAST + 1] = {
    [DPVS_TCP_S_NONE]           = 2,    /* in seconds */
    [DPVS_TCP_S_ESTABLISHED]    = 90,
    [DPVS_TCP_S_SYN_SENT]       = 3,
    [DPVS_TCP_S_SYN_RECV]       = 30,
    [DPVS_TCP_S_FIN_WAIT]       = 7,
    [DPVS_TCP_S_TIME_WAIT]      = 7,
    [DPVS_TCP_S_CLOSE]          = 3,
    [DPVS_TCP_S_CLOSE_WAIT]     = 7,
    [DPVS_TCP_S_LAST_ACK]       = 7,
    [DPVS_TCP_S_LISTEN]         = 120,
    [DPVS_TCP_S_SYNACK]         = 30,
    [DPVS_TCP_S_LAST]           = 2
};

    上面保存的是tcp在各个状态时,timer的超时时间,当报文驱动tcp状态改变时,timer的超时时间就会被更新,调用dpvs_timer_update()函数,跟踪dpvs的流程可以看dp_vs_in()里的处理。

5、dpvs 会话超时总结

a、我们通常的超时做法:

    假如现在时间now是555秒,设置会话超时时间是timeout5秒,那么到now+timeout=560秒,会话超时,怎么检查超时呢,肯定是将会话用链表管理起来,定时扫描链表(简单效率低)。

b、dpvs的做法:

    dpvs使用了dpdk的定时器机制,定时执行扫描,dpvs会话老化的精髓就在会话扫描设计上。它设计了一个大小为LEVEL_SIZE的滴答计数(*cursor),每个滴答数对应一个hash链表,滴答计数从0到LEVEL_SIZE不断循环,每次定时器超时滴答数加1,每次只扫描此滴答数对应的hash链表(sched->hashs[level][*cursor]),管理会话的超时设计上是将秒数转换成滴答数,并将这个会话的timer添加到由超时滴答数和此时*cursor共同决定的hash链表上。此时,每次dpdk定时器运行时,对应滴答数的非空hash 链表的会话都会被执行超时操作,很多滴答对应的hash表上根本就没有添加会话。此设计可能只会在相同滴答数上的hash链表里有很多个会话时,删除会耗费点时间,但这也不可避免,其他基本没有过多浪费操作。dpdk定时器的执行都是在每次收报文操作前执行,这也保证会话超时操作尽可能的减小对报文收取的影响。



你可能感兴趣的:(dpvs 连接会话老化处理逻辑)