前提知识储备,dpdk 定时器:http://blog.csdn.net/linzhaolover/article/details/9410529
rte_get_timer_hz() 获得CPU主频(1s多少个cycle)
cursors的本意是游标,我用滴答取代吧。
dpvs 代码分析:
1、核线程配置
相关文件:dpvs 配置文件dpvs.conf
dpvs设计沿用了dpdk对每个cpu核线程的操作,每个cpu核线程作为独立的运行体,每个核线程接收指定网卡,指定队列的数据(当然也可以一个核线程接收所有的,在此不讨论),核线程相关设置如下图,
eg、
worker cpu1 {
type slave
cpu_id 1
port dpdk0 {
rx_queue_ids 0
tx_queue_ids 0
! isol_rx_cpu_ids 9
! isol_rxq_ring_sz 1048576
}
}
2、dpvs定时器(timer)
相关文件:timer.c timer.h
相关结构
struct timer_scheduler {
/* wheels and cursors */
rte_spinlock_t lock;
uint32_t cursors[LEVEL_DEPTH];
struct list_head *hashs[LEVEL_DEPTH];
/* leverage dpdk rte_timer to drive us */
struct rte_timer rte_tim;
};
这个数据结构里cursors[LEVEL_DEPTH]代表定时器每次运行的滴答数,初始值从0开始的,最大值是LEVEL_SIZE,它不断从0->LEVEL_SIZE循环,LEVEL_DEPTH等于2,根据代码的意思是分成2个轮回(我这么理解的),hash和cursors是一一对应的。
* with 1000hz, if LEVEL_SIZE is 2<<18 and LEVEL_DEPTH is 2:
* it's about 524s for first wheel and 8.7 years for all wheels.
*/
/* __NOTE__: make sure (LEVEL_SIZE ** LEVEL_DEPTH) > TIMER_MAX_TICKS. */
#define LEVEL_SIZE (2<<18)
每个会话里的struct dpvs_timer保存连接超时相关信息
struct dpvs_timer {
struct list_head list;
dpvs_timer_cb_t handler;
void *priv;
bool is_period;
/*
* 'delay' for one-short timer
* 'interval' for periodic timer.
*/
dpvs_tick_t delay;
};
/*
* it takes exactly one tick between invokations,
* except system (including time handles) takes more then
* one tick to get rte_timer_manage() called.
* we needn't calculate ticks elapsed by ourself.
*/
static void rte_timer_tick_cb(struct rte_timer *tim, void *arg)
{
struct timer_scheduler *sched = arg;
struct dpvs_timer *timer, *next;
uint64_t left, hash, off;
int level, lower;
uint32_t *cursor;
bool carry;
assert(tim && sched);
#ifdef CONFIG_TIMER_MEASURE
deviation_measure();
return;
#endif
rte_spinlock_lock(&sched->lock);
/* drive timer to move and handle expired timers. */
for (level = 0; level < LEVEL_DEPTH; level++) {
cursor = &sched->cursors[level];
(*cursor)++;/* 每次运行滴答数加1*/
/*cursor的值等于LEVEL_SIZE, 赋值成0开始循环*/
if (*cursor < LEVEL_SIZE) {
carry = false;
} else {
/* reset the cursor and handle next level later. */
*cursor = 0;
carry = true;
}
/*遍历指定滴答数,level的hash表,执行会话超时操作*/
list_for_each_entry_safe(timer, next,
&sched->hashs[level][*cursor], list) {
/* is all lower levels ticks empty ? */
left = timer->delay % get_level_ticks(level);
if (!left) {
timer_expire(sched, timer);
} else {
/* drop to lower level wheel, note it may not drop to
* "next" lower level wheel. */
list_del(&timer->list);
lower = level;
while (--lower >= 0) {
off = timer->delay / get_level_ticks(lower);
if (!off)
continue; /* next lower level */
hash = (*cursor + off) % LEVEL_SIZE;
list_add_tail(&timer->list, &sched->hashs[lower][hash]);
break;
}
assert(lower >= 0);
}
}
if (!carry)
break;
}
rte_spinlock_unlock(&sched->lock);
return;
}
rte_timer_tick_cb函数周期性的扫描会话超时hash表链表里的会话。
3、会话添加
相关文件:ip_vs_conn.c
struct dp_vs_conn为会话结构体,当有新会话时,执行dp_vs_conn_new()函数添加会话,dp_vs_conn_new()函数调用dpvs_timer_sched()函数添加会话定时器,每个新的会话的new->timeout.tv_sec初始化的秒数是DPVS_CONN_INIT_TIMEOUT_DEF(3秒),后面会根据指定协议指定状态修改超时时间。执行添加timer的函数是__dpvs_timer_sched(),timeval_to_ticks()函数会将初始化的秒数转换成程序的滴答数。
/* call me with lock */
static int __dpvs_timer_sched(struct timer_scheduler *sched,
struct dpvs_timer *timer, struct timeval *delay,
dpvs_timer_cb_t handler, void *arg, bool period)
{
uint32_t off, hash;
int level;
assert(timer && delay && handler);
if (timer_pending(timer))
RTE_LOG(WARNING, DTIMER, "schedule a pending timer ?\n");
timer->handler = handler;
timer->priv = arg;
timer->is_period = period;
timer->delay = timeval_to_ticks(delay);/*将秒数转换成滴答数,若滴答数大于LEVEL_SIZE,这个timer会添加到level为1的hash表里,一般情况滴答数都小于LEVEL_SIZE*/
if (unlikely(timer->delay >= TIMER_MAX_TICKS)) {
RTE_LOG(WARNING, DTIMER, "exceed timer range\n");
return EDPVS_INVAL;
}
/*
* to schedule a 0 delay timer is not make sence.
* and it will never stopped (periodic) or never triggered (one-shut).
*/
if (unlikely(!timer->delay)) {
RTE_LOG(WARNING, DTIMER, "schedule 0 timeout timer.\n");
return EDPVS_INVAL;
}
/**/
/* add to corresponding wheel, from higher level to lower. */
for (level = LEVEL_DEPTH - 1; level >= 0; level--) {
off = timer->delay / get_level_ticks(level);/*这里就是验证上面转换的滴答数是否大于LEVEL_SIZE,大于的话off肯定大于0,timer就会添加到level为1的hash表里,但一般滴答都小于LEVEL_SIZE,所以timer都添加到了level为0的hash表里*/
if (off > 0) {
/*sched->cursors[level],在上面已经提到,它就是此时核线程的运行滴答数,它的值加上off和LEVEL_SIZE取余,获得hash key,然后把timer加到指定的hash链表了*/
hash = (sched->cursors[level] + off) % LEVEL_SIZE;
RTE_LOG(ERR, DTIMER, "hash[%d] level[%d] timer[%p] hash[%p] delay[%d]\n",
hash, level, timer, &sched->hashs[level][hash], timer->delay);
list_add_tail(&timer->list, &sched->hashs[level][hash]);
return EDPVS_OK;
}
}
/* not adopted by any wheel (never happend) */
return EDPVS_INVAL;
}
举例说明,一般level都为0,hash表的位置根据level和*cursor值确定(sched->hashs[level][*cursor])
根据上图举例,此时会话的timer就加到了蓝色箭头指向的hash链表里。定时器(rte_timer_tick_cb)每执行一次,*cursor的值加1,由于1-5的*cursor指定的hash表为空,rte_timer_tick_cb没有操作,当执行到第6时,发现sched->hashs[level][*cursor]指向的hash 链表不为空,就会把里面的会话取出来执行超时操作。
4、会话定时器更新
会话管理对应指定协议,以tcp会话为例:
static int tcp_timeouts[DPVS_TCP_S_LAST + 1] = {
[DPVS_TCP_S_NONE] = 2, /* in seconds */
[DPVS_TCP_S_ESTABLISHED] = 90,
[DPVS_TCP_S_SYN_SENT] = 3,
[DPVS_TCP_S_SYN_RECV] = 30,
[DPVS_TCP_S_FIN_WAIT] = 7,
[DPVS_TCP_S_TIME_WAIT] = 7,
[DPVS_TCP_S_CLOSE] = 3,
[DPVS_TCP_S_CLOSE_WAIT] = 7,
[DPVS_TCP_S_LAST_ACK] = 7,
[DPVS_TCP_S_LISTEN] = 120,
[DPVS_TCP_S_SYNACK] = 30,
[DPVS_TCP_S_LAST] = 2
};
5、dpvs 会话超时总结
a、我们通常的超时做法:
假如现在时间now是555秒,设置会话超时时间是timeout5秒,那么到now+timeout=560秒,会话超时,怎么检查超时呢,肯定是将会话用链表管理起来,定时扫描链表(简单效率低)。
b、dpvs的做法:
dpvs使用了dpdk的定时器机制,定时执行扫描,dpvs会话老化的精髓就在会话扫描设计上。它设计了一个大小为LEVEL_SIZE的滴答计数(*cursor),每个滴答数对应一个hash链表,滴答计数从0到LEVEL_SIZE不断循环,每次定时器超时滴答数加1,每次只扫描此滴答数对应的hash链表(sched->hashs[level][*cursor]),管理会话的超时设计上是将秒数转换成滴答数,并将这个会话的timer添加到由超时滴答数和此时*cursor共同决定的hash链表上。此时,每次dpdk定时器运行时,对应滴答数的非空hash 链表的会话都会被执行超时操作,很多滴答对应的hash表上根本就没有添加会话。此设计可能只会在相同滴答数上的hash链表里有很多个会话时,删除会耗费点时间,但这也不可避免,其他基本没有过多浪费操作。dpdk定时器的执行都是在每次收报文操作前执行,这也保证会话超时操作尽可能的减小对报文收取的影响。