因为最近在研究高性能方面的技术,突然想起上一份工作从事抗D的项目,在项目中使用到的dpdk组件,其中之一有无锁相关的技术,就重新翻了下源码,便写下这篇。
lock-free
相关的技术也蛮久的了,与之有关的ABA问题,CAS相关的,还有wait-free这个,有兴趣的自行研究。
本篇主要介绍dpdk中无锁环形队列的实现原理,比如在单进程多线程框架中,有个主线程把接收到的请求均匀的分发到多个工作线程,要高效的,尽量减少同步互斥,阻塞的问题;这一类的问题也有成熟高效的实现方案如muduo库中的代码;这里dpdk中的实现和kfifo实现原理差不多。
先看下实现说明:
73 * The Ring Manager is a fixed-size queue, implemented as a table of
74 * pointers. Head and tail pointers are modified atomically, allowing
75 * concurrent access to it. It has the following features:
76 *
77 * - FIFO (First In First Out)
78 * - Maximum size is fixed; the pointers are stored in a table.
79 * - Lockless implementation.
80 * - Multi- or single-consumer dequeue.
81 * - Multi- or single-producer enqueue.
82 * - Bulk dequeue.
83 * - Bulk enqueue.
ring的基本数据结构:
152 struct rte_ring {
159 int flags; /**< Flags supplied at creation. */
160 const struct rte_memzone *memzone;
161 /**< Memzone, if any, containing the rte_ring */
162
163 /** Ring producer status. */
164 struct prod {
165 uint32_t watermark; /**< Maximum items before EDQUOT. */
166 uint32_t sp_enqueue; /**< True, if single producer. */
167 uint32_t size; /**< Size of ring. */
168 uint32_t mask; /**< Mask (size-1) of ring. */
169 volatile uint32_t head; /**< Producer head. */
170 volatile uint32_t tail; /**< Producer tail. */
171 } prod __rte_cache_aligned;
172
173 /** Ring consumer status. */
174 struct cons {
175 uint32_t sc_dequeue; /**< True, if single consumer. */
176 uint32_t size; /**< Size of the ring. */
177 uint32_t mask; /**< Mask (size-1) of ring. */
178 volatile uint32_t head; /**< Consumer head. */
179 volatile uint32_t tail; /**< Consumer tail. */
180 #ifdef RTE_RING_SPLIT_PROD_CONS
181 } cons __rte_cache_aligned;
182 #else
183 } cons;
184 #endif
190 void * ring[0] __rte_cache_aligned; /**< Memory space of ring starts here.
191 * not volatile so need to be careful
192 * about compiler re-ordering */
193 };
其中prod和cons分别表示生产者和消费者数据类型声明,都cache line对齐,保证cpu读取数据时很快,且做到生产者和消费者线程的数据隔离,竞争不同的cache line;flags表示是单生产单消费还是其他,对应实现需cas保证正确性,这里假设是单生单消的情况;memzone用于记录在哪块mem分配的到时释放的时候使用;ring数组保存可用空间的地址;这里使用void * ring[0]
[不占任何空间,分配时连续,方便内存释放,能提高速度];其中prod和cons结构中的mask字段是2的幂次方减1,用于索引下标,这里不用担心会数组越界什么的,这样使用“index & mask”,比起取模要快得多,而且当index累加到uint32_t最大值后,再加一又回绕到0了;
这里需要注意的是prod
和cons
都有head
和tail
,作用会在下面解释。
ring的创建及初始化:
160 /* create the ring */
161 struct rte_ring *
162 rte_ring_create(const char *name, unsigned count, int socket_id,
163 unsigned flags)
164 {
166 struct rte_ring *r;
167 struct rte_tailq_entry *te;
168 const struct rte_memzone *mz;
169 ssize_t ring_size;
170 int mz_flags = 0;
171 struct rte_ring_list* ring_list = NULL;
172 int ret;
173
174 ring_list = RTE_TAILQ_CAST(rte_ring_tailq.head, rte_ring_list);
175
176 ring_size = rte_ring_get_memsize(count);
177 if (ring_size < 0) {
178 rte_errno = ring_size;
179 return NULL;
180 }
181 ...more code
189 te = rte_zmalloc("RING_TAILQ_ENTRY", sizeof(*te), 0);
190 if (te == NULL) {
191 RTE_LOG(ERR, RING, "Cannot reserve memory for tailq\n");
192 rte_errno = ENOMEM;
193 return NULL;
194 }
195
196 rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
197
198 /* reserve a memory zone for this ring. If we can't get rte_config or
199 * we are secondary process, the memzone_reserve function will set
200 * rte_errno for us appropriately - hence no check in this this function */
201 mz = rte_memzone_reserve(mz_name, ring_size, socket_id, mz_flags);
202 if (mz != NULL) {
203 r = mz->addr;
204 /* no need to check return value here, we already checked the
205 * arguments above */
206 rte_ring_init(r, name, count, flags);
207
208 te->data = (void *) r;
209 r->memzone = mz;
210
211 TAILQ_INSERT_TAIL(ring_list, te, next);
212 } else {
213 r = NULL;
214 RTE_LOG(ERR, RING, "Cannot reserve memory\n");
215 rte_free(te);
216 }
217 rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
218
219 return r;
220 }
在rte_ring_create
实现中,rte_ring_get_memsize
先判断count
有没有是2的幂次方和是否超过宏定义的最大size[#define RTE_RING_SZ_MASK (unsigned)(0x0fffffff)
],然后调整大小至对齐cache line
:
153 #define RTE_ALIGN_CEIL(val, align) \
154 RTE_ALIGN_FLOOR(((val) + ((typeof(val)) (align) - 1)), align)
135 #define RTE_ALIGN_FLOOR(val, align) \
136 (typeof(val))((val) & (~((typeof(val))((align) - 1))))
计算总共需要的内存空间;分配保存ring地址的struct rte_tailq_entry
后期用于释放,加写锁并设置ring相关数据成员,然后释放锁,这部分逻辑是比较简单的;但是里面也使用到了其他复杂的接口如rte_memzone_reserve
,主要功能是就近socket id node开辟一块空间[rte_memzone_reserve_thread_safe
],有些额外的信息,这里主要使用mz->addr
用于ring,源码在后面如果有时间和篇幅再作分析,之后进行rte_ring_init
;
rte_ring_init
初始化中的RTE_BUILD_BUG_ON
宏是编译期类型cache line对齐检查,具体作用也在前面的分析中说明过;剩下的是初始化ring各个成员变量:
121 int
122 rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
123 unsigned flags)
124 {
125 int ret;
126
127 /* compilation-time checks */
128 RTE_BUILD_BUG_ON((sizeof(struct rte_ring) &
129 RTE_CACHE_LINE_MASK) != 0);
130 #ifdef RTE_RING_SPLIT_PROD_CONS
131 RTE_BUILD_BUG_ON((offsetof(struct rte_ring, cons) &
132 RTE_CACHE_LINE_MASK) != 0);
133 #endif
134 RTE_BUILD_BUG_ON((offsetof(struct rte_ring, prod) &
135 RTE_CACHE_LINE_MASK) != 0);
142
143 /* init the ring structure */
144 memset(r, 0, sizeof(*r));
148 r->flags = flags;
149 r->prod.watermark = count;
150 r->prod.sp_enqueue = !!(flags & RING_F_SP_ENQ);
151 r->cons.sc_dequeue = !!(flags & RING_F_SC_DEQ);
152 r->prod.size = r->cons.size = count;
153 r->prod.mask = r->cons.mask = count-1;
154 r->prod.head = r->cons.head = 0;
155 r->prod.tail = r->cons.tail = 0;
156
157 return 0;
158 }
#define RTE_BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
#define RTE_CACHE_LINE_MASK (RTE_CACHE_LINE_SIZE-1)
#define offsetof(t, m) ((size_t) &((t *)0)->m)
ring的释放:
223 void
224 rte_ring_free(struct rte_ring *r)
225 {
226 struct rte_ring_list *ring_list = NULL;
227 struct rte_tailq_entry *te;
228
229 if (r == NULL)
230 return;
231
232 /*
233 * Ring was not created with rte_ring_create,
234 * therefore, there is no memzone to free.
235 */
236 if (r->memzone == NULL) {
238 return;
239 }
240
241 if (rte_memzone_free(r->memzone) != 0) {
243 return;
244 }
245
246 ring_list = RTE_TAILQ_CAST(rte_ring_tailq.head, rte_ring_list);
247 rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
248
249 /* find out tailq entry */
250 TAILQ_FOREACH(te, ring_list, next) {
251 if (te->data == (void *) r)
252 break;
253 }
254
255 if (te == NULL) {
256 rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
257 return;
258 }
259
260 TAILQ_REMOVE(ring_list, te, next);
262 rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
264 rte_free(te);
265 }
释放没什么好分析的,详细释放过程不作过多分析,这里先进行free(r->memzone)
,再把自己从全局链表中摘除,并释放自己;
先说明下入ring有两种模式rte_ring_queue_behavior
:
RTE_RING_QUEUE_FIXED: Enqueue a fixed number of items from a ring
RTE_RING_QUEUE_VARIABLE: Enqueue as many items a possible from ring
下面具体先分析单生产者和单消费者的实现过程,如何做到无锁高性能,由RING_F_SP_ENQ
和RING_F_SC_DEQ
表示,以下是生产者过程:
539 static inline int __attribute__((always_inline))
540 __rte_ring_sp_do_enqueue(struct rte_ring *r, void * const *obj_table,
541 unsigned n, enum rte_ring_queue_behavior behavior)
542 {
543 uint32_t prod_head, cons_tail;
544 uint32_t prod_next, free_entries;
545 unsigned i;
546 uint32_t mask = r->prod.mask;
547 int ret;
548
549 prod_head = r->prod.head;
550 cons_tail = r->cons.tail;
551 /* The subtraction is done between two unsigned 32bits value
552 * (the result is always modulo 32 bits even if we have
553 * prod_head > cons_tail). So 'free_entries' is always between 0
554 * and size(ring)-1. */
555 free_entries = mask + cons_tail - prod_head;
557 /* check that we have enough room in ring */
558 if (unlikely(n > free_entries)) {
559 if (behavior == RTE_RING_QUEUE_FIXED) {
560 __RING_STAT_ADD(r, enq_fail, n);
561 return -ENOBUFS;
562 }
563 else {
564 /* No free entry available */
565 if (unlikely(free_entries == 0)) {
566 __RING_STAT_ADD(r, enq_fail, n);
567 return 0;
568 }
569
570 n = free_entries;
571 }
572 }
573
574 prod_next = prod_head + n;
575 r->prod.head = prod_next;
577 /* write entries in ring */
578 ENQUEUE_PTRS();
579 rte_smp_wmb();
580
581 /* if we exceed the watermark */
582 if (unlikely(((mask + 1) - free_entries + n) > r->prod.watermark)) {
583 ret = (behavior == RTE_RING_QUEUE_FIXED) ? -EDQUOT :
584 (int)(n | RTE_RING_QUOT_EXCEED);
585 __RING_STAT_ADD(r, enq_quota, n);
586 }
587 else {
588 ret = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : n;
589 __RING_STAT_ADD(r, enq_success, n);
590 }
591
592 r->prod.tail = prod_next;
593 return ret;
594 }
其中代码行555〜572是判断要入队列ring的元素个数是否大于可用空间,取决于free_entries
的情况,这些只是检查;其中free_entries = mask + cons_tail - prod_head
算出来的值区间总是在[0,size-1]
;
一开始r->prod.head和r->cons.tail都为0,此时执行prod_next = prod_head + n
,接着修改prod.head
,即r->prod.head = prod_next
(A1);接着入队元素ENQUEUE_PTRS()
(A2):
356 #define ENQUEUE_PTRS() do { \
357 const uint32_t size = r->prod.size; \
358 uint32_t idx = prod_head & mask; \
359 if (likely(idx + n < size)) { \
360 for (i = 0; i < (n & ((~(unsigned)0x3))); i+=4, idx+=4) { \
361 r->ring[idx] = obj_table[i]; \
362 r->ring[idx+1] = obj_table[i+1]; \
363 r->ring[idx+2] = obj_table[i+2]; \
364 r->ring[idx+3] = obj_table[i+3]; \
365 } \
366 switch (n & 0x3) { \
367 case 3: r->ring[idx++] = obj_table[i++]; \
368 case 2: r->ring[idx++] = obj_table[i++]; \
369 case 1: r->ring[idx++] = obj_table[i++]; \
370 } \
371 } else { \
372 for (i = 0; idx < size; i++, idx++)\
373 r->ring[idx] = obj_table[i]; \
374 for (idx = 0; i < n; i++, idx++) \
375 r->ring[idx] = obj_table[i]; \
376 } \
377 } while(0)
这里likely
成立的情况当要入队的元素不需要回绕队列起始处的时候,然后循环步长是四,依次赋值,相当于循环展开优化,减少计算循环索引和预测条件分支判断等优化;如果需要回绕,那么先入队[idx,size)
,再入队[0,idx)
索引处;
入完队后执行rte_smp_wmb()
(A3),即do { asm volatile ("dmb st" : : : "memory"); } while (0)
,这里表示要等待前面所有存储内存的指令执行完后再执行后面的存储内存的指令;这语句作用如注释:
54 /**
55 * Write memory barrier.
56 *
57 * Guarantees that the STORE operations generated before the barrier
58 * occur before the STORE operations generated after.
59 */
内存屏障相关的知识点会单独开篇博客整理下,这里先不作过错说明,只要知道在编译器和cpu会给我们的代码进行优化乱序执行等,为了保证正确性,wmb
表示在此之前的写不能放在后面执行,在此之后的写不能放到前面执行;rmb
是针对读的,mb
两都限制;
代码行582〜590判断有没有超过最大watermark
[Quota exceeded. The objects have been enqueued, but the high water mark is exceeded.];不过这里有个疑问是什么情况下if会成立,因为这里用了unlikely
,表示很大可能if不会成立;
最后执行r->prod.tail = prod_next
(A4);
(A1)和(A4)位于(A3)前后。
再来看消费者过程:
722 static inline int __attribute__((always_inline))
723 __rte_ring_sc_do_dequeue(struct rte_ring *r, void **obj_table,
724 unsigned n, enum rte_ring_queue_behavior behavior)
725 {
726 uint32_t cons_head, prod_tail;
727 uint32_t cons_next, entries;
728 unsigned i;
729 uint32_t mask = r->prod.mask;
730
731 cons_head = r->cons.head;
732 prod_tail = r->prod.tail;
733 /* The subtraction is done between two unsigned 32bits value
734 * (the result is always modulo 32 bits even if we have
735 * cons_head > prod_tail). So 'entries' is always between 0
736 * and size(ring)-1. */
737 entries = prod_tail - cons_head;
738
739 if (n > entries) {
740 if (behavior == RTE_RING_QUEUE_FIXED) {
741 __RING_STAT_ADD(r, deq_fail, n);
742 return -ENOENT;
743 }
744 else {
745 if (unlikely(entries == 0)){
746 __RING_STAT_ADD(r, deq_fail, n);
747 return 0;
748 }
749
750 n = entries;
751 }
752 }
754 cons_next = cons_head + n;
755 r->cons.head = cons_next;
756
757 /* copy in table */
758 DEQUEUE_PTRS();
759 rte_smp_rmb();
760
761 __RING_STAT_ADD(r, deq_success, n);
762 r->cons.tail = cons_next;
763 return behavior == RTE_RING_QUEUE_FIXED ? 0 : n;
764 }
731 cons_head = r->cons.head;
732 prod_tail = r->prod.tail;
上面两行拷贝生产者的tail
索引,并计算有多少个entries,这里由于类型是uint32_t
,故始终entries >=0
;
代码行739〜752根据behavior值判断能不能pop出n个元素等;此后增加cons_next
,并执行r->cons.head = cons_next
(B1),然后执行DEQUEUE_PTRS
,是ENQUEUE_PTRS
反向操作:
382 #define DEQUEUE_PTRS() do { \
383 uint32_t idx = cons_head & mask; \
384 const uint32_t size = r->cons.size; \
385 if (likely(idx + n < size)) { \
386 for (i = 0; i < (n & (~(unsigned)0x3)); i+=4, idx+=4) {\
387 obj_table[i] = r->ring[idx]; \
388 obj_table[i+1] = r->ring[idx+1]; \
389 obj_table[i+2] = r->ring[idx+2]; \
390 obj_table[i+3] = r->ring[idx+3]; \
391 } \
392 switch (n & 0x3) { \
393 case 3: obj_table[i++] = r->ring[idx++]; \
394 case 2: obj_table[i++] = r->ring[idx++]; \
395 case 1: obj_table[i++] = r->ring[idx++]; \
396 } \
397 } else { \
398 for (i = 0; idx < size; i++, idx++) \
399 obj_table[i] = r->ring[idx]; \
400 for (idx = 0; i < n; i++, idx++) \
401 obj_table[i] = r->ring[idx]; \
402 } \
403 } while (0)
设置内存读屏障rte_smp_rmb()
__sync_synchronize
;最近更新r->cons.tail = cons_next
(B3);同样,(B2)中间;
简单考虑push和pop数据,假设有生产者A和消费者B两个线程,分别指向ring,有几个问题:B去pop数据的时候会不会因为判断有数据而取不到的情况?(A在push数据时不小心先更新了prod.tail
后push数据);A在push数据时是否会因为B先更新cons.tail
而导致覆盖B还没来得及pop的数据?还有一些其他情况...
因为这里有内存屏障,并不会出现以上问题,以上列的实现在单生产者和单消费者的情况下比较简单,但对于多读多写一个ring可能比较复杂;
以下分析下复杂情况,即多个线程push和pop数据,这里仅分析生产和消费的源码,其他的一些比较细节且不是很重要的代码不在这里分析。
以下是多生产者实现,有段说明:
408 * This function uses a "compare and set" instruction to move the
409 * producer index atomically.
430 static inline int __attribute__((always_inline))
431 __rte_ring_mp_do_enqueue(struct rte_ring *r, void * const *obj_table,
432 unsigned n, enum rte_ring_queue_behavior behavior)
433 {
434 uint32_t prod_head, prod_next;
435 uint32_t cons_tail, free_entries;
436 const unsigned max = n;
437 int success;
438 unsigned i, rep = 0;
439 uint32_t mask = r->prod.mask;
440 int ret;
442 /* Avoid the unnecessary cmpset operation below, which is also
443 * potentially harmful when n equals 0. */
444 if (n == 0)
445 return 0;
446
447 /* move prod.head atomically */
448 do {
449 /* Reset n to the initial burst count */
450 n = max;
451
452 prod_head = r->prod.head;
453 cons_tail = r->cons.tail;
454 /* The subtraction is done between two unsigned 32bits value
455 * (the result is always modulo 32 bits even if we have
456 * prod_head > cons_tail). So 'free_entries' is always between 0
457 * and size(ring)-1. */
458 free_entries = (mask + cons_tail - prod_head);
459
460 /* check that we have enough room in ring */
461 if (unlikely(n > free_entries)) {
462 if (behavior == RTE_RING_QUEUE_FIXED) {
463 __RING_STAT_ADD(r, enq_fail, n);
464 return -ENOBUFS;
465 }
466 else {
467 /* No free entry available */
468 if (unlikely(free_entries == 0)) {
469 __RING_STAT_ADD(r, enq_fail, n);
470 return 0;
471 }
472
473 n = free_entries;
474 }
475 }
477 prod_next = prod_head + n;
478 success = rte_atomic32_cmpset(&r->prod.head, prod_head,
479 prod_next);
480 } while (unlikely(success == 0));
481
482 /* write entries in ring */
483 ENQUEUE_PTRS();
484 rte_smp_wmb();
485
486 /* if we exceed the watermark */
487 if (unlikely(((mask + 1) - free_entries + n) > r->prod.watermark)) {
488 ret = (behavior == RTE_RING_QUEUE_FIXED) ? -EDQUOT :
489 (int)(n | RTE_RING_QUOT_EXCEED);
490 __RING_STAT_ADD(r, enq_quota, n);
491 }
492 else {
493 ret = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : n;
494 __RING_STAT_ADD(r, enq_success, n);
495 }
497 /*
498 * If there are other enqueues in progress that preceded us,
499 * we need to wait for them to complete
500 */
501 while (unlikely(r->prod.tail != prod_head)) {
502 rte_pause();
503
504 /* Set RTE_RING_PAUSE_REP_COUNT to avoid spin too long waiting
505 * for other thread finish. It gives pre-empted thread a chance
506 * to proceed and finish with ring dequeue operation. */
507 if (RTE_RING_PAUSE_REP_COUNT &&
508 ++rep == RTE_RING_PAUSE_REP_COUNT) {
509 rep = 0;
510 sched_yield();
511 }
512 }
513 r->prod.tail = prod_next;
514 return ret;
515 }
代码行434〜446定义和初始化一些变量,其中对n的判断做了优化,如注释说明;
代码行448〜480就是使用cas指令,尝试更新prod.head
,直到成功或者没有足够的空间返回,其他的代码计算free_entries
和上面分析的一样;
然后执行入队和内存屏障:
483 ENQUEUE_PTRS();
484 rte_smp_wmb();
代码行487〜495和上面的分析一样;
而代码501和512的作用是防止另外的生产者线程也在同时更新prod.tail
,为了防止覆盖的情况,这里没有用cas,而是短暂的rte_pause
一会避免busy waiting,等其他生产者线程完成后再更新,但是这里有次数rep
,当到达RTE_RING_PAUSE_REP_COUNT
时,会sched_yield
让出执行权,不过是个编译选项;
况且这里使用的是unlikely,可能性比较小;
最后更新prod.tail
。
至于多消费者的实现,这里不分析了,cas实现差不多和上面的多生产者一样,出队和单消费者实现一样,实现大概如下:
do {
entries = r->prod.tailprod_tail - r->cons.head;
cons_next = cons_head + n;
success = rte_atomic32_cmpset(&r->cons.head, cons_head,cons_next);
} while (unlikely(success == 0))
DEQUEUE_PTRS();
rte_smp_rmb();
while (unlikely(r->cons.tail != cons_head)) {
rte_pause();
}
r->cons.tail = cons_next;
以上是整体无锁环形队列的实现原理,可能更底层的实现没有说清楚,或者跟硬件有关系,或者跟自己水平有限有关,不管怎样,后面会继续多研究。
另外:
关于cas指令的一些说明,引用DPDK开源社区技术文章中的一段话“当两个core同时执行针对同一地址的CAS指令时,其实他们是在试图修改每个core自己持有的Cache line, 假设两个core都持有相同地址对应cacheline,且各自cacheline 状态为S, 这时如果要想成功修改,就首先需要把S转为E或者M, 则需要向其它core invalidate 这个地址的cacheline,则两个core都会向ring bus 发出 invalidate这个操作, 那么在ringbus上就会根据特定的设计协议仲裁是core0,还是core1能赢得这个invalidate,者完成操作,失败者需要接受结果invalidate自己对应的cacheline,再读取胜者修改后的值,回到起点。
到这里, 我们可以发现MESIF协议大大降低了读操作的时延,没有让写操作更慢,同时保持了一致性。那么对于我们的CAS操作来说,其实锁并没有消失,只是转嫁到了ring bus的总线仲裁协议中。而且大量的多核同时针对一个地址的CAS操作会引起反复的互相invalidate 同一cacheline,造成pingpong效应,同样会降低性能。只能说,基于CAS的操作仍然是不能滥用,不到万不得已不用,通常情况下还是使用数据分离模式更好。”
参考:
https://coolshell.cn/articles/8239.html
https://en.wikipedia.org/wiki/Non-blocking_algorithm
http://in355hz.iteye.com/blog/1797829
https://en.wikipedia.org/wiki/Memory_barrier
https://yq.aliyun.com/articles/95441
http://www.lenky.info/archives/2012/11/2028
https://mp.weixin.qq.com/s?__biz=MzI3NDA4ODY4MA==&mid=2653334228&idx=1&sn=8a106aed154ded89283146ddb6a02cf8&chksm=f0cb5d53c7bcd445704592eb7c06407f1b18f1bf94bed3d33345d2443454b15f9c859b64371c&scene=21#wechat_redirect
http://www.man7.org/linux/man-pages/man2/sched_yield.2.html