dpdk之memcpy优化

最近看某技术论坛,看到同是后台开发的同学,面试腾讯并回忆了一题关于是否知道memcpy优化相关的。

因为工作原因,是好久没有使用到类似这些库函数,并思考着优化方面,然后带着兴趣去研究下。本身优化这件事情,是需要测试评估的,不过早优化,还是先以程序功能正确和稳定,后面有瓶颈再通过一些工具去压测,找出热点,8-2原则,使用时间复杂度更低的数据结构或算法去代替并回归测试,或以空间换时间。

见过网上大部分资料,包括笔试面试题,有关memcpy/memmove等的主要实现是一个字符一个字符拷贝的,然后早些年工作中使用到dpdk,后者中重新实现了memcpy,大概就是根据特定业务场景优化,所以这里是准备分析下相关的代码实现。

memcpy和memmove的接口描述:

SYNOPSIS
     #include 

     void *
     memcpy(void *restrict dst, const void *restrict src, size_t n);

DESCRIPTION
     The memcpy() function copies n bytes from memory area src to memory area dst.  If dst
     and src overlap, behavior is undefined.  Applications in which dst and src might
     overlap should use memmove(3) instead.
SYNOPSIS
     #include 

     void *
     memmove(void *dst, const void *src, size_t len);

DESCRIPTION
     The memmove() function copies len bytes from string src to string dst.  The two
     strings may overlap; the copy is always done in a non-destructive manner.

这里以glibc-2.27中的实现为说明。以上两个实现,前者没有考虑内存重叠的情况,后者更安全些。

memcpy

 26 void *
 27 memcpy (void *dstpp, const void *srcpp, size_t len)
 28 {
 29   unsigned long int dstp = (long int) dstpp;
 30   unsigned long int srcp = (long int) srcpp;
 31 
 32   /* Copy from the beginning to the end.  */
 33 
 34   /* If there not too few bytes to copy, use word copy.  */
 35   if (len >= OP_T_THRES)
 36     {
 37       /* Copy just a few bytes to make DSTP aligned.  */
 38       len -= (-dstp) % OPSIZ;
 39       BYTE_COPY_FWD (dstp, srcp, (-dstp) % OPSIZ);
 40 
 41       /* Copy whole pages from SRCP to DSTP by virtual address manipulation,
 42      as much as possible.  */
 43 
 44       PAGE_COPY_FWD_MAYBE (dstp, srcp, len, len);
 45 
 46       /* Copy from SRCP to DSTP taking advantage of the known alignment of
 47      DSTP.  Number of bytes remaining is put in the third argument,
 48      i.e. in LEN.  This number may vary from machine to machine.  */
 49 
 50       WORD_COPY_FWD (dstp, srcp, len, len);
 51 
 52       /* Fall out and copy the tail.  */
 53     }
 54 
 55   /* There are just a few bytes to copy.  Use byte memory operations.  */
 56   BYTE_COPY_FWD (dstp, srcp, len);
 57 
 58   return dstpp;
 59 }

 75 /* Copy exactly NBYTES bytes from SRC_BP to DST_BP,
 76    without any assumptions about alignment of the pointers.  */
 77 #define BYTE_COPY_FWD(dst_bp, src_bp, nbytes)                     \
 78   do                                          \
 79     {                                         \
 80       size_t __nbytes = (nbytes);                         \
 81       while (__nbytes > 0)                            \
 82     {                                     \
 83       byte __x = ((byte *) src_bp)[0];                    \
 84       src_bp += 1;                                \
 85       __nbytes -= 1;                              \
 86       ((byte *) dst_bp)[0] = __x;                         \
 87       dst_bp += 1;                                \
 88     }                                     \
 89     } while (0)

这里宏OP_T_THRES以16为例,dpdk中也是16。WORD_COPY_FWD和PAGE_COPY_FWD_MAYBE这里不贴代码,感觉太多了,如注释说明,主要还是对要拷贝的地址进行边界对齐,并利用x86的movsl实现多字节拷贝。

memmove

 42 rettype
 43 inhibit_loop_to_libcall
 44 MEMMOVE (a1const void *a1, a2const void *a2, size_t len)
 45 {
 46   unsigned long int dstp = (long int) dest;
 47   unsigned long int srcp = (long int) src;
 48 
 49   /* This test makes the forward copying code be used whenever possible.
 50      Reduces the working set.  */
 51   if (dstp - srcp >= len)   /* *Unsigned* compare!  */
 52     {
 53       /* Copy from the beginning to the end.  */
 54 
 55 #if MEMCPY_OK_FOR_FWD_MEMMOVE
 56       dest = memcpy (dest, src, len);
 57 #else
 58       /* If there not too few bytes to copy, use word copy.  */
 59       if (len >= OP_T_THRES)
 61       /* Copy just a few bytes to make DSTP aligned.  */
 62       len -= (-dstp) % OPSIZ;
 63       BYTE_COPY_FWD (dstp, srcp, (-dstp) % OPSIZ);
 64 
 65       /* Copy whole pages from SRCP to DSTP by virtual address
 66          manipulation, as much as possible.  */
 67 
 68       PAGE_COPY_FWD_MAYBE (dstp, srcp, len, len);
 69 
 70       /* Copy from SRCP to DSTP taking advantage of the known
 71          alignment of DSTP.  Number of bytes remaining is put
 72          in the third argument, i.e. in LEN.  This number may
 73          vary from machine to machine.  */
 74 
 75       WORD_COPY_FWD (dstp, srcp, len, len);
 76 
 77       /* Fall out and copy the tail.  */
 78     }
 79 
 80       /* There are just a few bytes to copy.  Use byte memory operations.  */
 81       BYTE_COPY_FWD (dstp, srcp, len);
 82 #endif /* MEMCPY_OK_FOR_FWD_MEMMOVE */
 83     }
 84   else
 85     {
 86       /* Copy from the end to the beginning.  */
 87       srcp += len;
 88       dstp += len;
 89 
 90       /* If there not too few bytes to copy, use word copy.  */
 91       if (len >= OP_T_THRES)
 92     {
 93       /* Copy just a few bytes to make DSTP aligned.  */
 94       len -= dstp % OPSIZ;
 95       BYTE_COPY_BWD (dstp, srcp, dstp % OPSIZ);
 96 
 97       /* Copy from SRCP to DSTP taking advantage of the known
 98          alignment of DSTP.  Number of bytes remaining is put
 99          in the third argument, i.e. in LEN.  This number may
100          vary from machine to machine.  */
101 
102       WORD_COPY_BWD (dstp, srcp, len, len);
103 
104       /* Fall out and copy the tail.  */
105     }
106 
107       /* There are just a few bytes to copy.  Use byte memory operations.  */
108       BYTE_COPY_BWD (dstp, srcp, len);
109     }
110 
111   RETURN (dest);
112 }

memmove中处理了地址可能重叠的情况,不过这里的实现是可以简化一些。

rte_memcpy

下面贴上dpdk中关于memcpy相关的优化,借用官方的描述:
“不存在一个“最优”的适用于任何场景(硬件+软件+数据)的memcpy实现。这也是DPDK中rte_memcpy存在的原因:不是glibc中的memcpy不够优秀,而是它和DPDK中的核心应用场景之间不合适,有没有觉得这种说法很耳熟?本文将着重探讨如何针对具体应用进行memcpy(或其他任何程序)的性能优化。”

“通常memcpy的性能开销包含:

  1. 数据的Load/Store
  2. 附加计算任务(例如地址对齐处理)
  3. 分支预测

通用的memcpy优化方向:

  1. 最大限度使用memory/cache带宽(Vector指令、指令级并行)
  2. Load/Store地址对齐
  3. 集中顺序访问
  4. 适当使用non-temporal访存执令
  5. 适当使用String指令来加速较大的拷贝

最后,所有的指令都经过CPU的流水线执行,因此对流水线效率的分析至关重要,需要优化指令顺序以避免造成流水线阻塞。”

dpdk之memcpy优化_第1张图片
对比

191 static inline void *
192 rte_memcpy(void *dst, const void *src, size_t n)
193 {
194     uintptr_t dstu = (uintptr_t)dst;
195     uintptr_t srcu = (uintptr_t)src;
196     void *ret = dst;
197     size_t dstofss;
198     size_t bits;
199 
200     /**
201      * Copy less than 16 bytes
202      */
203     if (n < 16) {
204         if (n & 0x01) {
205             *(uint8_t *)dstu = *(const uint8_t *)srcu;
206             srcu = (uintptr_t)((const uint8_t *)srcu + 1);
207             dstu = (uintptr_t)((uint8_t *)dstu + 1);
208         }
209         if (n & 0x02) {
210             *(uint16_t *)dstu = *(const uint16_t *)srcu;
211             srcu = (uintptr_t)((const uint16_t *)srcu + 1);
212             dstu = (uintptr_t)((uint16_t *)dstu + 1);
213         }
214         if (n & 0x04) {
215             *(uint32_t *)dstu = *(const uint32_t *)srcu;
216             srcu = (uintptr_t)((const uint32_t *)srcu + 1);
217             dstu = (uintptr_t)((uint32_t *)dstu + 1);
218         }
219         if (n & 0x08)
220             *(uint64_t *)dstu = *(const uint64_t *)srcu;
221         return ret;
222     }

当要复制的数据长度小于16字节的时候,是直接通过类型转换赋值。

224     /**
225      * Fast way when copy size doesn't exceed 512 bytes
226      */
227     if (n <= 32) {
228         rte_mov16((uint8_t *)dst, (const uint8_t *)src);
229         rte_mov16((uint8_t *)dst - 16 + n,
230                   (const uint8_t *)src - 16 + n);
231         return ret;
232     }
233     if (n <= 64) {
234         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
235         rte_mov32((uint8_t *)dst - 32 + n,
236                   (const uint8_t *)src - 32 + n);
237         return ret;
238     }
239     if (n <= 512) {
240         if (n >= 256) {
241             n -= 256;
242             rte_mov256((uint8_t *)dst, (const uint8_t *)src);
243             src = (const uint8_t *)src + 256;
244             dst = (uint8_t *)dst + 256;
245         }
246         if (n >= 128) {
247             n -= 128;
248             rte_mov128((uint8_t *)dst, (const uint8_t *)src);
249             src = (const uint8_t *)src + 128;
250             dst = (uint8_t *)dst + 128;
251         }
252 COPY_BLOCK_128_BACK63:
253         if (n > 64) {
254             rte_mov64((uint8_t *)dst, (const uint8_t *)src);
255             rte_mov64((uint8_t *)dst - 64 + n,
256                       (const uint8_t *)src - 64 + n);
257             return ret;
258         }
259         if (n > 0)
260             rte_mov64((uint8_t *)dst - 64 + n,
261                       (const uint8_t *)src - 64 + n);
262         return ret;
263     }
 76 /**
 77  * Copy 16 bytes from one location to another,
 78  * locations should not overlap.
 79  */
 80 static inline void
 81 rte_mov16(uint8_t *dst, const uint8_t *src)
 82 {
 83     __m128i xmm0;
 84 
 85     xmm0 = _mm_loadu_si128((const __m128i *)src);
 86     _mm_storeu_si128((__m128i *)dst, xmm0);
 87 }
 88 

 93 static inline void
 94 rte_mov32(uint8_t *dst, const uint8_t *src)
 95 {
 96     __m256i ymm0;
 97 
 98     ymm0 = _mm256_loadu_si256((const __m256i *)src);
 99     _mm256_storeu_si256((__m256i *)dst, ymm0);
100 }

106 static inline void
107 rte_mov64(uint8_t *dst, const uint8_t *src)
108 {
109     __m512i zmm0;
110 
111     zmm0 = _mm512_loadu_si512((const void *)src);
112     _mm512_storeu_si512((void *)dst, zmm0);
113 }

119 static inline void
120 rte_mov128(uint8_t *dst, const uint8_t *src)
121 {
122     rte_mov64(dst + 0 * 64, src + 0 * 64);
123     rte_mov64(dst + 1 * 64, src + 1 * 64);
124 }

当要复制的数据长度小于等于512字节的时候,这里以rte_mov16为例说明SSE2指令的意义。
__m128i表示为128bits的整数;
_mm_loadu_si128表示:Loads 128-bit value;即加载128位值;
_mm_storeu_si128表示:Stores 128-bit value;
Address dst must be 16-byte aligned.

不过这里有个疑问,比如当n=300的时候,先执行rte_mov256复制,然后n=4,然后再执行rte_mov64,此时(uint8_t *)dst - 64 + n语句把dst往前移(64 - n)个字节单位,那么可能造成重复加载(赋值)(64 - n)*8位?

277     /**
278      * Copy 512-byte blocks.
279      * Use copy block function for better instruction order control,
280      * which is important when load is unaligned.
281      */
282     rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
283     bits = n;
284     n = n & 511;
285     bits -= n;
286     src = (const uint8_t *)src + bits;
287     dst = (uint8_t *)dst + bits;
288 
289     /**
290      * Copy 128-byte blocks.
291      * Use copy block function for better instruction order control,
292      * which is important when load is unaligned.
293      */
294     if (n >= 128) {
295         rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
296         bits = n;
297         n = n & 127;
298         bits -= n;
299         src = (const uint8_t *)src + bits;
300         dst = (uint8_t *)dst + bits;
301     }
303     /**
304      * Copy whatever left
305      */
306     goto COPY_BLOCK_128_BACK63;
307 }

当超过512字节时,这里先对齐,即先加载64字节,然后把src/dst往后移dstofss个字节单位,从(uint8_t *)dst + dstofss(uint8_t *)dst+64会在后面重新赋值。然后以512个字节为block进行赋值,接着以128字节为单位等。

139 /**
140  * Copy 128-byte blocks from one location to another,
141  * locations should not overlap.
142  */
143 static inline void
144 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
145 {   
146     __m512i zmm0, zmm1;
147     
148     while (n >= 128) {
149         zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
150         n -= 128;
151         zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
152         src = src + 128;
153         _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
154         _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
155         dst = dst + 128;
156     }   
157 }

163 static inline void
164 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
165 {
166     __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
167 
168     while (n >= 512) {
169         zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
170         n -= 512;
171         zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
172         zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
173         zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
174         zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
175         zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
176         zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
177         zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
178         src = src + 512;
179         _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
180         _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
181         _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
182         _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
183         _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
184         _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
185         _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
186         _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
187         dst = dst + 512;
188     }
189 }

以上是dpdk中针对此业务场景针对性的对memory进行了优化,另外实现中没有处理overlap地址重叠的情况。

参考
大并发服务器内存转换的灵活运用,memcpy的思考
glibc--memcpy源码分析
DPDK中的memcpy性能优化及思考
_mm_loadu_si128

你可能感兴趣的:(dpdk之memcpy优化)