xmemcpy改进版,利用movdqu速度快的特点,利用内联和常量化来提高对于小内存的memcpy性能优化
xmemcpy来自github /progs/C/c_progs/memcpy.c ,不知道是不是原作者,这里进行了部分改进
------2016-2-28注意1:以下内容的缓冲区由于反复读取,总在L1cache中,类似于栈内存,如果总是在超出cache的内存中,则由于内存速度拖累,改进版与memcpy很难拉开差距,但是仍然有一定的效果
------2016-2-28注意2:DEBUG下速度会很慢,除非关闭/GS或用 #pragma runtime_checks( "s", restore ) (此编译杂注对模板无效)
------2016-3-5 注意3:参看zmemcpy改进版,对debug模式有相当大的提高 http://blog.csdn.net/superzmy/article/details/50810343
预期结果:
All time to memcpy 80 * 100M is 0.248s in 3GHz (xmemcopy) All time to memcpy 80 * 100M is 0.476s in 3GHz (xmemcpy) All time to memcpy 80 * 100M is 0.778s in 3GHz (xmemcpy unknownSize) All time to memcpy 80 * 100M is 0.232s in 3GHz (movdq) All time to memcpy 80 * 100M is 0.257s in 3GHz (movdq unalign) All time to memcpy 81 * 100M is 0.298s in 3GHz (movdq) All time to memcpy 81 * 100M is 0.264s in 3GHz (movdq unalign) All time to memcpy 400 * 100M is 1.334s in 3GHz (xmemcopy) All time to memcpy 400 * 100M is 1.236s in 3GHz (xmemcopy unalign) All time to memcpy 400 * 100M is 1.819s in 3GHz (xmemcpy) All time to memcpy 400 * 100M is 3.051s in 3GHz (rep movs) All time to memcpy 400 * 100M is 2.984s in 3GHz (rep movs unalign) All time to memcpy 400 * 100M is 3.015s in 3GHz (rep movs handwrite asm) All time to memcpy 401 * 100M is 3.093s in 3GHz (rep movs) All time to memcpy 401 * 100M is 3.193s in 3GHz (rep movs handwrite asm) All time to memcpy 80 * 100M is 1.216s in 3GHz (rep movs handwrite asm) All time to memcpy 4000 * 100M is 15.254s in 3GHz (rep movs handwrite asm) All time to memcpy 80 * 100M is 1.824s in 3GHz (call _memcpy) All time to memcpy 81 * 100M is 1.828s in 3GHz (call _memcpy) All time to memcpy 81 * 100M is 1.779s in 3GHz (call _memcpy unalign) All time to memcpy 400 * 100M is 2.554s in 3GHz (call _memcpy) All time to memcpy 401 * 100M is 2.777s in 3GHz (call _memcpy) All time to memcpy 401 * 100M is 2.725s in 3GHz (call _memcpy unalign) All time to memcpy 4000 * 100M is 14.379s in 3GHz (call _memcpy)以上代码vs2013编译 E3 1230V2上运行
// ConsoleApplication3.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include <windows.h> #include <intrin.h> #include <assert.h> char data80[80] = "abcdefghijklmnopqrstuvwxyz0123456789"; char data400[400] = "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "012345678901234567890123456789012345678"; char data4000[4000] = "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" ; char data401[401] = "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" "abcdefghijklmnopqrstuvwxyz0123456789" ; char data81[81] = "abcdefghijklmnopqrstuvwxyz0123456789"; // optimize memcpy less than 120bytes // char a[32], b[32]; a = b; is faster than memcpy(a, b, sizeof(b)); namespace com { const static size_t _MAXSIZE_ = 80; extern void* (*g_base[_MAXSIZE_+1])(void *dest, const void *src); }; inline void *xmemcpy(void *dest, const void *src, size_t len); namespace com { template <size_t size> struct xmemcpy_t { int data[size]; }; template <> struct xmemcpy_t<0> { }; template <size_t size> class xmemcopy { public: inline static void * copy(void *dest, const void *src) { if (size > _MAXSIZE_) { size_t i = 0; for (; i + _MAXSIZE_ <= size; i += _MAXSIZE_) xmemcopy<_MAXSIZE_>::copy((char*)dest + i, (const char*)src + i); if (size % _MAXSIZE_) xmemcopy<size % _MAXSIZE_>::copy((char*)dest + i, (const char*)src + i); return dest; } typedef xmemcpy_t<((size - 1) % _MAXSIZE_ + 1) / sizeof(int)> type_t; *((type_t *)dest) = *((type_t *)src); if ((size%sizeof(int)) > 0) { ((char *)dest)[size - 1] = ((char *)src)[size - 1]; } if ((size%sizeof(int)) > 1) { ((char *)dest)[size - 2] = ((char *)src)[size - 2]; } if ((size%sizeof(int)) > 2) { ((char *)dest)[size - 3] = ((char *)src)[size - 3]; } return dest; } }; template <> class xmemcopy<0> { public: static void * copy(void *dest, const void *src) { return dest; } }; void* (*g_base[_MAXSIZE_+1])(void *dest, const void *src); template <size_t len> void init() { g_base[len] = xmemcopy<len>::copy; init<len - 1>(); } template <> void init<0>() { g_base[0] = xmemcopy<0>::copy; } struct xmem_monitor { xmem_monitor() { init<_MAXSIZE_>(); } }; static xmem_monitor g_monitor; } inline void *xmemcpy(void *dest, const void *src, size_t len) { if (len <= com::_MAXSIZE_) { return com::g_base[len](dest, src); } else if (len <= com::_MAXSIZE_ * 10) { size_t i = 0; for (; i + com::_MAXSIZE_ < len; i += com::_MAXSIZE_) com::xmemcopy<com::_MAXSIZE_>::copy((char*)dest + i, (const char*)src + i); com::g_base[len - i]((char*)dest + i, (const char*)src + i); return dest; } return ::memcpy(dest, src, len); } int _tmain(int argc, _TCHAR* argv[]) { SetProcessAffinityMask(GetCurrentProcess(), 2); char buffer[10000] = {}; com::xmemcopy<com::_MAXSIZE_ * 2>::copy(buffer, data400); if (memcmp(buffer, data400, com::_MAXSIZE_ * 2)) __asm int 3; com::xmemcopy<com::_MAXSIZE_ * 2 + 1>::copy(buffer, data400); if (memcmp(buffer, data400, com::_MAXSIZE_ * 2 + 1)) __asm int 3; com::xmemcopy<400>::copy(buffer, data400); if(memcmp(buffer, data400, 400)) __asm int 3; char* volatile pb = buffer; char* volatile pb1 = buffer + 1; size_t volatile size40 = sizeof(data80); size_t volatile size41 = sizeof(data81); assert((int)pb % 4 == 0); assert((int)pb1 % 4 == 1); assert((int)data80 % 8 == 0); assert((int)data400 % 8 == 0); assert((int)data4000 % 8 == 0); for (int i = 0; i < 10; ++i) { memcpy(pb, data80, size40); memcpy(pb, data81, size41); memcpy(pb, data400, sizeof(data400)); memcpy(pb, data401, sizeof(data401)); memcpy(pb, data4000, sizeof(data4000)); } printf("\n"); enum { Count = 100000000 }; #if(1) { auto& dest = data80; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) com::xmemcopy<sizeof(dest)>::copy(pb, dest); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcopy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data80; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) xmemcpy(pb, dest, sizeof(dest)); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcpy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data80; size_t volatile size = sizeof(dest); __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) xmemcpy(pb, dest, size); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcpy unknownSize)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data80; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb, dest, sizeof(dest)); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data80; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb1, dest, sizeof(dest)); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data81; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb, dest, sizeof(dest)); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data81; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb1, dest, sizeof(dest)); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } ////////////////////////////////////////////////////////////////////////// { auto& dest = data400; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) com::xmemcopy<sizeof(dest)>::copy(pb, dest); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcopy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data400; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) com::xmemcopy<sizeof(dest)>::copy(pb1, dest); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcopy unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } #endif memset(pb, 0, 400); { auto& dest = data400; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) xmemcpy(pb, dest, sizeof(dest)); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcpy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data400; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb, dest, sizeof(dest)); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data400; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb1, dest, sizeof(dest)); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data400; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) { __asm { mov edi, dword ptr[pb]; mov ecx, size data400 / 4; mov esi, dest; rep movs dword ptr es : [edi], dword ptr[esi]; } } t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data401; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb, dest, sizeof(dest)); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data401; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) { __asm { mov edi, dword ptr[pb]; mov ecx, size data401 / 4; mov esi, dest; rep movs dword ptr es : [edi], dword ptr[esi]; movs byte ptr es : [edi], byte ptr[esi] } } t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data80; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) { __asm { mov edi, dword ptr[pb]; mov ecx, size data80 / 4; mov esi, dest; rep movs dword ptr es : [edi], dword ptr[esi]; } } t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data4000; __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) { __asm { mov edi, dword ptr[pb]; mov ecx, size data4000 / 4; mov esi, dest; rep movs dword ptr es : [edi], dword ptr[esi]; } } t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0); } { auto& dest = data80; size_t volatile size = sizeof(dest); __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb, dest, size); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0); } { auto& dest = data81; size_t volatile size = sizeof(dest); __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb, dest, size); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0); } { auto& dest = data81; size_t volatile size = sizeof(dest); __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb1, dest, size); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy unalign)\n", size, Count / 1000000, t / 3000000000.0); } { auto& dest = data400; size_t volatile size = sizeof(dest); __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb, dest, size); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0); } { auto& dest = data401; size_t volatile size = sizeof(dest); __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb, dest, size); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0); } { auto& dest = data401; size_t volatile size = sizeof(dest); __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb1, dest, size); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy unalign)\n", size, Count / 1000000, t / 3000000000.0); } { auto& dest = data4000; size_t volatile size = sizeof(dest); __int64 t = __rdtsc(); for (int i = 0; i < Count; ++i) memcpy(pb, dest, size); t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0); } return 0; }