z memcpy(for MSVC only)小内存高速复制,即使在debug模式下复制的也比memcpy快,release下差距也不大
2016-3-6注意: 由于指令缓存命中、内联深度等方面的原因,此函数性能测试看起来很不错,插入到程序中实际应用时就不一定了,请测试对比后再决定使用
对VC2008及以上均测试通过
Release Mode
All time to memcpy 63 * 100M is 0.042s in 3GHz (zmemcopy template const size)
All time to memcpy 63 * 100M is 0.050s in 3GHz (zmemcopy static const size)
All time to memcpy 63 * 100M is 0.147s in 3GHz (memcpy const size)
All time to memcpy 63 * 100M is 0.048s in 3GHz (zmemcopy const size)
All time to memcpy 63 * 100M is 0.050s in 3GHz (zmemcopy unknown array direct)
All time to memcpy 63 * 100M is 0.051s in 3GHz (zmemcopy unknown small size)
All time to memcpy 63 * 100M is 0.056s in 3GHz (zmemcopy unknown size)
All time to memcpy 63 * 100M is 0.140s in 3GHz (memcpy unknown size)
Debug Mode
All time to memcpy 63 * 100M is 0.056s in 3GHz (zmemcopy template const size)
All time to memcpy 63 * 100M is 0.055s in 3GHz (zmemcopy static const size)
All time to memcpy 63 * 100M is 0.171s in 3GHz (memcpy const size)
All time to memcpy 63 * 100M is 0.093s in 3GHz (zmemcopy const size)
All time to memcpy 63 * 100M is 0.060s in 3GHz (zmemcopy unknown array direct)
All time to memcpy 63 * 100M is 0.086s in 3GHz (zmemcopy unknown small size)
All time to memcpy 63 * 100M is 0.100s in 3GHz (zmemcopy unknown size)
All time to memcpy 63 * 100M is 0.172s in 3GHz (memcpy unknown size)
使用方法,一般用zmemcpy(dest,src,size),在已知目标长度(且是常量表达式时)可以用 ZMemoryCopy::copy<size>(dest,src)
// zmemcpy.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include "zmemcpy.h" //#include <intrin.h> //#include <nmmintrin.h> //#include <windows.h> //#include <utility> __declspec(noinline) void* GetCurrentAddress() { return _ReturnAddress(); } inline void* GetRetAddress() { return _ReturnAddress(); } __declspec(noinline) bool IsReleaseMode() { return _ReturnAddress() == GetRetAddress(); } bool g_IsReleaseMode = IsReleaseMode(); // #pragma runtime_checks( "scu", restore ) //#define pCopyFunc(x) copy<x> #define pCopyFunc(x) ZMemoryCopy::___copy_##x void(*const copys[129])(char* dest, const char* src) = { ZMemoryCopy::___copy_0, pCopyFunc(1), pCopyFunc(2), pCopyFunc(3), pCopyFunc(4), pCopyFunc(5), pCopyFunc(6), pCopyFunc(7), pCopyFunc(8), pCopyFunc(9), pCopyFunc(10), pCopyFunc(11), pCopyFunc(12), pCopyFunc(13), pCopyFunc(14), pCopyFunc(15), pCopyFunc(16), pCopyFunc(17), pCopyFunc(18), pCopyFunc(19), pCopyFunc(20), pCopyFunc(21), pCopyFunc(22), pCopyFunc(23), pCopyFunc(24), pCopyFunc(25), pCopyFunc(26), pCopyFunc(27), pCopyFunc(28), pCopyFunc(29), pCopyFunc(30), pCopyFunc(31), pCopyFunc(32), pCopyFunc(33), pCopyFunc(34), pCopyFunc(35), pCopyFunc(36), pCopyFunc(37), pCopyFunc(38), pCopyFunc(39), pCopyFunc(40), pCopyFunc(41), pCopyFunc(42), pCopyFunc(43), pCopyFunc(44), pCopyFunc(45), pCopyFunc(46), pCopyFunc(47), pCopyFunc(48), pCopyFunc(49), pCopyFunc(50), pCopyFunc(51), pCopyFunc(52), pCopyFunc(53), pCopyFunc(54), pCopyFunc(55), pCopyFunc(56), pCopyFunc(57), pCopyFunc(58), pCopyFunc(59), pCopyFunc(60), pCopyFunc(61), pCopyFunc(62), pCopyFunc(63), pCopyFunc(64), pCopyFunc(65), pCopyFunc(66), pCopyFunc(67), pCopyFunc(68), pCopyFunc(69), pCopyFunc(70), pCopyFunc(71), pCopyFunc(72), pCopyFunc(73), pCopyFunc(74), pCopyFunc(75), pCopyFunc(76), pCopyFunc(77), pCopyFunc(78), pCopyFunc(79), pCopyFunc(80), pCopyFunc(81), pCopyFunc(82), pCopyFunc(83), pCopyFunc(84), pCopyFunc(85), pCopyFunc(86), pCopyFunc(87), pCopyFunc(88), pCopyFunc(89), pCopyFunc(90), pCopyFunc(91), pCopyFunc(92), pCopyFunc(93), pCopyFunc(94), pCopyFunc(95), pCopyFunc(96), pCopyFunc(97), pCopyFunc(98), pCopyFunc(99), pCopyFunc(100), pCopyFunc(101), pCopyFunc(102), pCopyFunc(103), pCopyFunc(104), pCopyFunc(105), pCopyFunc(106), pCopyFunc(107), pCopyFunc(108), pCopyFunc(109), pCopyFunc(110), pCopyFunc(111), pCopyFunc(112), pCopyFunc(113), pCopyFunc(114), pCopyFunc(115), pCopyFunc(116), pCopyFunc(117), pCopyFunc(118), pCopyFunc(119), pCopyFunc(120), pCopyFunc(121), pCopyFunc(122), pCopyFunc(123), pCopyFunc(124), pCopyFunc(125), pCopyFunc(126), pCopyFunc(127), pCopyFunc(128), }; #undef pCopyFunc #define pCopyFunc(x) ZMemoryCopy::copy<x> void(*const template_copys[129])(char* dest, const char* src) = { ZMemoryCopy::___copy_0, pCopyFunc(1), pCopyFunc(2), pCopyFunc(3), pCopyFunc(4), pCopyFunc(5), pCopyFunc(6), pCopyFunc(7), pCopyFunc(8), pCopyFunc(9), pCopyFunc(10), pCopyFunc(11), pCopyFunc(12), pCopyFunc(13), pCopyFunc(14), pCopyFunc(15), pCopyFunc(16), pCopyFunc(17), pCopyFunc(18), pCopyFunc(19), pCopyFunc(20), pCopyFunc(21), pCopyFunc(22), pCopyFunc(23), pCopyFunc(24), pCopyFunc(25), pCopyFunc(26), pCopyFunc(27), pCopyFunc(28), pCopyFunc(29), pCopyFunc(30), pCopyFunc(31), pCopyFunc(32), pCopyFunc(33), pCopyFunc(34), pCopyFunc(35), pCopyFunc(36), pCopyFunc(37), pCopyFunc(38), pCopyFunc(39), pCopyFunc(40), pCopyFunc(41), pCopyFunc(42), pCopyFunc(43), pCopyFunc(44), pCopyFunc(45), pCopyFunc(46), pCopyFunc(47), pCopyFunc(48), pCopyFunc(49), pCopyFunc(50), pCopyFunc(51), pCopyFunc(52), pCopyFunc(53), pCopyFunc(54), pCopyFunc(55), pCopyFunc(56), pCopyFunc(57), pCopyFunc(58), pCopyFunc(59), pCopyFunc(60), pCopyFunc(61), pCopyFunc(62), pCopyFunc(63), pCopyFunc(64), pCopyFunc(65), pCopyFunc(66), pCopyFunc(67), pCopyFunc(68), pCopyFunc(69), pCopyFunc(70), pCopyFunc(71), pCopyFunc(72), pCopyFunc(73), pCopyFunc(74), pCopyFunc(75), pCopyFunc(76), pCopyFunc(77), pCopyFunc(78), pCopyFunc(79), pCopyFunc(80), pCopyFunc(81), pCopyFunc(82), pCopyFunc(83), pCopyFunc(84), pCopyFunc(85), pCopyFunc(86), pCopyFunc(87), pCopyFunc(88), pCopyFunc(89), pCopyFunc(90), pCopyFunc(91), pCopyFunc(92), pCopyFunc(93), pCopyFunc(94), pCopyFunc(95), pCopyFunc(96), pCopyFunc(97), pCopyFunc(98), pCopyFunc(99), pCopyFunc(100), pCopyFunc(101), pCopyFunc(102), pCopyFunc(103), pCopyFunc(104), pCopyFunc(105), pCopyFunc(106), pCopyFunc(107), pCopyFunc(108), pCopyFunc(109), pCopyFunc(110), pCopyFunc(111), pCopyFunc(112), pCopyFunc(113), pCopyFunc(114), pCopyFunc(115), pCopyFunc(116), pCopyFunc(117), pCopyFunc(118), pCopyFunc(119), pCopyFunc(120), pCopyFunc(121), pCopyFunc(122), pCopyFunc(123), pCopyFunc(124), pCopyFunc(125), pCopyFunc(126), pCopyFunc(127), pCopyFunc(128), }; #undef pCopyFunc #define pCopyFunc(x) ZMemoryCopy::___copy_##x static void(*const static_copys[129])(char* dest, const char* src) = { ZMemoryCopy::___copy_0, pCopyFunc(1), pCopyFunc(2), pCopyFunc(3), pCopyFunc(4), pCopyFunc(5), pCopyFunc(6), pCopyFunc(7), pCopyFunc(8), pCopyFunc(9), pCopyFunc(10), pCopyFunc(11), pCopyFunc(12), pCopyFunc(13), pCopyFunc(14), pCopyFunc(15), pCopyFunc(16), pCopyFunc(17), pCopyFunc(18), pCopyFunc(19), pCopyFunc(20), pCopyFunc(21), pCopyFunc(22), pCopyFunc(23), pCopyFunc(24), pCopyFunc(25), pCopyFunc(26), pCopyFunc(27), pCopyFunc(28), pCopyFunc(29), pCopyFunc(30), pCopyFunc(31), pCopyFunc(32), pCopyFunc(33), pCopyFunc(34), pCopyFunc(35), pCopyFunc(36), pCopyFunc(37), pCopyFunc(38), pCopyFunc(39), pCopyFunc(40), pCopyFunc(41), pCopyFunc(42), pCopyFunc(43), pCopyFunc(44), pCopyFunc(45), pCopyFunc(46), pCopyFunc(47), pCopyFunc(48), pCopyFunc(49), pCopyFunc(50), pCopyFunc(51), pCopyFunc(52), pCopyFunc(53), pCopyFunc(54), pCopyFunc(55), pCopyFunc(56), pCopyFunc(57), pCopyFunc(58), pCopyFunc(59), pCopyFunc(60), pCopyFunc(61), pCopyFunc(62), pCopyFunc(63), pCopyFunc(64), pCopyFunc(65), pCopyFunc(66), pCopyFunc(67), pCopyFunc(68), pCopyFunc(69), pCopyFunc(70), pCopyFunc(71), pCopyFunc(72), pCopyFunc(73), pCopyFunc(74), pCopyFunc(75), pCopyFunc(76), pCopyFunc(77), pCopyFunc(78), pCopyFunc(79), pCopyFunc(80), pCopyFunc(81), pCopyFunc(82), pCopyFunc(83), pCopyFunc(84), pCopyFunc(85), pCopyFunc(86), pCopyFunc(87), pCopyFunc(88), pCopyFunc(89), pCopyFunc(90), pCopyFunc(91), pCopyFunc(92), pCopyFunc(93), pCopyFunc(94), pCopyFunc(95), pCopyFunc(96), pCopyFunc(97), pCopyFunc(98), pCopyFunc(99), pCopyFunc(100), pCopyFunc(101), pCopyFunc(102), pCopyFunc(103), pCopyFunc(104), pCopyFunc(105), pCopyFunc(106), pCopyFunc(107), pCopyFunc(108), pCopyFunc(109), pCopyFunc(110), pCopyFunc(111), pCopyFunc(112), pCopyFunc(113), pCopyFunc(114), pCopyFunc(115), pCopyFunc(116), pCopyFunc(117), pCopyFunc(118), pCopyFunc(119), pCopyFunc(120), pCopyFunc(121), pCopyFunc(122), pCopyFunc(123), pCopyFunc(124), pCopyFunc(125), pCopyFunc(126), pCopyFunc(127), pCopyFunc(128), }; #undef pCopyFunc char dest[32000000]; char dest2[32000000]; const char pSource_[32000000] = "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "123456789012345678901234567-901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "123456789012345678901234567-901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "123456789012345678901234567-901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "123456789012345678901234567-901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "123456789012345678901234567-901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "123456789012345678901234567-901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "123456789012345678901234567-901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "123456789012345678901234567-901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "123456789012345678901234567-901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "123456789012345678901234567-901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "abcde67890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"; const char * volatile pSource = pSource_; // __declspec(noinline) void __fastcall donothing(int v) // { // __asm nop; // } // int _______reserved = (donothing((const volatile int&)(const int&)1), donothing((const volatile int&)(const int&)2), 1); // __declspec(noinline) void __fastcall donothing_(int v) // { // donothing(v); // } // // __declspec(noinline) bool testConstFunction() // { // unsigned char* ptr; // if (*(unsigned short*)((ptr = (unsigned char*)_ReturnAddress())- 15) == *(unsigned short*)"\xC7\x05") // { // DWORD p; // VirtualProtect(ptr - 15, 15, PAGE_EXECUTE_READWRITE, &p); // p = *(int*)(ptr - 15 + 2 + 4); // memcpy(ptr - 15, "\xB8\x90\x90\x90\x90\xB8\x01\x00\x00\x00\x90\x90\x90\x90\x90", 15); //mov eax, 1 // *(int*)(ptr - 14) = p; // return true; // } // else // { // DWORD p; // VirtualProtect(ptr - 10, 10, PAGE_EXECUTE_READWRITE, &p); // memcpy(ptr - 10, "\xB8\x00\x00\x00\x00\xB8\x00\x00\x00\x00", 10); //mov eax, 0 // return false; // } // } // // static int g_somewhere; // #define testConst(v) (_mm_pause(), g_somewhere = v, testConstFunction()) void test(); #define constsize 63 //每次复制的内存大小 #if 0 //1 开启无缓存测试:通过不断改变地址使cache失效 #define nocache + (i & 0xFFF) * 4096 #else //0 开启有缓存测试:源数据和目标数据总在cache中 #define nocache #endif #if constsize > 128 #define copysize constsize namespace ZMemoryCopy { #include "zmemcpyinc.h" } #endif #pragma runtime_checks( "s", restore ) int _tmain(int argc, _TCHAR* argv[]) { if (g_IsReleaseMode) puts("Release Mode"); else puts("Debug Mode"); char test[2000]; for (int i = 1; i <= 128; ++i) { memset(test, 0, 200); template_copys[i](test, pSource); if(memcmp(test, pSource, i) != 0) __debugbreak(); if (test[i] != (char)0) __debugbreak(); } for (int i = 1; i <= 499; ++i) { memset(test, 0, 500); zmemcpy(test, pSource, i); if (memcmp(test, pSource, i) != 0) __debugbreak(); if (test[i] != (char)0) __debugbreak(); } for (int i = 1; i <= 128; ++i) { memset(test, 0, 200); static_copys[i](test, pSource); if (memcmp(test, pSource, i) != 0) __debugbreak(); if (test[i] != (char)0) __debugbreak(); } memset(dest, 0, sizeof(dest)); memset(dest2, 0, sizeof(dest2)); memcpy(dest2, pSource, sizeof(dest2)); int volatile unknownSize = constsize; for (int j = 0; j < 4; ++j) { { __int64 t = __rdtsc(); for (int i = 0; i < 10000000; ++i) { ZMemoryCopy::copy<constsize>(dest nocache/*+ 1*/, pSource nocache /*+ 1*/); } t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (zmemcopy template const size)\n", constsize, 100000000 / 1000000, t / 3000000000.0); } { __int64 t = __rdtsc(); for (int i = 0; i < 10000000; ++i) { _COMBINE(ZMemoryCopy::___copy_, constsize)(dest nocache/*+ 1*/, pSource nocache /*+ 1*/); } t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (zmemcopy static const size)\n", constsize, 100000000 / 1000000, t / 3000000000.0); } { __int64 t = __rdtsc(); for (int i = 0; i < 10000000; ++i) { memcpy(dest nocache/*+ 1*/, pSource nocache/*+ 1*/, constsize); } t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (memcpy const size)\n", constsize, 100000000 / 1000000, t / 3000000000.0); } { __int64 t = __rdtsc(); for (int i = 0; i < 10000000; ++i) { zmemcpy(dest nocache/*+ 1*/, pSource nocache /*+ 1*/, constsize); } t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (zmemcopy const size)\n", constsize, 100000000 / 1000000, t / 3000000000.0); } if (unknownSize < 128) { __int64 t = __rdtsc(); for (int i = 0; i < 10000000; ++i) { copys[unknownSize](dest nocache/*+ 1*/, pSource nocache /*+ 1*/); } t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (zmemcopy unknown array direct)\n", constsize, 100000000 / 1000000, t / 3000000000.0); } if (unknownSize < 128) { __int64 t = __rdtsc(); for (int i = 0; i < 10000000; ++i) { zmemcpy_max128(dest nocache/*+ 1*/, pSource nocache /*+ 1*/, unknownSize); } t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (zmemcopy unknown small size)\n", constsize, 100000000 / 1000000, t / 3000000000.0); } { __int64 t = __rdtsc(); for (int i = 0; i < 10000000; ++i) { zmemcpy(dest nocache/*+ 1*/, pSource nocache /*+ 1*/, unknownSize); } t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (zmemcopy unknown size)\n", constsize, 100000000 / 1000000, t / 3000000000.0); } { __int64 t = __rdtsc(); for (int i = 0; i < 10000000; ++i) { memcpy(dest nocache/*+ 1*/, pSource nocache /*+ 1*/, unknownSize); } t = __rdtsc() - t; printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (memcpy unknown size)\n", constsize, 100000000 / 1000000, t / 3000000000.0); } puts(""); } return 0; } #pragma runtime_checks( "s", restore )
zmemcpy.h:
#pragma once #include <windows.h> #include <intrin.h> #ifndef _SAFEBUFFERS #if _MSC_VER >= 1600 #define _SAFEBUFFERS __declspec(safebuffers) #else #define _SAFEBUFFERS #endif #endif namespace z { #ifndef _Z_IF_DEFINED #define _Z_IF_DEFINED template<bool v> struct If { enum{ True = 1 }; }; template<> struct If < false > { enum{ False = 1 }; }; //强制使用z::If<false>和z::If<true> //这样__if_exists有效 #if _MSC_VER >= 1600 static_assert(z::If<false>::False, ""); static_assert(z::If<true>::True, ""); #else enum{ ___unknown = z::If<true>::True + z::If<false>::False }; #endif #endif //_Z_IF_DEFINED #pragma runtime_checks( "s", off) //由于#pragma runtime_checks必须在cpp末尾关闭,才能对模板生效,因此这里手工动态修改机器码移除stack check代码 //加速Debug模式下的函数执行 inline __declspec(noinline) _SAFEBUFFERS void RemoveCodeOf_InitESPBuffer() { __asm pushad; { unsigned char* ptr; if (*(unsigned int*)((ptr = (unsigned char*)_ReturnAddress() - 17) + 5) == *(unsigned int*)"\xB8\xCC\xCC\xCC") { DWORD p; VirtualProtect(ptr, 17, PAGE_EXECUTE_READWRITE, &p); //memset(ptr, 0x90, 17); memcpy(ptr, "\xE9\x0C\x00\x00\x00\x90\x90\x90\x90\x90\x90\x90\x90\x90\x90\x90\x90", 17); } else if (*(unsigned char*)(ptr = (unsigned char*)_ReturnAddress() - 5) == 0xE8u) { DWORD p; VirtualProtect(ptr, 5, PAGE_EXECUTE_READWRITE, &p); memset(ptr, 0x90, 5); } } __asm popad; } inline __declspec(noinline) _SAFEBUFFERS void RemoveCodeOf_CheckESP() { unsigned char* ptr; if (*(unsigned char*)(ptr = (unsigned char*)_ReturnAddress() - 5) == 0xE8u) { DWORD p; VirtualProtect(ptr, 22, PAGE_EXECUTE_READWRITE, &p); memset(ptr, 0x90, 5); if (*(unsigned int*)(ptr + 5 + 1 + 3 + 5) == *(unsigned int*)"\x00\x3B\xEC\xE8") { memset(ptr + 5 + 1 + 3 + 5 + 1, 0x90, 7); memcpy(ptr + 5 + 1 + 3 + 5 + 1, "\x8B\xE5\x5D\xC3", 4); // memcpy(ptr + 5, "\x5F\x5E\x5B\x81\xC4\xC0\x00\x00\x00\x8B\xE5\x5D\xC3", 13); } //3B EC cmp ebp, esp //E8 xxxxxxxx call __RTC_CheckEsp } else { __debugbreak(); } } #pragma runtime_checks( "s", restore ) } namespace ZMemoryCopy { #pragma runtime_checks( "s", off) //成组地复制128字节 inline __declspec(noinline) _SAFEBUFFERS void __copy_group(char* dest, const char* src, int size) { __asm { mov esi, dword ptr[src]; mov edi, dword ptr[dest]; } while ((size -= 0x80) >= 0) { __asm { movdqu xmm0, xmmword ptr[esi + 0x00]; movdqu xmm1, xmmword ptr[esi + 0x10]; movdqu xmm2, xmmword ptr[esi + 0x20]; movdqu xmm3, xmmword ptr[esi + 0x30]; movdqu xmm4, xmmword ptr[esi + 0x40]; movdqu xmm5, xmmword ptr[esi + 0x50]; movdqu xmm6, xmmword ptr[esi + 0x60]; movdqu xmm7, xmmword ptr[esi + 0x70]; prefetchnta[esi + 0x80]; prefetchnta[esi + 0xC0]; movdqu xmmword ptr[edi + 0x00], xmm0; movdqu xmmword ptr[edi + 0x10], xmm1; movdqu xmmword ptr[edi + 0x20], xmm2; movdqu xmmword ptr[edi + 0x30], xmm3; movdqu xmmword ptr[edi + 0x40], xmm4; movdqu xmmword ptr[edi + 0x50], xmm5; movdqu xmmword ptr[edi + 0x60], xmm6; movdqu xmmword ptr[edi + 0x70], xmm7; add esi, 0x80; add edi, 0x80; } } } //如果已知块大小(且是常量表达式),可以直接使用这个版本 template<int copysize> static _SAFEBUFFERS __forceinline void copy(char* dest, const char* src) { //由于#pragma runtime_checks必须在cpp末尾关闭,才能对模板生效,因此这里手工动态修改机器码移除stack check代码 //加速Debug模式下的函数执行 z::RemoveCodeOf_InitESPBuffer(); __if_exists(z::If<(copysize >= 4000)>::True) { memcpy(dest, src, copysize); } __if_exists(z::If<(copysize >= 4000)>::False) { __asm { mov esi, dword ptr[src]; mov edi, dword ptr[dest]; } __if_exists(z::If<(copysize >= 0x80 * 3)>::True) { __asm { prefetchnta[esi + 0x40]; } int vsize = copysize; while ((vsize -= 0x80) >= 0x80) } ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize >= 0x80 * 2)>::True) { __asm { movdqu xmm0, xmmword ptr[esi + 0x00]; movdqu xmm1, xmmword ptr[esi + 0x10]; movdqu xmm2, xmmword ptr[esi + 0x20]; movdqu xmm3, xmmword ptr[esi + 0x30]; movdqu xmm4, xmmword ptr[esi + 0x40]; movdqu xmm5, xmmword ptr[esi + 0x50]; movdqu xmm6, xmmword ptr[esi + 0x60]; movdqu xmm7, xmmword ptr[esi + 0x70]; prefetchnta[esi + 0x80]; prefetchnta[esi + 0xC0]; movdqu xmmword ptr[edi + 0x00], xmm0; movdqu xmmword ptr[edi + 0x10], xmm1; movdqu xmmword ptr[edi + 0x20], xmm2; movdqu xmmword ptr[edi + 0x30], xmm3; movdqu xmmword ptr[edi + 0x40], xmm4; movdqu xmmword ptr[edi + 0x50], xmm5; movdqu xmmword ptr[edi + 0x60], xmm6; movdqu xmmword ptr[edi + 0x70], xmm7; add esi, 0x80; add edi, 0x80; } } enum { offset1 = 0 }; ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize >= 0x80)>::True) { __asm { movdqu xmm0, xmmword ptr[esi + 0x00]; movdqu xmm1, xmmword ptr[esi + 0x10]; movdqu xmm2, xmmword ptr[esi + 0x20]; movdqu xmm3, xmmword ptr[esi + 0x30]; movdqu xmm4, xmmword ptr[esi + 0x40]; movdqu xmm5, xmmword ptr[esi + 0x50]; movdqu xmm6, xmmword ptr[esi + 0x60]; movdqu xmm7, xmmword ptr[esi + 0x70]; } __if_exists(z::If<(copysize & 0x60)>::True) { __asm { prefetchnta[esi + 0x80]; } } __asm { movdqu xmmword ptr[edi + 0x00], xmm0; movdqu xmmword ptr[edi + 0x10], xmm1; movdqu xmmword ptr[edi + 0x20], xmm2; movdqu xmmword ptr[edi + 0x30], xmm3; movdqu xmmword ptr[edi + 0x40], xmm4; movdqu xmmword ptr[edi + 0x50], xmm5; movdqu xmmword ptr[edi + 0x60], xmm6; movdqu xmmword ptr[edi + 0x70], xmm7; // add esi, 0x80; // add edi, 0x80; } enum { offset2 = 0x80 }; } __if_exists(z::If<(copysize >= 0x80)>::False) { enum { offset2 = 0 }; } ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize & 0x40)>::True) { __asm { movdqu xmm0, xmmword ptr[esi + offset2 + 0x00]; movdqu xmm1, xmmword ptr[esi + offset2 + 0x10]; movdqu xmm2, xmmword ptr[esi + offset2 + 0x20]; movdqu xmm3, xmmword ptr[esi + offset2 + 0x30]; movdqu xmmword ptr[edi + offset2 + 0x00], xmm0; movdqu xmmword ptr[edi + offset2 + 0x10], xmm1; movdqu xmmword ptr[edi + offset2 + 0x20], xmm2; movdqu xmmword ptr[edi + offset2 + 0x30], xmm3; } enum { offset3 = offset2 + 0x40 }; } __if_exists(z::If<(copysize & 0x40)>::False) { enum { offset3 = offset2 }; } ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize & 0x20)>::True) { __asm { movdqu xmm4, xmmword ptr[esi + offset3 + 0x00]; movdqu xmm5, xmmword ptr[esi + offset3 + 0x10]; movdqu xmmword ptr[edi + offset3 + 0x00], xmm4; movdqu xmmword ptr[edi + offset3 + 0x10], xmm5; } enum { offset4 = offset3 + 0x20 }; } __if_exists(z::If<(copysize & 0x20)>::False) { enum { offset4 = offset3 }; } ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize & 0x10)>::True) { __asm { movdqu xmm6, xmmword ptr[esi + offset4 + 0x00]; movdqu xmmword ptr[edi + offset4 + 0x00], xmm6; } enum { offset5 = offset4 + 0x10 }; } __if_exists(z::If<(copysize & 0x10)>::False) { enum { offset5 = offset4 }; } ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize & 0x8)>::True) { __asm { movlpd xmm7, qword ptr[esi + offset5]; movlpd qword ptr[edi + offset5], xmm7; } enum { offset6 = offset5 + 0x8 }; } __if_exists(z::If<(copysize & 0x8)>::False) { enum { offset6 = offset5 }; } ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize & 0x7)>::True) { enum { copydone = false }; { __if_exists(z::If < ((copysize & 0x7) > 4) && copysize >= 8 > ::True) // 5 6 7 //8字节移动版 { enum{ copy_offset = (copysize & 0x7) - 8 }; __asm { movlpd xmm0, qword ptr[esi + offset6 + copy_offset]; movlpd qword ptr[edi + offset6 + copy_offset], xmm0; } enum { copydone = true }; //return; } __if_exists(z::If <!copydone && ((copysize & 0x7) >= 4)>::True) // 4 5 6 7 //缓冲区不够先移动4字节 { __asm { mov eax, dword ptr[esi + offset6]; mov dword ptr[edi + offset6], eax; } enum{ offset6 = offset6 + 4 }; } __if_exists(z::If <!copydone && ((copysize & 0x3) == 3) && (copysize >= 4)> ::True) //3 { __asm { mov eax, dword ptr[esi + offset6 - 1]; mov dword ptr[edi + offset6 - 1], eax; } enum { copydone = true }; //return; } __if_exists(z::If <!copydone && ((copysize & 0x3) == 3)> ::True) //3 { __asm { mov ax, word ptr[esi + offset6]; mov word ptr[edi + offset6], ax; mov al, byte ptr[esi + offset6 + 2]; mov byte ptr[edi + offset6 + 2], al; } enum { copydone = true }; //return; } __if_exists(z::If <!copydone && ((copysize & 0x3) == 2) > ::True) //2 { __asm { mov ax, word ptr[esi + offset6]; mov word ptr[edi + offset6], ax; } enum { copydone = true }; //return; } __if_exists(z::If <!copydone && ((copysize & 0x3) == 1) > ::True) //1 { __asm { mov al, byte ptr[esi + offset6]; mov byte ptr[edi + offset6], al; } enum { copydone = true }; //return; } __if_exists(z::If <!copydone && ((copysize & 0x3) == 0) > ::True) //0 { enum { copydone = true }; //return; } __if_exists(z::If<!copydone>::True) { static_assert(0, ""); } } } } z::RemoveCodeOf_CheckESP(); __asm nop; } inline void ___copy_0(char* dest, const char* src) {} #pragma runtime_checks( "s", restore) #define copysize 1 #include "zmemcpyinc.h" #define copysize 2 #include "zmemcpyinc.h" #define copysize 3 #include "zmemcpyinc.h" #define copysize 4 #include "zmemcpyinc.h" #define copysize 5 #include "zmemcpyinc.h" #define copysize 6 #include "zmemcpyinc.h" #define copysize 7 #include "zmemcpyinc.h" #define copysize 8 #include "zmemcpyinc.h" #define copysize 9 #include "zmemcpyinc.h" #define copysize 10 #include "zmemcpyinc.h" #define copysize 11 #include "zmemcpyinc.h" #define copysize 12 #include "zmemcpyinc.h" #define copysize 13 #include "zmemcpyinc.h" #define copysize 14 #include "zmemcpyinc.h" #define copysize 15 #include "zmemcpyinc.h" #define copysize 16 #include "zmemcpyinc.h" #define copysize 17 #include "zmemcpyinc.h" #define copysize 18 #include "zmemcpyinc.h" #define copysize 19 #include "zmemcpyinc.h" #define copysize 20 #include "zmemcpyinc.h" #define copysize 21 #include "zmemcpyinc.h" #define copysize 22 #include "zmemcpyinc.h" #define copysize 23 #include "zmemcpyinc.h" #define copysize 24 #include "zmemcpyinc.h" #define copysize 25 #include "zmemcpyinc.h" #define copysize 26 #include "zmemcpyinc.h" #define copysize 27 #include "zmemcpyinc.h" #define copysize 28 #include "zmemcpyinc.h" #define copysize 29 #include "zmemcpyinc.h" #define copysize 30 #include "zmemcpyinc.h" #define copysize 31 #include "zmemcpyinc.h" #define copysize 32 #include "zmemcpyinc.h" #define copysize 33 #include "zmemcpyinc.h" #define copysize 34 #include "zmemcpyinc.h" #define copysize 35 #include "zmemcpyinc.h" #define copysize 36 #include "zmemcpyinc.h" #define copysize 37 #include "zmemcpyinc.h" #define copysize 38 #include "zmemcpyinc.h" #define copysize 39 #include "zmemcpyinc.h" #define copysize 40 #include "zmemcpyinc.h" #define copysize 41 #include "zmemcpyinc.h" #define copysize 42 #include "zmemcpyinc.h" #define copysize 43 #include "zmemcpyinc.h" #define copysize 44 #include "zmemcpyinc.h" #define copysize 45 #include "zmemcpyinc.h" #define copysize 46 #include "zmemcpyinc.h" #define copysize 47 #include "zmemcpyinc.h" #define copysize 48 #include "zmemcpyinc.h" #define copysize 49 #include "zmemcpyinc.h" #define copysize 50 #include "zmemcpyinc.h" #define copysize 51 #include "zmemcpyinc.h" #define copysize 52 #include "zmemcpyinc.h" #define copysize 53 #include "zmemcpyinc.h" #define copysize 54 #include "zmemcpyinc.h" #define copysize 55 #include "zmemcpyinc.h" #define copysize 56 #include "zmemcpyinc.h" #define copysize 57 #include "zmemcpyinc.h" #define copysize 58 #include "zmemcpyinc.h" #define copysize 59 #include "zmemcpyinc.h" #define copysize 60 #include "zmemcpyinc.h" #define copysize 61 #include "zmemcpyinc.h" #define copysize 62 #include "zmemcpyinc.h" #define copysize 63 #include "zmemcpyinc.h" #define copysize 64 #include "zmemcpyinc.h" #define copysize 65 #include "zmemcpyinc.h" #define copysize 66 #include "zmemcpyinc.h" #define copysize 67 #include "zmemcpyinc.h" #define copysize 68 #include "zmemcpyinc.h" #define copysize 69 #include "zmemcpyinc.h" #define copysize 70 #include "zmemcpyinc.h" #define copysize 71 #include "zmemcpyinc.h" #define copysize 72 #include "zmemcpyinc.h" #define copysize 73 #include "zmemcpyinc.h" #define copysize 74 #include "zmemcpyinc.h" #define copysize 75 #include "zmemcpyinc.h" #define copysize 76 #include "zmemcpyinc.h" #define copysize 77 #include "zmemcpyinc.h" #define copysize 78 #include "zmemcpyinc.h" #define copysize 79 #include "zmemcpyinc.h" #define copysize 80 #include "zmemcpyinc.h" #define copysize 81 #include "zmemcpyinc.h" #define copysize 82 #include "zmemcpyinc.h" #define copysize 83 #include "zmemcpyinc.h" #define copysize 84 #include "zmemcpyinc.h" #define copysize 85 #include "zmemcpyinc.h" #define copysize 86 #include "zmemcpyinc.h" #define copysize 87 #include "zmemcpyinc.h" #define copysize 88 #include "zmemcpyinc.h" #define copysize 89 #include "zmemcpyinc.h" #define copysize 90 #include "zmemcpyinc.h" #define copysize 91 #include "zmemcpyinc.h" #define copysize 92 #include "zmemcpyinc.h" #define copysize 93 #include "zmemcpyinc.h" #define copysize 94 #include "zmemcpyinc.h" #define copysize 95 #include "zmemcpyinc.h" #define copysize 96 #include "zmemcpyinc.h" #define copysize 97 #include "zmemcpyinc.h" #define copysize 98 #include "zmemcpyinc.h" #define copysize 99 #include "zmemcpyinc.h" #define copysize 100 #include "zmemcpyinc.h" #define copysize 101 #include "zmemcpyinc.h" #define copysize 102 #include "zmemcpyinc.h" #define copysize 103 #include "zmemcpyinc.h" #define copysize 104 #include "zmemcpyinc.h" #define copysize 105 #include "zmemcpyinc.h" #define copysize 106 #include "zmemcpyinc.h" #define copysize 107 #include "zmemcpyinc.h" #define copysize 108 #include "zmemcpyinc.h" #define copysize 109 #include "zmemcpyinc.h" #define copysize 110 #include "zmemcpyinc.h" #define copysize 111 #include "zmemcpyinc.h" #define copysize 112 #include "zmemcpyinc.h" #define copysize 113 #include "zmemcpyinc.h" #define copysize 114 #include "zmemcpyinc.h" #define copysize 115 #include "zmemcpyinc.h" #define copysize 116 #include "zmemcpyinc.h" #define copysize 117 #include "zmemcpyinc.h" #define copysize 118 #include "zmemcpyinc.h" #define copysize 119 #include "zmemcpyinc.h" #define copysize 120 #include "zmemcpyinc.h" #define copysize 121 #include "zmemcpyinc.h" #define copysize 122 #include "zmemcpyinc.h" #define copysize 123 #include "zmemcpyinc.h" #define copysize 124 #include "zmemcpyinc.h" #define copysize 125 #include "zmemcpyinc.h" #define copysize 126 #include "zmemcpyinc.h" #define copysize 127 #include "zmemcpyinc.h" #define copysize 128 #include "zmemcpyinc.h" #pragma runtime_checks( "s", off ) __forceinline void zmemcpy(char* dest, const char* src, size_t size) { #define pCopyFunc(x) ZMemoryCopy::___copy_##x static void(*const static_copys[129])(char* dest, const char* src) = { ZMemoryCopy::___copy_0, pCopyFunc(1), pCopyFunc(2), pCopyFunc(3), pCopyFunc(4), pCopyFunc(5), pCopyFunc(6), pCopyFunc(7), pCopyFunc(8), pCopyFunc(9), pCopyFunc(10), pCopyFunc(11), pCopyFunc(12), pCopyFunc(13), pCopyFunc(14), pCopyFunc(15), pCopyFunc(16), pCopyFunc(17), pCopyFunc(18), pCopyFunc(19), pCopyFunc(20), pCopyFunc(21), pCopyFunc(22), pCopyFunc(23), pCopyFunc(24), pCopyFunc(25), pCopyFunc(26), pCopyFunc(27), pCopyFunc(28), pCopyFunc(29), pCopyFunc(30), pCopyFunc(31), pCopyFunc(32), pCopyFunc(33), pCopyFunc(34), pCopyFunc(35), pCopyFunc(36), pCopyFunc(37), pCopyFunc(38), pCopyFunc(39), pCopyFunc(40), pCopyFunc(41), pCopyFunc(42), pCopyFunc(43), pCopyFunc(44), pCopyFunc(45), pCopyFunc(46), pCopyFunc(47), pCopyFunc(48), pCopyFunc(49), pCopyFunc(50), pCopyFunc(51), pCopyFunc(52), pCopyFunc(53), pCopyFunc(54), pCopyFunc(55), pCopyFunc(56), pCopyFunc(57), pCopyFunc(58), pCopyFunc(59), pCopyFunc(60), pCopyFunc(61), pCopyFunc(62), pCopyFunc(63), pCopyFunc(64), pCopyFunc(65), pCopyFunc(66), pCopyFunc(67), pCopyFunc(68), pCopyFunc(69), pCopyFunc(70), pCopyFunc(71), pCopyFunc(72), pCopyFunc(73), pCopyFunc(74), pCopyFunc(75), pCopyFunc(76), pCopyFunc(77), pCopyFunc(78), pCopyFunc(79), pCopyFunc(80), pCopyFunc(81), pCopyFunc(82), pCopyFunc(83), pCopyFunc(84), pCopyFunc(85), pCopyFunc(86), pCopyFunc(87), pCopyFunc(88), pCopyFunc(89), pCopyFunc(90), pCopyFunc(91), pCopyFunc(92), pCopyFunc(93), pCopyFunc(94), pCopyFunc(95), pCopyFunc(96), pCopyFunc(97), pCopyFunc(98), pCopyFunc(99), pCopyFunc(100), pCopyFunc(101), pCopyFunc(102), pCopyFunc(103), pCopyFunc(104), pCopyFunc(105), pCopyFunc(106), pCopyFunc(107), pCopyFunc(108), pCopyFunc(109), pCopyFunc(110), pCopyFunc(111), pCopyFunc(112), pCopyFunc(113), pCopyFunc(114), pCopyFunc(115), pCopyFunc(116), pCopyFunc(117), pCopyFunc(118), pCopyFunc(119), pCopyFunc(120), pCopyFunc(121), pCopyFunc(122), pCopyFunc(123), pCopyFunc(124), pCopyFunc(125), pCopyFunc(126), pCopyFunc(127), pCopyFunc(128), }; #undef pCopyFunc if (size >= 128) __copy_group(dest, src, size); if (size & 127) static_copys[size & 127](dest + (size &~127), src + (size &~127)); } __forceinline void zmemcpy_max128(char* dest, const char* src, size_t size) { #define pCopyFunc(x) ZMemoryCopy::___copy_##x static void(*const static_copys[129])(char* dest, const char* src) = { ZMemoryCopy::___copy_0, pCopyFunc(1), pCopyFunc(2), pCopyFunc(3), pCopyFunc(4), pCopyFunc(5), pCopyFunc(6), pCopyFunc(7), pCopyFunc(8), pCopyFunc(9), pCopyFunc(10), pCopyFunc(11), pCopyFunc(12), pCopyFunc(13), pCopyFunc(14), pCopyFunc(15), pCopyFunc(16), pCopyFunc(17), pCopyFunc(18), pCopyFunc(19), pCopyFunc(20), pCopyFunc(21), pCopyFunc(22), pCopyFunc(23), pCopyFunc(24), pCopyFunc(25), pCopyFunc(26), pCopyFunc(27), pCopyFunc(28), pCopyFunc(29), pCopyFunc(30), pCopyFunc(31), pCopyFunc(32), pCopyFunc(33), pCopyFunc(34), pCopyFunc(35), pCopyFunc(36), pCopyFunc(37), pCopyFunc(38), pCopyFunc(39), pCopyFunc(40), pCopyFunc(41), pCopyFunc(42), pCopyFunc(43), pCopyFunc(44), pCopyFunc(45), pCopyFunc(46), pCopyFunc(47), pCopyFunc(48), pCopyFunc(49), pCopyFunc(50), pCopyFunc(51), pCopyFunc(52), pCopyFunc(53), pCopyFunc(54), pCopyFunc(55), pCopyFunc(56), pCopyFunc(57), pCopyFunc(58), pCopyFunc(59), pCopyFunc(60), pCopyFunc(61), pCopyFunc(62), pCopyFunc(63), pCopyFunc(64), pCopyFunc(65), pCopyFunc(66), pCopyFunc(67), pCopyFunc(68), pCopyFunc(69), pCopyFunc(70), pCopyFunc(71), pCopyFunc(72), pCopyFunc(73), pCopyFunc(74), pCopyFunc(75), pCopyFunc(76), pCopyFunc(77), pCopyFunc(78), pCopyFunc(79), pCopyFunc(80), pCopyFunc(81), pCopyFunc(82), pCopyFunc(83), pCopyFunc(84), pCopyFunc(85), pCopyFunc(86), pCopyFunc(87), pCopyFunc(88), pCopyFunc(89), pCopyFunc(90), pCopyFunc(91), pCopyFunc(92), pCopyFunc(93), pCopyFunc(94), pCopyFunc(95), pCopyFunc(96), pCopyFunc(97), pCopyFunc(98), pCopyFunc(99), pCopyFunc(100), pCopyFunc(101), pCopyFunc(102), pCopyFunc(103), pCopyFunc(104), pCopyFunc(105), pCopyFunc(106), pCopyFunc(107), pCopyFunc(108), pCopyFunc(109), pCopyFunc(110), pCopyFunc(111), pCopyFunc(112), pCopyFunc(113), pCopyFunc(114), pCopyFunc(115), pCopyFunc(116), pCopyFunc(117), pCopyFunc(118), pCopyFunc(119), pCopyFunc(120), pCopyFunc(121), pCopyFunc(122), pCopyFunc(123), pCopyFunc(124), pCopyFunc(125), pCopyFunc(126), pCopyFunc(127), pCopyFunc(128), }; #undef pCopyFunc __assume(size <= 128); static_copys[size](dest, src); } #pragma runtime_checks( "s", restore) } using ZMemoryCopy::zmemcpy; using ZMemoryCopy::zmemcpy_max128;
zmemcpyinc.h :
#ifndef END_WITH_copysize #ifndef _COMBINE2 #define _COMBINE2(x,y) x##y #define _COMBINE(x,y) _COMBINE2(x,y) #endif #define END_WITH_copysize(x) _COMBINE(x, copysize) #endif #pragma runtime_checks( "s", off ) inline void END_WITH_copysize(___copy_)(char* dest, const char* src) { __asm { mov esi, dword ptr[src]; mov edi, dword ptr[dest]; } __if_exists(z::If<(copysize >= 0x80 * 3)>::True) { __asm { prefetchnta[esi + 0x40]; } int vsize = copysize; while ((vsize -= 0x80) >= 0x80) } ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize >= 0x80 * 2)>::True) { __asm { movdqu xmm0, xmmword ptr[esi + 0x00]; movdqu xmm1, xmmword ptr[esi + 0x10]; movdqu xmm2, xmmword ptr[esi + 0x20]; movdqu xmm3, xmmword ptr[esi + 0x30]; movdqu xmm4, xmmword ptr[esi + 0x40]; movdqu xmm5, xmmword ptr[esi + 0x50]; movdqu xmm6, xmmword ptr[esi + 0x60]; movdqu xmm7, xmmword ptr[esi + 0x70]; prefetchnta[esi + 0x80]; prefetchnta[esi + 0xC0]; movdqu xmmword ptr[edi + 0x00], xmm0; movdqu xmmword ptr[edi + 0x10], xmm1; movdqu xmmword ptr[edi + 0x20], xmm2; movdqu xmmword ptr[edi + 0x30], xmm3; movdqu xmmword ptr[edi + 0x40], xmm4; movdqu xmmword ptr[edi + 0x50], xmm5; movdqu xmmword ptr[edi + 0x60], xmm6; movdqu xmmword ptr[edi + 0x70], xmm7; add esi, 0x80; add edi, 0x80; } } enum { offset1 = 0 }; ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize >= 0x80)>::True) { __asm { movdqu xmm0, xmmword ptr[esi + 0x00]; movdqu xmm1, xmmword ptr[esi + 0x10]; movdqu xmm2, xmmword ptr[esi + 0x20]; movdqu xmm3, xmmword ptr[esi + 0x30]; movdqu xmm4, xmmword ptr[esi + 0x40]; movdqu xmm5, xmmword ptr[esi + 0x50]; movdqu xmm6, xmmword ptr[esi + 0x60]; movdqu xmm7, xmmword ptr[esi + 0x70]; } __if_exists(z::If<(copysize & 0x60)>::True) { __asm { prefetchnta[esi + 0x80]; } } __asm { movdqu xmmword ptr[edi + 0x00], xmm0; movdqu xmmword ptr[edi + 0x10], xmm1; movdqu xmmword ptr[edi + 0x20], xmm2; movdqu xmmword ptr[edi + 0x30], xmm3; movdqu xmmword ptr[edi + 0x40], xmm4; movdqu xmmword ptr[edi + 0x50], xmm5; movdqu xmmword ptr[edi + 0x60], xmm6; movdqu xmmword ptr[edi + 0x70], xmm7; // add esi, 0x80; // add edi, 0x80; } enum { offset2 = 0x80 }; } __if_exists(z::If<(copysize >= 0x80)>::False) { enum { offset2 = 0 }; } ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize & 0x40)>::True) { __asm { movdqu xmm0, xmmword ptr[esi + offset2 + 0x00]; movdqu xmm1, xmmword ptr[esi + offset2 + 0x10]; movdqu xmm2, xmmword ptr[esi + offset2 + 0x20]; movdqu xmm3, xmmword ptr[esi + offset2 + 0x30]; movdqu xmmword ptr[edi + offset2 + 0x00], xmm0; movdqu xmmword ptr[edi + offset2 + 0x10], xmm1; movdqu xmmword ptr[edi + offset2 + 0x20], xmm2; movdqu xmmword ptr[edi + offset2 + 0x30], xmm3; } enum { offset3 = offset2 + 0x40 }; } __if_exists(z::If<(copysize & 0x40)>::False) { enum { offset3 = offset2 }; } ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize & 0x20)>::True) { __asm { movdqu xmm4, xmmword ptr[esi + offset3 + 0x00]; movdqu xmm5, xmmword ptr[esi + offset3 + 0x10]; movdqu xmmword ptr[edi + offset3 + 0x00], xmm4; movdqu xmmword ptr[edi + offset3 + 0x10], xmm5; } enum { offset4 = offset3 + 0x20 }; } __if_exists(z::If<(copysize & 0x20)>::False) { enum { offset4 = offset3 }; } ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize & 0x10)>::True) { __asm { movdqu xmm6, xmmword ptr[esi + offset4 + 0x00]; movdqu xmmword ptr[edi + offset4 + 0x00], xmm6; } enum { offset5 = offset4 + 0x10 }; } __if_exists(z::If<(copysize & 0x10)>::False) { enum { offset5 = offset4 }; } ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize & 0x8)>::True) { __asm { movlpd xmm7, qword ptr[esi + offset5]; movlpd qword ptr[edi + offset5], xmm7; } enum { offset6 = offset5 + 0x8 }; } __if_exists(z::If<(copysize & 0x8)>::False) { enum { offset6 = offset5 }; } ////////////////////////////////////////////////////////////////////////// __if_exists(z::If<(copysize & 0x7)>::True) { enum { copydone = false }; { __if_exists(z::If < ((copysize & 0x7) > 4) && copysize >= 8 > ::True) // 5 6 7 //8字节移动版 { enum{ copy_offset = (copysize & 0x7) - 8 }; __asm { movlpd xmm0, qword ptr[esi + offset6 + copy_offset]; movlpd qword ptr[edi + offset6 + copy_offset], xmm0; } enum { copydone = true }; //return; } __if_exists(z::If <!copydone && ((copysize & 0x7) >= 4)>::True) // 4 5 6 7 //缓冲区不够先移动4字节 { __asm { mov eax, dword ptr[esi + offset6]; mov dword ptr[edi + offset6], eax; } enum{ offset6 = offset6 + 4 }; } __if_exists(z::If <!copydone && ((copysize & 0x3) == 3) && (copysize >= 4)> ::True) //3 { __asm { mov eax, dword ptr[esi + offset6 - 1]; mov dword ptr[edi + offset6 - 1], eax; } enum { copydone = true }; //return; } __if_exists(z::If <!copydone && ((copysize & 0x3) == 3)> ::True) //3 { __asm { mov ax, word ptr[esi + offset6]; mov word ptr[edi + offset6], ax; mov al, byte ptr[esi + offset6 + 2]; mov byte ptr[edi + offset6 + 2], al; } enum { copydone = true }; //return; } __if_exists(z::If <!copydone && ((copysize & 0x3) == 2) > ::True) //2 { __asm { mov ax, word ptr[esi + offset6]; mov word ptr[edi + offset6], ax; } enum { copydone = true }; //return; } __if_exists(z::If <!copydone && ((copysize & 0x3) == 1) > ::True) //1 { __asm { mov al, byte ptr[esi + offset6]; mov byte ptr[edi + offset6], al; } enum { copydone = true }; //return; } __if_exists(z::If <!copydone && ((copysize & 0x3) == 0) > ::True) //0 { enum { copydone = true }; //return; } __if_exists(z::If<!copydone>::True) { static_assert(0, ""); } } } } #pragma runtime_checks( "s", restore ) #undef copysize