xmemcpy改进版

xmemcpy改进版,利用movdqu速度快的特点,利用内联和常量化来提高对于小内存的memcpy性能优化

xmemcpy来自github beyondszine/progs/C/c_progs/memcpy.c ,不知道是不是原作者,这里进行了部分改进

------2016-2-28注意1:以下内容的缓冲区由于反复读取,总在L1cache中,类似于栈内存,如果总是在超出cache的内存中,则由于内存速度拖累,改进版与memcpy很难拉开差距,但是仍然有一定的效果

------2016-2-28注意2:DEBUG下速度会很慢,除非关闭/GS或用 #pragma runtime_checks( "s", restore ) (此编译杂注对模板无效)

------2016-3-5  注意3:参看zmemcpy改进版,对debug模式有相当大的提高 http://blog.csdn.net/superzmy/article/details/50810343


预期结果:

All time to memcpy 80 * 100M is 0.248s in 3GHz (xmemcopy)
All time to memcpy 80 * 100M is 0.476s in 3GHz (xmemcpy)
All time to memcpy 80 * 100M is 0.778s in 3GHz (xmemcpy unknownSize)
All time to memcpy 80 * 100M is 0.232s in 3GHz (movdq)
All time to memcpy 80 * 100M is 0.257s in 3GHz (movdq  unalign)
All time to memcpy 81 * 100M is 0.298s in 3GHz (movdq)
All time to memcpy 81 * 100M is 0.264s in 3GHz (movdq unalign)
All time to memcpy 400 * 100M is 1.334s in 3GHz (xmemcopy)
All time to memcpy 400 * 100M is 1.236s in 3GHz (xmemcopy unalign)
All time to memcpy 400 * 100M is 1.819s in 3GHz (xmemcpy)
All time to memcpy 400 * 100M is 3.051s in 3GHz (rep movs)
All time to memcpy 400 * 100M is 2.984s in 3GHz (rep movs unalign)
All time to memcpy 400 * 100M is 3.015s in 3GHz (rep movs handwrite asm)
All time to memcpy 401 * 100M is 3.093s in 3GHz (rep movs)
All time to memcpy 401 * 100M is 3.193s in 3GHz (rep movs handwrite asm)
All time to memcpy 80 * 100M is 1.216s in 3GHz (rep movs handwrite asm)
All time to memcpy 4000 * 100M is 15.254s in 3GHz (rep movs handwrite asm)
All time to memcpy 80 * 100M is 1.824s in 3GHz (call _memcpy)
All time to memcpy 81 * 100M is 1.828s in 3GHz (call _memcpy)
All time to memcpy 81 * 100M is 1.779s in 3GHz (call _memcpy unalign)
All time to memcpy 400 * 100M is 2.554s in 3GHz (call _memcpy)
All time to memcpy 401 * 100M is 2.777s in 3GHz (call _memcpy)
All time to memcpy 401 * 100M is 2.725s in 3GHz (call _memcpy unalign)
All time to memcpy 4000 * 100M is 14.379s in 3GHz (call _memcpy)
以上代码vs2013编译 E3 1230V2上运行

// ConsoleApplication3.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include <windows.h>
#include <intrin.h>
#include <assert.h>
char data80[80] = "abcdefghijklmnopqrstuvwxyz0123456789";

char data400[400] = 
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"012345678901234567890123456789012345678";

char data4000[4000] =
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
;

char data401[401] =
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
;
char data81[81] = "abcdefghijklmnopqrstuvwxyz0123456789";

// optimize memcpy less than 120bytes
// char a[32], b[32]; a = b;  is faster than memcpy(a, b, sizeof(b));

namespace com
{
	const static size_t _MAXSIZE_ = 80;
	extern void* (*g_base[_MAXSIZE_+1])(void *dest, const void *src);
};

inline void *xmemcpy(void *dest, const void *src, size_t len);

namespace com
{
	template <size_t size>
	struct xmemcpy_t
	{
		int data[size];
	};

	template <>
	struct xmemcpy_t<0>
	{
	};

	template <size_t size>
	class xmemcopy
	{
	public:
		inline static void * copy(void *dest, const void *src)
		{
			if (size > _MAXSIZE_)
			{
				size_t i = 0;
				for (; i + _MAXSIZE_ <= size; i += _MAXSIZE_)
					xmemcopy<_MAXSIZE_>::copy((char*)dest + i, (const char*)src + i);
				if (size % _MAXSIZE_) 
					xmemcopy<size % _MAXSIZE_>::copy((char*)dest + i, (const char*)src + i);
				return dest;
			}
			typedef xmemcpy_t<((size - 1) % _MAXSIZE_ + 1) / sizeof(int)> type_t;
			*((type_t *)dest) = *((type_t *)src);
			
			if ((size%sizeof(int)) > 0) {
				((char *)dest)[size - 1] = ((char *)src)[size - 1];
			}
			if ((size%sizeof(int)) > 1) {
				((char *)dest)[size - 2] = ((char *)src)[size - 2];
			}
			if ((size%sizeof(int)) > 2) {
				((char *)dest)[size - 3] = ((char *)src)[size - 3];
			}
			return dest;
		}
	};

	template <>
	class xmemcopy<0>
	{
	public:
		static void * copy(void *dest, const void *src) { return dest; }
	};
	
	void* (*g_base[_MAXSIZE_+1])(void *dest, const void *src);

	template <size_t len>
	void init() {
		g_base[len] = xmemcopy<len>::copy;
		init<len - 1>();
	}

	template <>
	void init<0>() {
		g_base[0] = xmemcopy<0>::copy;
	}

	struct xmem_monitor
	{
		xmem_monitor() 
		{
			init<_MAXSIZE_>();
		}
	};

	static xmem_monitor g_monitor;
}

inline void *xmemcpy(void *dest, const void *src, size_t len)
{
	if (len <= com::_MAXSIZE_) {
		return com::g_base[len](dest, src);
	}
	else if (len <= com::_MAXSIZE_ * 10)
	{
		size_t i = 0;
		for (; i + com::_MAXSIZE_ < len; i += com::_MAXSIZE_)
			com::xmemcopy<com::_MAXSIZE_>::copy((char*)dest + i, (const char*)src + i);
		com::g_base[len - i]((char*)dest + i, (const char*)src + i);
		return dest;
	}

	return ::memcpy(dest, src, len);
}



int _tmain(int argc, _TCHAR* argv[])
{
	SetProcessAffinityMask(GetCurrentProcess(), 2);

	char buffer[10000] = {};
	com::xmemcopy<com::_MAXSIZE_ * 2>::copy(buffer, data400);
	if (memcmp(buffer, data400, com::_MAXSIZE_ * 2))
		__asm int 3;

	com::xmemcopy<com::_MAXSIZE_ * 2 + 1>::copy(buffer, data400);
	if (memcmp(buffer, data400, com::_MAXSIZE_ * 2 + 1))
		__asm int 3;

	com::xmemcopy<400>::copy(buffer, data400);
	if(memcmp(buffer, data400, 400))
		__asm int 3;
	

	char* volatile pb = buffer;
	char* volatile pb1 = buffer + 1;
	size_t volatile size40 = sizeof(data80);
	size_t volatile size41 = sizeof(data81);

	assert((int)pb % 4 == 0);
	assert((int)pb1 % 4 == 1);
	assert((int)data80 % 8 == 0);
	assert((int)data400 % 8 == 0);
	assert((int)data4000 % 8 == 0);

	for (int i = 0; i < 10; ++i)
	{
		memcpy(pb, data80, size40);
		memcpy(pb, data81, size41);
		memcpy(pb, data400, sizeof(data400));
		memcpy(pb, data401, sizeof(data401));
		memcpy(pb, data4000, sizeof(data4000));
	}
	printf("\n");
	enum { Count = 100000000 };
#if(1)
	{
		auto& dest = data80;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			com::xmemcopy<sizeof(dest)>::copy(pb, dest);
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcopy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data80;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			xmemcpy(pb, dest, sizeof(dest));
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcpy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data80;
		size_t volatile size = sizeof(dest);

		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			xmemcpy(pb, dest, size);
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcpy unknownSize)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}

	{
		auto& dest = data80;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb, dest, sizeof(dest));
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data80;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb1, dest, sizeof(dest));
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq  unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data81;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb, dest, sizeof(dest));
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data81;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb1, dest, sizeof(dest));
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	//////////////////////////////////////////////////////////////////////////	
	{
		auto& dest = data400;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			com::xmemcopy<sizeof(dest)>::copy(pb, dest);
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcopy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data400;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			com::xmemcopy<sizeof(dest)>::copy(pb1, dest);
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcopy unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
#endif
	memset(pb, 0, 400);
	{
		auto& dest = data400;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			xmemcpy(pb, dest, sizeof(dest));
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcpy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data400;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb, dest, sizeof(dest));
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data400;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb1, dest, sizeof(dest));
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data400;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
		{
			__asm
			{
				mov         edi, dword ptr[pb];
				mov         ecx, size data400 / 4;
				mov         esi, dest;
				rep movs    dword ptr es : [edi], dword ptr[esi];
			}
		}
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data401;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb, dest, sizeof(dest));
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data401;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
		{
			__asm
			{
				mov         edi, dword ptr[pb];
				mov         ecx, size data401 / 4;
				mov         esi, dest;
				rep movs    dword ptr es : [edi], dword ptr[esi];
				movs        byte ptr es : [edi], byte ptr[esi]
			}
		}
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data80;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
		{
			__asm
			{
				mov         edi, dword ptr[pb];
				mov         ecx, size data80 / 4;
				mov         esi, dest;
				rep movs    dword ptr es : [edi], dword ptr[esi];
			}
		}
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data4000;
		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
		{
			__asm
			{
				mov         edi, dword ptr[pb];
				mov         ecx, size data4000 / 4;
				mov         esi, dest;
				rep movs    dword ptr es : [edi], dword ptr[esi];
			}
		}
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);
	}

	{
		auto& dest = data80;
		size_t volatile size = sizeof(dest);

		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb, dest, size);
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data81;
		size_t volatile size = sizeof(dest);

		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb, dest, size);
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data81;
		size_t volatile size = sizeof(dest);

		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb1, dest, size);
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy unalign)\n", size, Count / 1000000, t / 3000000000.0);
	}

	{
		auto& dest = data400;
		size_t volatile size = sizeof(dest);

		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb, dest, size);
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data401;
		size_t volatile size = sizeof(dest);

		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb, dest, size);
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data401;
		size_t volatile size = sizeof(dest);

		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb1, dest, size);
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy unalign)\n", size, Count / 1000000, t / 3000000000.0);
	}
	{
		auto& dest = data4000;
		size_t volatile size = sizeof(dest);

		__int64 t = __rdtsc();
		for (int i = 0; i < Count; ++i)
			memcpy(pb, dest, size);
		t = __rdtsc() - t;
		printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);
	}
	return 0;
}






你可能感兴趣的:(C++,memcpy,Visual,Studio)