About Optimizing - 0010

现在看看实际的例子，在这个例子中，我们尝试使用两种不同的方法来优化memcpy函数，并对16K,128K,256K,512K,1M,2M,4M,16M,32M,64M的数据进行拷贝操作，同时还使用两种不同尺寸的预取块，然后对其结果进行比较。

所有数据都在16字节边界上对齐

C++的实现

用C++写的作为参照的函数，使用for循环进行指针操作：

void test_for(char* src, char* dst, int size) {
  for ( int a = 0; a < size / sizeof(int); a++) {
      *(int *)dst = *(int *)src; // 进行整数赋值
      dst += 4; // 下一次迭代地址
      src += 4;
  } // for
} // func

汇编的实现

// 页面大小：4096
#define PAGE_SIZE    4096

// 预取字节，当为缓存线长度时最佳，PIII为32，P4及Xeon为128，作为比较，在测试中也使用了不同值
#define PREFETCH_SIZE 128

// 使用汇编进行预取操作
__forceinline void __fastcall __prefetchnta(char* x) {
  __asm {
    mov eax, [x]
    _emit 0xF
    _emit 0x18
    _emit 0x0
  } // asm
} // func

// 流式拷贝函数
__forceinline void __fastcall __stream_cpy(char* dst, char* src) {
  __asm {
    mov eax, [src]
    mov edx, [dst]
    _emit 0xF
    _emit 0x28
    _emit 0
    _emit 0xF
    _emit 0x2B
    _emit 0x2
  } // asm
} // func

// 使用循环展开对预取方式进行优化的的拷贝函数
// 循环展开是常用的一种优化方式
int test_perfetch(char* src, char* dst, int size) {
  int a, b, temp;

  for ( a = 0; a < size; a += PAGE_SIZE ) {
    temp = *(int *)(src + a + PAGE_SIZE);

    for(b = a; b < a + PAGE_SIZE; b+= PREFETCH_SIZE) { //预取到缓存
        __prefetchnta(src + b);
    } // for

    for(b = a; b < a + PAGE_SIZE; b += 16 * 8) { // 拷贝，每个迭代128字节
        __stream_cpy(dst + b + 16 * 0, src + b + 16 * 0);
        __stream_cpy(dst + b + 16 * 1, src + b + 16 * 1);
        __stream_cpy(dst + b + 16 * 2, src + b + 16 * 2);
        __stream_cpy(dst + b + 16 * 3, src + b + 16 * 3);
        __stream_cpy(dst + b + 16 * 4, src + b + 16 * 4);
        __stream_cpy(dst + b + 16 * 5, src + b + 16 * 5);
        __stream_cpy(dst + b + 16 * 6, src + b + 16 * 6);
        __stream_cpy(dst + b + 16 * 7, src + b + 16 * 7);
    } // for
  } // for

  return temp;
}

使用QueryPerformanceFrequency来计算花费的时间

//申明时间度量变量
#define DECL_PERF()     __int64 m_CounterFreq, m_CounterStart, m_CounterEnd;

//初始化时间变量
#define INIT_PERF()     QueryPerformanceFrequency((LARGE_INTEGER *)&m_CounterFreq);

//记录操作开始时间
#define BEGIN_PERF()    QueryPerformanceCounter((LARGE_INTEGER *)&m_CounterStart);

//记录结束时间
#define END_PERF()      QueryPerformanceCounter((LARGE_INTEGER *)&m_CounterEnd);

//计算操作耗费的时间
#define GET_PERF()      ((m_CounterEnd - m_CounterStart) / (FLOAT)m_CounterFreq)

运行完成后，将统计数据导入到EXCEL中，制成图表，进行比较。

About Optimizing - 0010

所有数据都在16字节边界上对齐

C++的实现

汇编的实现

你可能感兴趣的:(c++,optimization)