今天心血来潮,想写个CPU版本的矩阵转置代码,过几天写GPU版本的。
按照我的想法,就是以下几种方式:
1> 整块矩阵转置,横读纵写或纵读横写
2> 将矩阵分成固定大小的block,block与block可以分成横读纵写或纵读横写,而block内部的数据也可以横读纵写或纵读横写。
经过试验:block横读,block内部纵读,能获得最好的平均性能。
代码如下(R=read,W=write,h=horizon,v=vertical,block前面的表示block的读写方法,后面表示内容的读写方法):
#include <stdio.h> #include <stdlib.h> #include <intrin.h> #pragma intrinsic(__rdtsc) #define LOOP 100 #define BLOCKCPU 128 float MatA[4096*4096]; float MatB[4096*4096]; int Matrix_Reverse_CPU_RvWh( float* inMat, float* outMat, int m, int n ) { int back = m * n - 1; float *pEnd = inMat + m * n; float *pIn = inMat, *pInEnd = inMat + n, *pOut = outMat; while( pIn < pInEnd ) { while( pIn < pEnd ) { *pOut++ = *pIn; pIn += n; } pIn -= back; } return 0; } int Matrix_Reverse_CPU_RvWh_Block_RvWh( float* inMat, float* outMat, int m, int n ) { int i, j, bi, bj, bW = n / BLOCKCPU, bH = m / BLOCKCPU, temp = BLOCKCPU * n - 1; float *pIn, *pOut; for( bi = 0; bi < bW; bi++ ) { for( bj = 0; bj < bH; bj++ ) { pIn = inMat + bj * BLOCKCPU * n + bi * BLOCKCPU; pOut = outMat + bi * BLOCKCPU * m + bj * BLOCKCPU; for( i = 0; i < BLOCKCPU; i++ ) { for( j = 0; j < BLOCKCPU; j++ ) { pOut[j] = *pIn; pIn += n; } pIn -= temp; pOut += m; } } } return 0; } int Matrix_Reverse_CPU_RvWh_Block_RhWv( float* inMat, float* outMat, int m, int n ) { int i, j, bi, bj, bW = n / BLOCKCPU, bH = m / BLOCKCPU, temp = BLOCKCPU * m - 1; float *pIn, *pOut; for( bi = 0; bi < bW; bi++ ) { for( bj = 0; bj < bH; bj++ ) { pIn = inMat + bj * BLOCKCPU * n + bi * BLOCKCPU; pOut = outMat + bi * BLOCKCPU * m + bj * BLOCKCPU; for( i = 0; i < BLOCKCPU; i++ ) { for( j = 0; j < BLOCKCPU; j++ ) { *pOut = pIn[j]; pOut += m; } pIn += n; pOut -= temp; } } } return 0; } int Matrix_Reverse_CPU_RhWv_Block_RvWh( float* inMat, float* outMat, int m, int n ) { int i, j, bi, bj, bW = n / BLOCKCPU, bH = m / BLOCKCPU, temp = BLOCKCPU * n - 1; float *pIn, *pOut; for( bj = 0; bj < bH; bj++ ) { for( bi = 0; bi < bW; bi++ ) { pIn = inMat + bj * BLOCKCPU * n + bi * BLOCKCPU; pOut = outMat + bi * BLOCKCPU * m + bj * BLOCKCPU; for( i = 0; i < BLOCKCPU; i++ ) { for( j = 0; j < BLOCKCPU; j++ ) { pOut[j] = *pIn; pIn += n; } pIn -= temp; pOut += m; } } } return 0; } int Matrix_Reverse_CPU_RhWv_Block_RhWv( float* inMat, float* outMat, int m, int n ) { int i, j, bi, bj, bW = n / BLOCKCPU, bH = m / BLOCKCPU, temp = BLOCKCPU * m - 1; float *pIn, *pOut; for( bj = 0; bj < bH; bj++ ) { for( bi = 0; bi < bW; bi++ ) { pIn = inMat + bj * BLOCKCPU * n + bi * BLOCKCPU; pOut = outMat + bi * BLOCKCPU * m + bj * BLOCKCPU; for( i = 0; i < BLOCKCPU; i++ ) { for( j = 0; j < BLOCKCPU; j++ ) { *pOut = pIn[j]; pOut += m; } pIn += n; pOut -= temp; } } } return 0; } int Matrix_Reverse_CPU_RhWv( float* inMat, float* outMat, int m, int n ) { int back = m * n - 1; float *pEnd = outMat + m * n; float *pIn = inMat, *pOutEnd = outMat + m, *pOut = outMat; while( pOut < pOutEnd ) { while( pOut < pEnd ) { *pOut = *pIn++; pOut += m; } pOut -= back; } return 0; } int main() { int i, w, h; unsigned __int64 t1, t2, t; const int M[] = { 128, 256, 512, 1024, 2048, 4096 }; //row const int N[] = { 128, 256, 512, 1024, 2048, 4096 }; // col for( w = 0; w < sizeof(N)/sizeof(int); w++ ) { for( h = 0; h < sizeof(M)/sizeof(int); h++ ) { printf("Size: %d * %d/n", M[w], N[h] ); t1 = __rdtsc(); for( i = 0; i < LOOP; i++ ) Matrix_Reverse_CPU_RvWh_Block_RvWh( MatA, MatB, M[w], N[h] ); t2 = __rdtsc(); t = t2 - t1; printf("Matrix_Reverse_CPU_RvWh_Block_RvWh : %llu/n", t ); t1 = __rdtsc(); for( i = 0; i < LOOP; i++ ) Matrix_Reverse_CPU_RvWh_Block_RhWv( MatA, MatB, M[w], N[h] ); t2 = __rdtsc(); t = t2 - t1; printf("Matrix_Reverse_CPU_RvWh_Block_RhWv : %llu/n", t ); t1 = __rdtsc(); for( i = 0; i < LOOP; i++ ) Matrix_Reverse_CPU_RhWv_Block_RvWh( MatA, MatB, M[w], N[h] ); t2 = __rdtsc(); t = t2 - t1; printf("Matrix_Reverse_CPU_RhWv_Block_RvWh : %llu/n", t ); t1 = __rdtsc(); for( i = 0; i < LOOP; i++ ) Matrix_Reverse_CPU_RhWv_Block_RhWv( MatA, MatB, M[w], N[h] ); t2 = __rdtsc(); t = t2 - t1; printf("Matrix_Reverse_CPU_RhWv_Block_RhWv : %llu/n", t ); t1 = __rdtsc(); for( i = 0; i < LOOP; i++ ) Matrix_Reverse_CPU_RvWh( MatA, MatB, M[w], N[h] ); t2 = __rdtsc(); t = t2 - t1; printf("Matrix_Reverse_CPU_RvWh : %llu/n", t ); t1 = __rdtsc(); for( i = 0; i < LOOP; i++ ) Matrix_Reverse_CPU_RhWv( MatA, MatB, M[w], N[h] ); t2 = __rdtsc(); t = t2 - t1; printf("Matrix_Reverse_CPU_RhWv : %llu/n", t ); } } return 0; }
结果如下:
Size: 128 * 128 Matrix_Reverse_CPU_RvWh_Block_RvWh : 10417257 Matrix_Reverse_CPU_RvWh_Block_RhWv : 14665905 Matrix_Reverse_CPU_RhWv_Block_RvWh : 10622754 Matrix_Reverse_CPU_RhWv_Block_RhWv : 14981904 Matrix_Reverse_CPU_RvWh : 10275039 Matrix_Reverse_CPU_RhWv : 14443650 Size: 128 * 256 Matrix_Reverse_CPU_RvWh_Block_RvWh : 21198996 Matrix_Reverse_CPU_RvWh_Block_RhWv : 39430450 Matrix_Reverse_CPU_RhWv_Block_RvWh : 20058974 Matrix_Reverse_CPU_RhWv_Block_RhWv : 34941303 Matrix_Reverse_CPU_RvWh : 20690820 Matrix_Reverse_CPU_RhWv : 30270690 Size: 128 * 512 Matrix_Reverse_CPU_RvWh_Block_RvWh : 41558545 Matrix_Reverse_CPU_RvWh_Block_RhWv : 59876280 Matrix_Reverse_CPU_RhWv_Block_RvWh : 42976817 Matrix_Reverse_CPU_RhWv_Block_RhWv : 61637067 Matrix_Reverse_CPU_RvWh : 40461093 Matrix_Reverse_CPU_RhWv : 59937813 Size: 128 * 1024 Matrix_Reverse_CPU_RvWh_Block_RvWh : 87489181 Matrix_Reverse_CPU_RvWh_Block_RhWv : 122680602 Matrix_Reverse_CPU_RhWv_Block_RvWh : 84115530 Matrix_Reverse_CPU_RhWv_Block_RhWv : 125989632 Matrix_Reverse_CPU_RvWh : 87557526 Matrix_Reverse_CPU_RhWv : 126284472 Size: 128 * 2048 Matrix_Reverse_CPU_RvWh_Block_RvWh : 174393621 Matrix_Reverse_CPU_RvWh_Block_RhWv : 256453047 Matrix_Reverse_CPU_RhWv_Block_RvWh : 169418836 Matrix_Reverse_CPU_RhWv_Block_RhWv : 234190639 Matrix_Reverse_CPU_RvWh : 161152011 Matrix_Reverse_CPU_RhWv : 230227541 Size: 128 * 4096 Matrix_Reverse_CPU_RvWh_Block_RvWh : 576353700 Matrix_Reverse_CPU_RvWh_Block_RhWv : 478431693 Matrix_Reverse_CPU_RhWv_Block_RvWh : 570337947 Matrix_Reverse_CPU_RhWv_Block_RhWv : 502093638 Matrix_Reverse_CPU_RvWh : 600788664 Matrix_Reverse_CPU_RhWv : 487275859 Size: 256 * 128 Matrix_Reverse_CPU_RvWh_Block_RvWh : 20071099 Matrix_Reverse_CPU_RvWh_Block_RhWv : 29227086 Matrix_Reverse_CPU_RhWv_Block_RvWh : 20358459 Matrix_Reverse_CPU_RhWv_Block_RhWv : 29538414 Matrix_Reverse_CPU_RvWh : 20236393 Matrix_Reverse_CPU_RhWv : 29102473 Size: 256 * 256 Matrix_Reverse_CPU_RvWh_Block_RvWh : 41621678 Matrix_Reverse_CPU_RvWh_Block_RhWv : 58808790 Matrix_Reverse_CPU_RhWv_Block_RvWh : 41086485 Matrix_Reverse_CPU_RhWv_Block_RhWv : 58896440 Matrix_Reverse_CPU_RvWh : 40269159 Matrix_Reverse_CPU_RhWv : 57798684 Size: 256 * 512 Matrix_Reverse_CPU_RvWh_Block_RvWh : 83462742 Matrix_Reverse_CPU_RvWh_Block_RhWv : 117741097 Matrix_Reverse_CPU_RhWv_Block_RvWh : 81348300 Matrix_Reverse_CPU_RhWv_Block_RhWv : 117998676 Matrix_Reverse_CPU_RvWh : 80092828 Matrix_Reverse_CPU_RhWv : 113777821 Size: 256 * 1024 Matrix_Reverse_CPU_RvWh_Block_RvWh : 166494672 Matrix_Reverse_CPU_RvWh_Block_RhWv : 237239486 Matrix_Reverse_CPU_RhWv_Block_RvWh : 163659491 Matrix_Reverse_CPU_RhWv_Block_RhWv : 242815373 Matrix_Reverse_CPU_RvWh : 163278197 Matrix_Reverse_CPU_RhWv : 239810364 Size: 256 * 2048 Matrix_Reverse_CPU_RvWh_Block_RvWh : 338305950 Matrix_Reverse_CPU_RvWh_Block_RhWv : 473304833 Matrix_Reverse_CPU_RhWv_Block_RvWh : 335302938 Matrix_Reverse_CPU_RhWv_Block_RhWv : 478345078 Matrix_Reverse_CPU_RvWh : 587043324 Matrix_Reverse_CPU_RhWv : 543861585 Size: 256 * 4096 Matrix_Reverse_CPU_RvWh_Block_RvWh : 1457239121 Matrix_Reverse_CPU_RvWh_Block_RhWv : 1238241527 Matrix_Reverse_CPU_RhWv_Block_RvWh : 1451646350 Matrix_Reverse_CPU_RhWv_Block_RhWv : 1247579846 Matrix_Reverse_CPU_RvWh : 1573425657 Matrix_Reverse_CPU_RhWv : 1624683015 Size: 512 * 128 Matrix_Reverse_CPU_RvWh_Block_RvWh : 41554792 Matrix_Reverse_CPU_RvWh_Block_RhWv : 59120343 Matrix_Reverse_CPU_RhWv_Block_RvWh : 40392802 Matrix_Reverse_CPU_RhWv_Block_RhWv : 58695670 Matrix_Reverse_CPU_RvWh : 40465396 Matrix_Reverse_CPU_RhWv : 58227831 Size: 512 * 256 Matrix_Reverse_CPU_RvWh_Block_RvWh : 82802924 Matrix_Reverse_CPU_RvWh_Block_RhWv : 117499176 Matrix_Reverse_CPU_RhWv_Block_RvWh : 82407870 Matrix_Reverse_CPU_RhWv_Block_RhWv : 125584362 Matrix_Reverse_CPU_RvWh : 79600294 Matrix_Reverse_CPU_RhWv : 115811190 Size: 512 * 512 Matrix_Reverse_CPU_RvWh_Block_RvWh : 165924747 Matrix_Reverse_CPU_RvWh_Block_RhWv : 233900001 Matrix_Reverse_CPU_RhWv_Block_RvWh : 162587754 Matrix_Reverse_CPU_RhWv_Block_RhWv : 234426294 Matrix_Reverse_CPU_RvWh : 163152242 Matrix_Reverse_CPU_RhWv : 233270739 Size: 512 * 1024 Matrix_Reverse_CPU_RvWh_Block_RvWh : 354002130 Matrix_Reverse_CPU_RvWh_Block_RhWv : 486031347 Matrix_Reverse_CPU_RhWv_Block_RvWh : 331645778 Matrix_Reverse_CPU_RhWv_Block_RhWv : 479563658 Matrix_Reverse_CPU_RvWh : 580098789 Matrix_Reverse_CPU_RhWv : 685572742 Size: 512 * 2048 Matrix_Reverse_CPU_RvWh_Block_RvWh : 1091996829 Matrix_Reverse_CPU_RvWh_Block_RhWv : 1275984676 Matrix_Reverse_CPU_RhWv_Block_RvWh : 1151299638 Matrix_Reverse_CPU_RhWv_Block_RhWv : 1286450758 Matrix_Reverse_CPU_RvWh : 1528142759 Matrix_Reverse_CPU_RhWv : 1884699307 Size: 512 * 4096 Matrix_Reverse_CPU_RvWh_Block_RvWh : 3134379394 Matrix_Reverse_CPU_RvWh_Block_RhWv : 4226491962 Matrix_Reverse_CPU_RhWv_Block_RvWh : 3096178083 Matrix_Reverse_CPU_RhWv_Block_RhWv : 4209514281 Matrix_Reverse_CPU_RvWh : 3811554836 Matrix_Reverse_CPU_RhWv : 17024637267 Size: 1024 * 128 Matrix_Reverse_CPU_RvWh_Block_RvWh : 81138654 Matrix_Reverse_CPU_RvWh_Block_RhWv : 117979380 Matrix_Reverse_CPU_RhWv_Block_RvWh : 80910720 Matrix_Reverse_CPU_RhWv_Block_RhWv : 116910584 Matrix_Reverse_CPU_RvWh : 80473905 Matrix_Reverse_CPU_RhWv : 134668124 Size: 1024 * 256 Matrix_Reverse_CPU_RvWh_Block_RvWh : 164265812 Matrix_Reverse_CPU_RvWh_Block_RhWv : 234912950 Matrix_Reverse_CPU_RhWv_Block_RvWh : 164818620 Matrix_Reverse_CPU_RhWv_Block_RhWv : 235213893 Matrix_Reverse_CPU_RvWh : 161024939 Matrix_Reverse_CPU_RhWv : 235012006 Size: 1024 * 512 Matrix_Reverse_CPU_RvWh_Block_RvWh : 337517757 Matrix_Reverse_CPU_RvWh_Block_RhWv : 476269956 Matrix_Reverse_CPU_RhWv_Block_RvWh : 338091687 Matrix_Reverse_CPU_RhWv_Block_RhWv : 475558370 Matrix_Reverse_CPU_RvWh : 368677817 Matrix_Reverse_CPU_RhWv : 852082353 Size: 1024 * 1024 Matrix_Reverse_CPU_RvWh_Block_RvWh : 1125439649 Matrix_Reverse_CPU_RvWh_Block_RhWv : 1411009218 Matrix_Reverse_CPU_RhWv_Block_RvWh : 1121928480 Matrix_Reverse_CPU_RhWv_Block_RhWv : 1366843328 Matrix_Reverse_CPU_RvWh : 1630837134 Matrix_Reverse_CPU_RhWv : 2185864037 Size: 1024 * 2048 Matrix_Reverse_CPU_RvWh_Block_RvWh : 2387519343 Matrix_Reverse_CPU_RvWh_Block_RhWv : 6137348453 Matrix_Reverse_CPU_RhWv_Block_RvWh : 2393332091 Matrix_Reverse_CPU_RhWv_Block_RhWv : 6200314578 Matrix_Reverse_CPU_RvWh : 5073037516 Matrix_Reverse_CPU_RhWv : 19062663525 Size: 1024 * 4096 Matrix_Reverse_CPU_RvWh_Block_RvWh : 8812405188 Matrix_Reverse_CPU_RvWh_Block_RhWv : 9203846886 Matrix_Reverse_CPU_RhWv_Block_RvWh : 8833045428 Matrix_Reverse_CPU_RhWv_Block_RhWv : 9621636156 Matrix_Reverse_CPU_RvWh : 13624540469 Matrix_Reverse_CPU_RhWv : 53447809886 Size: 2048 * 128 Matrix_Reverse_CPU_RvWh_Block_RvWh : 207078318 Matrix_Reverse_CPU_RvWh_Block_RhWv : 232919433 Matrix_Reverse_CPU_RhWv_Block_RvWh : 161324893 Matrix_Reverse_CPU_RhWv_Block_RhWv : 233421029 Matrix_Reverse_CPU_RvWh : 157768371 Matrix_Reverse_CPU_RhWv : 231983937 Size: 2048 * 256 Matrix_Reverse_CPU_RvWh_Block_RvWh : 326545506 Matrix_Reverse_CPU_RvWh_Block_RhWv : 492295339 Matrix_Reverse_CPU_RhWv_Block_RvWh : 332199397 Matrix_Reverse_CPU_RhWv_Block_RhWv : 482750596 Matrix_Reverse_CPU_RvWh : 371166166 Matrix_Reverse_CPU_RhWv : 841668939 Size: 2048 * 512 Matrix_Reverse_CPU_RvWh_Block_RvWh : 1035631045 Matrix_Reverse_CPU_RvWh_Block_RhWv : 1525221388 Matrix_Reverse_CPU_RhWv_Block_RvWh : 1127027358 Matrix_Reverse_CPU_RhWv_Block_RhWv : 1492769178 Matrix_Reverse_CPU_RvWh : 1276703820 Matrix_Reverse_CPU_RhWv : 2135876904 Size: 2048 * 1024 Matrix_Reverse_CPU_RvWh_Block_RvWh : 2494238617 Matrix_Reverse_CPU_RvWh_Block_RhWv : 3870908262 Matrix_Reverse_CPU_RhWv_Block_RvWh : 2488308246 Matrix_Reverse_CPU_RhWv_Block_RhWv : 3816585424 Matrix_Reverse_CPU_RvWh : 11187392157 Matrix_Reverse_CPU_RhWv : 12063028248 Size: 2048 * 2048 Matrix_Reverse_CPU_RvWh_Block_RvWh : 7293328750 Matrix_Reverse_CPU_RvWh_Block_RhWv : 6994836973 Matrix_Reverse_CPU_RhWv_Block_RvWh : 7256066662 Matrix_Reverse_CPU_RhWv_Block_RhWv : 6997737213 Matrix_Reverse_CPU_RvWh : 19055086983 Matrix_Reverse_CPU_RhWv : 39151565973 Size: 2048 * 4096 Matrix_Reverse_CPU_RvWh_Block_RvWh : 15619297726 Matrix_Reverse_CPU_RvWh_Block_RhWv : 13155896816 Matrix_Reverse_CPU_RhWv_Block_RvWh : 15331781169 Matrix_Reverse_CPU_RhWv_Block_RhWv : 13917048795 Matrix_Reverse_CPU_RvWh : 41958742194 Matrix_Reverse_CPU_RhWv : 105749784415 Size: 4096 * 128 Matrix_Reverse_CPU_RvWh_Block_RvWh : 321151680 Matrix_Reverse_CPU_RvWh_Block_RhWv : 739762740 Matrix_Reverse_CPU_RhWv_Block_RvWh : 326692782 Matrix_Reverse_CPU_RhWv_Block_RhWv : 738515258 Matrix_Reverse_CPU_RvWh : 336008620 Matrix_Reverse_CPU_RhWv : 839885266 Size: 4096 * 256 Matrix_Reverse_CPU_RvWh_Block_RvWh : 983351071 Matrix_Reverse_CPU_RvWh_Block_RhWv : 1853335045 Matrix_Reverse_CPU_RhWv_Block_RvWh : 967479003 Matrix_Reverse_CPU_RhWv_Block_RhWv : 1833265656 Matrix_Reverse_CPU_RvWh : 1160984817 Matrix_Reverse_CPU_RhWv : 2078493867 Size: 4096 * 512 Matrix_Reverse_CPU_RvWh_Block_RvWh : 2202172822 Matrix_Reverse_CPU_RvWh_Block_RhWv : 3846165651 Matrix_Reverse_CPU_RhWv_Block_RvWh : 2125062334 Matrix_Reverse_CPU_RhWv_Block_RhWv : 3841032770 Matrix_Reverse_CPU_RvWh : 8793675153 Matrix_Reverse_CPU_RhWv : 5398407549 Size: 4096 * 1024 Matrix_Reverse_CPU_RvWh_Block_RvWh : 5964178851 Matrix_Reverse_CPU_RvWh_Block_RhWv : 8009663400 Matrix_Reverse_CPU_RhWv_Block_RvWh : 5888756484 Matrix_Reverse_CPU_RhWv_Block_RhWv : 7724640878 Matrix_Reverse_CPU_RvWh : 27156089034 Matrix_Reverse_CPU_RhWv : 15113791657 Size: 4096 * 2048 Matrix_Reverse_CPU_RvWh_Block_RvWh : 12148545420 Matrix_Reverse_CPU_RvWh_Block_RhWv : 16162558714 Matrix_Reverse_CPU_RhWv_Block_RvWh : 12166173739 Matrix_Reverse_CPU_RhWv_Block_RhWv : 15740501130 Matrix_Reverse_CPU_RvWh : 54296160336 Matrix_Reverse_CPU_RhWv : 85253695460 Size: 4096 * 4096 Matrix_Reverse_CPU_RvWh_Block_RvWh : 27819237664 Matrix_Reverse_CPU_RvWh_Block_RhWv : 32572810413 Matrix_Reverse_CPU_RhWv_Block_RvWh : 27575753498 Matrix_Reverse_CPU_RhWv_Block_RhWv : 33486339519 Matrix_Reverse_CPU_RvWh : 101988909892 Matrix_Reverse_CPU_RhWv : 205211126718 Press any key to continue . . .