1、简介下HMPP
HMPP指的是(Hybrid Multicore Parallel Programming),他是由CAPS(http://www.caps-entreprise.com(英文);www.caps-entreprise.com.cn(中文)) 发起的一种异构计算的标准,他的出现可以大大减少我们的程序优化时间。大家可以参考我之前的几篇讲解HMPP的文章去获得HMPP的试用版。
HMPP是一种基于编译指导语句(类似与OpenMP)的标准,它与OpenMP的区别是:OMP是基于CPU的并行标准,HMPP是基于异构平台的标准(例如CPU+GPU,CPU+MIC),它支持C和Fortran两种语言。
另外HMPP编译器可以根据你的#pragma指令产生CUDA代码,也可以直接编译CUDA代码!
总之,HMPP编译器非常强大!
2、使用HMPP以及OpenACC的一个推荐原则。
使用HMPP是为了尽可能不改变原有代码的基础上只需要添加少量的#pragma 语句就可一获得几十甚至几千倍的加速比。当然前提是你原有的代码要可以正确的按照算法设计的目的执行才行。
3、继续优化矩阵相乘的那段代码
1)重新贴一边需要优化的代码:(特别注意这段代码来值CAPS,这是原始代码,我没有做实质性的修改)
/* * Copyright 2008 - 2012 CAPS entreprise. All rights reserved. */ #include <getopt.h> #include <sys/time.h> #include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> // Number of execution #define NB_RUNS 5 // Size of the matrix #define SIZE 256 // Initialization random value #define SRAND_VALUE 5347 // Use to initialize the matrix float randFloat(float low, float high) { float t = (float)rand() / (float)RAND_MAX; return (1.0f - t) * low + t * high; } //////////////////////////////////////////////////////////////////////////////// // sgemm_codelet //////////////////////////////////////////////////////////////////////////////// void mySgemm( int m, int n, int k, float alpha, float beta, float a[m][n], float b[n][k], float c[m][k] ) { int i,j,l; // Induction variables float ab; // Temporary result for( j = 0 ; j < m ; j++ ) { for( i = 0 ; i < k ; i++ ) { ab=0.0f; for( l = 0 ; l < n ; l++ ){ ab += a[j][l] * b[l][i]; } c[j][i] = alpha * ab + beta * c[j][i]; } } } //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { int m=SIZE, n=SIZE, k = SIZE; float *my_a=NULL, *b=NULL, *c_hwa=NULL, *c_cpu=NULL; int i, j, ii; // For timer measures struct timeval tv_global_begin, tv_global_end; // global timer (all iterations) struct timeval tv_begin, tv_end; // local timer (1 iteration) unsigned long long int best_measure_GPU = 0; unsigned long long int sum_measure_GPU = 0; unsigned long long int best_measure_CPU = 0; unsigned long long int sum_measure_CPU = 0; unsigned long long int global_CPU_time = 0; unsigned long long int global_GPU_time = 0; unsigned long long int current; float alpha, beta; double error = 0.0; int index_i = 0.0; int index_j = 0.0; double valueCPU = 0.0; double valueGPU = 0.0; // Allocating CPU memory my_a = (float *)malloc(m* n * sizeof(float)); my_b = (float *)malloc(n * k * sizeof(float)); c_hwa = (float *)malloc(m * k * sizeof(float)); c_cpu = (float *)malloc(m * k * sizeof(float)); if((my_a == NULL) || (my_b == NULL) || (c_hwa == NULL) || (c_cpu == NULL)) { fprintf( stderr, "\n**** error : memory allocation failed ****\n\n"); return 1; } fprintf( stdout, "---- Initialization of the Matrices ----\n\n"); srand(SRAND_VALUE); //Generate options set for(i = 0; i < m; i++){ for(j = 0; j < n; j++){ my_a[i*n+j] = randFloat(0.0001f, 1.0f); } } for(i = 0; i < n; i++){ for(j = 0; j < k; j++){ my_b[i*k+j] = randFloat(0.0001f, 1.0f); } } for(i = 0; i < m; i++){ for(j = 0; j < k; j++) { c_cpu[i*k+j] = randFloat(1.0, 20.0f); c_hwa[i*k+j] = c_cpu[i*k+j]; } } alpha = 0.5; beta = randFloat(1.0, 2.0); fprintf( stdout, "---- Running calculations ----\n"); // run sgemm on GPU (NB_RUNS iterations) printf("Run on GPU\n"); // Start timer gettimeofday(&tv_global_begin, NULL); for( i=0; i<NB_RUNS; i++ ) { printf("%d ",i); gettimeofday(&tv_begin, NULL); mySgemm( m, n, k, alpha, beta, my_a, my_b, c_hwa ); gettimeofday(&tv_end, NULL); current = (tv_end.tv_sec-tv_begin.tv_sec)*1e6 + tv_end.tv_usec-tv_begin.tv_usec; if( ( best_measure_GPU == 0 ) || ( best_measure_GPU > current ) ){ best_measure_GPU = current; } sum_measure_GPU += current; } gettimeofday(&tv_global_end, NULL); global_GPU_time = (tv_global_end.tv_sec-tv_global_begin.tv_sec)*1e6 + tv_global_end.tv_usec-tv_global_begin.tv_usec; // run sgemm on CPU (NB_RUNS iterations) printf("\n\nRun on CPU\n"); // Start timer gettimeofday(&tv_global_begin, NULL); for( i=0; i<NB_RUNS; i++ ) { printf("%d ",i); gettimeofday(&tv_begin, NULL); mySgemm( m, n, k, alpha, beta, my_a, my_b, c_cpu ); gettimeofday(&tv_end, NULL); current = (tv_end.tv_sec-tv_begin.tv_sec)*1e6 + tv_end.tv_usec-tv_begin.tv_usec; if( ( best_measure_CPU == 0 ) || ( best_measure_CPU > current ) ){ best_measure_CPU = current; } sum_measure_CPU += current; } gettimeofday(&tv_global_end, NULL); global_CPU_time = (tv_global_end.tv_sec-tv_global_begin.tv_sec)*1e6 + tv_global_end.tv_usec-tv_global_begin.tv_usec; // Compute error between GPU and CPU for( ii = 0; ii < m; ii++){ for(j = 0; j < k; j++){ double lerror = fabs((c_hwa[ii*k+j]-c_cpu[ii*k+j])/c_cpu[ii*k+j]); if (lerror > error) { error = lerror; valueCPU = c_cpu[ii*k+j]; valueGPU = c_hwa[ii*k+j]; index_i = ii; index_j = j; } } } if (error > 2e-06) { fprintf( stdout, "\n\nThe error is %e with index %d:%d @ %e (CPU) / %e (GPU)\n", error, index_i, index_j, valueCPU, valueGPU); fprintf( stdout, "The error is is too big!\n"); return -1; } fprintf( stdout, "\n\n---- Results ----"); fprintf( stdout, "\n"); fprintf( stdout, "Sizes of matrices: M:%i N:%i K:%i\n\n", m, n, k); fprintf( stdout, "Best HWA time : %f ms\n", best_measure_GPU / 1e3 ); fprintf( stdout, "Mean HWA time : %f ms\n", sum_measure_GPU / NB_RUNS / 1e3); fprintf( stdout, "\n"); fprintf( stdout, "Best CPU time : %f ms\n", best_measure_CPU / 1e3 ); fprintf( stdout, "Mean CPU time : %f ms\n", sum_measure_CPU / NB_RUNS / 1e3); fprintf( stdout, "\n"); fprintf( stdout, "Global HWA time : %f ms\n", global_GPU_time / 1e3 ); fprintf( stdout, "Global CPU time : %f ms\n", global_CPU_time / 1e3 ); fprintf( stdout, "\n"); fprintf( stdout, "Speed-up : %f (computed on the best time)", ((float)best_measure_CPU)/best_measure_GPU); fprintf( stdout, "\n"); free(my_a); free(my_b); free(c_hwa); free(c_cpu); return 0; }
注意上述代码中,测试了两次统一个函数的执行结果,下面加入两句简单的指令,然后编译执行下,看一下加速比情况。
在第31与第32行插入一下语句:
#pragma hmpp mylab codelet, target=CUDA, args[*].transfer=atcall
#pragma hmpp mylab callsite
[]$hmpp --codelet-required gcc source.c
---- Initialization of the Matrices ---- ---- Running calculations ---- Run on GPU 0 1 2 3 4 Run on CPU 0 1 2 3 4 ---- Results ---- Sizes of matrices: M:256 N:256 K:256 Best HWA time : 1.436000 ms Mean HWA time : 21.837000 ms Best CPU time : 86.995000 ms Mean CPU time : 87.583000 ms Global HWA time : 109.192000 ms Global CPU time : 437.922000 ms Speed-up : 60.581478 (computed on the best time)
当然HMPP并没有到这里这么简单,它提供了很多指令,指令学习并不难,也就是说我们不用直接学习CUDA或者OpenCL就可以很方便的使用GPU的计算资源了。种种好处 只有在你试用之后才能知道的奥。
后面的博客我还会讲解更多的指令,还有一些有意思的细节。欢迎大家关注奥!