转载请注明出处:http://blog.csdn.net/bendanban/article/details/8897735
/*=======================================================================
* 今天天气不错,是被吵起来的。
=======================================================================*/
今天我们会介绍局矩阵,BLAS中level2的函数包含了矩阵与向量的操作。
我觉得cublas库中最烦人的是按列存储数组,并且从1开始索引。不过关系也不是很大,我们只要关注自己的算法就好了,保持对数学的理解,不要过多关注存储结构就好了。
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
使用这种宏的场景描述:
我们假设你已经用C语言处理过的矩阵为float *A, 假设在数学上,它是一个M X N的矩阵(M行,N列),那么C‘语言和CUBLAS如何理解同一块存储为同一个数学含义呢?
C语言是按行存储的,所以,在内存中,A的首行首先被连续存储,然后是首行后的一行再被连续的存储,依次类推,我们称A的这个存储结构为Host_A。
那么CUBLAS会如何理解HOST_A呢?
CUBLAS中,矩阵会被按照列存储,所以CUBLAS也是按照 按列存储的方式理解内存中的数据的,如果我们告诉CUBLAS,HOST_A存储的是一个M X N的矩阵,那么,CUBLAS会把连续存储的首次M个元素作为矩阵首列的元素存储单元,按照我们原先C中的使用,这显然是不对的,因为C中,我们认为HOST_A的首N个连续存储单元为A的首行元素。
看看图1中的描述,大家可能更好的理解一下。
图1 对同一块内存的CUBLAS与C语言的不同理解。矩阵是2行3列的。
图1可以知道CUBLAS完全理解错了我原先的C语言数据。这样想,如果我们告诉CUBLAS我们给他的是一个3X2的矩阵,那么内存的0到2将会被看做矩阵的第一列,3到5将会被看做第二列,而实际上,C语言认为0到2是第一行,3到5是第二行,那么怎么让CUBLAS也这样理解呢?我们不可以让CUBLAS这样理解,但是我们可以在用的时候在CUBLAS中转置后再用。
总结有两个要点:
环境配置与第二节中要求一致。
采用算法为SGEMV,可以参考这里。
#include <stdio.h> #include <stdlib.h> #include <time.h> #include "cublas_v2.h" #include "cuda_runtime.h" #define ROWS 2048 #define COLS 1024 #define IDX2C(i,j,ld) (((j)*(ld))+(i)) #define CUBLAS_ERROR_CHECK(sdata) if(CUBLAS_STATUS_SUCCESS!=sdata)\ {printf("ERROR at:%s:%d\n",__FILE__,__LINE__);exit(-1);} void InitMemValue(float *x, int n, float down, float up) { int i; //srand((unsigned int)time(NULL)); srand(5778); for (i = 0; i < n; i++) { x[i] = (rand()*1.0f/RAND_MAX)*(up-down)+down; } return ; } void mySgemv(cublasOperation_t trans, \ float *pfA, int siM, int siN, float *pfX, float *pfY, float alpha, float beta) { int i, j; float fSum; if (CUBLAS_OP_N == trans) { for (i = 0; i < siM; i++) { fSum = 0.0f; for (j = 0; j < siN; j++) { fSum += pfA[IDX2C(i,j,siM)]*pfX[j]; } fSum *= alpha; pfY[i] = pfY[i]*beta + fSum; } } else { for (i = 0; i < siN; i++) { fSum = 0.0f; for (j = 0; j < siM; j++) { fSum += pfA[IDX2C(j,i,siM)]*pfX[j]; } fSum *= alpha; pfY[i] = pfY[i]*beta + fSum; } } } float computeError(int n, const float *x, const float *y) { int i; float err, fSumError=0.0f; for (i = 0; i < n; ++i) { err = fabsf((x[i]-y[i])/x[i]); if (err > 1e-5) { printf("%d error : %f\n", i, err); } fSumError += err; } return fSumError; } int main(int argc, char **argv) { // 1.1 cublas helper variables cublasStatus_t cubStatus; cublasHandle_t cubHandle; // 1.2 vector, matrix, scalar int siM = ROWS, siN = COLS; float *pfA = 0 , *d_pfA = 0 , * pfY = 0; float *pfX = 0 , *d_pfX = 0 , *d_pfY = 0, *h_pfY = 0; cublasOperation_t cubTrans = CUBLAS_OP_N; float fAlpha = 0.2f; float fBeta = 0.3f; int siIncx=1; // need to be configured when use. // 1.3 cublas init cubStatus = cublasCreate(&cubHandle);CUBLAS_ERROR_CHECK(cubStatus) // 1.4 memory allocation pfA = (float*)malloc(siM*siN*sizeof(float)); cudaMalloc((void**)&d_pfA, siM*siN*sizeof(float)); if (CUBLAS_OP_N == cubTrans) { pfX = (float*)malloc(siN*sizeof(float)); pfY = (float*)malloc(siM*sizeof(float)); h_pfY = (float*)malloc(siM*sizeof(float)); cudaMalloc((void**)&d_pfX, siN*sizeof(float)); cudaMalloc((void**)&d_pfY, siM*sizeof(float)); } else { pfX = (float*)malloc(siM*sizeof(float)); pfY = (float*)malloc(siN*sizeof(float)); h_pfY = (float*)malloc(siN*sizeof(float)); cudaMalloc((void**)&d_pfX, siM*sizeof(float)); cudaMalloc((void**)&d_pfY, siN*sizeof(float)); } // 1.5 initialize CPU memory InitMemValue(pfA, siM*siN, 0.0f, 1.0f); if (CUBLAS_OP_N == cubTrans) { InitMemValue(pfX, siN, 0.0f, 1.0f); InitMemValue(pfY, siM, 0.0f, 1.0f); } else { InitMemValue(pfX, siM, 0.0f, 1.0f); InitMemValue(pfY, siN, 0.0f, 1.0f); } // 1.6 Initialize GPU memory cubStatus = cublasSetMatrix(siM, siN, sizeof(float), pfA, siM, d_pfA, siM);CUBLAS_ERROR_CHECK(cubStatus) if (CUBLAS_OP_N == cubTrans) { cubStatus = cublasSetVector(siN, sizeof(float), pfX, 1, d_pfX, 1);CUBLAS_ERROR_CHECK(cubStatus) cubStatus = cublasSetVector(siM, sizeof(float), pfY, 1, d_pfY, 1);CUBLAS_ERROR_CHECK(cubStatus) } else { cubStatus = cublasSetVector(siM, sizeof(float), pfX, 1, d_pfX, 1);CUBLAS_ERROR_CHECK(cubStatus) cubStatus = cublasSetVector(siN, sizeof(float), pfY, 1, d_pfY, 1);CUBLAS_ERROR_CHECK(cubStatus) } // 1.7 Invoke CUBLAS cubStatus = cublasSgemv(cubHandle, cubTrans, siM, siN, &fAlpha, d_pfA, siM, d_pfX, 1, &fBeta, d_pfY, 1);CUBLAS_ERROR_CHECK(cubStatus) // 1.8 Get result if (CUBLAS_OP_N == cubTrans) { cubStatus = cublasGetVector(siM, sizeof(float), d_pfY, 1, h_pfY, 1); } else { cubStatus = cublasGetVector(siN, sizeof(float), d_pfY, 1, h_pfY, 1); } // 1.9 invoke cpu version mySgemv(cubTrans, pfA, siM, siN, pfX, pfY, fAlpha, fBeta); printf("Sum of error is %f\n", computeError(CUBLAS_OP_N==cubTrans?siM:siN, pfY, h_pfY)); // last: release cudaFree(d_pfA); cudaFree(d_pfX); cudaFree(d_pfY); free(pfA); free(pfX); free(pfY); free(h_pfY); cubStatus = cublasDestroy(cubHandle);CUBLAS_ERROR_CHECK(cubStatus) return 0; }
这里的的实例我并没有真正的在调用CUBLAS的库函数之前对矩阵进行操作,假设调用了而已。但是这并不影响我的分析,更不影响之前的分析的正确性。
特别注意37,50,129,143行!!!!!
#include <stdio.h> #include <stdlib.h> #include <time.h> #include "cublas_v2.h" #include "cuda_runtime.h" #define ROWS 2048 #define COLS 1024 #define CUBLAS_ERROR_CHECK(sdata) if(CUBLAS_STATUS_SUCCESS!=sdata)\ {printf("ERROR at:%s:%d\n",__FILE__,__LINE__);exit(-1);} void InitMemValue(float *x, int n, float down, float up) { int i; srand(5778); for (i = 0; i < n; i++) { x[i] = (rand()*1.0f/RAND_MAX)*(up-down)+down; } return ; } void mySgemv(cublasOperation_t trans, \ float *pfA, int siM, int siN, float *pfX, float *pfY, float alpha, float beta) { int i, j; float fSum; if (CUBLAS_OP_N == trans) { for (i = 0; i < siM; i++) { fSum = 0.0f; for (j = 0; j < siN; j++) { fSum += pfA[i*siN+j]*pfX[j]; } fSum *= alpha; pfY[i] = pfY[i]*beta + fSum; } } else { for (i = 0; i < siN; i++) { fSum = 0.0f; for (j = 0; j < siM; j++) { fSum += pfA[j*siN+i]*pfX[j]; } fSum *= alpha; pfY[i] = pfY[i]*beta + fSum; } } } float computeError(int n, const float *x, const float *y) { int i; float err, fSumError=0.0f; for (i = 0; i < n; ++i) { err = fabsf((x[i]-y[i])/x[i]); if (err > 1e-5) { printf("%d error : %f\n", i, err); } fSumError += err; } return fSumError; } int main(int argc, char **argv) { // 1.1 cublas helper variables cublasStatus_t cubStatus; cublasHandle_t cubHandle; // 1.2 vector, matrix, scalar int siM = ROWS, siN = COLS; float *pfA = 0 , *d_pfA = 0 , * pfY = 0; float *pfX = 0 , *d_pfX = 0 , *d_pfY = 0, *h_pfY = 0; cublasOperation_t cubTrans = CUBLAS_OP_T; float fAlpha = 0.2f; float fBeta = 0.3f; int siIncx=1; // need to be configured when use. // 1.3 cublas init cubStatus = cublasCreate(&cubHandle);CUBLAS_ERROR_CHECK(cubStatus) // 1.4 memory allocation pfA = (float*)malloc(siM*siN*sizeof(float)); cudaMalloc((void**)&d_pfA, siM*siN*sizeof(float)); if (CUBLAS_OP_N == cubTrans) { pfX = (float*)malloc(siN*sizeof(float)); pfY = (float*)malloc(siM*sizeof(float)); h_pfY = (float*)malloc(siM*sizeof(float)); cudaMalloc((void**)&d_pfX, siN*sizeof(float)); cudaMalloc((void**)&d_pfY, siM*sizeof(float)); } else { pfX = (float*)malloc(siM*sizeof(float)); pfY = (float*)malloc(siN*sizeof(float)); h_pfY = (float*)malloc(siN*sizeof(float)); cudaMalloc((void**)&d_pfX, siM*sizeof(float)); cudaMalloc((void**)&d_pfY, siN*sizeof(float)); } // 1.5 initialize CPU memory InitMemValue(pfA, siM*siN, 0.0f, 1.0f); if (CUBLAS_OP_N == cubTrans) { InitMemValue(pfX, siN, 0.0f, 1.0f); InitMemValue(pfY, siM, 0.0f, 1.0f); } else { InitMemValue(pfX, siM, 0.0f, 1.0f); InitMemValue(pfY, siN, 0.0f, 1.0f); } // 1.6 Initialize GPU memory cubStatus = cublasSetMatrix(siN, siM, sizeof(float), pfA, siN, d_pfA, siN);CUBLAS_ERROR_CHECK(cubStatus) if (CUBLAS_OP_N == cubTrans) { cubStatus = cublasSetVector(siN, sizeof(float), pfX, 1, d_pfX, 1);CUBLAS_ERROR_CHECK(cubStatus) cubStatus = cublasSetVector(siM, sizeof(float), pfY, 1, d_pfY, 1);CUBLAS_ERROR_CHECK(cubStatus) } else { cubStatus = cublasSetVector(siM, sizeof(float), pfX, 1, d_pfX, 1);CUBLAS_ERROR_CHECK(cubStatus) cubStatus = cublasSetVector(siN, sizeof(float), pfY, 1, d_pfY, 1);CUBLAS_ERROR_CHECK(cubStatus) } // 1.7 Invoke CUBLAS //dfTimer_device = getTimeInMicroSecs(); cubStatus = cublasSgemv(cubHandle, cubTrans==CUBLAS_OP_N?CUBLAS_OP_T:CUBLAS_OP_N, siN, siM, &fAlpha, d_pfA, siN, d_pfX, 1, &fBeta, d_pfY, 1);CUBLAS_ERROR_CHECK(cubStatus) //dfTimer_device = getTimeInMicroSecs() - dfTimer_device; // 1.8 Get result if (CUBLAS_OP_N == cubTrans) { cubStatus = cublasGetVector(siM, sizeof(float), d_pfY, 1, h_pfY, 1); } else { cubStatus = cublasGetVector(siN, sizeof(float), d_pfY, 1, h_pfY, 1); } // 1.9 invoke cpu version mySgemv(cubTrans, pfA, siM, siN, pfX, pfY, fAlpha, fBeta); printf("Sum of error is %f\n", computeError(CUBLAS_OP_N==cubTrans?siM:siN, pfY, h_pfY)); // last: release cudaFree(d_pfA); cudaFree(d_pfX); cudaFree(d_pfY); free(pfA); free(pfX); free(pfY); free(h_pfY); cubStatus = cublasDestroy(cubHandle);CUBLAS_ERROR_CHECK(cubStatus) return 0; }