CUDA中的cusparse库

VS2013+CUDA7.5

项目设置

1.新建cuda项目

CUDA中的cusparse库_第1张图片
Paste_Image.png

选择X64位,根据自己的需要定义Debug或Release模式


CUDA中的cusparse库_第2张图片
Paste_Image.png

2.项目属性-配置属性-VC++目录

CUDA中的cusparse库_第3张图片
Paste_Image.png

  设置包含目录


CUDA中的cusparse库_第4张图片
Paste_Image.png

  设置库目录


CUDA中的cusparse库_第5张图片
Paste_Image.png

3.项目属性-配置属性-链接器

常规-附加库目录:$(CUDA_PATH_V7.5)\lib$(Platform)


CUDA中的cusparse库_第6张图片
Paste_Image.png

  输入-附加依赖项:
  cublas.lib
  cublas_device.lib
  cuda.lib
  cudadevrt.lib
  cudart.lib
  cudart_static.lib
  cufft.lib
  cufftw.lib
  curand.lib
  cusolver.lib
  cusparse.lib
  nppc.lib
  nppi.lib
  npps.lib
  nvblas.lib
  nvcuvid.lib
  nvrtc.lib
  OpenCL.lib


CUDA中的cusparse库_第7张图片
Paste_Image.png

4.设置windows调制器为X64位

由于cusparse.lib是X64位下的库,所以要设置成X64运行模式,如果不设置就会出现如下的错误。


CUDA中的cusparse库_第8张图片
Paste_Image.png

修改:


CUDA中的cusparse库_第9张图片
Paste_Image.png

实验结果:
CUDA中的cusparse库_第10张图片
Paste_Image.png

#include
#include
#include
#include"cusparse.h"

#define CLEANUP(s)\
do{\
    printf("%s\n", s); \
if (yHostPtr)           free(yHostPtr);                 \
if (zHostPtr)           free(zHostPtr);          \
if (xIndHostPtr)        free(xIndHostPtr);       \
if (xValHostPtr)        free(xValHostPtr);       \
if (cooRowIndexHostPtr) free(cooRowIndexHostPtr); \
if (cooColIndexHostPtr) free(cooColIndexHostPtr); \
if (cooValHostPtr)      free(cooValHostPtr);     \
if (y)                  cudaFree(y);             \
if (z)                  cudaFree(z);             \
if (xInd)               cudaFree(xInd);          \
if (xVal)               cudaFree(xVal);          \
if (csrRowPtr)          cudaFree(csrRowPtr);     \
if (cooRowIndex)        cudaFree(cooRowIndex);   \
if (cooColIndex)        cudaFree(cooColIndex);   \
if (cooVal)             cudaFree(cooVal);        \
if (descr)              cusparseDestroyMatDescr(descr); \
if (handle)             cusparseDestroy(handle); \
    cudaDeviceReset(); \
    fflush(stdout); \
} while (0)

    int main(){
        cudaError_t cudaStat1, cudaStat2, cudaStat3, cudaStat4, cudaStat5, cudaStat6;
        cusparseStatus_t status;
        cusparseHandle_t handle = 0;
        cusparseMatDescr_t descr = 0;

        int *    cooRowIndexHostPtr = 0;
        int *    cooColIndexHostPtr = 0;
        double * cooValHostPtr = 0;

        int *    cooRowIndex = 0;
        int *    cooColIndex = 0;
        double * cooVal = 0;

        int *    xIndHostPtr = 0;
        double * xValHostPtr = 0;
        double * yHostPtr = 0;
        int *    xInd = 0;
        double * xVal = 0;
        double * y = 0;
        int *    csrRowPtr = 0;
        double * zHostPtr = 0;
        double * z = 0;
        int      n, nnz, nnz_vector;
        double dzero = 0.0;
        double dtwo = 2.0;
        double dthree = 3.0;
        double dfive = 5.0;
        printf("testing example\n");
        /* create the following sparse test matrix in COO format */
        /*
        |1.0     2.0 3.0|
        |    4.0        |
        |5.0     6.0 7.0|
        |    8.0     9.0| */
        n = 4; nnz = 9;
        cooRowIndexHostPtr = (int *)malloc(nnz*sizeof(cooRowIndexHostPtr[0]));
        cooColIndexHostPtr = (int *)malloc(nnz*sizeof(cooColIndexHostPtr[0]));
        cooValHostPtr = (double *)malloc(nnz*sizeof(cooValHostPtr[0]));
        if ((!cooRowIndexHostPtr) || (!cooColIndexHostPtr) || (!cooValHostPtr)){
            CLEANUP("Host malloc failed (matrix)");
            return 1;
        }
        cooRowIndexHostPtr[0] = 0; cooColIndexHostPtr[0] = 0; cooValHostPtr[0] = 1.0;
        cooRowIndexHostPtr[1] = 0; cooColIndexHostPtr[1] = 2; cooValHostPtr[1] = 2.0;
        cooRowIndexHostPtr[2] = 0; cooColIndexHostPtr[2] = 3; cooValHostPtr[2] = 3.0;
        cooRowIndexHostPtr[3] = 1; cooColIndexHostPtr[3] = 1; cooValHostPtr[3] = 4.0;
        cooRowIndexHostPtr[4] = 2; cooColIndexHostPtr[4] = 0; cooValHostPtr[4] = 5.0;
        cooRowIndexHostPtr[5] = 2; cooColIndexHostPtr[5] = 2; cooValHostPtr[5] = 6.0;
        cooRowIndexHostPtr[6] = 2; cooColIndexHostPtr[6] = 3; cooValHostPtr[6] = 7.0;
        cooRowIndexHostPtr[7] = 3; cooColIndexHostPtr[7] = 1; cooValHostPtr[7] = 8.0;
        cooRowIndexHostPtr[8] = 3; cooColIndexHostPtr[8] = 3; cooValHostPtr[8] = 9.0;
        //*
        printf("Input data:\n");
        for (int i = 0; i < nnz; i++){
            printf("cooRowIndexHostPtr[%d]=%d  ", i, cooRowIndexHostPtr[i]);
            printf("cooColIndexHostPtr[%d]=%d  ", i, cooColIndexHostPtr[i]);
            printf("cooValHostPtr[%d]=%f     \n", i, cooValHostPtr[i]);
        }
        //*/


        /* create a sparse and dense vector */
        /*
        xVal= [100.0 200.0 400.0]   (sparse)
        xInd= [0     1     3    ]
        y = [10.0 20.0 30.0 40.0 | 50.0 60.0 70.0 80.0] (dense)
        */
        nnz_vector = 3;
        xIndHostPtr = (int *)malloc(nnz_vector*sizeof(xIndHostPtr[0]));
        xValHostPtr = (double *)malloc(nnz_vector*sizeof(xValHostPtr[0]));
        yHostPtr = (double *)malloc(2 * n*sizeof(yHostPtr[0]));
        zHostPtr = (double *)malloc(2 * (n + 1)*sizeof(zHostPtr[0]));
        if ((!xIndHostPtr) || (!xValHostPtr) || (!yHostPtr) || (!zHostPtr)){
            CLEANUP("Host malloc failed (vectors)");
            return 1;
        }
        yHostPtr[0] = 10.0; xIndHostPtr[0] = 0; xValHostPtr[0] = 100.0;
        yHostPtr[1] = 20.0; xIndHostPtr[1] = 1; xValHostPtr[1] = 200.0;
        yHostPtr[2] = 30.0;
        yHostPtr[3] = 40.0; xIndHostPtr[2] = 3; xValHostPtr[2] = 400.0;
        yHostPtr[4] = 50.0;
        yHostPtr[5] = 60.0;
        yHostPtr[6] = 70.0;
        yHostPtr[7] = 80.0;
        //printf the vectors
        for (int j = 0; j < 2; j++){
            for (int i = 0; i < n; i++){
                printf("yHostPtr[%d,%d]=%f\n", i, j, yHostPtr[i + n*j]);
            }
        }
        for (int i = 0; i < nnz_vector; i++){
            printf("xIndHostPtr[%d]=%d ", i, xIndHostPtr[i]);
            printf("xValHostPtr[%d]=%f\n", i, xValHostPtr[i]);
        }

        /* allocate GPU memory and copy the matrix and vectors into it */
        cudaStat1 = cudaMalloc((void**)&cooRowIndex, nnz*sizeof(cooRowIndex[0]));
        cudaStat2 = cudaMalloc((void**)&cooColIndex, nnz*sizeof(cooColIndex[0]));
        cudaStat3 = cudaMalloc((void**)&cooVal, nnz*sizeof(cooVal[0]));
        cudaStat4 = cudaMalloc((void**)&y, 2 * n*sizeof(y[0]));
        cudaStat5 = cudaMalloc((void**)&xInd, nnz_vector*sizeof(xInd[0]));
        cudaStat6 = cudaMalloc((void**)&xVal, nnz_vector*sizeof(xVal[0]));
        if ((cudaStat1 != cudaSuccess) || (cudaStat2 != cudaSuccess)
            || (cudaStat3 != cudaSuccess) || (cudaStat4 != cudaSuccess)
            || (cudaStat5 != cudaSuccess) || (cudaStat6 != cudaSuccess)) {
            CLEANUP("Device malloc failed");
            return 1;
        }
        cudaStat1 = cudaMemcpy(cooRowIndex, cooRowIndexHostPtr,
            (size_t)(nnz*sizeof(cooRowIndex[0])),
            cudaMemcpyHostToDevice);
        cudaStat2 = cudaMemcpy(cooColIndex, cooColIndexHostPtr,
            (size_t)(nnz*sizeof(cooColIndexHostPtr[0])),
            cudaMemcpyHostToDevice);
        cudaStat3 = cudaMemcpy(cooVal, cooValHostPtr,
            (size_t)(nnz*sizeof(cooValHostPtr[0])),
            cudaMemcpyHostToDevice);
        cudaStat4 = cudaMemcpy(y, yHostPtr,
            (size_t)(2 * n*sizeof(y[0])),
            cudaMemcpyHostToDevice);
        cudaStat5 = cudaMemcpy(xInd, xIndHostPtr,
            (size_t)(nnz_vector*sizeof(xInd[0])),
            cudaMemcpyHostToDevice);
        cudaStat6 = cudaMemcpy(xVal, xValHostPtr,
            (size_t)(nnz_vector*sizeof(xVal[0])),
            cudaMemcpyHostToDevice);

        if ((cudaStat1 != cudaSuccess) ||
            (cudaStat2 != cudaSuccess) ||
            (cudaStat3 != cudaSuccess) ||
            (cudaStat4 != cudaSuccess) ||
            (cudaStat5 != cudaSuccess) ||
            (cudaStat6 != cudaSuccess)) {
            CLEANUP("Memcpy from Host to Device failed");
            return 1;
        }

        /* initialize cusparse library */
        status = cusparseCreate(&handle);
        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("CUSPARSE Library initialization failed");
            return 1;
        }
        /* create and setup matrix descriptor */
        status = cusparseCreateMatDescr(&descr);
        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("Matrix descriptor initialization failed");
            return 1;
        }
        cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
        cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
        /* exercise conversion routines (convert matrix from COO 2 CSR format) */
        cudaStat1 = cudaMalloc((void**)&csrRowPtr, (n + 1)*sizeof(csrRowPtr[0]));
        if (cudaStat1 != cudaSuccess) {
            CLEANUP("Device malloc failed (csrRowPtr)");
            return 1;
        }
        status = cusparseXcoo2csr(handle, cooRowIndex, nnz, n,
            csrRowPtr, CUSPARSE_INDEX_BASE_ZERO);
        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("Conversion from COO to CSR format failed");
            return 1;
        }
        //csrRowPtr = [0 3 4 7 9] 
        // The following test only works for compute capability 1.3 and above 
        // because it needs double precision.
        int devId;
        cudaDeviceProp prop; //cudaDeviceProp数据类型针对函式 cudaGetDeviceProperties定义的,cudaGetDeviceProperties函数的功能是取得支持GPU计算的装置的相关属性,如支持CUDA版本号装置的名称、内存的大小、最大的 thread 数目、执行单元的频率  
        cudaError_t cudaStat;
        cudaStat = cudaGetDevice(&devId);
        if (cudaStat != cudaSuccess){
            CLEANUP("cudaGetDevice failed");
            printf("Error: cudaStat %d, %s\n", cudaStat, cudaGetErrorString(cudaStat));
            return 1;
        }
        cudaStat = cudaGetDeviceProperties(&prop, devId);
        if (cudaStat != cudaSuccess){
            CLEANUP("cudaGetDeviceProperties failed");
            printf("Error: cudaStat %d, %s\n", cudaStat, cudaGetErrorString(cudaStat));
            return 1;
        }
        int cc = 100 * prop.major + 10 * prop.minor; //major计算能力的主代号,minor计算能力的次要代号
        if (cc < 130){
            CLEANUP("waive the test because only sm13 and above are supported\n");
            printf("the device has compute capability %d\n", cc);
            printf("example test WAIVED");
            return 2;
        }

        /* exercise Level 1 routines (scatter vector elements) */
        status = cusparseDsctr(handle, nnz_vector, xVal, xInd,
            &y[n], CUSPARSE_INDEX_BASE_ZERO);

        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("Scatter from sparse to dense vector failed");
            return 1;
        }
        //y = [10 20 30 40 | 100 200 70 400]


        /* exercise Level 2 routines (csrmv) */
        status = cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, n, nnz,
            &dtwo, descr, cooVal, csrRowPtr, cooColIndex,
            &y[0], &dthree, &y[n]);

        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("Matrix-vector multiplication failed");
            return 1;
        }
        //y = [10 20 30 40 | 680 760 1230 2240]
        cudaMemcpy(yHostPtr, y, (size_t)(2 * n*sizeof(y[0])), cudaMemcpyDeviceToHost);
        printf("Intermediate results:\n");
        for (int j = 0; j < 2; j++){
            for (int i = 0; i < n; i++){
                printf("yHostPtr[%d,%d]=%f\n", i, j, yHostPtr[i + n*j]);
            }
        }


        /* exercise Level 3 routines (csrmm) */
        cudaStat1 = cudaMalloc((void**)&z, 2 * (n + 1)*sizeof(z[0]));
        if (cudaStat1 != cudaSuccess) {
            CLEANUP("Device malloc failed(z)");
            return 1;
        }
        cudaStat1 = cudaMemset((void *)z, 0, 2 * (n + 1)*sizeof(z[0]));
        if (cudaStat1 != cudaSuccess) {
            CLEANUP("Memset on Device failed");
            return 1;
        }
        status = cusparseDcsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, 2, n, nnz,
            &dfive, descr, cooVal, csrRowPtr, cooColIndex,
            y, n, &dzero, z, n + 1);
        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("Matrix-matrix multiplication failed");
            return 1;
        }
        //z = [950 400 2550 2600 0 | 49300 15200 132300 131200 0]
        /* print final results (z) */
        cudaStat1 = cudaMemcpy(zHostPtr, z,
            (size_t)(2 * (n + 1)*sizeof(z[0])),
            cudaMemcpyDeviceToHost);
        if (cudaStat1 != cudaSuccess) {
            CLEANUP("Memcpy from Device to Host failed");
            return 1;
        }
        printf("Final results:\n");
        for (int j = 0; j < 2; j++){
            for (int i = 0; i < n + 1; i++){
                printf("z[%d,%d]=%f\n", i, j, zHostPtr[i + (n + 1)*j]);
            }
        }
        /* destroy matrix descriptor */
        status = cusparseDestroyMatDescr(descr);
        descr = 0;
        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("Matrix descriptor destruction failed");
            return 1;
        }
        /* destroy handle */
        status = cusparseDestroy(handle);
        handle = 0;
        if (status != CUSPARSE_STATUS_SUCCESS) {
            CLEANUP("CUSPARSE Library release of resources failed");
            return 1;
        }


        /* check the results */
        /* Notice that CLEANUP() contains a call to cusparseDestroy(handle) */
        if ((zHostPtr[0] != 950.0) ||
            (zHostPtr[1] != 400.0) ||
            (zHostPtr[2] != 2550.0) ||
            (zHostPtr[3] != 2600.0) ||
            (zHostPtr[4] != 0.0) ||
            (zHostPtr[5] != 49300.0) ||
            (zHostPtr[6] != 15200.0) ||
            (zHostPtr[7] != 132300.0) ||
            (zHostPtr[8] != 131200.0) ||
            (zHostPtr[9] != 0.0) ||
            (yHostPtr[0] != 10.0) ||
            (yHostPtr[1] != 20.0) ||
            (yHostPtr[2] != 30.0) ||
            (yHostPtr[3] != 40.0) ||
            (yHostPtr[4] != 680.0) ||
            (yHostPtr[5] != 760.0) ||
            (yHostPtr[6] != 1230.0) ||
            (yHostPtr[7] != 2240.0)){
            CLEANUP("example test FAILED");
            return 1;
        }
        else{
            CLEANUP("example test PASSED");
            //return 0; 
        }
        getchar();
}

你可能感兴趣的:(CUDA中的cusparse库)