1. 矩阵相乘再加 C = a*A*B + b*C
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include
#include
using namespace std;
int const M = 6;
int const N = 10;
int main()
{
cublasStatus_t status;
//Host memory malloc
float *h_A = (float*)malloc(N*M*sizeof(float));
float *h_B = (float*)malloc(N*M*sizeof(float));
float *h_C = (float*)malloc(M*M*sizeof(float));
float *h_C_cpu = (float*)malloc(M*M*sizeof(float));
memset(h_C_cpu,0,M*M*sizeof(float));
//Initialize and print
for (int i=0; i
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*
* Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice.
*/
/* This example demonstrates how to use the CUBLAS library
* by scaling an array of floating-point values on the device
* and comparing the result to the same operation performed
* on the host.
*/
/* Includes, system */
#include
#include
#include
#include
#include "time.h"
/* Includes, cuda */
#include
#include
#include
using namespace std;
/* Matrix size */
#define N (600)
/* Host implementation of a simple version of sgemm */
static void simple_sgemm(int n, float alpha, const float *A, const float *B,
float beta, float *C)
{
int i;
int j;
int k;
for (i = 0; i < n; ++i)
{
for (j = 0; j < n; ++j)
{
float prod = 0;
for (k = 0; k < n; ++k)
{
prod += A[k * n + i] * B[j * n + k];
}
C[j * n + i] = alpha * prod + beta * C[j * n + i];
}
}
}
/* Main */
int main(int argc, char **argv)
{
cublasStatus_t status;
float *h_A;
float *h_B;
float *h_C;
float *h_C_ref;
float *d_A = 0;
float *d_B = 0;
float *d_C = 0;
float alpha = 1.0f;
float beta = 0.0f;
int n2 = N * N;
int i;
float error_norm;
float ref_norm;
float diff;
cublasHandle_t handle;
clock_t cuStart,cuFinish;
clock_t start,finish;
int dev = findCudaDevice(argc, (const char **) argv);
if (dev == -1)
{
return EXIT_FAILURE;
}
/* Initialize CUBLAS */
printf("simpleCUBLAS test running..\n");
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
/* Allocate host memory for the matrices */
h_A = (float *)malloc(n2 * sizeof(h_A[0]));
if (h_A == 0)
{
fprintf(stderr, "!!!! host memory allocation error (A)\n");
return EXIT_FAILURE;
}
h_B = (float *)malloc(n2 * sizeof(h_B[0]));
if (h_B == 0)
{
fprintf(stderr, "!!!! host memory allocation error (B)\n");
return EXIT_FAILURE;
}
h_C = (float *)malloc(n2 * sizeof(h_C[0]));
if (h_C == 0)
{
fprintf(stderr, "!!!! host memory allocation error (C)\n");
return EXIT_FAILURE;
}
/* Fill the matrices with test data */
for (i = 0; i < n2; i++)
{
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
h_C[i] = rand() / (float)RAND_MAX;
}
cuStart = clock();
/* Allocate device memory for the matrices */
if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
return EXIT_FAILURE;
}
if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
return EXIT_FAILURE;
}
if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
{
fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
return EXIT_FAILURE;
}
/* Initialize the device matrices with the host matrices */
status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write A)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write B)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (write C)\n");
return EXIT_FAILURE;
}
/* Performs operation using cublas */
status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
/* Allocate host memory for reading back the result from device memory */
h_C_ref = (float *)malloc(n2 * sizeof(h_C[0]));
if (h_C_ref == 0)
{
fprintf(stderr, "!!!! host memory allocation error (C)\n");
return EXIT_FAILURE;
}
/* Read the result back */
status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C_ref, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
cuFinish = clock();
start = clock();
/* Performs operation using plain C code */
simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
finish = clock();
cout<<"GPU TIME:"<<(double)(cuFinish - cuStart)/CLOCKS_PER_SEC<