#include
#ifndef _COMMON_H
#define _COMMON_H
#define CHECK(call) \
{ \
const hipError_t error = call; \
if (error != hipSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
hipGetErrorString(error)); \
} \
}
#define CHECK_CUBLAS(call) \
{ \
hipblasStatus_t err; \
if ((err = (call)) != HIPBLAS_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CURAND(call) \
{ \
hiprandStatus_t err; \
if ((err = (call)) != HIPRAND_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUFFT(call) \
{ \
hipfftResult err; \
if ( (err = (call)) != HIPFFT_SUCCESS) \
{ \
fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUSPARSE(call) \
{ \
hipsparseStatus_t err; \
if ((err = (call)) != HIPSPARSE_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__); \
hipError_t cuda_err = hipGetLastError(); \
if (cuda_err != hipSuccess) \
{ \
fprintf(stderr, " CUDA error \"%s\" also detected\n", \
hipGetErrorString(cuda_err)); \
} \
exit(1); \
} \
}
inline double seconds()
{
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
#endif // _COMMON_H
Makefile
SOURCE =$(wildcard *.cpp)
OBJS =$(patsubst %.cpp,%,$(SOURCE))
HIPCC = /opt/rocm/bin/hipcc
GCC=/opt/rh/devtoolset-7/root/usr/bin/gcc
all:$(OBJS)
$(OBJS):%:%.cpp
$(HIPCC) $^ -o $@
run:
./matrixMaxDCU 2 4 4 4 1
clean:
-rm $(OBJS)
matrixMaxDCU.cpp
#include "common.h"
#include
#include
#include
#define BSIZE 2
#define BLOCK_SIZE 2
// .filename 2 4 4 4 0
// ./filename ngpus m n k 0/1 是否打印
__global__ void multiplicateMatrixOnDevice(float *array_A, float *array_B, float *array_C, int n)
{
int ix = threadIdx.x + blockDim.x*blockIdx.x;//col number
int iy = threadIdx.y + blockDim.y*blockIdx.y;//row number
if (ix < n && iy < n)
{
double sum = 0;
for (int k = 0; k < n; k++)
{
sum += array_A[iy*n + k] * array_B[k*n + ix];
}
array_C[iy*n + ix] = sum;
}
}
__global__ void matrixMultiplyShared(float *A, float *B, float *C,
int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns)
{
__shared__ double sharedM[BLOCK_SIZE][BLOCK_SIZE];
__shared__ double sharedN[BLOCK_SIZE][BLOCK_SIZE];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * BLOCK_SIZE + ty;
int col = bx * BLOCK_SIZE + tx;
float Csub = 0.0;
for (int i = 0; i < (int)(ceil((double)numAColumns / BLOCK_SIZE)); i++)
{
if (i*BLOCK_SIZE + tx < numAColumns && row < numARows)
sharedM[ty][tx] = A[row*numAColumns + i * BLOCK_SIZE + tx];
else
sharedM[ty][tx] = 0.0;
if (i*BLOCK_SIZE + ty < numBRows && col < numBColumns)
sharedN[ty][tx] = B[(i*BLOCK_SIZE + ty)*numBColumns + col];
else
sharedN[ty][tx] = 0.0;
__syncthreads();
for (int j = 0; j < BLOCK_SIZE; j++)
{
Csub += sharedM[ty][j] * sharedN[j][tx];
}
__syncthreads();
}
if (row < numCRows && col < numCColumns)
C[row*numCColumns + col] = Csub;
}
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
printf("Arrays do not match!\n");
printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
gpuRef[i], i);
break;
}
}
}
void initialData(float * const ip, int const size)
{
for (int i = 0; i < size; i++)
{
ip[i] = (float)(rand() % 10 + 1);
}
}
void printMatrix(const char *arrayname, float *array, int row, int col)
{
printf("%s\n", arrayname);
float *p = array;
for (int y = 0; y < row; y++)
{
for (int x = 0; x < col; x++)
{
printf("%12lf", p[x]);
}
p = p + col;
printf("\n");
}
printf("\n");
}
void printData(float * const ip, int const size)
{
for (int i = 0; i < size*size; i++)
{
printf("%12lf", ip[i]);
}
printf("\n");
}
void multiplicateMatrixOnHost(float *array_A, float *array_B, float *array_C, int M_p, int K_p, int N_p)
{
for (int i = 0; i < M_p; i++)
{
for (int j = 0; j < N_p; j++)
{
float sum = 0;
for (int k = 0; k < K_p; k++)
{
sum += array_A[i*K_p + k] * array_B[k*N_p + j];
}
array_C[i*N_p + j] = sum;
}
}
}
int main(int argc, char **argv)
{
int ngpus;
int m, k, n;
int ifprint;
printf("> starting %s", argv[0]);
CHECK(hipGetDeviceCount(&ngpus));
printf(" CUDA-capable devices: %i\n", ngpus);
if (argc > 1)
{
if (atoi(argv[1]) > ngpus)
{
fprintf(stderr, "Invalid number of GPUs specified: %d is greater "
"than the total number of GPUs in this platform (%d)\n",
atoi(argv[1]), ngpus);
exit(1);
}
ngpus = atoi(argv[1]);
}
if (argc > 2) m = atoi(argv[2]);
if (argc > 3) k = atoi(argv[3]);
if (argc > 4) n = atoi(argv[4]);
if (argc > 5) ifprint = atoi(argv[5]);
int Axy = m * k;
int Bxy = k * n;
int Cxy = m * n;
// allocat device emory
float **d_A = (float **)malloc(sizeof(float *) * ngpus);
float **d_B = (float **)malloc(sizeof(float *) * ngpus);
float **d_C = (float **)malloc(sizeof(float *) * ngpus);
float **h_A = (float **)malloc(sizeof(float *) * ngpus);
float **h_B = (float **)malloc(sizeof(float *) * ngpus);
float **hostRef = (float **)malloc(sizeof(float *) * ngpus);
float **gpuRef = (float **)malloc(sizeof(float *) * ngpus);
hipStream_t *stream = (hipStream_t *)malloc(sizeof(hipStream_t) * ngpus);
for (int i = 0; i < ngpus; i++)
{
// set current device
CHECK(hipSetDevice(i));
// allocate device memory
CHECK(hipMalloc((void **)&d_A[i], Axy * sizeof(float)));
CHECK(hipMalloc((void **)&d_B[i], Bxy * sizeof(float)));
CHECK(hipMalloc((void **)&d_C[i], Cxy * sizeof(float)));
// allocate page locked host memory for asynchronous data transfer
CHECK(hipHostMalloc((void **)&h_A[i], Axy * sizeof(float)));
CHECK(hipHostMalloc((void **)&h_B[i], Bxy * sizeof(float)));
CHECK(hipHostMalloc((void **)&hostRef[i], Cxy * sizeof(float)));
CHECK(hipHostMalloc((void **)&gpuRef[i], Cxy * sizeof(float)));
// create streams for timing and synchronizing
CHECK(hipStreamCreate(&stream[i]));
}
int dimx = 2;
int dimy = 2;
dim3 block(dimx, dimy);
dim3 grid((m + block.x - 1) / block.x, (n + block.y - 1) / block.y);
for (int i = 0; i < ngpus; i++)
{
CHECK(hipSetDevice(i));
initialData(h_A[i], Axy);
initialData(h_B[i], Bxy);
}
for (int i = 0; i < ngpus; i++)
{
multiplicateMatrixOnHost(h_A[i], h_B[i], hostRef[i], m, k, n);
}
// record start time
double iStart = seconds();
// distributing the workload across multiple devices
for (int i = 0; i < ngpus; i++)
{
CHECK(hipSetDevice(i));
CHECK(hipMemcpyAsync(d_A[i], h_A[i], Axy * sizeof(float), hipMemcpyHostToDevice, stream[i]));
CHECK(hipMemcpyAsync(d_B[i], h_B[i], Bxy * sizeof(float), hipMemcpyHostToDevice, stream[i]));
// matrixMultiplyShared << > > (d_A[i], d_B[i], d_C[i], m, k, k, n, m, n);
hipLaunchKernelGGL(matrixMultiplyShared, grid, block, 0, stream[i], d_A[i], d_B[i], d_C[i], m, k, k, n, m, n);
CHECK(hipMemcpyAsync(gpuRef[i], d_C[i], Cxy * sizeof(float), hipMemcpyDeviceToHost,
stream[i]));
}
// synchronize streams
for (int i = 0; i < ngpus; i++)
{
CHECK(hipSetDevice(i));
CHECK(hipStreamSynchronize(stream[i]));
}
const char* array_A = "array_A";
const char* array_B = "array_B";
const char* array_hostRef = "array_hostRef";
const char* array_gpuRef = "array_gpuRef";
if (ifprint)
for (int i = 0; i < ngpus; i++)
{
CHECK(hipSetDevice(i));
// printData(h_A[i], iSize);
printMatrix(array_A, h_A[i], m, k);
printMatrix(array_B, h_B[i], k, n);
printMatrix(array_hostRef, hostRef[i], m, n);
printMatrix(array_gpuRef, gpuRef[i], m, n);
}
double iElaps = seconds() - iStart;
printf("%d GPU timer elapsed: %8.2fms \n", ngpus, iElaps * 1000.0);
for (int i = 0; i < ngpus; i++)
{
CHECK(hipSetDevice(i));
checkResult(hostRef[i], gpuRef[i], Cxy);
}
for (int i = 0; i < ngpus; i++)
{
CHECK(hipSetDevice(i));
CHECK(hipFree(d_A[i]));
CHECK(hipFree(d_B[i]));
CHECK(hipFree(d_C[i]));
CHECK(hipHostFree(h_A[i]));
CHECK(hipHostFree(h_B[i]));
CHECK(hipHostFree(hostRef[i]));
CHECK(hipHostFree(gpuRef[i]));
CHECK(hipStreamDestroy(stream[i]));
CHECK(hipDeviceReset());
}
free(d_A);
free(d_B);
free(d_C);
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);
free(stream);
return 0;
}