多机多卡技术测试-单节点多DCU(任务划分型-无数据传输矩阵乘法)

文件目录:
多机多卡技术测试-单节点多DCU(任务划分型-无数据传输矩阵乘法)_第1张图片
common.h

#include 

#ifndef _COMMON_H
#define _COMMON_H

#define CHECK(call)                                                            \
{                                                                              \
    const hipError_t error = call;                                            \
    if (error != hipSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                hipGetErrorString(error));                                    \
    }                                                                          \
}

#define CHECK_CUBLAS(call)                                                     \
{                                                                              \
    hipblasStatus_t err;                                                        \
    if ((err = (call)) != HIPBLAS_STATUS_SUCCESS)                               \
    {                                                                          \
        fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__,       \
                __LINE__);                                                     \
        exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CURAND(call)                                                     \
{                                                                              \
    hiprandStatus_t err;                                                        \
    if ((err = (call)) != HIPRAND_STATUS_SUCCESS)                               \
    {                                                                          \
        fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__,       \
                __LINE__);                                                     \
        exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUFFT(call)                                                      \
{                                                                              \
    hipfftResult err;                                                           \
    if ( (err = (call)) != HIPFFT_SUCCESS)                                      \
    {                                                                          \
        fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__,        \
                __LINE__);                                                     \
        exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUSPARSE(call)                                                   \
{                                                                              \
    hipsparseStatus_t err;                                                      \
    if ((err = (call)) != HIPSPARSE_STATUS_SUCCESS)                             \
    {                                                                          \
        fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__);   \
        hipError_t cuda_err = hipGetLastError();                             \
        if (cuda_err != hipSuccess)                                           \
        {                                                                      \
            fprintf(stderr, "  CUDA error \"%s\" also detected\n",             \
                    hipGetErrorString(cuda_err));                             \
        }                                                                      \
        exit(1);                                                               \
    }                                                                          \
}

inline double seconds()
{
    struct timeval tp;
    struct timezone tzp;
    int i = gettimeofday(&tp, &tzp);
    return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}

#endif // _COMMON_H

Makefile

SOURCE =$(wildcard *.cpp)
OBJS =$(patsubst %.cpp,%,$(SOURCE))
HIPCC = /opt/rocm/bin/hipcc
GCC=/opt/rh/devtoolset-7/root/usr/bin/gcc

all:$(OBJS) 
$(OBJS):%:%.cpp
	$(HIPCC)  $^ -o  $@
run:
	./matrixMaxDCU 2 4 4 4 1
clean:
	-rm $(OBJS)

matrixMaxDCU.cpp

#include "common.h"
#include 
#include 
#include 

#define BSIZE 2
#define BLOCK_SIZE 2

 // .filename 2 4 4 4 0
 // ./filename ngpus m n k 0/1 是否打印


__global__ void multiplicateMatrixOnDevice(float *array_A, float  *array_B, float  *array_C, int n)
{
	int ix = threadIdx.x + blockDim.x*blockIdx.x;//col  number
	int iy = threadIdx.y + blockDim.y*blockIdx.y;//row number

	if (ix < n && iy < n)
	{
		double sum = 0;
		for (int k = 0; k < n; k++)
		{
			sum += array_A[iy*n + k] * array_B[k*n + ix];
		}
		array_C[iy*n + ix] = sum;
	}

}


__global__ void matrixMultiplyShared(float *A, float *B, float *C,
	int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns)
{
	__shared__ double sharedM[BLOCK_SIZE][BLOCK_SIZE];
	__shared__ double sharedN[BLOCK_SIZE][BLOCK_SIZE];

	int bx = blockIdx.x;
	int by = blockIdx.y;
	int tx = threadIdx.x;
	int ty = threadIdx.y;

	int row = by * BLOCK_SIZE + ty;
	int col = bx * BLOCK_SIZE + tx;

	float Csub = 0.0;

	for (int i = 0; i < (int)(ceil((double)numAColumns / BLOCK_SIZE)); i++)
	{
		if (i*BLOCK_SIZE + tx < numAColumns && row < numARows)
			sharedM[ty][tx] = A[row*numAColumns + i * BLOCK_SIZE + tx];
		else
			sharedM[ty][tx] = 0.0;

		if (i*BLOCK_SIZE + ty < numBRows && col < numBColumns)
			sharedN[ty][tx] = B[(i*BLOCK_SIZE + ty)*numBColumns + col];
		else
			sharedN[ty][tx] = 0.0;
		__syncthreads();

		for (int j = 0; j < BLOCK_SIZE; j++)
		{
			Csub += sharedM[ty][j] * sharedN[j][tx];
		}

		__syncthreads();
	}

	if (row < numCRows && col < numCColumns)
		C[row*numCColumns + col] = Csub;

}


void checkResult(float *hostRef, float *gpuRef, const int N)
{
	double epsilon = 1.0E-8;

	for (int i = 0; i < N; i++)
	{
		if (abs(hostRef[i] - gpuRef[i]) > epsilon)
		{
			printf("Arrays do not match!\n");
			printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
				gpuRef[i], i);
			break;
		}
	}
}


void initialData(float * const ip, int const  size)
{
	for (int i = 0; i < size; i++)
	{
		ip[i] = (float)(rand() % 10 + 1);
	}
}


void printMatrix(const char *arrayname, float *array, int row, int col)
{
	printf("%s\n", arrayname);
	float *p = array;
	for (int y = 0; y < row; y++)
	{
		for (int x = 0; x < col; x++)
		{
			printf("%12lf", p[x]);
		}
		p = p + col;
		printf("\n");
	}
	printf("\n");
}


void printData(float * const ip, int const  size)
{
	for (int i = 0; i < size*size; i++)
	{
		printf("%12lf", ip[i]);
	}
	printf("\n");
}


void  multiplicateMatrixOnHost(float *array_A, float *array_B, float *array_C, int M_p, int K_p, int N_p)
{
	for (int i = 0; i < M_p; i++)
	{
		for (int j = 0; j < N_p; j++)
		{
			float sum = 0;
			for (int k = 0; k < K_p; k++)
			{
				sum += array_A[i*K_p + k] * array_B[k*N_p + j];
			}
			array_C[i*N_p + j] = sum;
		}
	}

}


int main(int argc, char **argv)
{
	int ngpus;
	int m, k, n;
	int ifprint;

	printf("> starting %s", argv[0]);

	CHECK(hipGetDeviceCount(&ngpus));
	printf(" CUDA-capable devices: %i\n", ngpus);

	if (argc > 1)
	{
		if (atoi(argv[1]) > ngpus)
		{
			fprintf(stderr, "Invalid number of GPUs specified: %d is greater "
				"than the total number of GPUs in this platform (%d)\n",
				atoi(argv[1]), ngpus);
			exit(1);
		}

		ngpus = atoi(argv[1]);
	}

	if (argc > 2)   m = atoi(argv[2]);
	if (argc > 3)   k = atoi(argv[3]);
	if (argc > 4)   n = atoi(argv[4]);
	if (argc > 5)   ifprint = atoi(argv[5]);

	int Axy = m * k;
	int Bxy = k * n;
	int Cxy = m * n;

	// allocat device emory
	float **d_A = (float **)malloc(sizeof(float *) * ngpus);
	float **d_B = (float **)malloc(sizeof(float *) * ngpus);
	float **d_C = (float **)malloc(sizeof(float *) * ngpus);

	float **h_A = (float **)malloc(sizeof(float *) * ngpus);
	float **h_B = (float **)malloc(sizeof(float *) * ngpus);
	float **hostRef = (float **)malloc(sizeof(float *) * ngpus);
	float **gpuRef = (float **)malloc(sizeof(float *) * ngpus);
	hipStream_t *stream = (hipStream_t *)malloc(sizeof(hipStream_t) * ngpus);

	for (int i = 0; i < ngpus; i++)
	{
		// set current device
		CHECK(hipSetDevice(i));

		// allocate device memory
		CHECK(hipMalloc((void **)&d_A[i], Axy * sizeof(float)));
		CHECK(hipMalloc((void **)&d_B[i], Bxy * sizeof(float)));
		CHECK(hipMalloc((void **)&d_C[i], Cxy * sizeof(float)));

		// allocate page locked host memory for asynchronous data transfer
		CHECK(hipHostMalloc((void **)&h_A[i], Axy * sizeof(float)));
		CHECK(hipHostMalloc((void **)&h_B[i], Bxy * sizeof(float)));
		CHECK(hipHostMalloc((void **)&hostRef[i], Cxy * sizeof(float)));
		CHECK(hipHostMalloc((void **)&gpuRef[i], Cxy * sizeof(float)));

		// create streams for timing and synchronizing
		CHECK(hipStreamCreate(&stream[i]));
	}

	int dimx = 2;
	int dimy = 2;
	dim3 block(dimx, dimy);
	dim3 grid((m + block.x - 1) / block.x, (n + block.y - 1) / block.y);

	for (int i = 0; i < ngpus; i++)
	{
		CHECK(hipSetDevice(i));
		initialData(h_A[i], Axy);
		initialData(h_B[i], Bxy);
	}

	for (int i = 0; i < ngpus; i++)
	{
		multiplicateMatrixOnHost(h_A[i], h_B[i], hostRef[i], m, k, n);
	}

	// record start time
	double iStart = seconds();

	// distributing the workload across multiple devices
	for (int i = 0; i < ngpus; i++)
	{
		CHECK(hipSetDevice(i));
		CHECK(hipMemcpyAsync(d_A[i], h_A[i], Axy * sizeof(float), hipMemcpyHostToDevice, stream[i]));
		CHECK(hipMemcpyAsync(d_B[i], h_B[i], Bxy * sizeof(float), hipMemcpyHostToDevice, stream[i]));
	//	matrixMultiplyShared << > > (d_A[i], d_B[i], d_C[i], m, k, k, n, m, n);
		hipLaunchKernelGGL(matrixMultiplyShared, grid, block, 0, stream[i], d_A[i], d_B[i], d_C[i], m, k, k, n, m, n);
		CHECK(hipMemcpyAsync(gpuRef[i], d_C[i], Cxy * sizeof(float), hipMemcpyDeviceToHost,
			stream[i]));
	}

	// synchronize streams
	for (int i = 0; i < ngpus; i++)
	{
		CHECK(hipSetDevice(i));
		CHECK(hipStreamSynchronize(stream[i]));
	}

	const char* array_A = "array_A";
	const char* array_B = "array_B";
	const char* array_hostRef = "array_hostRef";
	const char* array_gpuRef = "array_gpuRef";

	if (ifprint)
		for (int i = 0; i < ngpus; i++)
		{
			CHECK(hipSetDevice(i));
			// printData(h_A[i], iSize);
			printMatrix(array_A, h_A[i], m, k);
			printMatrix(array_B, h_B[i], k, n);
			printMatrix(array_hostRef, hostRef[i], m, n);
			printMatrix(array_gpuRef, gpuRef[i], m, n);
		}

	double iElaps = seconds() - iStart;
	printf("%d GPU timer elapsed: %8.2fms \n", ngpus, iElaps * 1000.0);

	for (int i = 0; i < ngpus; i++)
	{
		CHECK(hipSetDevice(i));
		checkResult(hostRef[i], gpuRef[i], Cxy);
	}

	for (int i = 0; i < ngpus; i++)
	{
		CHECK(hipSetDevice(i));
		CHECK(hipFree(d_A[i]));
		CHECK(hipFree(d_B[i]));
		CHECK(hipFree(d_C[i]));
		CHECK(hipHostFree(h_A[i]));
		CHECK(hipHostFree(h_B[i]));
		CHECK(hipHostFree(hostRef[i]));
		CHECK(hipHostFree(gpuRef[i]));
		CHECK(hipStreamDestroy(stream[i]));
		CHECK(hipDeviceReset());
	}

	free(d_A);
	free(d_B);
	free(d_C);
	free(h_A);
	free(h_B);
	free(hostRef);
	free(gpuRef);
	free(stream);

	return 0;
}

运行结果:
多机多卡技术测试-单节点多DCU(任务划分型-无数据传输矩阵乘法)_第2张图片

你可能感兴趣的:(并行计算,c语言)