MPICC=/usr/local/mpich/bin/mpicxx
NVCC=/usr/local/cuda-10.2/bin/nvcc
MPI_INCLUDE= -I /usr/local/mpich/include
MPI_LIBS= -L /usr/local/mpich/lib -lmpich
CUDA_INCLUDE= -I /usr/local/cuda-10.2/include
CUDA_LIBS= -L /usr/local/cuda-10.2/lib64 -lcudart
CFILES=simpleMPI.c
CUFILES=simpleCu.cu
OBJECTS=simpleMPI.o simpleCu.o
all:
$(MPICC) -c $(CFILES) -o simpleMPI.o
$(NVCC) -c $(CUFILES) -o simpleCu.o
$(MPICC) $(CUDA_LIBS) $(OBJECTS) -o simpleMPI
# $(NVCC) $(MPI_LIBS) $(OBJECTS) -o simpleMPI
run:
mpirun -n 2 ./simpleMPI
clean:
rm -f simpleMPI *.o
simpleCu.cu
#include
#include
#include "simpleMPI.h"
__global__ void simpleMPIKernel(float *input, float *output)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
output[tid] = pow(input[tid],2);
}
void initData(float *data, int dataSize)
{
for (int i = 0; i < dataSize; i++)
{
data[i] = (float)(rand()%10+1);
}
}
void printTotalData(const char *name,float *data, int dataSize)
{
printf("%s\n",name);
for (int i = 0; i < dataSize; i++)
{
printf("%10f",data[i]);
}
printf("\n");
}
void printNodeData(int commRank,float *data, int dataSize)
{
printf("进程%d:\n",commRank);
for (int i = 0; i < dataSize; i++)
{
printf("%10f",data[i]);
}
printf("\n");
}
void computeGPU(float *hostData, int blockSize, int gridSize)
{
int dataSize = blockSize * gridSize;
float *deviceInputData = NULL;
cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float));
float *deviceOutputData = NULL;
cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float));
cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice);
simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
cudaMemcpy(hostData, deviceOutputData, dataSize *sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(deviceInputData);
cudaFree(deviceOutputData);
}
simpleMPI.c
#include
#include
#include
#include "simpleMPI.h"
int main(int argc, char *argv[])
{
int blockSize = 5;
int gridSize = 1;
int dataSizePerNode = gridSize * blockSize;
MPI_Init(&argc, &argv);
int commSize, commRank;
MPI_Comm_size(MPI_COMM_WORLD, &commSize);
MPI_Comm_rank(MPI_COMM_WORLD, &commRank);
int dataSizeTotal = dataSizePerNode * commSize;
float *dataRoot = NULL;
if (commRank == 0)
{
printf("Running on %d nodes\n", commSize);
dataRoot =(float*)malloc(dataSizeTotal*sizeof(float));
initData(dataRoot, dataSizeTotal);
printTotalData("Initial:",dataRoot,dataSizeTotal);
}
float *dataNode =(float*)malloc(dataSizePerNode*sizeof(float));
MPI_Scatter(dataRoot,dataSizePerNode,MPI_FLOAT,dataNode,dataSizePerNode,MPI_FLOAT,0,MPI_COMM_WORLD);
computeGPU(dataNode, blockSize, gridSize);
printNodeData(commRank,dataNode,dataSizePerNode);
MPI_Gather(dataNode, dataSizePerNode, MPI_FLOAT, dataRoot, dataSizePerNode,MPI_FLOAT,0, MPI_COMM_WORLD);
if (commRank == 0)
{
printTotalData("Result:",dataRoot,dataSizeTotal);
free(dataRoot);
}
free(dataNode);
if (commRank == 0)
{
printf("PASSED\n");
}
MPI_Finalize();
return 0;
}
simpleMPI.h
#ifndef _SIMPLEMPI_H
#define _SIMPLEMPI_H
void initData(float *data, int dataSize);
void printTotalData(const char *name,float *data, int dataSize);
void printNodeData(int commRank,float *data, int dataSize);
void computeGPU(float *hostData, int blockSize, int gridSize);
#endif