注意:【实现Jacobi(雅可比)迭代可以用CUDA 实现,也可以用实现,想使用MPI实现Jacobi迭代的同学,请点击这里】;如果觉得写的不错,请点个赞,谢谢!
CUDA:CUDA(Compute Unified Device Architecture),是显卡厂商NVIDIA推出的运算平台。 CUDA™是一种由NVIDIA推出的通用并行计算架构,该架构使GPU能够解决复杂的计算问题
CUDA是运行在GPU(也就是我们所说的显卡),因此他的最大线程数是非常多的
就N * N的矩阵来说,我么可以为每一个点分配一个线程,所以我们处在核心区方法vector_calculation中不需要通过for循环,只需要外层添加一个迭代循环即可。
#include
#include
#include "string.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
// 256 - the number of N x N interior points. accordingly, the total threads is also N * N
// #define N 256
// define the number of iteration
// #define iteration_num 10000
__global__ void vector_calculation(double *dev_prev, double *dev_cur, int Num_points)
{
// compute the axis of each thread;
int mark_thread_y = threadIdx.x + blockIdx.x * blockDim.x;
int mark_thread_x = threadIdx.y + blockIdx.y * blockDim.y;
if (mark_thread_x < Num_points + 1 && mark_thread_x > 0 && mark_thread_y < Num_points + 1 && mark_thread_y > 0)
{
// printf("(x: %d,y: %d)\n", j, i);
dev_cur[mark_thread_x * (Num_points + 2) + mark_thread_y ] = 0.25 * (dev_prev[(mark_thread_x - 1)*(Num_points + 2) + mark_thread_y] + dev_prev[(mark_thread_x+1)*(Num_points+2)
+mark_thread_y]+dev_prev[mark_thread_x * (Num_points + 2) + mark_thread_y - 1]+dev_prev[mark_thread_x * (Num_points + 2) + mark_thread_y + 1]);
}
}
int main(int argc, char* argv[]){
if(argc < 5)
{
cout<<"You should input two params, but you just did less than that"<<endl;
return 0;
}
else
{
if(atoi(argv[2]) <= 0 || atoi(argv[4]) <= 0)
{
cout<<"Invalid parameters, please check your values."<<endl;
return 0;
}
if(atoi(argv[2]) > 256 || atoi(argv[4]) > 10000)
{
cout<<" the first param is up to 256 and the second up to 10000."<<endl;
return 0;
}
else
{
int N =0 , iteration_num = 0;
N = atoi(argv[2]);
iteration_num = atoi(argv[4]);
// 1, know kernel calculation
cudaDeviceProp prop;
cudaGetDeviceProperties (&prop,0);
// 2, Allocate host memory
double *host_mem = (double* )malloc(sizeof(double) * (N + 2) * (N + 2));
// 3, Initialize host arrays
for(int i = 0; i < (N + 2) * (N + 2); i++)
{
host_mem[i] = 20.00;
if( i > 0.3*(N+2-1) && i < 0.7*(N+2-1))
{
host_mem[i] = 100.00;
}
}
// 4, allocate GPU memory
double *device_mem_prev, *device_mem_cur;
cudaMalloc((void**)&device_mem_prev, sizeof(double) * (N + 2) * (N + 2));
cudaMalloc((void**)&device_mem_cur, sizeof(double) * (N + 2) * (N + 2));
// 5, transfer initial data from the host to the device
cudaMemcpy(device_mem_prev, host_mem, sizeof(double)*(N+2) * (N+2), cudaMemcpyHostToDevice);
cudaMemcpy(device_mem_cur, host_mem, sizeof(double)*(N+2) * (N+2), cudaMemcpyHostToDevice);
// 6, start iterations
int block_dimension = sqrt(prop.maxThreadsPerBlock);
dim3 blockDim(block_dimension,block_dimension);
dim3 gridDim((N + 2) / block_dimension + 1, (N + 2) / block_dimension + 1);
struct timeval t1, t2;
gettimeofday(&t1, 0);
for (int iter = 0; iter < iteration_num; iter++)
{
vector_calculation<<<gridDim, blockDim>>>(device_mem_prev, device_mem_cur, N);
cudaMemcpy(device_mem_prev, device_mem_cur, sizeof(double) * (N+2) * (N+2), cudaMemcpyDeviceToDevice);
}
gettimeofday(&t2, 0);
double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0;
printf("GPU Time to generate: %3.2f ms \n", time);
// 7, Transfer data back to the host memory
cudaMemcpy(host_mem, device_mem_prev, sizeof(double) * (N + 2) * (N + 2), cudaMemcpyDeviceToHost);
// 8, output the result to the csv file
int total_points = (N + 2) * (N + 2);
std::ofstream myfile;
myfile.open ("finalTemperatures.csv");
for(int p = 0; p < total_points; p++)
{
if(p % (N + 2) == 0 && p != 0)
{
myfile << "\n";
}
std::ostringstream out;
out.precision(8);
out<<host_mem[p];
std::string str= out.str(); //从流中取出字符串 数值现在存储在str中,等待格式化。
myfile<< str << ", ";
}
myfile.close();
// 9, Deallocate device memory
cudaFree(device_mem_cur);
cudaFree(device_mem_prev);
// 10, Deallocate host memory
free(host_mem);
}
}
return 1;
}