CUDA实现Jacobi(雅可比)迭代算法[C++]

注意:【实现Jacobi(雅可比)迭代可以用CUDA 实现,也可以用实现,想使用MPI实现Jacobi迭代的同学,请点击这里】;如果觉得写的不错,请点个赞,谢谢!

1 what is CUDA

  • CUDA:CUDA(Compute Unified Device Architecture),是显卡厂商NVIDIA推出的运算平台。 CUDA™是一种由NVIDIA推出的通用并行计算架构,该架构使GPU能够解决复杂的计算问题

  • CUDA是运行在GPU(也就是我们所说的显卡),因此他的最大线程数是非常多的

  • 就N * N的矩阵来说,我么可以为每一个点分配一个线程,所以我们处在核心区方法vector_calculation中不需要通过for循环,只需要外层添加一个迭代循环即可。

2 使用指南,首先你必须确定你电脑上配有英伟达nvidia独立显卡,然后才能跑完这段代码(我用的是学校提供的服务器)

#include 
#include 
#include "string.h"
#include    
#include  
#include
#include   
#include 
#include 
#include 
#include 
#include 
#include 

using namespace std;
// 256 - the number of N x N interior points. accordingly, the total threads is also N * N
// #define N 256

// define the number of iteration
// #define iteration_num 10000


__global__ void vector_calculation(double *dev_prev, double *dev_cur, int Num_points) 
{

    // compute the axis of each thread;
    int mark_thread_y = threadIdx.x + blockIdx.x * blockDim.x;
    int mark_thread_x = threadIdx.y + blockIdx.y * blockDim.y;

    if (mark_thread_x < Num_points + 1 && mark_thread_x > 0 && mark_thread_y < Num_points + 1 && mark_thread_y > 0)
    {
        // printf("(x: %d,y: %d)\n", j, i);
        dev_cur[mark_thread_x * (Num_points + 2) + mark_thread_y ] = 0.25 * (dev_prev[(mark_thread_x - 1)*(Num_points + 2) + mark_thread_y] + dev_prev[(mark_thread_x+1)*(Num_points+2)
            +mark_thread_y]+dev_prev[mark_thread_x * (Num_points + 2) + mark_thread_y - 1]+dev_prev[mark_thread_x * (Num_points + 2) + mark_thread_y + 1]);
    }

}

int main(int argc, char* argv[]){

    if(argc < 5)
    {
        cout<<"You should input two params, but you just did less than that"<<endl;
        return 0;
    }
    else
    {
        if(atoi(argv[2]) <= 0 || atoi(argv[4]) <= 0)
        {
            cout<<"Invalid parameters, please check your values."<<endl;
            return 0;
        }
        if(atoi(argv[2]) > 256 || atoi(argv[4]) > 10000)
        {
            cout<<" the first param is up to 256 and the second up to 10000."<<endl;
            return 0;
        }
        else
        {
            int N =0 , iteration_num = 0;
            N = atoi(argv[2]);
            iteration_num = atoi(argv[4]);
            // 1, know kernel calculation 
            cudaDeviceProp prop;
            cudaGetDeviceProperties (&prop,0);

            
            // 2, Allocate host memory
            double *host_mem = (double* )malloc(sizeof(double) * (N + 2) * (N + 2));
            
            // 3, Initialize host arrays
            for(int i = 0; i < (N + 2) * (N + 2); i++)
            {       
                host_mem[i] = 20.00;
                if( i > 0.3*(N+2-1) && i < 0.7*(N+2-1))
                {
                    host_mem[i] = 100.00;
                }
            }

            // 4, allocate GPU memory
            double *device_mem_prev, *device_mem_cur;
            cudaMalloc((void**)&device_mem_prev, sizeof(double) * (N + 2) * (N + 2));
            cudaMalloc((void**)&device_mem_cur, sizeof(double) * (N + 2) * (N + 2));


            // 5, transfer initial data from the host to the device
            cudaMemcpy(device_mem_prev, host_mem, sizeof(double)*(N+2) * (N+2), cudaMemcpyHostToDevice);
            cudaMemcpy(device_mem_cur, host_mem, sizeof(double)*(N+2) * (N+2), cudaMemcpyHostToDevice);

            // 6, start iterations
            int block_dimension = sqrt(prop.maxThreadsPerBlock);
            dim3 blockDim(block_dimension,block_dimension);
            dim3 gridDim((N + 2) / block_dimension + 1, (N + 2) / block_dimension + 1);

            struct timeval t1, t2;
            gettimeofday(&t1, 0);


            for (int iter = 0; iter < iteration_num; iter++)
            {
                vector_calculation<<<gridDim, blockDim>>>(device_mem_prev, device_mem_cur, N);
                cudaMemcpy(device_mem_prev, device_mem_cur, sizeof(double) * (N+2) * (N+2), cudaMemcpyDeviceToDevice);
            }
        
            gettimeofday(&t2, 0);

            double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0;

            printf("GPU Time to generate:  %3.2f ms \n", time);

            // 7, Transfer data back to the host memory
            cudaMemcpy(host_mem, device_mem_prev, sizeof(double) * (N + 2) * (N + 2), cudaMemcpyDeviceToHost);

            // 8, output the result to the csv file
            int total_points = (N + 2) * (N + 2);
            std::ofstream myfile;
            myfile.open ("finalTemperatures.csv");
            for(int p = 0; p < total_points; p++)
            { 
                if(p % (N + 2) == 0 && p != 0)
                {
                    myfile << "\n";
                }
                std::ostringstream out;
                out.precision(8);
                out<<host_mem[p];
                std::string str= out.str(); //从流中取出字符串 数值现在存储在str中,等待格式化。
                myfile<< str << ", ";
                
            }
            myfile.close();



            // 9, Deallocate device memory
            cudaFree(device_mem_cur);
            cudaFree(device_mem_prev);

            // 10, Deallocate host memory
            free(host_mem); 
        }
    }

  
    return 1;
}

你可能感兴趣的:(雅可比(Jacobi)迭代算法,CUDA编程,算法,c++,cuda)