原文链接
内核函数是CUDA 每个线程 执行的函数,它运行在GPU设备上。CUDA使用扩展的C语言编写内核函数,关键字为__global__。内核函数返回值只能是void。
__global__ void 函数名(参数……){ 程序指令集合 }
函数名<<>>(参数……)
blocksPerGrid
:每个网格中进程块的排布方式(可以采用1维或2维)threadsPerBock
:每个进程块中进程的排布方式(可以采用1维或2维)_global void VecAdd(double a[][],double b[][],double c[][]){
int x = blockIdx.x * blockDim.x + threadIdx.x;//当前列址
int y = blockIdy.y * blockDim.y + threadIdy.y;//当前行址0099
if(i < N&& y < N){
c[j][i] = a[j][i] + b[j][i]
}
}
int main(){
dim3 threadsPerBlock(16,16);//每个线程块内部排布
dim3 blocksPerGrid(N / threadsPerBlock.x ,N / threadsPerBlock.y);//线程排布
VecAdd<<<blocksPerGrid,threadsPerBlock>>>(A,B,C);
}
nvcc filename.cu –o filename
cudaMalloc
cudaMalloc (void **devPtr, size_t size )
cudaMemcpy
主机到设备:cudaMemcpy(d_A,h_A,nBytes,cudaMemcpyHostToDevice)
设备到主机:cudaMemcpy(h_A,d_A,nBytes,cudaMemcpyDeviceToHost)
#include
#include
#include
#include
#include "cuda_runtime.h"
#include
#include
#include "device_launch_parameters.h"
#define thread_num 256//一个线程块的线程数
using namespace std;
const int N = 6000;//数组维数
const int blocks_num = (N + thread_num - 1) / thread_num;//线程块数
__global__ void mextix(int *da,int *db,int *dc)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if(row < N && col < N){
dc[row*N+col] = 0;
for(int i = 0;i < N;i++){
dc[row*N+col] += da[row*N+i] * db[i*N+col];
}
}
}
//随机生成矩阵
void rands(int *a)
{
for(int i = 0;i < N;i++){
for(int j = 0;j < N;j++){
a[i*N+j] = rand() % 10 + 1 ;
}
}
}
int main()
{
int *a,*b,*c;
int *da,*db,*dc;
int size = N*N*sizeof(int);
//freopen("out.txt","w",stdout);
//分配空间
a = (int*)malloc(size);
b = (int*)malloc(size);
c = (int*)malloc(size);
//生成随机数组
rands(a);
rands(b);
//分配内存 GPU申请空间所需时间
clock_t t1 = clock();
cudaMalloc((void**)&da,size);
cudaMalloc((void**)&db,size);
cudaMalloc((void**)&dc,size);
//cudaMalloc((void**)&time,blocks_num*sizeof(clock_t)*2);
clock_t t2 = clock();
double ts = (double)(t2-t1);
//CLOCKS_PER_SEC表示一秒钟内CPU运行的时钟周期数
printf("GPU divide costtime : %lf ms\n",ts/CLOCKS_PER_SEC*1000);
//存到GPU
cudaMemcpy(da,a,size,cudaMemcpyHostToDevice);
cudaMemcpy(db,b,size,cudaMemcpyHostToDevice);
/*
GPU运算 并行运算时间
计算代码运行时间
*/
timeval start,finish1,finish2;
gettimeofday(&start,0);//获得当前精确时间
dim3 dg(16,16);
dim3 dbs((N+dg.x-1)/dg.x,(N+dg.y-1)/dg.y);
gettimeofday(&finish1,0);
mextix<<<dbs,dg>>>(da,db,dc);
gettimeofday(&finish2, 0);//获得当前精确时间
double cost1 = 1e6 * (finish2.tv_sec - start.tv_sec) + finish2.tv_usec - start.tv_usec;//微秒
double cs = 1e6*(finish1.tv_sec - start.tv_sec) + (finish1.tv_usec - start.tv_usec);
/*
timeval
{
time_t tv_sec; //秒 [long int]
suseconds_t tv_usec; //微秒 [long int]
};
*/
//从GPU取回
cudaMemcpy(c,dc,size,cudaMemcpyDeviceToHost);
//GPU运算时间
printf("GPUCost time : %lf ms\n",cost1/1e3);
printf("GPU divdided time : %lf ms\n",cs/1e3);
// printf("GPUAnswer : \n");
// for(int i = 0;i < N;i++){
// for(int j = 0;j < N;j++){
// printf("%d ",c[i*N+j]);
// //printf("1");
// }
// printf("\n");
// }
//释放内存
cudaFree(da);
cudaFree(db);
cudaFree(dc);
CPU计算
clock_t st = clock();
for(int i = 0;i < N;i++){
for(int j = 0;j < N;j++){
c[i*N+j] = 0;
for(int k = 0;k < N;k++){
c[i*N+j] += a[i*N+k] * b[k*N+j];
}
}
}
clock_t ed = clock();
double ends = (double)(ed-st);
CPU运算时间
printf("CPUCost time : %lf ms\n",ends/CLOCKS_PER_SEC*1000);
// printf("CPUAnswer : \n");
// for(int i = 0;i < N;i++){
// for(int j = 0;j < N;j++){
// printf("%d ",c[i*N+j]);
// }
// printf("\n");
// }
return 0;
}
分析可得,因为GPU采用线程并行处理矩阵相乘,而CPU采用串行一个个依次算,所以GPU运行时间会更短。