【CUDA并行编程之四】矩阵相乘

前面介绍了基本的Cuda编程的相关知识,那么这一篇在此基础之上来看看GPU在处理数据计算上的高效能,我们拿矩阵相乘来作为例子。


1.CPU上执行矩阵相乘以及性能。

在CPU上进行矩阵相乘运算的代码:

mat_mul.cc:

//a[i]*b[i] + c[i] = d[i]
#include<iostream>
#include<vector>
#include<map>
#include<fstream>
#include"wtime.h" 

using namespace std;

const int N = 320;

//矩阵有两种表达的方法用二维矩阵或者用一维矩阵表示
int a[N+1][N+1],b[N+1][N+1],c[N+1][N+1],d[N+1][N+1];
int aa[(N+1)*(N+1)],bb[(N+1)*(N+1)],cc[(N+1)*(N+1)],dd[(N+1)*(N+1)];

void init()
{
	for(int i=0;i<N;i++)
		for(int j=0;j<N;j++)
		{
			a[i][j] = 1;
			b[i][j] = 2;
			c[i][j] = 3;
		}
}

void init1()
{
	for(int i=0;i<N;i++)
		for(int j=0;j<N;j++)
		{
			aa[i*N+j] = 1;
			bb[i*N+j] = 2;
			cc[i*N+j] = 3;
		}
}

void mul()
{
	for(int i=0;i<N;i++)	
	  for(int j=0;j<N;j++)
	  {
		for(int k=0;k<N;k++)
		{
			d[i][j] += a[i][k] * b[k][j];
		}
		d[i][j] += c[i][j];
	  }
}

void mul1()
{
	for(int i=0;i<N;i++)	
	  for(int j=0;j<N;j++)
	  {
		for(int k=0;k<N;k++)
		{
			dd[i*N+j] += aa[i*N+k] * bb[k*N+j];
		}
		dd[N*i+j] += cc[N*i+j];
	  }
}

void print()
{
	ofstream fout;
	fout.open("result.txt");
	if(!fout)
	{
		perror("can not open the file");
	}
	for(int i=0;i<N;i++)
	{
	  for(int j=0;j<N;j++)
	  {
	  	  fout<<d[i][j]<<" ";
	  }
	  fout<<endl;
	}
	fout.close();
}

int main()
{
	init1();	
	
	double t = wtime();
	mul1();
	t = wtime()-t;
	printf("computation timing = %10.10f sec\n",t);
	
	//print();

	return 0;
}
wtime.h:

#ifndef _WTIME_
#define _WTIME_

double wtime();

#endif


wtime.cc:

#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>

double wtime(void)
{
	double now_time;
	struct timeval etstart;
	struct timezone tzp;

	if(gettimeofday(&etstart,&tzp)==-1)
	{
		perror("Error:calling gettimeofday() not successfully.\n");
	}

	now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;

	return now_time;
}

#if 0
int main()
{
	double time;
	time = wtime();

	printf("time of day = %10.4f\n",time);

	return 0;
}
#endif

makefile:

target:
	g++ mat_mul.cc wtime.cc
	./a.out

结果:



2.GPU上执行矩阵相乘以及性能。

代码:

cuda_mat_mul_v1.cu:

//matrix multiplication with global memory 
#include<iostream>
#include<fstream>
#include "wtime.h"

using namespace std;


const int BLOCK_SIZE = 16;
const int GRID_SIZE = 20;

//D = A * B + C;
__global__ void mat_mul(int *da,int *db,int *dc,int *dd,int N)
{
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	int sum = 0;
	for(int i=0;i<N;i++)
	{
		sum += da[row*N + i] * db[row*i+col];
	}
	dd[row*N + col] = sum + dc[row*N + col];
}

int main()
{
	int N = BLOCK_SIZE * GRID_SIZE;
	int *ha,*hb,*hc,*hd;
	int *da,*db,*dc,*dd;
	double time;
	ha = new int[N*N];
	hb = new int[N*N];
	hc = new int[N*N];
	hd = new int[N*N];
	cudaError_t err;

	//initialize
	for(int i=0;i<N;i++)
		for(int j=0;j<N;j++)
		{
			ha[i*N+j] = 1;
			hb[i*N+j] = 2;
			hc[i*N+j] = 3;
		}

	//malloc</strong>
	cudaMalloc(&da,N*N*sizeof(int));
	cudaMalloc(&db,N*N*sizeof(int));
	cudaMalloc(&dc,N*N*sizeof(int));
	err = cudaMalloc(&dd,N*N*sizeof(int));
	printf("Cuda Malloc C : %s\n",cudaGetErrorString(err));

	//host to device
	cudaMemcpy(da,ha,N*N*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(db,hb,N*N*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(dc,hc,N*N*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(dd,hd,N*N*sizeof(int),cudaMemcpyHostToDevice);

	dim3 threadBlock(BLOCK_SIZE,BLOCK_SIZE);
	dim3 grid(GRID_SIZE,GRID_SIZE);
	//kernel
	time = wtime();
	mat_mul<<<grid,threadBlock>>>(da,db,dc,dd,N);
	printf("Computation time is %10.10f\n",wtime()-time);

	//device to host
	cudaMemcpy(hd,dd,N*N*sizeof(int),cudaMemcpyDeviceToHost);

	//print result to file
	ofstream fout;
	fout.open("result_v1.txt");
	if(!fout)  
	{
		cerr<<"open the file error"<<endl;
		exit(-1);
	}
	for(int i=0;i<N;i++)	
	{
		for(int j=0;j<N;j++)
		{
			fout<<hd[i*N+j]<<" ";
		}
		fout<<endl;
	}
	
	delete []ha;delete []hb;delete []hc;delete []hd;
	cudaFree(da);cudaFree(db);cudaFree(dc);cudaFree(dd);

	return 0;
}
cuda_wtime.cu:

#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>

double wtime(void)
{
	double now_time;
	struct timeval etstart;
	struct timezone tzp;

	if(gettimeofday(&etstart,&tzp)==-1)
	{
		perror("Error:calling gettimeofday() not successfully.\n");
	}

	now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;

	return now_time;
}

#if 0
int main()
{
	double time;
	time = wtime();

	printf("time of day = %10.4f\n",time);

	return 0;
}
#endif
wtime.h:
#ifndef _WTIME_
#define _WTIME_

double wtime();

#endif


cuda_wtime.cu:

#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>

double wtime(void)
{
	double now_time;
	struct timeval etstart;
	struct timezone tzp;

	if(gettimeofday(&etstart,&tzp)==-1)
	{
		perror("Error:calling gettimeofday() not successfully.\n");
	}

	now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;

	return now_time;
}

#if 0
int main()
{
	double time;
	time = wtime();

	printf("time of day = %10.4f\n",time);

	return 0;
}
#endif

makefile:

cu:
	nvcc cuda_mat_mul_v1.cu cuda_wtime.cu
	./a.out

结果:



3.计算性能对比:

矩阵大小
1600*1600
1200*1200
800*800
320*320
串行时间/s
30.9
11.49865
2.597987
0.162311
并行时间
grid=100/block=16
grid=75/block=16
grid=50/block=16
grid=20/block=16
kernel执行时间/s
0.0000319
0.0000309944
0.0000309944
0.0000231266
并行计算总时间(分配内存加+数据拷贝+计算)/s
0.70796
0.439213
0.310214
0.237676

可见,在矩阵规模大的时候非常明显的体现出了GPU强大的计算能力。


注明出处:http://blog.csdn.net/lavorange/article/details/41896591



你可能感兴趣的:(CUDA,并行计算,矩阵相乘)