CUDA矩阵乘

#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"

#include 
using namespace std;

#define numElements 256

//windows下的计时;
#include 
double get_time() {
	LARGE_INTEGER timer;
	static LARGE_INTEGER fre;
	static int init = 0;
	double t;
	if (init != 1) {
		QueryPerformanceFrequency(&fre);
		init = 1;
	}
	QueryPerformanceCounter(&timer);
	t = timer.QuadPart * 1. / fre.QuadPart;
	return t;
}

void Host_MarticMultiply(int A[][numElements], int B[][numElements], int C[][numElements]) {
	for (int i = 0; i < numElements; i++) {
		for (int j = 0; j < numElements; j++) {
			for (int k = 0; k < numElements; k++) {
				C[i][j] += A[i][k] * B[k][j];
			}
		}
	}
}

__global__ void MatixMultiply(int h_a[][numElements], int h_b[][numElements], int h_c[][numElements])
{
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	int j = threadIdx.y + blockIdx.y * blockDim.y;

	if (i < numElements && j < numElements) {
		for (int k = 0; k < numElements; k++) {
			h_c[i][j] += h_a[i][k] * h_b[k][j];
		}
	}
}

void textValue(int h_c[][numElements]) {

	cout << "测试程序:测试三组数据" << endl;
	for (int i = 0; i < 5; i++)
	{
		for (int j = 0; j < 5; j++)
			cout << h_c[i][j] << "\t";
		cout << endl;
	}
	cout << endl;
	for (int i = 100; i < 105; i++)
	{
		for (int j = 0; j < 5; j++)
			cout << h_c[i][j] << "\t";
		cout << endl;
	}
	cout << endl;
	for (int i = 250; i < 255; i++)
	{
		for (int j = 0; j < 5; j++)
			cout << h_c[i][j] << "\t";
		cout << endl;
	}
}

int main()
{
	int(*h_a)[numElements] = new int[numElements][numElements];
	int(*h_b)[numElements] = new int[numElements][numElements];
	int(*h_c)[numElements] = new int[numElements][numElements];
	int(*h_d)[numElements] = new int[numElements][numElements];

	for (int i = 0; i < numElements; i++) {
		for (int j = 0; j < numElements; j++) {
			h_a[i][j] = i + 1;
			h_b[i][j] = i + 1;
			h_c[i][j] = 0;
			h_d[i][j] = 0;
		}
	}

	cudaError_t error = cudaSuccess;

	int(*d_a)[numElements], (*d_b)[numElements], (*d_c)[numElements];

	error = cudaMalloc((void**)&d_a, sizeof(int) * numElements * numElements);
	error = cudaMalloc((void**)&d_b, sizeof(int) * numElements * numElements);
	error = cudaMalloc((void**)&d_c, sizeof(int) * numElements * numElements);

	cudaMemcpy(d_a, h_a, sizeof(int) * numElements * numElements, cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, h_b, sizeof(int) * numElements * numElements, cudaMemcpyHostToDevice);
	cudaMemcpy(d_c, h_c, sizeof(int) * numElements * numElements, cudaMemcpyHostToDevice);

	double td = 0;
	double th = 0;
	//GPU执行时间
	cudaThreadSynchronize();//锁定直到前面进程执行结束
	td = get_time();

	dim3 blocks(32, 32);
	dim3 threads(32, 32);
	MatixMultiply << <blocks, threads >> > (d_a, d_b, d_c);

	cudaThreadSynchronize();
	td = get_time() - td;
	cout << "GPU执行时间:" << td << endl;//得到GPU执行最终时间

	//copy回主存
	cudaMemcpy(h_c, d_c, sizeof(int) * numElements * numElements, cudaMemcpyDeviceToHost);

	//CPU执行时间
	cudaThreadSynchronize();//锁定直到前面进程执行结束
	th = get_time();
	Host_MarticMultiply(h_a,h_b,h_d);
	cudaThreadSynchronize();
	th = get_time() - th;
	cout << "CPU执行时间:" << th << endl;//得到CPU执行最终时间

	cout << "GPU:" << endl;
	textValue(h_c);
	cout << endl;
	cout << "CPU:" << endl;
	textValue(h_d);
	cout << endl;

	free(h_a);
	free(h_b);
	free(h_c);
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);
	return 0;
}

你可能感兴趣的:(cuda,笔记,cuda)