#include "stdafx.h"
#include
#include
#include
extern "C" int mulWithCuda(float *c, const float *a, const float *b, int size);
int _tmain(int argc, _TCHAR* argv[])
{
int i = 0, j = 0, k = 0;
float sum = 0;
int size = 8;
srand(time(NULL));
float * matrix_a = (float *)malloc(size * size * sizeof(float)); //创建一维数组
float * matrix_b = (float *)malloc(size * size * sizeof(float)); //创建一维数组
float * matrix_c = (float *)malloc(size * size * sizeof(float)); //创建一维数组
float * matrix_d = (float *)malloc(size * size * sizeof(float)); //创建一维数组
for (i = 0; i < size; i++)
{
for (j = 0; j < size; j++)
{
//生成随机数
*(matrix_a + i * size + j) = (float)rand() / (RAND_MAX / 10);
*(matrix_b + i * size + j) = (float)rand() / (RAND_MAX / 10);
}
}
for (i = 0; i < size; i++)
{
for (j = 0; j < size; j++)
{
printf("%f ", *(matrix_a + i * size + j));
}
printf("\n");
}
printf("\n");
for (i = 0; i < size; i++)
{
for (j = 0; j < size; j++)
{
printf("%f ", *(matrix_b + i * size + j));
}
printf("\n");
}
printf("\n");
clock_t start = clock();
for (i = 0; i < size; i++)
{
for (j = 0; j < size; j++)
{
sum = 0;
for (k = 0; k < size; k++)
{
sum = sum + *(matrix_a + i * size + k) * (*(matrix_b + k * size + j));
}
*(matrix_d + i * size + j) = sum;
printf("%f ", sum);
}
printf("\n");
}
clock_t end = clock();
double interval = double(end - start) / CLK_TCK;
printf("CPU运行时间为:%lf\n", interval);
// Add vectors in parallel.
clock_t start1 = clock();
int cudaStatus = mulWithCuda(matrix_c, matrix_a, matrix_b, size);
clock_t end1 = clock();
double interval1 = double(end1 - start1) / CLK_TCK;
printf("GPU运行时间为:%lf\n", interval1);
//printf("加速比为:%lf\n", interval / interval1);
for (i = 0; i < size; i++)
{
for (j = 0; j < size; j++)
{
//*(matrix_c + i * size + j) = *(matrix_a + i * size + j) + *(matrix_b + i * size + j);
printf("%f ", *(matrix_c + i * size + j));
}
printf("\n");
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
return 0;
}
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#define dimen 16
__global__ void mulKernel1(float* c, float* a, float* b, int size)
{
float sum = 0;
//创建共享存储器数组
__shared__ float A[dimen][dimen];
__shared__ float B[dimen][dimen];
//计算线程的不同维度索引
int bx=blockIdx.x;
int by=blockIdx.y;
int tx=threadIdx.x;
int ty=threadIdx.y;
//计算得到结果在线程块中的索引
int row = by*dimen+ty;
int col = bx*dimen+tx;
for (int k=0;k<(size-1)/dimen+1;k++)
{
//加载数据到共享存储器
if(k * dimen + tx < size && row < size)
{
A[ty][tx]=a[row*size+k*dimen+tx];
}
else
{
A[ty][tx] = 0;
}
if(k * dimen + ty< size && col < size)
{
B[ty][tx]=b[(k*dimen+ty)*size+col];
}
else
{
B[ty][tx] = 0;
}
__syncthreads(); //同步块内其他线程,使A和B共享存储器中数据填满
for(int m=0;m
__syncthreads(); //同步矩阵上移动的共享块,使得sum的值完整
}
//将sum值赋值到线程对应的矩阵位置
if(row < size && col < size)
{
c[row*size+col]=sum;
}
}
__global__ void mulKernel(float *c,float *a, float *b, int size)
{
int threadx=threadIdx.x;
int thready=threadIdx.y;
int blockx=blockIdx.x;
int blocky=blockIdx.y;
int row = blockx*blockDim.x+threadx;
int col = blocky*blockDim.x+thready;
float sum=0;
for(int i=0;i
if(row < size && col < size) //增加判断条件,避免由于矩阵分块不匹配导致的计算错误
{
c[row*size+col]=sum;
}
}
// Helper function for using CUDA to add vectors in parallel.
extern "C" int mulWithCuda(float *c, float *a, float *b, int size)
{
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
dim3 grid((size-1)/dimen+1,(size-1)/dimen+1,1);
dim3 block(dimen,dimen,1);
// Launch a kernel on the GPU with one thread for each element.
mulKernel1<<
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}