向量乘以矩阵(vector_matrix_multiplication)

/********************************************************************
created: 2009/09/19
created: 19:9:2009 12:00
filename: vector_matrix_multiplication.cu
file base: vector_matrix_multiplication
file ext: CUDA
author: zhao.kaiyong(at)gmail.com
purpose: vector matrix multiplication
copyright: everyone can use this code, please specify source.
任意使用,请注明出处;
http://www.hpctech.com
http://openhero.net
*********************************************************************/
#include <stdio.h><br>#include <stdlib.h><br>#include <cuda_runtime.h><br>#include <cutil.h></cutil.h></cuda_runtime.h></stdlib.h></stdio.h>

/************************************************************************/
/* Init CUDA */
/************************************************************************/
#if __DEVICE_EMULATION__

bool InitCUDA(void){return true;}

#else
bool InitCUDA(void)
{
int count = 0;
int i = 0;

cudaGetDeviceCount(&count);
if(count == 0) {
fprintf(stderr, "There is no device./n");
return false;
}

for(i = 0; i cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if(prop.major >= 1) {
break;
}
}
}
if(i == count) {
fprintf(stderr, "There is no device supporting CUDA./n");
return false;
}
cudaSetDevice(i);

printf("CUDA initialized./n");
return true;
}

#endif
/************************************************************************/
/* Example */
/************************************************************************/
__global__ static void vector_matrix_mult_kernel(float* A, long wA, float* B, long wB, float* C)
{
__shared__ float subA[64];

A = A + threadIdx.x;
B = B + blockIdx.x * 64 + threadIdx.x;
C = C + blockIdx.x * 64 + threadIdx.x;

float subC = 0.0;

for (int i = 0; i {
subA[threadIdx.x] = A[i];
__syncthreads();

#pragma unroll
for (int j = 0; j {
subC += subA[j] * B[0];
}
__syncthreads();
}

C[0] = subC;
}

//__global__ static void vector_matrix_mult_kernel_32T(float* A, long wA, float* B, long wB, float* C)
//{

//}

#define RUN_TEST

#define WA 32
#define WB 64
/************************************************************************/
/* HelloCUDA */
/************************************************************************/
int main(int argc, char* argv[])
{

if(!InitCUDA()) {
return 0;
}

srand(2009);

long wA = 64* WA;
long wB = 64* WB;

long size_A = wA;
long size_B = wA*wB;
long size_C = wB;

float *hA = (float*)malloc(sizeof(float) * size_A);
float *hB = (float*)malloc(sizeof(float) * size_B);
float *hC = (float*)malloc(sizeof(float) * size_C);
float *testhC = (float*)malloc(sizeof(float) * size_C);

for (int i = 0; i {
hA[i] = (float)rand()/(float)RAND_MAX;
}

for (int i = 0; i {
hB[i] = (float)rand()/(float)RAND_MAX;
}

float *dA = 0;
float *dB = 0;
float *dC = 0;

CUDA_SAFE_CALL( cudaMalloc((void**) &dA, sizeof(float) * size_A));
CUDA_SAFE_CALL( cudaMalloc((void**) &dB, sizeof(float) * size_B));
CUDA_SAFE_CALL( cudaMalloc((void**) &dC, sizeof(float) * size_C));

CUDA_SAFE_CALL( cudaMemcpy(dA, hA, sizeof(float)*size_A, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL( cudaMemcpy(dB, hB, sizeof(float)*size_B, cudaMemcpyHostToDevice));

unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));

dim3 threads = 64;
dim3 blocks = wB/64;
vector_matrix_mult_kernel>>(dA, wA, dB, wB, dC);

CUT_CHECK_ERROR("Kernel execution failed/n");

CUDA_SAFE_CALL( cudaMemcpy(hC, dC, sizeof(float) * size_C, cudaMemcpyDeviceToHost));
CUT_SAFE_CALL( cutStopTimer( timer));
printf("Processing time: %f (ms)/n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutResetTimer( timer));

for (int i = 0; i {
float subC = 0.0;
for (int j = 0; j <wa j></wa> {
subC += hA[j] * hB[j*wB + i];
}
testhC[i] = subC;
}

CUT_SAFE_CALL( cutStopTimer( timer));
printf("Processing time: %f (ms)/n", cutGetTimerValue( timer));

CUT_SAFE_CALL( cutDeleteTimer( timer));

#ifdef RUN_TEST

CUTBoolean res = cutCompareL2fe(testhC, hC, size_C, 1e-6f);
printf("Test %s /n", (1 == res) ? "PASSED":"FAILED");
#endif

CUDA_SAFE_CALL( cudaFree(dA));
CUDA_SAFE_CALL( cudaFree(dB));
CUDA_SAFE_CALL( cudaFree(dC));
free(hA);
free(hB);
free(hC);
CUT_EXIT(argc, argv);

return 0;
}

你可能感兴趣的:(vector)