主要是VS的compile和link,方法是用CUDA.rule来编译cu,同时加入$(CUDA_INC_PATH)和$(CUDA_LIB_PATH),禁用一个INC啥的东西。
INC_PATH有White Space会报错,估计是VS自己的问题。
贴个代码...Javaye别再不支持MF了。。。
#include <iostream>
#include <cstdlib>
using namespace std;
// Matrix multiplication kernel – per thread code
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width)
{
// 2D Thread ID
int tx = threadIdx.x;
int ty = threadIdx.y;
// Pvalue is used to store the element of the matrix
// that is computed by the thread
float Pvalue = 0;
for (int k = 0; k < Width; ++k)
{
float Melement = Md[ty * Width + k];
float Nelement = Nd[k * Width + tx];
Pvalue += Melement * Nelement;
}
// Write the matrix to device memory;
// each thread writes one element
Pd[ty * Width + tx] = Pvalue;
}
// Matrix multiplication on the (CPU) host in double precision
void MatrixMulOnHost(float* M, float* N, float* P, int Width)
{
for (int i = 0; i < Width; ++i)
for (int j = 0; j < Width; ++j) {
double sum = 0;
for (int k = 0; k < Width; ++k) {
double a = M[i * width + k];
double b = N[k * width + j];
sum += a * b;
}
P[i * Width + j] = sum;
}
}
void MatrixMulOnDevice(float* M, float* N, float* P, int Width)
{
int size = Width * Width * sizeof(float);
float* Md, *Nd, *Pd;
// Allocate and Load M, N to device memory
cudaMalloc((void **)&Md, size);
cudaMemcpy(Md, M, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&Nd, size);
cudaMemcpy(Nd, N, size, cudaMemcpyHostToDevice);
// Allocate P on the device
cudaMalloc((void**)&Pd, size);
// Kernel invocation code – to be shown later
// Setup the execution configuration
dim3 dimBlock(Width, Width);
dim3 dimGrid(1, 1);
// Launch the device computation threads!
MatrixMulKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd);
// Read P from the device
cudaMemcpy(P, Pd, size, cudaMemcpyDeviceToHost);
// Free device matrices
cudaFree(Md); cudaFree(Nd); cudaFree (Pd);
}
int main()
{
float A[9];
float B[9];
float C[9];
int width=3;
for (int i=0;i<9;i++)
{
A[i]=rand();
B[i]=rand();
}
for (int i=0;i<9;i++)
{
cout<<A[i];
if (i%3==2) cout<<endl;
}
for (int i=0;i<9;i++)
{
cout<<B[i];
if (i%3==2) cout<<endl;
}
MatrixMulOnDevice(A,B,C,width);
for (int i=0;i<9;i++)
{
cout<<C[i];
if (i%3==2) cout<<endl;
}
}