[CUDA] First...

主要是VS的compile和link,方法是用CUDA.rule来编译cu,同时加入$(CUDA_INC_PATH)和$(CUDA_LIB_PATH),禁用一个INC啥的东西。
INC_PATH有White Space会报错,估计是VS自己的问题。
贴个代码...Javaye别再不支持MF了。。。
#include <iostream>
#include <cstdlib>
using namespace std;

// Matrix multiplication kernel – per thread code

__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width)
{
    // 2D Thread ID
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Pvalue is used to store the element of the matrix
    // that is computed by the thread
    float Pvalue = 0;

    for (int k = 0; k < Width; ++k)
    { 
         float Melement = Md[ty * Width + k];
         float Nelement = Nd[k * Width + tx];
         Pvalue += Melement * Nelement;
    }
    // Write the matrix to device memory;
    // each thread writes one element
    Pd[ty * Width + tx] = Pvalue;
}


// Matrix multiplication on the (CPU) host in double precision
void MatrixMulOnHost(float* M, float* N, float* P, int Width)
{   
    for (int i = 0; i < Width; ++i)
        for (int j = 0; j < Width; ++j) {
            double sum = 0;
            for (int k = 0; k < Width; ++k) {
                double a = M[i * width + k];
                double b = N[k * width + j];
                sum += a * b;
            }
            P[i * Width + j] = sum;
        }
}

void MatrixMulOnDevice(float* M, float* N, float* P, int Width)
{
   int size = Width * Width * sizeof(float); 
    float* Md, *Nd, *Pd;

	// Allocate and Load M, N to device memory 
    cudaMalloc((void **)&Md, size);
    cudaMemcpy(Md, M, size, cudaMemcpyHostToDevice);

     cudaMalloc((void **)&Nd, size);
     cudaMemcpy(Nd, N, size, cudaMemcpyHostToDevice);

     // Allocate P on the device
    cudaMalloc((void**)&Pd, size);
	
	// Kernel invocation code – to be shown later
    // Setup the execution configuration
    dim3 dimBlock(Width, Width);
    dim3 dimGrid(1, 1);

    // Launch the device computation threads!
    MatrixMulKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd);

	// Read P from the device
    cudaMemcpy(P, Pd, size, cudaMemcpyDeviceToHost);

    // Free device matrices
    cudaFree(Md); cudaFree(Nd); cudaFree (Pd);
     
}

int main()
{
	float A[9];
	float B[9];
	float C[9];
	int width=3;
	for (int i=0;i<9;i++)
	{
		A[i]=rand();
		B[i]=rand();
	}
	for (int i=0;i<9;i++)
	{
		cout<<A[i];
		if (i%3==2) cout<<endl;
	}
	for (int i=0;i<9;i++)
	{
		cout<<B[i];
		if (i%3==2) cout<<endl;
	}
	MatrixMulOnDevice(A,B,C,width);
	for (int i=0;i<9;i++)
	{
		cout<<C[i];
		if (i%3==2) cout<<endl;
	}

	
	
}

你可能感兴趣的:(thread,C++,c,C#,J#)