C++调用CUDA(基于VS2015) matlab调用CUDA 以及matlab调用C++ 学习记录

版本说明:VS2015 matlab2016a  Nvcc9.0

实现功能:double型矩阵相乘(日后有机会更新模版类型)

attention:CUDA存储矩阵是列优先,不知道自己能不能改,还在摸索中

1、matlab调用C++ 基于mexFunction 其中c++扩展调用了Eigen库3.3version

//Matrixs_Multiply.cpp
/*function for matrixs mltply,based on Eigen 3.3 version 
  matlab use C function use eigen lib 
  there is a .m file named function_compiler to initialize the environment and function 
  change the path and lib according to different projects
  */

#include 
#include "mex.h"
using namespace Eigen;
using namespace std;

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
	int L_rows = mxGetM(prhs[0]);
	int L_cols = mxGetN(prhs[0]);
	Map< MatrixXd>  L_mat(mxGetPr(prhs[0]), L_rows, L_cols);
	int R_rows = mxGetM(prhs[1]);
	int R_cols = mxGetN(prhs[1]);
	Map< MatrixXd>  R_mat(mxGetPr(prhs[1]), R_rows, R_cols);
	plhs[0] = mxCreateDoubleMatrix(L_rows, R_cols, mxREAL);
	MatrixXd result;
	result = L_mat*R_mat;
	double* outdata = mxGetPr(plhs[0]);
	for (int i = 0; i
matlab编译C++的.m文件代码如下 在代码对应处修改路径和库文件即可
clear all;
% Get the architecture of this computer
is_64bit = strcmp(computer,'MACI64') || strcmp(computer,'GLNXA64') || strcmp(computer,'PCWIN64');
 

%----------------------------------------------------------------------------------------------  
 %% The configuration of compiler
% You need to modify this configuration according to your own path of OpenCV  
% Notice: if your system is 64bit, your OpenCV must be 64bit! 
out_dir='./';  
%CPPFLAGS = ' -O -DNDEBUG -I.\ -IF:\opencv\build\include -IF:\opencv\build\include\opencv -IF:\opencv\build\include\opencv2'; % your OpenCV "include" path  
%LDFLAGS = ' -LF:\opencv\build\x86\vc10\lib';                       % your OpenCV "lib" path   
%LIBS = ' -lopencv_calib3d249d -lopencv_contrib249d -lopencv_core249d -lopencv_features2d249d -lopencv_flann249d -lopencv_gpu249d -lopencv_highgui249d -lopencv_imgproc249d -lopencv_legacy249d -lopencv_ml249d -lopencv_nonfree249d -lopencv_objdetect249d -lopencv_photo249d -lopencv_stitching249d -lopencv_ts249d -lopencv_video249d -lopencv_videostab249d';
%LIBS = ' -lopencv_calib3d249 -lopencv_contrib249 -lopencv_core249 -lopencv_features2d249 -lopencv_flann249 -lopencv_gpu249 -lopencv_highgui249 -lopencv_imgproc249 -lopencv_legacy249 -lopencv_ml249 -lopencv_nonfree249 -lopencv_objdetect249 -lopencv_photo249 -lopencv_stitching249 -lopencv_ts249 -lopencv_video249 -lopencv_videostab249';
CPPFLAGS = ' -O -DNDEBUG -I.\ -IF:\opencv\build\include -IE:\opencv3.3\opencv\build\include -IE:\eigen_higher_version' ; % your OpenCV "include" path  
LDFLAGS = ' -LE:\opencv3.3\opencv\build\x64\vc14\lib';                       % your OpenCV "lib" path 
LIBS='opencv_world330d';
if is_64bit  
   CPPFLAGS = [CPPFLAGS ' -largeArrayDims'];  
end  
 
% add your files here!!
compile_files = {
%the list of your code files which need to be compiled
    'Matrixs_Multiply.cpp'
};
%---------------------------------------------------------------------------------------------- 

%---------------------------------------------------------------------------------------------- 
%% compiling
for k = 1 : length(compile_files)  
     str = compile_files{k};  
     fprintf('compilation of: %s\n', str);  
     str = [str ' -outdir ' out_dir CPPFLAGS LDFLAGS LIBS];  
     args = regexp(str, '\s+', 'split');  
     mex(args{:});  
 end 
 fprintf('Congratulations, compilation successful!!!\n');
 %---------------------------------------------------------------------------------------------- 

2、C++调用CUDA(基于VS2015)

/*this .cu file can be used to compute the multiply between two matrixs
  C use CUDA to speed up 
  remember to add extern c as  the line 29*/

#include 
#include 
#include 
#include 
#include 



__global__ void MatrixMuiOnDevice(double *M, double *N, double* res,int L_rows,int L_cols) //矩阵列优先存储  //block thread 行优先
{
	//int num = blockDim.x*threadIdx.y+threadIdx.x;
	int num=(gridDim.x*blockIdx.y+blockIdx.x)*blockDim.x*blockDim.y+ blockDim.x*threadIdx.y + threadIdx.x;

	int col = num / L_rows;
	int row = num % L_rows;
	double sum = 0;
	for (int i = 0; i < L_cols; i++)
		{
		sum += M[i*L_rows + row] * N[col*L_cols + i];
		}
	res[num] = sum;
}

extern "C"
int  CUDA_Matrix_Mul(const double* a,const double* b, double* c,const int* size)
{
	double *M, *N, *P;
	//int width = 30;
	//int NUM = 900;
	//dim3 dimBlock(30, 30);
	int width = size[0];
	int NUM_a = size[0] * size[1];
	int NUM_b = size[2] * size[3];
	int NUM_c = size[0] * size[3];
	dim3 dimBlock(size[0], size[3]);
	cudaEvent_t start, stop;
	float elapsedTime;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaMalloc((void**)&M, size[0]*size[1] * sizeof(double));
	cudaMalloc((void**)&N, size[2]*size[3] * sizeof(double));
	cudaMalloc((void**)&P, size[0]*size[3] * sizeof(double));
	cudaMemcpy(M, a, NUM_a * sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(N, b, NUM_b * sizeof(double), cudaMemcpyHostToDevice);
	cudaEventRecord(start, 0);
	int threadsPerBlock = 256;
	int blocksPerGrid = (size[0] * size[3] + threadsPerBlock - 1) / threadsPerBlock;
	printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
	MatrixMuiOnDevice <<>> (M, N, P, size[0],size[1]);
	cudaMemcpy(c, P, NUM_c * sizeof(double), cudaMemcpyDeviceToHost);
	cudaThreadSynchronize();
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime, start, stop);
	printf("%f\n", elapsedTime);
	cudaFree(M);
	cudaFree(N);
	cudaFree(P);
	return 0;
}


3、matlab调用CUDA计算矩阵相乘

#include 
#include 
#include 
#include 
#include 
#include "mex.h"
#include "gpu/mxGPUArray.h"

__global__ void matrix_mul(double *M, double *N, double* res,int L_rows,int L_cols) //矩阵列优先存储  //block thread 行优先
{
	//int num = blockDim.x*threadIdx.y+threadIdx.x;
	int num=(gridDim.x*blockIdx.y+blockIdx.x)*blockDim.x*blockDim.y+ blockDim.x*threadIdx.y + threadIdx.x;

	int col = num / L_rows;
	int row = num % L_rows;
	double sum = 0;
	for (int i = 0; i < L_cols; i++)
		{
		sum += M[i*L_rows + row] * N[col*L_cols + i];
		}
	res[num] = sum;
}


void mexFunction(int nlhs, mxArray *plhs[],int nrhs, mxArray const *prhs[])
{
	double* M;
	double* L_mat;
	double* N;
	double* R_mat;
	double* res_d;
	double* res_h;

	int L_rows = mxGetM(prhs[0]);
	int L_cols = mxGetN(prhs[0]);
	L_mat = mxGetPr(prhs[0]);                           

	int R_rows = mxGetM(prhs[1]);
	int R_cols = mxGetN(prhs[1]);
	R_mat = mxGetPr(prhs[1]);                           

	plhs[0] = mxCreateDoubleMatrix(L_rows, R_cols, mxREAL);

	cudaMalloc((void**)&M,  L_rows*L_cols*sizeof(double));
	cudaMalloc((void**)&N, R_rows*R_cols * sizeof(double));
	cudaMalloc((void**)&res_d, L_rows*R_cols * sizeof(double));
	cudaMemcpy(M, L_mat, L_rows*L_cols * sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(N, R_mat, R_rows*R_cols * sizeof(double), cudaMemcpyHostToDevice);
	dim3 dimBlock(L_rows,R_cols);
	int threadsPerBlock = 256;
	int blocksPerGrid = (L_rows*R_cols + threadsPerBlock - 1) / threadsPerBlock;
	matrix_mul << > >(M,N, res_d,L_rows,L_cols);
	cudaThreadSynchronize();

	cudaMemcpy(mxGetPr(plhs[0]), res_d, L_rows*R_cols * sizeof(double), cudaMemcpyDeviceToHost);
	cudaFree(M);
	cudaFree(N);
	cudaFree(res_d);

}





你可能感兴趣的:(cuda,matlab——cuda,C++)