cuda_opencv 矩阵相加

实现矩阵相加

 1 #include <stdlib.h>

 2 #include <stdio.h>

 3 #include <opencv/cv.h>

 4 #include <opencv/highgui.h>

 5 #include <opencv2/opencv.hpp>

 6 

 7 #include "cuda_runtime.h"

 8 #include "device_launch_parameters.h"

 9 using namespace std;

10 using namespace cv;

11 

12 __global__ void Add_kernel(const int2* d_A, const int2* d_B,int2*  d_C,int width, int height)

13 {

14     int x = threadIdx.x + blockIdx.x * blockDim.x;

15     int y = threadIdx.y + blockIdx.y * blockDim.y;

16 

17     if(x < width && y < height)

18     {

19         int offset = x + y*width;

20         d_C[offset].x = d_A[offset].x + d_B[offset].x;

21         d_C[offset].y = d_A[offset].y + d_B[offset].y;

22     }

23 }

24 int main()

25 {

26     Mat img(3, 4, CV_32S, Scalar_<int>(0));

27 

28     cout<<img<<endl;

29     cout<<endl;

30 

31 

32     for(int i = 0 ; i < img.rows; i++)

33     {

34         for(int j = 0 ; j < img.cols; j++)

35         {

36             img.at<int>(i,j)=i+j;

37         }

38     }

39     cout<<endl;

40 

41     cout<<img<<endl;

42 

43 

44     size_t memSize = img.step * img.rows;

45     int2* d_A = NULL;

46     int2* d_B = NULL;

47     int2* d_C = NULL;

48     cudaMalloc((void**)&d_A, memSize);

49     cudaMalloc((void**)&d_B, memSize);

50     cudaMalloc((void**)&d_C, memSize);

51 

52     cudaMemcpy(d_A,img.data,memSize, cudaMemcpyHostToDevice);

53     cudaMemcpy(d_B,img.data,memSize, cudaMemcpyHostToDevice);

54 

55     dim3 threads(16, 16);

56     dim3 grids((img.rows + threads.x - 1)/threads.x,(img.cols + threads.y - 1)/threads.y);

57     Add_kernel<<<grids,threads>>>(d_A, d_B, d_C, img.rows, img.cols);

58 

59     cudaMemcpy(img.data, d_C,memSize,cudaMemcpyDeviceToHost);

60     cout<<"GPU"<<endl;

61     cout<<img<<endl;

62     cudaFree(d_A);

63     cudaFree(d_B);

64     cudaFree(d_C);

65 

66     system("pause");

67     return 0;

68 }

 

你可能感兴趣的:(opencv)