尝试了一下对两幅图像进行叠加融合:思路是用opencv读取图像(Mat类型),拷贝到device中,进行叠加,再拷出来host中,显示结果。
下面就是完整的代码:
/*****************************************
Copyright (c) 2016 Jingshuang Hu
@filename:kernel.cu
@datetime:2016.11.10
@author:HJS
@e-mail:[email protected]
@blog:http://blog.csdn.net/hujingshuang
*****************************************/
#include
#include
#include
#include
#include
#include
using namespace std;
using namespace cv;
// gray (overload)
__global__ void imageAdd(uchar* img1, uchar* img2, uchar* imgres, int length) {
// 一维数据索引计算(万能计算方法)
int tid = blockIdx.z * (gridDim.x * gridDim.y) * (blockDim.x * blockDim.y * blockDim.z) \
+ blockIdx.y * gridDim.x * (blockDim.x * blockDim.y * blockDim.z) \
+ blockIdx.x * (blockDim.x * blockDim.y * blockDim.z) \
+ threadIdx.z * (blockDim.x * blockDim.y) \
+ threadIdx.y * blockDim.x \
+ threadIdx.x;
if (tid < length) {
imgres[tid] = (img1[tid] + img2[tid]) / 2;
}
}
// rgb (overload)
__global__ void imageAdd(uchar3* img1, uchar3* img2, uchar3* imgres, int length) {
int tid = blockIdx.z * (gridDim.x * gridDim.y) * (blockDim.x * blockDim.y * blockDim.z) \
+ blockIdx.y * gridDim.x * (blockDim.x * blockDim.y * blockDim.z) \
+ blockIdx.x * (blockDim.x * blockDim.y * blockDim.z) \
+ threadIdx.z * (blockDim.x * blockDim.y) \
+ threadIdx.y * blockDim.x \
+ threadIdx.x;
if (tid < length) {
imgres[tid].x = (img1[tid].x + img2[tid].x) / 2;
imgres[tid].y = (img1[tid].y + img2[tid].y) / 2;
imgres[tid].z = (img1[tid].z + img2[tid].z) / 2;
}
}
int main() {
// source images, gray
Mat img1_host = imread("img1.jpg", IMREAD_GRAYSCALE);
Mat img2_host = imread("img2.jpg", IMREAD_GRAYSCALE);
int row = img1_host.rows;
int col = img1_host.cols;
int length = row * col;
// memory size
int memSize = length * sizeof(uchar);
// device memory
uchar* img1_device;
uchar* img2_device;
uchar* imgres_device;
cudaMalloc((void**)&img1_device, memSize);
cudaMalloc((void**)&img2_device, memSize);
cudaMalloc((void**)&imgres_device, memSize);
// copy host to device
cudaMemcpy(img1_device, img1_host.data, memSize, cudaMemcpyHostToDevice);
cudaMemcpy(img2_device, img2_host.data, memSize, cudaMemcpyHostToDevice);
// setting parameters and run the kernel function
dim3 grid(1 + (length / (32 * 32 + 1)), 1, 1); // grid
dim3 block(32, 32, 1); // block
imageAdd<<>>(img1_device, img2_device, imgres_device, length);
// copy device to host
Mat imgres_host = Mat::zeros(row, col, CV_8UC1);
cudaMemcpy(imgres_host.data, imgres_device, memSize, cudaMemcpyDeviceToHost);
// show source and result images
imshow("img1", img1_host);
imshow("img2", img2_host);
imshow("imgres", imgres_host);
waitKey(0);
// rgb
Mat img1rgb_host = imread("img1.jpg");
Mat img2rgb_host = imread("img2.jpg");
int rowrgb = img1rgb_host.rows;
int colrgb = img1rgb_host.cols;
int lengthrgb = rowrgb * colrgb;
int memSizergb = length * sizeof(uchar3);
uchar3* img1rgb_device;
uchar3* img2rgb_device;
uchar3* imgresrgb_device;
cudaMalloc((void**)&img1rgb_device, memSizergb);
cudaMalloc((void**)&img2rgb_device, memSizergb);
cudaMalloc((void**)&imgresrgb_device, memSizergb);
cudaMemcpy(img1rgb_device, img1rgb_host.data, memSizergb, cudaMemcpyHostToDevice);
cudaMemcpy(img2rgb_device, img2rgb_host.data, memSizergb, cudaMemcpyHostToDevice);
dim3 gridrgb(1 + (length / (32 * 32 + 1)), 1, 1);
dim3 blockrgb(32, 32, 1);
imageAdd<<>>(img1rgb_device, img2rgb_device, imgresrgb_device, lengthrgb);
Mat imgresrgb_host = Mat::zeros(rowrgb, colrgb, CV_8UC3);
cudaMemcpy(imgresrgb_host.data, imgresrgb_device, memSizergb, cudaMemcpyDeviceToHost);
imshow("img1rgb", img1rgb_host);
imshow("img2rgb", img2rgb_host);
imshow("imgrgbres", imgresrgb_host);
waitKey();
// free
cudaFree(img1_device);
cudaFree(img2_device);
cudaFree(imgres_device);
cudaFree(img1rgb_device);
cudaFree(img2rgb_device);
cudaFree(imgresrgb_device);
system("pause");
return 0;
}
显示结果:
另外,还要请教大家一个问题,就是在使用GPU写代码的时候,chrome总是崩溃,网上收了一下结果好多搞CUDA的遇到了,办法都试过一遍勒还是不能解决问题。迫使我把搜狗浏览器这个小三儿硬生生的扶成了正室,只有在不用GPU时,才用的chrome,我感觉到了它心里的委屈,各位大神有什么办法啊~