1、BGR转为灰度图的CUDA demo
__global__ void Image2Gray(uchar* din, uchar* dout, int h, int w) {
int i = threadIdx.x + blockDim.x*blockIdx.x;
int j = threadIdx.y + blockDim.y*blockIdx.y;
int idx = j*w + i;
int idx3 = idx * 3;
dout[idx] = 0.229* din[idx3 + 0] + 0.587*din[idx3 + 1] + 0.114*din[idx3 + 2];
}
int Img2GrayTestGPU() {
bool display = true;
cv::Mat img = cv::imread("C:/Users/fengjiahui/Desktop/demo/res.jpg");
cv::resize(img, img, cv::Size(480, 640));
//cv::imshow("img", img);
//cv::waitKey(0);
int w = img.cols;
int h = img.rows;
int wh = w*h;
int len_size = wh * 3;
cv::Mat img_gray(h, w, CV_8UC1);
uchar *gdata;
cudaMalloc((void**)&gdata, sizeof(uchar)*wh);
uchar *dataa;
cudaMalloc((void**)&dataa, sizeof(uchar)*len_size);
double gtct_time = (double)cv::getTickCount();
cudaMemcpy(dataa, img.data, sizeof(uchar)*len_size, cudaMemcpyHostToDevice);
dim3 threadPerBlock(32, 32);//block:thread number,max number is 1024=32*32
dim3 blockNumber(int((w + threadPerBlock.x - 1) / threadPerBlock.x), int((h + threadPerBlock.y - 1) / threadPerBlock.y));
printf("Block(%d,%d),Grid(%d,%d).\n", threadPerBlock.x, threadPerBlock.y, blockNumber.x, blockNumber.y);
Image2Gray << > > (dataa, gdata, h, w);
cudaMemcpy(img_gray.data, gdata, sizeof(uchar)*wh, cudaMemcpyDeviceToHost);
printf("=>need time:%.2f ms\n", ((double)cv::getTickCount() - gtct_time) / ((double)cv::getTickFrequency()) * 1000);
uchar* g = img_gray.data;
for (int i = 0; i < 5; ++i) {
printf("%d\n", g[i]);
}
if (display) {
imshow("gpuori", img);
imshow("gpugray", img_gray);
waitKey(0);
}
cudaFree(gdata);
cudaFree(dataa);
return 0;
}
2、图像归一化
__global__ void ImagesProcess(uchar* din, float* dout, int h, int w) {
int i = threadIdx.x + blockDim.x*blockIdx.x;
int j = threadIdx.y + blockDim.y*blockIdx.y;
int idx = j*w + i;
int idx3 = idx * 3;
const int hw = h*w;
const int hw2 = hw * 2;
const float scale = 1 / 255.0;
dout[idx + 0] = din[idx3 + 2] * scale;//R
dout[idx + hw] = din[idx3 + 1] * scale;//B
dout[idx + hw2] = din[idx3 + 0] * scale;//G
}
int ImgTestGPU() {
cv::Mat img = cv::imread("C:/Users/fengjiahui/Desktop/demo/res.jpg");
cv::resize(img, img, cv::Size(640, 640));
//cv::imshow("img", img);
//cv::waitKey(0);
int size_len = img.channels()*img.rows*img.cols;
int inputhw = img.rows*img.cols;
float *res_host = new float[size_len];
uchar *dataa;
cudaMalloc((void**)&dataa, sizeof(uchar)*size_len);
float *res_device;
cudaMalloc((void**)&res_device, sizeof(float)*size_len);
double gtct_time = (double)cv::getTickCount();
cudaMemcpy(dataa, img.data, sizeof(uchar)*size_len, cudaMemcpyHostToDevice);
dim3 threadPerBlock(32, 32);//block:thread number
dim3 blockNumber(int((img.cols + threadPerBlock.x - 1) / threadPerBlock.x), int((img.rows + threadPerBlock.y - 1) / threadPerBlock.y));
printf("Block(%d,%d),Grid(%d,%d).\n", threadPerBlock.x, threadPerBlock.y, blockNumber.x, blockNumber.y);
ImagesProcess << > > (dataa, res_device, img.rows, img.cols);
cudaMemcpy(res_host, res_device, sizeof(float)*inputhw, cudaMemcpyDeviceToHost);
printf("=>need time:%.2f ms\n", ((double)cv::getTickCount() - gtct_time) / ((double)cv::getTickFrequency()) * 1000);
for (int i = 10; i < 15; ++i) {
printf("%f\n", res_host[i]);
}
delete[] res_host;
cudaFree(res_device);
cudaFree(dataa);
return 0;
}