CUDA实现图像处理

1、BGR转为灰度图的CUDA demo

__global__ void Image2Gray(uchar* din, uchar* dout, int h, int w) {
    int i = threadIdx.x + blockDim.x*blockIdx.x;
    int j = threadIdx.y + blockDim.y*blockIdx.y;
    int idx = j*w + i;
    int idx3 = idx * 3;
    dout[idx] = 0.229* din[idx3 + 0] + 0.587*din[idx3 + 1] + 0.114*din[idx3 + 2];
}
int Img2GrayTestGPU() {
    bool display = true;
    cv::Mat img = cv::imread("C:/Users/fengjiahui/Desktop/demo/res.jpg");
    cv::resize(img, img, cv::Size(480, 640));
    //cv::imshow("img", img);
    //cv::waitKey(0);
    int w = img.cols;
    int h = img.rows;
    int wh = w*h;
    int len_size = wh * 3;
    cv::Mat img_gray(h, w, CV_8UC1);

    uchar *gdata;
    cudaMalloc((void**)&gdata, sizeof(uchar)*wh);

    uchar *dataa;
    cudaMalloc((void**)&dataa, sizeof(uchar)*len_size);
    double gtct_time = (double)cv::getTickCount();

    cudaMemcpy(dataa, img.data, sizeof(uchar)*len_size, cudaMemcpyHostToDevice);

    dim3 threadPerBlock(32, 32);//block:thread number,max number is 1024=32*32
    dim3 blockNumber(int((w + threadPerBlock.x - 1) / threadPerBlock.x), int((h + threadPerBlock.y - 1) / threadPerBlock.y));
    printf("Block(%d,%d),Grid(%d,%d).\n", threadPerBlock.x, threadPerBlock.y, blockNumber.x, blockNumber.y);

    Image2Gray << > > (dataa, gdata, h, w);
    cudaMemcpy(img_gray.data, gdata, sizeof(uchar)*wh, cudaMemcpyDeviceToHost);
    printf("=>need time:%.2f ms\n", ((double)cv::getTickCount() - gtct_time) / ((double)cv::getTickFrequency()) * 1000);

    uchar* g = img_gray.data;
    for (int i = 0; i < 5; ++i) {
        printf("%d\n", g[i]);
    }
    if (display) {
        imshow("gpuori", img);
        imshow("gpugray", img_gray);
        waitKey(0);
    }
    
    cudaFree(gdata);
    cudaFree(dataa);
    return 0;
}

2、图像归一化

__global__ void ImagesProcess(uchar* din, float* dout, int h, int w) {
    int i = threadIdx.x + blockDim.x*blockIdx.x;
    int j = threadIdx.y + blockDim.y*blockIdx.y;
    int idx = j*w + i;
    int idx3 = idx * 3;
    const int hw = h*w;
    const int hw2 = hw * 2;
    const float scale = 1 / 255.0;
    dout[idx + 0] = din[idx3 + 2] * scale;//R
    dout[idx + hw] = din[idx3 + 1] * scale;//B
    dout[idx + hw2] = din[idx3 + 0] * scale;//G
}
int ImgTestGPU() {
    cv::Mat img = cv::imread("C:/Users/fengjiahui/Desktop/demo/res.jpg");
    cv::resize(img, img, cv::Size(640, 640));
    //cv::imshow("img", img);
    //cv::waitKey(0);
    int size_len = img.channels()*img.rows*img.cols;
    int inputhw = img.rows*img.cols;
    float *res_host = new float[size_len];

    uchar *dataa;
    cudaMalloc((void**)&dataa, sizeof(uchar)*size_len);

    float *res_device;
    cudaMalloc((void**)&res_device, sizeof(float)*size_len);

    double gtct_time = (double)cv::getTickCount();

    cudaMemcpy(dataa, img.data, sizeof(uchar)*size_len, cudaMemcpyHostToDevice);

    dim3 threadPerBlock(32, 32);//block:thread number
    dim3 blockNumber(int((img.cols + threadPerBlock.x - 1) / threadPerBlock.x), int((img.rows + threadPerBlock.y - 1) / threadPerBlock.y));
    printf("Block(%d,%d),Grid(%d,%d).\n", threadPerBlock.x, threadPerBlock.y, blockNumber.x, blockNumber.y);
    ImagesProcess << > > (dataa, res_device, img.rows, img.cols);
    cudaMemcpy(res_host, res_device, sizeof(float)*inputhw, cudaMemcpyDeviceToHost);

    printf("=>need time:%.2f ms\n", ((double)cv::getTickCount() - gtct_time) / ((double)cv::getTickFrequency()) * 1000);
    for (int i = 10; i < 15; ++i) {
        printf("%f\n", res_host[i]);
    }
    delete[] res_host;
    cudaFree(res_device);
    cudaFree(dataa);
    return 0;
}

你可能感兴趣的:(daily,opencv)