CUDA 并行算法Scan、Reduce 图像直方图均衡

1、并行算法介绍

Reduce:

对于一般可以加法运算,如进行8个数相加:


串行计算

由于对于符合交换律,其运算顺序可以改变,可以使用并行计算,如下:


并行计算

对比串行运算,与并行运算的速度:

运算量 8 ... N
并行Step 7 ... N-1
串行Step 3 ... log2(N)

对于并行计算的实现,可参照udacity的课程GitHub:reduce.cu

Scan

Scan 可称为prefix sum,即求前N项(或N-1项)的和,如下:

prefix sum

串行计算流程如上求和无太大差异,而并行计算起来相当讲技巧,以N=8举例:
黑\红实线:加法 黑虚线:赋值

详细可阅读:Parallel prefix sum (scan) with CUDA

2、图像的直方图均衡

假设存在8个灰度级的图像如下图,那么它实际可见的范围为1~4

1 2 3 4
1 2 3 4
1 2 3 4
1 2 3 4

频数图如下:


image.png
histogram_equalize.png

即原来的灰度级为1映射后为0,2映射后为2,3映射后为3,4映射后为5
均衡后新灰度频数表如下:

灰度级 0 1 2 3 4 5 6 7
频数 4 0 4 4 0 4 0 0
image.png

映射结果:

0 2 3 5
0 2 3 5
0 2 3 5
0 2 3 5

总的来说没有整个过程没有改变像素点灰度值点之间大小、数量关系,或者说将可视区间拉伸,让色彩对比度更加明显。

3、并行图像直方图均衡化

1)统计每个灰度级像素的数目

__global__ void  histogram_init(char *src,unsigned int *out)
//针对512*512大小灰度图 开启512*512线程,计算256个灰度级相应频数

2)应用Parallel Scan(prefix sum) 计算出映射关系

__global__ void scan_downsweep(unsigned int *in,unsigned int *out,int n)
//输入256点频数表,输出均衡化后256对应得映射值

3)将原图按照映射关系映射到结果图

__global__ void equal_remap(char *img,char *out,unsigned int *idetity)
//根据映射表,重新映射图像

程序效果图如下:


histogram_result

源代码:

#include "cuda.h"
#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "device_functions.h"

#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/opencv.hpp"
#include "stdio.h"
#include 
using namespace std;
using namespace cv;
#define IMAGE_DIR "/home/dzqiu/Documents/image/chaozhou.JPG"

__global__ void  histogram_init(char *src,unsigned int *out)
{
    int x = threadIdx.x + blockDim.x * blockIdx.x;
    int y = threadIdx.y + blockDim.y * blockDim.y;
    int offset = x + y * gridDim.x * blockDim.x;

    unsigned char value = src[offset];
    //原子操作,否则会出错。
    atomicAdd(&out[value],1);
}

//refer: Parallel Prefix Sum (Scan) with CUDA, Mark Harris, April 2007
//link : http: //developer.download.nvidia.com/compute/cuda/2_2/sdk/website/projects/scan/doc/scan.pdf
__global__ void scan_downsweep(unsigned int *in,unsigned int *out,int n)
{
    int Idx = threadIdx.x ;
     extern __shared__ float sdata[];
    sdata[Idx] = in[Idx];
    __syncthreads();

    unsigned int offset=1;

    for(unsigned int i=n>>1;i>0;i>>=1)  //build sum iin place up the tree
    {
        __syncthreads();
        if(Idx>= 1;
        __syncthreads();
        if(Idx>>(dev_gray,dev_histogram);
    int host_histogram[GRAY_IDENTITY]={0};
    cudaMemcpy(host_histogram,dev_histogram,GRAY_IDENTITY*sizeof(int),cudaMemcpyDeviceToHost);

    /*equalize the histogram*/
    unsigned int *dev_histogram_equal;
    cudaMalloc((void**)&dev_histogram_equal,GRAY_IDENTITY*sizeof(int));
    blocks=dim3(1,1);threads=dim3(GRAY_IDENTITY,1);
    scan_downsweep<<>>(dev_histogram,dev_histogram_equal,GRAY_IDENTITY);
    int host_histogram_equal[GRAY_IDENTITY]={0};
    cudaMemcpy(host_histogram_equal,dev_histogram_equal,GRAY_IDENTITY*sizeof(int),cudaMemcpyDeviceToHost);

    for(int i=0;i %6d| bin[%3d]:%6d -> %6d| bin[%3d]:%6d -> %6d| bin[%3d]:%6d -> %6d\n",
                4*i,host_histogram[4*i],host_histogram_equal[4*i],
                4*i+1,host_histogram[4*i+1],host_histogram_equal[4*i+1],
                4*i+2,host_histogram[4*i+2],host_histogram_equal[4*i+2],
                4*i+3,host_histogram[4*i+3],host_histogram_equal[4*i+3]);
    }

    /*remap the image use the equalized histogram*/
    char *dev_equalImg;
    cudaMalloc((void**)&dev_equalImg,DIM*DIM);
    blocks=dim3(DIM/16,DIM/16);threads=dim3(16,16);
    equal_remap<<>>(dev_gray,dev_equalImg,dev_histogram_equal);
    Mat equalImg(DIM,DIM,CV_8UC1,Scalar(255));
    cudaMemcpy(equalImg.data,dev_equalImg,DIM*DIM,cudaMemcpyDeviceToHost);




    cudaFree(dev_gray);
    cudaFree(dev_histogram);
    cudaFree(dev_histogram_equal);
    imshow("gray_img",img_gray);
    imshow("histogram equalization use GPU",equalImg);

    waitKey(0);
    return 0;
}

源码下载:GitHub
参考:Parallel Prefix Sum (Scan) with CUDA, Mark Harris, April 2007

你可能感兴趣的:(CUDA 并行算法Scan、Reduce 图像直方图均衡)