CUDA加速图像融合

在这篇文章之前我们已经介绍过一篇图像拼接过程中使用CUDA加速融合,但是还存在着一些不足,需要配置C++的一些编译环境,这个比较麻烦,实际应用麻烦一些。本文在这里借助python、pytorch、cuda来完成图像拼接的这一过程,使用python进行图片的预处理,然后通过torch上传到cuda,调用cuda进行融合。这个过程较为简单,而且效率极高。

1. 首先使计算两张图片的单应性矩阵并将一张图片想另一种图像上做投影变换,这个没什么需要解释的,直接上代码。

def stitch(path1, path2):
    """
    传入图片路径,返回变换过后的填充图片
    :param path1:
    :return:
    """
    # 边界填充
    top, bot, left, right = 100, 100, 0, 800
    img1 = cv2.imread(path1)
    img2 = cv2.imread(path2)

    img1_size = img1.shape[:2]
    img2 = cv2.resize(img2, (img1_size[1], img1_size[0]))

    srcImg = cv2.copyMakeBorder(img1, top, bot, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
    testImg = cv2.copyMakeBorder(img2, top, bot, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
    img1gray = cv2.cvtColor(srcImg, cv2.COLOR_BGR2GRAY)
    img2gray = cv2.cvtColor(testImg, cv2.COLOR_BGR2GRAY)
    sift = cv2.xfeatures2d_SIFT().create()
    # find the keypoints and descriptors with SIFT
    kp1, des1 = sift.detectAndCompute(img1gray, None)
    kp2, des2 = sift.detectAndCompute(img2gray, None)

    # FLANN parameters
    FLANN_INDEX_KDTREE = 1
    index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
    search_params = dict(checks=50)
    flann = cv2.FlannBasedMatcher(index_params, search_params)
    matches = flann.knnMatch(des1, des2, k=2)

    # Need to draw only good matches, so create a mask
    matchesMask = [[0, 0] for i in range(len(matches))]
    good = []
    pts1 = []
    pts2 = []
    # ratio test as per Lowe's paper
    for i, (m, n) in enumerate(matches):
        if m.distance < 0.7 * n.distance:
            good.append(m)
            pts2.append(kp2[m.trainIdx].pt)
            pts1.append(kp1[m.queryIdx].pt)
            matchesMask[i] = [1, 0]

    draw_params = dict(matchColor=(0, 255, 0),
                       singlePointColor=(255, 0, 0),
                       matchesMask=matchesMask,
                       flags=0)
    img3 = cv2.drawMatchesKnn(img1gray, kp1, img2gray, kp2, matches, None, **draw_params)
    plt.imshow(img3, ), plt.show()

    MIN_MATCH_COUNT = 10
    if len(good) > MIN_MATCH_COUNT:
        src_pts = np.float32([kp1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
        dst_pts = np.float32([kp2[m.trainIdx].pt for m in good]).reshape(-1, 1, 2)

        M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
        warpImg = cv2.warpPerspective(testImg, np.array(M), (testImg.shape[1], testImg.shape[0]),
                                      flags=cv2.WARP_INVERSE_MAP)
        return srcImg, warpImg
    else:
        print("Not enough matches are found - {}/{}".format(len(good), MIN_MATCH_COUNT))
        matchesMask = None

2. 对得到两张变换图像进行融合

def blend(srcImg, warpImg, savename=None):
    """
    图片融合,
    """
    rows, cols = srcImg.shape[:2]
    # 找到左右重叠区域
    global left, right
    for col in range(0, cols):
        if srcImg[:, col].any() and warpImg[:, col].any():
            left = col
            break
    for col in range(cols - 1, 0, -1):
        if srcImg[:, col].any() and warpImg[:, col].any():
            right = col
            break
    print(left, right)

    height, width = srcImg.shape[:2]
    print(srcImg.shape)

    # cv2.imwrite('../data/src.jpg', srcImg)
    # cv2.imwrite('../data/warp.jpg', warpImg)

    leftimg = torch.from_numpy(srcImg).int().cuda()
    rightimg = torch.from_numpy(warpImg).int().cuda()
    res = torch.ones([height, width, 3]).int().cuda()


    t1 = time.time()
    # 调用CUDA融合
    imgblend.imgblend_wrapper(height, width, left, right, leftimg, rightimg, res)
    print(time.time() - t1)

    img = res.cpu().numpy()
    img = np.array(img, dtype=np.uint8)

    return img

3. 重点讲如何使用CUDA加速

cuda核函数在imgblend_gpu.cu中,grid划分成2维,block划分为1维 。block中的线程数为1024=32*32,在grid中创建二维的Block,为了保证总线程数核融合的图片像素点的个数相对应(一般会多一点),根据图片的宽高创建的Block的大小为(height+32-1)/32, (width+32-1)/32),这个一个线程处理一个像素点的rgb三个值。

// imglend_api.cpp 
#include 
#include 
#include "imgblend_gpu.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("imgblend_wrapper", &imgblend_wrapper_cpp, "imgblend_wrapper_cpp");
}

// imgblend_gpu.h
#include 
#include 
#include 

int imgblend_wrapper_cpp(int height, int width, int left, int right,
	at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);

void imgblend_kernel_launcher_cuda(int height, int width, int left, int right,
	const int *xyz, const int *new_xyz, int *idx);
// imgblend_gpu.cpp
#include 
#include 
#include 
#include 
#include "imgblend_gpu.h"

extern THCState *state;

#define CHECK_CUDA(x) do { \
	  if (!x.type().is_cuda()) { \
		      fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
		      exit(-1); \
		    } \
} while (0)
#define CHECK_CONTIGUOUS(x) do { \
	  if (!x.is_contiguous()) { \
		      fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
		      exit(-1); \
		    } \
} while (0)
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)

int imgblend_wrapper_cpp(int height, int width, int left, int right,
    at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {

    // 检查输入是否为contiguous的torch.cuda变量
    CHECK_INPUT(new_xyz_tensor);
    CHECK_INPUT(xyz_tensor);

    // 建立指针
    const int *new_xyz = new_xyz_tensor.data<int>();
    const int *xyz = xyz_tensor.data<int>();
    int *idx = idx_tensor.data<int>();

    // 放入到CUDA中进行具体的算法实现
    imgblend_kernel_launcher_cuda(height, width, left, right, new_xyz, xyz, idx);
    return 1;
}
// imgblend_gpu.cu  这里主要实现cuda的融合 
#include 
#include 
#include 
#include "imgblend_gpu.h"


// CUDA使用__global__来定义kernel
__global__ void imgblend_kernel_cuda(int height, int width, int left, int right, const int *__restrict__ new_xyz, const int *__restrict__ xyz, int *__restrict__ idx) {
  
    int blockId = blockIdx.y * gridDim.x + blockIdx.x;
    int pt_idx = blockId * blockDim.x + threadIdx.x;

    if (pt_idx >= width * height) return;
    // 针对指针数据,利用+的操作来确定数组首地址,相当于取new_xyz[bi,ni]
    int offset = pt_idx * 3;
    new_xyz += offset;
    xyz += offset;
    idx += offset;

    int src1pix = new_xyz[0] + new_xyz[1] + new_xyz[2];
    int src2pix = xyz[0] + xyz[1] + xyz[2];

    if(src2pix==0){
        idx[0] = new_xyz[0];
        idx[1] = new_xyz[1];
        idx[2] = new_xyz[2];
    }else if(src1pix==0){
        idx[0] = xyz[0];
        idx[1] = xyz[1];
        idx[2] = xyz[2];
    }else{
        float srclen = abs(pt_idx - left);
        float warplen = abs(pt_idx - right);
        float d = srclen/(srclen + warplen);

        idx[0] = int(new_xyz[0] * (1-d) + xyz[0] * d);
        idx[1] = int(new_xyz[1] * (1-d) + xyz[1] * d);
        idx[2] = int(new_xyz[2] * (1-d) + xyz[2] * d);
    }

}

void imgblend_kernel_launcher_cuda(int height, int width, int left, int right, \
    const int *new_xyz, const int *xyz, int *idx) {

    // cudaError_t变量用来记录CUDA的err信息,在最后需要check
    cudaError_t err;
    dim3 blocks((height+32-1)/32, (width+32-1)/32);
    dim3 threads(1024);

    // 可函数需要用<<>> 去指定调用的块数和线程数,总共调用的线程数为blocks1*threads
    imgblend_kernel_cuda<<<blocks, threads>>>(height, width, left, right, new_xyz, xyz, idx);

    // 如果cuda操作错误,则打印错误信息
    err = cudaGetLastError();
    if (cudaSuccess != err) {
        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
        exit(-1);
    }
}

# setup.py 用于构建生成供pyhton调用的库
# 使用python setup.py develop 才能生成python才可以调用的包
from setuptools import find_packages, setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension


if __name__ == '__main__':

    setup(
        name='example',
        version='0.0.0',
        description='Examples illustrating how to use c++ and CUDA in python.',
        install_requires=[
            'numpy',
            'torch>=1.1',
        ],
        author='Jeff Wang',
        license='Apache License 2.0',
        packages=find_packages(),
        cmdclass={
            'build_ext': BuildExtension,
        },
        ext_modules=[
            CUDAExtension(
                name="cpp_CUDA_code.imgblend_cuda",
                sources=[
                    "cpp_CUDA_code/imgblend_api.cpp",
                    "cpp_CUDA_code/imgblend.cpp",
                    "cpp_CUDA_code/imgblend_gpu.cu",
                ]   
            ),
        ],
    )

CUDA加速图像融合_第1张图片
最后,在920x1880的图像上,在gtx1660Ti上融合只用了1ms的时间

你可能感兴趣的:(OpenCV,python,图像融合,cuda加速)