在这篇文章之前我们已经介绍过一篇图像拼接过程中使用CUDA加速融合,但是还存在着一些不足,需要配置C++的一些编译环境,这个比较麻烦,实际应用麻烦一些。本文在这里借助python、pytorch、cuda来完成图像拼接的这一过程,使用python进行图片的预处理,然后通过torch上传到cuda,调用cuda进行融合。这个过程较为简单,而且效率极高。
def stitch(path1, path2):
"""
传入图片路径,返回变换过后的填充图片
:param path1:
:return:
"""
# 边界填充
top, bot, left, right = 100, 100, 0, 800
img1 = cv2.imread(path1)
img2 = cv2.imread(path2)
img1_size = img1.shape[:2]
img2 = cv2.resize(img2, (img1_size[1], img1_size[0]))
srcImg = cv2.copyMakeBorder(img1, top, bot, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
testImg = cv2.copyMakeBorder(img2, top, bot, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
img1gray = cv2.cvtColor(srcImg, cv2.COLOR_BGR2GRAY)
img2gray = cv2.cvtColor(testImg, cv2.COLOR_BGR2GRAY)
sift = cv2.xfeatures2d_SIFT().create()
# find the keypoints and descriptors with SIFT
kp1, des1 = sift.detectAndCompute(img1gray, None)
kp2, des2 = sift.detectAndCompute(img2gray, None)
# FLANN parameters
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
search_params = dict(checks=50)
flann = cv2.FlannBasedMatcher(index_params, search_params)
matches = flann.knnMatch(des1, des2, k=2)
# Need to draw only good matches, so create a mask
matchesMask = [[0, 0] for i in range(len(matches))]
good = []
pts1 = []
pts2 = []
# ratio test as per Lowe's paper
for i, (m, n) in enumerate(matches):
if m.distance < 0.7 * n.distance:
good.append(m)
pts2.append(kp2[m.trainIdx].pt)
pts1.append(kp1[m.queryIdx].pt)
matchesMask[i] = [1, 0]
draw_params = dict(matchColor=(0, 255, 0),
singlePointColor=(255, 0, 0),
matchesMask=matchesMask,
flags=0)
img3 = cv2.drawMatchesKnn(img1gray, kp1, img2gray, kp2, matches, None, **draw_params)
plt.imshow(img3, ), plt.show()
MIN_MATCH_COUNT = 10
if len(good) > MIN_MATCH_COUNT:
src_pts = np.float32([kp1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
dst_pts = np.float32([kp2[m.trainIdx].pt for m in good]).reshape(-1, 1, 2)
M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
warpImg = cv2.warpPerspective(testImg, np.array(M), (testImg.shape[1], testImg.shape[0]),
flags=cv2.WARP_INVERSE_MAP)
return srcImg, warpImg
else:
print("Not enough matches are found - {}/{}".format(len(good), MIN_MATCH_COUNT))
matchesMask = None
def blend(srcImg, warpImg, savename=None):
"""
图片融合,
"""
rows, cols = srcImg.shape[:2]
# 找到左右重叠区域
global left, right
for col in range(0, cols):
if srcImg[:, col].any() and warpImg[:, col].any():
left = col
break
for col in range(cols - 1, 0, -1):
if srcImg[:, col].any() and warpImg[:, col].any():
right = col
break
print(left, right)
height, width = srcImg.shape[:2]
print(srcImg.shape)
# cv2.imwrite('../data/src.jpg', srcImg)
# cv2.imwrite('../data/warp.jpg', warpImg)
leftimg = torch.from_numpy(srcImg).int().cuda()
rightimg = torch.from_numpy(warpImg).int().cuda()
res = torch.ones([height, width, 3]).int().cuda()
t1 = time.time()
# 调用CUDA融合
imgblend.imgblend_wrapper(height, width, left, right, leftimg, rightimg, res)
print(time.time() - t1)
img = res.cpu().numpy()
img = np.array(img, dtype=np.uint8)
return img
cuda核函数在imgblend_gpu.cu
中,grid划分成2维,block划分为1维 。block中的线程数为1024=32*32
,在grid中创建二维的Block,为了保证总线程数核融合的图片像素点的个数相对应(一般会多一点),根据图片的宽高创建的Block的大小为(height+32-1)/32, (width+32-1)/32)
,这个一个线程处理一个像素点的rgb三个值。
// imglend_api.cpp
#include
#include
#include "imgblend_gpu.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("imgblend_wrapper", &imgblend_wrapper_cpp, "imgblend_wrapper_cpp");
}
// imgblend_gpu.h
#include
#include
#include
int imgblend_wrapper_cpp(int height, int width, int left, int right,
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
void imgblend_kernel_launcher_cuda(int height, int width, int left, int right,
const int *xyz, const int *new_xyz, int *idx);
// imgblend_gpu.cpp
#include
#include
#include
#include
#include "imgblend_gpu.h"
extern THCState *state;
#define CHECK_CUDA(x) do { \
if (!x.type().is_cuda()) { \
fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_CONTIGUOUS(x) do { \
if (!x.is_contiguous()) { \
fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
int imgblend_wrapper_cpp(int height, int width, int left, int right,
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
// 检查输入是否为contiguous的torch.cuda变量
CHECK_INPUT(new_xyz_tensor);
CHECK_INPUT(xyz_tensor);
// 建立指针
const int *new_xyz = new_xyz_tensor.data<int>();
const int *xyz = xyz_tensor.data<int>();
int *idx = idx_tensor.data<int>();
// 放入到CUDA中进行具体的算法实现
imgblend_kernel_launcher_cuda(height, width, left, right, new_xyz, xyz, idx);
return 1;
}
// imgblend_gpu.cu 这里主要实现cuda的融合
#include
#include
#include
#include "imgblend_gpu.h"
// CUDA使用__global__来定义kernel
__global__ void imgblend_kernel_cuda(int height, int width, int left, int right, const int *__restrict__ new_xyz, const int *__restrict__ xyz, int *__restrict__ idx) {
int blockId = blockIdx.y * gridDim.x + blockIdx.x;
int pt_idx = blockId * blockDim.x + threadIdx.x;
if (pt_idx >= width * height) return;
// 针对指针数据,利用+的操作来确定数组首地址,相当于取new_xyz[bi,ni]
int offset = pt_idx * 3;
new_xyz += offset;
xyz += offset;
idx += offset;
int src1pix = new_xyz[0] + new_xyz[1] + new_xyz[2];
int src2pix = xyz[0] + xyz[1] + xyz[2];
if(src2pix==0){
idx[0] = new_xyz[0];
idx[1] = new_xyz[1];
idx[2] = new_xyz[2];
}else if(src1pix==0){
idx[0] = xyz[0];
idx[1] = xyz[1];
idx[2] = xyz[2];
}else{
float srclen = abs(pt_idx - left);
float warplen = abs(pt_idx - right);
float d = srclen/(srclen + warplen);
idx[0] = int(new_xyz[0] * (1-d) + xyz[0] * d);
idx[1] = int(new_xyz[1] * (1-d) + xyz[1] * d);
idx[2] = int(new_xyz[2] * (1-d) + xyz[2] * d);
}
}
void imgblend_kernel_launcher_cuda(int height, int width, int left, int right, \
const int *new_xyz, const int *xyz, int *idx) {
// cudaError_t变量用来记录CUDA的err信息,在最后需要check
cudaError_t err;
dim3 blocks((height+32-1)/32, (width+32-1)/32);
dim3 threads(1024);
// 可函数需要用<<>> 去指定调用的块数和线程数,总共调用的线程数为blocks1*threads
imgblend_kernel_cuda<<<blocks, threads>>>(height, width, left, right, new_xyz, xyz, idx);
// 如果cuda操作错误,则打印错误信息
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
# setup.py 用于构建生成供pyhton调用的库
# 使用python setup.py develop 才能生成python才可以调用的包
from setuptools import find_packages, setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
if __name__ == '__main__':
setup(
name='example',
version='0.0.0',
description='Examples illustrating how to use c++ and CUDA in python.',
install_requires=[
'numpy',
'torch>=1.1',
],
author='Jeff Wang',
license='Apache License 2.0',
packages=find_packages(),
cmdclass={
'build_ext': BuildExtension,
},
ext_modules=[
CUDAExtension(
name="cpp_CUDA_code.imgblend_cuda",
sources=[
"cpp_CUDA_code/imgblend_api.cpp",
"cpp_CUDA_code/imgblend.cpp",
"cpp_CUDA_code/imgblend_gpu.cu",
]
),
],
)