opencv gpu

opencv gpu

  • 普通的copy操作
  • opencv cuda
  • cuda 实现 resize crop
  • tip

普通的copy操作

cv::Mat src = imread(szfile);
cv::Mat dst;
cv::gpu::GpuMat mat1(src);
mat1.upload(src);
mat1.download(dst);

cpu和gpu之间的copy操作,upload()用于将host数据传入到device,download用于将数据从device传回host.

opencv cuda

#include "cudaCrop.h"
void percepUnit::applyAlpha() {
    cv::gpu::GpuMat tmpImage, tmpMask, tmpAlphaImage;
    std::vector<cv::gpu::GpuMat> channels;
    tmpImage.upload(this->image);
    tmpMask.upload(this->mask);
    cv::gpu::split(tmpImage,channels); // break image into channels
    channels.push_back(tmpMask); // append alpha channel
    cv::gpu::merge(channels,tmpAlphaImage); // combine channels
    tmpAlphaImage.download(this->alphaImage);
    tmpAlphaImage.release();
    tmpImage.release();
    tmpMask.release();
    channels[0].release();
    channels[1].release();
    channels[2].release();
}

cuda 实现 resize crop

#include "cudaResize.h"
template <typename T>
__global__ void gpuResize( float2 scale, T* input, int iWidth, T* output, int oWidth, int oHeight )
{
	const int x = blockIdx.x * blockDim.x + threadIdx.x;
	const int y = blockIdx.y * blockDim.y + threadIdx.y;
	if( x >= oWidth || y >= oHeight )
		return;
	const int dx = ((float)x * scale.x);
	const int dy = ((float)y * scale.y);
	const T px = input[ dy * iWidth + dx ];
	output[y*oWidth+x] = px;
}
__global__ void gpuRGBResize(float2 scale, uchar* input, int iWidth, uchar* output, int oWidth, int oHeight)
{
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= oWidth || y >= oHeight)
        return;
    const int dx = ((float)x * scale.x);
    const int dy = ((float)y * scale.y);
    output[y*oWidth * 3 + x * 3 + 0] = input[dy * iWidth * 3 + dx * 3 + 0];
    output[y*oWidth * 3 + x * 3 + 1] = input[dy * iWidth * 3 + dx * 3 + 1];
    output[y*oWidth * 3 + x * 3 + 2] = input[dy * iWidth * 3 + dx * 3 + 2];

    //printf("[%d,%d] - y=%d, owidth=%d, x=%d\n",x, y, y, oWidth, x);
    //printf("[%d,%d] ------>output: idx = %d\n", x, y, y*oWidth + x * 3 + 0);
    //printf("[%d,%d] ------>output: idx = %d\n", x, y, y*oWidth + x * 3 + 1);
    //printf("[%d,%d] ------>output: idx = %d\n", x, y, y*oWidth + x * 3 + 2);

    //printf("[%d,%d] input: idx = %d\n", x, y, dy * iWidth + dx * 3 + 0);
    //printf("[%d,%d] input: idx = %d\n", x, y, dy * iWidth + dx * 3 + 1);
    //printf("[%d,%d] input: idx = %d\n", x, y, dy * iWidth + dx * 3 + 2);
}

// cudaResize
cudaError_t cudaResize( float* input, size_t inputWidth, size_t inputHeight,
				        float* output, size_t outputWidth, size_t outputHeight )
{
	if( !input || !output )
		return cudaErrorInvalidDevicePointer;

	if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 )
		return cudaErrorInvalidValue;

	const float2 scale = make_float2( float(inputWidth) / float(outputWidth),
							          float(inputHeight) / float(outputHeight) );

	// launch kernel
	const dim3 blockDim(8, 8);
	const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y));

	gpuResize<float><<<gridDim, blockDim>>>(scale, input, inputWidth, output, outputWidth, outputHeight);

	return CUDA(cudaGetLastError());
}


// cudaResizeRGBA
cudaError_t cudaResizeRGBA( float4* input,  size_t inputWidth, size_t inputHeight,
				            float4* output, size_t outputWidth, size_t outputHeight )
{
	if( !input || !output )
		return cudaErrorInvalidDevicePointer;

	if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 )
		return cudaErrorInvalidValue;

	const float2 scale = make_float2( float(inputWidth) / float(outputWidth),
							    float(inputHeight) / float(outputHeight) );

	// launch kernel
	const dim3 blockDim(8, 8);
	const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y));

	gpuResize<float4><<<gridDim, blockDim>>>(scale, input, inputWidth, output, outputWidth, outputHeight);

	return CUDA(cudaGetLastError());
}

cudaError_t cudaResizeRGB(uchar* input, size_t inputWidth, size_t inputHeight,
    uchar* output, size_t outputWidth, size_t outputHeight) {
    if (!input || !output)
        return cudaErrorInvalidDevicePointer;

    if (inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0)
        return cudaErrorInvalidValue;

    const float2 scale = make_float2(float(inputWidth) / float(outputWidth),
        float(inputHeight) / float(outputHeight));

    // launch kernel
    const dim3 blockDim(8, 8);
    const dim3 gridDim(iDivUp(outputWidth, blockDim.x), iDivUp(outputHeight, blockDim.y));

    gpuRGBResize << <gridDim, blockDim >> >(scale, input, inputWidth, output, outputWidth, outputHeight);

    return CUDA(cudaGetLastError());
}
#ifndef __CUDA_CROP_H__
#define __CUDA_CROP_H__
#include "cudaUtility.h"
#include "imageFormat.h"
cudaError_t cudaCrop( uint8_t* input, uint8_t* output, const int4& roi, size_t inputWidth, size_t inputHeight );
cudaError_t cudaCrop( float* input, float* output, const int4& roi, size_t inputWidth, size_t inputHeight );
cudaError_t cudaCrop( uchar4* input, uchar4* output, const int4& roi, size_t inputWidth, size_t inputHeight );
cudaError_t cudaCrop( float3* input, float3* output, const int4& roi, size_t inputWidth, size_t inputHeight );
cudaError_t cudaCrop( float4* input, float4* output, const int4& roi, size_t inputWidth, size_t inputHeight );
#endif


/*
 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

//#include "cudaCrop.h"
//void percepUnit::applyAlpha() {
//
//    cv::gpu::GpuMat tmpImage, tmpMask, tmpAlphaImage;
//    std::vector channels;
//
//    tmpImage.upload(this->image);
//    tmpMask.upload(this->mask);
//
//    cv::gpu::split(tmpImage,channels); // break image into channels
//    channels.push_back(tmpMask); // append alpha channel
//    cv::gpu::merge(channels,tmpAlphaImage); // combine channels
//
//    tmpAlphaImage.download(this->alphaImage);
//
//    tmpAlphaImage.release();
//    tmpImage.release();
//    tmpMask.release();
//
//    channels[0].release();
//    channels[1].release();
//    channels[2].release();
//}


// gpuCrop
template<typename T>
__global__ void gpuCrop( T* input, T* output, int offsetX, int offsetY, 
					int inWidth, int outWidth, int outHeight )
{
	const int out_x = blockIdx.x * blockDim.x + threadIdx.x;
	const int out_y = blockIdx.y * blockDim.y + threadIdx.y;

	if( out_x >= outWidth || out_y >= outHeight )
		return;

	const int in_x = out_x + offsetX;
	const int in_y = out_y + offsetY;

	output[out_y * outWidth + out_x] = input[in_y * inWidth + in_x];
}


// launchCrop
template<typename T>
static cudaError_t launchCrop( T* input, T* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
	if( !input || !output )
		return cudaErrorInvalidDevicePointer;

	if( inputWidth == 0 || inputHeight == 0 )
		return cudaErrorInvalidValue;

	// get the ROI/output dimensions
	const int outputWidth = roi.z - roi.x;
	const int outputHeight = roi.w - roi.y;

	// validate the requested ROI
	if( outputWidth <= 0 || outputHeight <= 0 )
		return cudaErrorInvalidValue;

	if( outputWidth > inputWidth || outputHeight > inputHeight )
		return cudaErrorInvalidValue;

	if( roi.x < 0 || roi.y < 0 || roi.z < 0 || roi.w < 0 )
		return cudaErrorInvalidValue;

	if( roi.z >= inputWidth || roi.w >= inputHeight )
		return cudaErrorInvalidValue;

	// launch kernel
	const dim3 blockDim(8, 8);
	const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y));

	gpuCrop<T><<<gridDim, blockDim>>>(input, output, roi.x, roi.y, inputWidth, outputWidth, outputHeight);

	return CUDA(cudaGetLastError());
}

// cudaCrop (uint8 grayscale)
cudaError_t cudaCrop( uint8_t* input, uint8_t* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
	return launchCrop<uint8_t>(input, output, roi, inputWidth, inputHeight);
}

cudaError_t cudaCrop( float* input, float* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
	return launchCrop<float>(input, output, roi, inputWidth, inputHeight);
}
cudaError_t cudaCrop( uchar3* input, uchar3* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
	return launchCrop<uchar3>(input, output, roi, inputWidth, inputHeight);
}

cudaError_t cudaCrop( uchar4* input, uchar4* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
	return launchCrop<uchar4>(input, output, roi, inputWidth, inputHeight);
}
cudaError_t cudaCrop( float3* input, float3* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
	return launchCrop<float3>(input, output, roi, inputWidth, inputHeight);
}
cudaError_t cudaCrop( float4* input, float4* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
	return launchCrop<float4>(input, output, roi, inputWidth, inputHeight);
}
cudaError_t cudaCrop( void* input, void* output, const int4& roi, size_t inputWidth, size_t inputHeight, imageFormat format )
{
	if( format == IMAGE_RGB8 || format == IMAGE_BGR8 )
		return cudaCrop((uchar3*)input, (uchar3*)output, roi, inputWidth, inputHeight);
	else if( format == IMAGE_RGBA8 || format == IMAGE_BGRA8 )
		return cudaCrop((uchar4*)input, (uchar4*)output, roi, inputWidth, inputHeight);
	else if( format == IMAGE_RGB32F || format == IMAGE_BGR32F )
		return cudaCrop((float3*)input, (float3*)output, roi, inputWidth, inputHeight);
	else if( format == IMAGE_RGBA32F || format == IMAGE_BGRA32F )
		return cudaCrop((float4*)input, (float4*)output, roi, inputWidth, inputHeight);
	else if( format == IMAGE_GRAY8 )
		return cudaCrop((uint8_t*)input, (uint8_t*)output, roi, inputWidth, inputHeight);
	else if( format == IMAGE_GRAY32F )
		return cudaCrop((float*)input, (float*)output, roi, inputWidth, inputHeight);

	LogError(LOG_CUDA "cudaCrop() -- invalid image format '%s'\n", imageFormatToStr(format));
	LogError(LOG_CUDA "              supported formats are:\n");
	LogError(LOG_CUDA "                  * gray8\n");
	LogError(LOG_CUDA "                  * gray32f\n");
	LogError(LOG_CUDA "                  * rgb8, bgr8\n");
	LogError(LOG_CUDA "                  * rgba8, bgra8\n");
	LogError(LOG_CUDA "                  * rgb32f, bgr32f\n");
	LogError(LOG_CUDA "                  * rgba32f, bgra32f\n");

	return cudaErrorInvalidValue;
}

tip

cuda的数据类型如int int2 int3 int4 要获取其中的值,通过x,y,z,w.
roi.x -> left,roi.y -> top,roi.z -> right,roi.w -> bottom.
Xavier和普通的台式机不同,他的cpu和gpu是在同一块硬件设备上.
所以现成的cv::cpu::resize()函数不能使用,效率太低了

size_t isizeOfImage = img.step[0] * img.rows;
if (h_resize_input == nullptr){
 if (!cudaAllocMapped((void**)&h_resize_input, (void**)&d_resize_input, isizeOfImage)) {
                        exit(-1);
                    }
                }
memcpy(h_resize_input, img.data, isizeOfImage);

使用cudaAllocMapped()同步device和host在同一个硬件上的地址,使用简单的memcpy()就可以

你可能感兴趣的:(cuda,c++)