cv::Mat src = imread(szfile);
cv::Mat dst;
cv::gpu::GpuMat mat1(src);
mat1.upload(src);
mat1.download(dst);
cpu和gpu之间的copy操作,upload()用于将host数据传入到device,download用于将数据从device传回host.
#include "cudaCrop.h"
void percepUnit::applyAlpha() {
cv::gpu::GpuMat tmpImage, tmpMask, tmpAlphaImage;
std::vector<cv::gpu::GpuMat> channels;
tmpImage.upload(this->image);
tmpMask.upload(this->mask);
cv::gpu::split(tmpImage,channels); // break image into channels
channels.push_back(tmpMask); // append alpha channel
cv::gpu::merge(channels,tmpAlphaImage); // combine channels
tmpAlphaImage.download(this->alphaImage);
tmpAlphaImage.release();
tmpImage.release();
tmpMask.release();
channels[0].release();
channels[1].release();
channels[2].release();
}
#include "cudaResize.h"
template <typename T>
__global__ void gpuResize( float2 scale, T* input, int iWidth, T* output, int oWidth, int oHeight )
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if( x >= oWidth || y >= oHeight )
return;
const int dx = ((float)x * scale.x);
const int dy = ((float)y * scale.y);
const T px = input[ dy * iWidth + dx ];
output[y*oWidth+x] = px;
}
__global__ void gpuRGBResize(float2 scale, uchar* input, int iWidth, uchar* output, int oWidth, int oHeight)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= oWidth || y >= oHeight)
return;
const int dx = ((float)x * scale.x);
const int dy = ((float)y * scale.y);
output[y*oWidth * 3 + x * 3 + 0] = input[dy * iWidth * 3 + dx * 3 + 0];
output[y*oWidth * 3 + x * 3 + 1] = input[dy * iWidth * 3 + dx * 3 + 1];
output[y*oWidth * 3 + x * 3 + 2] = input[dy * iWidth * 3 + dx * 3 + 2];
//printf("[%d,%d] - y=%d, owidth=%d, x=%d\n",x, y, y, oWidth, x);
//printf("[%d,%d] ------>output: idx = %d\n", x, y, y*oWidth + x * 3 + 0);
//printf("[%d,%d] ------>output: idx = %d\n", x, y, y*oWidth + x * 3 + 1);
//printf("[%d,%d] ------>output: idx = %d\n", x, y, y*oWidth + x * 3 + 2);
//printf("[%d,%d] input: idx = %d\n", x, y, dy * iWidth + dx * 3 + 0);
//printf("[%d,%d] input: idx = %d\n", x, y, dy * iWidth + dx * 3 + 1);
//printf("[%d,%d] input: idx = %d\n", x, y, dy * iWidth + dx * 3 + 2);
}
// cudaResize
cudaError_t cudaResize( float* input, size_t inputWidth, size_t inputHeight,
float* output, size_t outputWidth, size_t outputHeight )
{
if( !input || !output )
return cudaErrorInvalidDevicePointer;
if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 )
return cudaErrorInvalidValue;
const float2 scale = make_float2( float(inputWidth) / float(outputWidth),
float(inputHeight) / float(outputHeight) );
// launch kernel
const dim3 blockDim(8, 8);
const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y));
gpuResize<float><<<gridDim, blockDim>>>(scale, input, inputWidth, output, outputWidth, outputHeight);
return CUDA(cudaGetLastError());
}
// cudaResizeRGBA
cudaError_t cudaResizeRGBA( float4* input, size_t inputWidth, size_t inputHeight,
float4* output, size_t outputWidth, size_t outputHeight )
{
if( !input || !output )
return cudaErrorInvalidDevicePointer;
if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 )
return cudaErrorInvalidValue;
const float2 scale = make_float2( float(inputWidth) / float(outputWidth),
float(inputHeight) / float(outputHeight) );
// launch kernel
const dim3 blockDim(8, 8);
const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y));
gpuResize<float4><<<gridDim, blockDim>>>(scale, input, inputWidth, output, outputWidth, outputHeight);
return CUDA(cudaGetLastError());
}
cudaError_t cudaResizeRGB(uchar* input, size_t inputWidth, size_t inputHeight,
uchar* output, size_t outputWidth, size_t outputHeight) {
if (!input || !output)
return cudaErrorInvalidDevicePointer;
if (inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0)
return cudaErrorInvalidValue;
const float2 scale = make_float2(float(inputWidth) / float(outputWidth),
float(inputHeight) / float(outputHeight));
// launch kernel
const dim3 blockDim(8, 8);
const dim3 gridDim(iDivUp(outputWidth, blockDim.x), iDivUp(outputHeight, blockDim.y));
gpuRGBResize << <gridDim, blockDim >> >(scale, input, inputWidth, output, outputWidth, outputHeight);
return CUDA(cudaGetLastError());
}
#ifndef __CUDA_CROP_H__
#define __CUDA_CROP_H__
#include "cudaUtility.h"
#include "imageFormat.h"
cudaError_t cudaCrop( uint8_t* input, uint8_t* output, const int4& roi, size_t inputWidth, size_t inputHeight );
cudaError_t cudaCrop( float* input, float* output, const int4& roi, size_t inputWidth, size_t inputHeight );
cudaError_t cudaCrop( uchar4* input, uchar4* output, const int4& roi, size_t inputWidth, size_t inputHeight );
cudaError_t cudaCrop( float3* input, float3* output, const int4& roi, size_t inputWidth, size_t inputHeight );
cudaError_t cudaCrop( float4* input, float4* output, const int4& roi, size_t inputWidth, size_t inputHeight );
#endif
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
//#include "cudaCrop.h"
//void percepUnit::applyAlpha() {
//
// cv::gpu::GpuMat tmpImage, tmpMask, tmpAlphaImage;
// std::vector channels;
//
// tmpImage.upload(this->image);
// tmpMask.upload(this->mask);
//
// cv::gpu::split(tmpImage,channels); // break image into channels
// channels.push_back(tmpMask); // append alpha channel
// cv::gpu::merge(channels,tmpAlphaImage); // combine channels
//
// tmpAlphaImage.download(this->alphaImage);
//
// tmpAlphaImage.release();
// tmpImage.release();
// tmpMask.release();
//
// channels[0].release();
// channels[1].release();
// channels[2].release();
//}
// gpuCrop
template<typename T>
__global__ void gpuCrop( T* input, T* output, int offsetX, int offsetY,
int inWidth, int outWidth, int outHeight )
{
const int out_x = blockIdx.x * blockDim.x + threadIdx.x;
const int out_y = blockIdx.y * blockDim.y + threadIdx.y;
if( out_x >= outWidth || out_y >= outHeight )
return;
const int in_x = out_x + offsetX;
const int in_y = out_y + offsetY;
output[out_y * outWidth + out_x] = input[in_y * inWidth + in_x];
}
// launchCrop
template<typename T>
static cudaError_t launchCrop( T* input, T* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
if( !input || !output )
return cudaErrorInvalidDevicePointer;
if( inputWidth == 0 || inputHeight == 0 )
return cudaErrorInvalidValue;
// get the ROI/output dimensions
const int outputWidth = roi.z - roi.x;
const int outputHeight = roi.w - roi.y;
// validate the requested ROI
if( outputWidth <= 0 || outputHeight <= 0 )
return cudaErrorInvalidValue;
if( outputWidth > inputWidth || outputHeight > inputHeight )
return cudaErrorInvalidValue;
if( roi.x < 0 || roi.y < 0 || roi.z < 0 || roi.w < 0 )
return cudaErrorInvalidValue;
if( roi.z >= inputWidth || roi.w >= inputHeight )
return cudaErrorInvalidValue;
// launch kernel
const dim3 blockDim(8, 8);
const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y));
gpuCrop<T><<<gridDim, blockDim>>>(input, output, roi.x, roi.y, inputWidth, outputWidth, outputHeight);
return CUDA(cudaGetLastError());
}
// cudaCrop (uint8 grayscale)
cudaError_t cudaCrop( uint8_t* input, uint8_t* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
return launchCrop<uint8_t>(input, output, roi, inputWidth, inputHeight);
}
cudaError_t cudaCrop( float* input, float* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
return launchCrop<float>(input, output, roi, inputWidth, inputHeight);
}
cudaError_t cudaCrop( uchar3* input, uchar3* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
return launchCrop<uchar3>(input, output, roi, inputWidth, inputHeight);
}
cudaError_t cudaCrop( uchar4* input, uchar4* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
return launchCrop<uchar4>(input, output, roi, inputWidth, inputHeight);
}
cudaError_t cudaCrop( float3* input, float3* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
return launchCrop<float3>(input, output, roi, inputWidth, inputHeight);
}
cudaError_t cudaCrop( float4* input, float4* output, const int4& roi, size_t inputWidth, size_t inputHeight )
{
return launchCrop<float4>(input, output, roi, inputWidth, inputHeight);
}
cudaError_t cudaCrop( void* input, void* output, const int4& roi, size_t inputWidth, size_t inputHeight, imageFormat format )
{
if( format == IMAGE_RGB8 || format == IMAGE_BGR8 )
return cudaCrop((uchar3*)input, (uchar3*)output, roi, inputWidth, inputHeight);
else if( format == IMAGE_RGBA8 || format == IMAGE_BGRA8 )
return cudaCrop((uchar4*)input, (uchar4*)output, roi, inputWidth, inputHeight);
else if( format == IMAGE_RGB32F || format == IMAGE_BGR32F )
return cudaCrop((float3*)input, (float3*)output, roi, inputWidth, inputHeight);
else if( format == IMAGE_RGBA32F || format == IMAGE_BGRA32F )
return cudaCrop((float4*)input, (float4*)output, roi, inputWidth, inputHeight);
else if( format == IMAGE_GRAY8 )
return cudaCrop((uint8_t*)input, (uint8_t*)output, roi, inputWidth, inputHeight);
else if( format == IMAGE_GRAY32F )
return cudaCrop((float*)input, (float*)output, roi, inputWidth, inputHeight);
LogError(LOG_CUDA "cudaCrop() -- invalid image format '%s'\n", imageFormatToStr(format));
LogError(LOG_CUDA " supported formats are:\n");
LogError(LOG_CUDA " * gray8\n");
LogError(LOG_CUDA " * gray32f\n");
LogError(LOG_CUDA " * rgb8, bgr8\n");
LogError(LOG_CUDA " * rgba8, bgra8\n");
LogError(LOG_CUDA " * rgb32f, bgr32f\n");
LogError(LOG_CUDA " * rgba32f, bgra32f\n");
return cudaErrorInvalidValue;
}
cuda的数据类型如int int2 int3 int4 要获取其中的值,通过x,y,z,w.
roi.x -> left,roi.y -> top,roi.z -> right,roi.w -> bottom.
Xavier和普通的台式机不同,他的cpu和gpu是在同一块硬件设备上.
所以现成的cv::cpu::resize()函数不能使用,效率太低了
size_t isizeOfImage = img.step[0] * img.rows;
if (h_resize_input == nullptr){
if (!cudaAllocMapped((void**)&h_resize_input, (void**)&d_resize_input, isizeOfImage)) {
exit(-1);
}
}
memcpy(h_resize_input, img.data, isizeOfImage);
使用cudaAllocMapped()同步device和host在同一个硬件上的地址,使用简单的memcpy()就可以