python opencv调用cuda,从Python访问OpenCV CUDA函数(无PyCUDA)

I am writing a Python application that uses OpenCV's Python bindings to do marker detection and other image processing. I would like to use OpenCV's CUDA modules to CUDA-accelerate certain parts of my application, and noticed in their .hpp files that they seem to be using the OpenCV export macros for Python and Java. However, I do not seem to be able to access those CUDA functions, even though I am building OpenCV WITH_CUDA=ON.

Is it necessary to use a wrapper such as PyCUDA in order to access the GPU functions, such as threshold in cudaarithm? Or, are these CUDA-accelerated functions already being used if I call cv2.threshold() in my Python code (rather than the regular, CPU-based implementation)?

CV_EXPORTS double threshold(InputArray src, OutputArray dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());

The submodules I see for cv2 are the following:

Error

aruco

detail

fisheye

flann

instr

ml

ocl

ogl

videostab

cv2.cuda, cv2.gpu, and cv2.cudaarithm all return with an AttributeError.

The CMake instruction I am running to build OpenCV is as follows:

cmake -DOPENCV_EXTRA_MODULES_PATH=/usr/local/lib/opencv_contrib/modules/ \

-D WITH_CUDA=ON -D CUDA_FAST_MATH=1 \

-D ENABLE_PRECOMPILED_HEADERS=OFF \

-D BUILD_TESTS=OFF -D BUILD_PERF_TESTS=OFF -D BUILD_EXAMPLES=OFF \

-D BUILD_opencv_java=OFF \

-DBUILD_opencv_bgsegm=OFF -DBUILD_opencv_bioinspired=OFF -DBUILD_opencv_ccalib=OFF -DBUILD_opencv_cnn_3dobj=OFF -DBUILD_opencv_contrib_world=OFF -DBUILD_opencv_cvv=OFF -DBUILD_opencv_datasets=OFF -DBUILD_openc

v_dnn=OFF -DBUILD_opencv_dnns_easily_fooled=OFF -DBUILD_opencv_dpm=OFF -DBUILD_opencv_face=OFF -DBUILD_opencv_fuzzy=OFF -DBUILD_opencv_hdf=OFF -DBUILD_opencv_line_descriptor=OFF -DBUILD_opencv_matlab=OFF -DBUILD_o

pencv_optflow=OFF -DBUILD_opencv_plot=OFF -DBUILD_opencv_README.md=OFF -DBUILD_opencv_reg=OFF -DBUILD_opencv_rgbd=OFF -DBUILD_opencv_saliency=OFF -DBUILD_opencv_sfm=OFF -DBUILD_opencv_stereo=OFF -DBUILD_opencv_str

uctured_light=OFF -DBUILD_opencv_surface_matching=OFF -DBUILD_opencv_text=OFF -DBUILD_opencv_tracking=OFF -DBUILD_opencv_viz=OFF -DBUILD_opencv_xfeatures2d=OFF -DBUILD_opencv_ximgproc=OFF -DBUILD_opencv_xobjdetect

=OFF -DBUILD_opencv_xphoto=OFF ..

解决方案

So as confirmed in the answer and comment thread with @NAmorim, there are no accessible Python bindings to OpenCV's various CUDA modules.

I was able to get around this restriction by using Cython to gain access to the CUDA functions I needed and implementing the necessary logic to convert my Python objects (mainly NumPy arrays) to OpenCV C/C++ objects and back.

Working Code

I first wrote a Cython definition file, GpuWrapper.pxd. The purpose of this file is to reference external C/C++ classes and methods, such as the CUDA methods I am interested in.

from libcpp cimport bool

from cpython.ref cimport PyObject

# References PyObject to OpenCV object conversion code borrowed from OpenCV's own conversion file, cv2.cpp

cdef extern from 'pyopencv_converter.cpp':

cdef PyObject* pyopencv_from(const Mat& m)

cdef bool pyopencv_to(PyObject* o, Mat& m)

cdef extern from 'opencv2/imgproc.hpp' namespace 'cv':

cdef enum InterpolationFlags:

INTER_NEAREST = 0

cdef enum ColorConversionCodes:

COLOR_BGR2GRAY

cdef extern from 'opencv2/core/core.hpp':

cdef int CV_8UC1

cdef int CV_32FC1

cdef extern from 'opencv2/core/core.hpp' namespace 'cv':

cdef cppclass Size_[T]:

Size_() except +

Size_(T width, T height) except +

T width

T height

ctypedef Size_[int] Size2i

ctypedef Size2i Size

cdef cppclass Scalar[T]:

Scalar() except +

Scalar(T v0) except +

cdef extern from 'opencv2/core/core.hpp' namespace 'cv':

cdef cppclass Mat:

Mat() except +

void create(int, int, int) except +

void* data

int rows

int cols

cdef extern from 'opencv2/core/cuda.hpp' namespace 'cv::cuda':

cdef cppclass GpuMat:

GpuMat() except +

void upload(Mat arr) except +

void download(Mat dst) const

cdef cppclass Stream:

Stream() except +

cdef extern from 'opencv2/cudawarping.hpp' namespace 'cv::cuda':

cdef void warpPerspective(GpuMat src, GpuMat dst, Mat M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& stream)

# Function using default values

cdef void warpPerspective(GpuMat src, GpuMat dst, Mat M, Size dsize, int flags)

We also need the ability to convert Python objects to OpenCV objects. I copied the first couple hundred lines from OpenCV's modules/python/src2/cv2.cpp. You can find that code below in the appendix.

We can finally write our Cython wrapper methods to call OpenCV's CUDA functions! This is part of the Cython implementation file, GpuWrapper.pyx.

import numpy as np # Import Python functions, attributes, submodules of numpy

cimport numpy as np # Import numpy C/C++ API

def cudaWarpPerspectiveWrapper(np.ndarray[np.uint8_t, ndim=2] _src,

np.ndarray[np.float32_t, ndim=2] _M,

_size_tuple,

int _flags=INTER_NEAREST):

# Create GPU/device InputArray for src

cdef Mat src_mat

cdef GpuMat src_gpu

pyopencv_to( _src, src_mat)

src_gpu.upload(src_mat)

# Create CPU/host InputArray for M

cdef Mat M_mat = Mat()

pyopencv_to( _M, M_mat)

# Create Size object from size tuple

# Note that size/shape in Python is handled in row-major-order -- therefore, width is [1] and height is [0]

cdef Size size = Size( _size_tuple[1], _size_tuple[0])

# Create empty GPU/device OutputArray for dst

cdef GpuMat dst_gpu = GpuMat()

warpPerspective(src_gpu, dst_gpu, M_mat, size, INTER_NEAREST)

# Get result of dst

cdef Mat dst_host

dst_gpu.download(dst_host)

cdef np.ndarray out = pyopencv_from(dst_host)

return out

After running a setup script to cythonize and compile this code (see apendix), we can import GpuWrapper as a Python module and run cudaWarpPerspectiveWrapper. This allowed me to run the code through CUDA with only a mismatch of 0.34722222222222854% -- quite exciting!

References (can only post max of 2)

Appendix

pyopencv_converter.cpp

#include

#include "numpy/ndarrayobject.h"

#include "opencv2/core/core.hpp"

static PyObject* opencv_error = 0;

// === FAIL MESSAGE ====================================================================================================

static int failmsg(const char *fmt, ...)

{

char str[1000];

va_list ap;

va_start(ap, fmt);

vsnprintf(str, sizeof(str), fmt, ap);

va_end(ap);

PyErr_SetString(PyExc_TypeError, str);

return 0;

}

struct ArgInfo

{

const char * name;

bool outputarg;

// more fields may be added if necessary

ArgInfo(const char * name_, bool outputarg_)

: name(name_)

, outputarg(outputarg_) {}

// to match with older pyopencv_to function signature

operator const char *() const { return name; }

};

// === THREADING =======================================================================================================

class PyAllowThreads

{

public:

PyAllowThreads() : _state(PyEval_SaveThread()) {}

~PyAllowThreads()

{

PyEval_RestoreThread(_state);

}

private:

PyThreadState* _state;

};

class PyEnsureGIL

{

public:

PyEnsureGIL() : _state(PyGILState_Ensure()) {}

~PyEnsureGIL()

{

PyGILState_Release(_state);

}

private:

PyGILState_STATE _state;

};

// === ERROR HANDLING ==================================================================================================

#define ERRWRAP2(expr) \

try \

{ \

PyAllowThreads allowThreads; \

expr; \

} \

catch (const cv::Exception &e) \

{ \

PyErr_SetString(opencv_error, e.what()); \

return 0; \

}

// === USING NAMESPACE CV ==============================================================================================

using namespace cv;

// === NUMPY ALLOCATOR =================================================================================================

class NumpyAllocator : public MatAllocator

{

public:

NumpyAllocator() { stdAllocator = Mat::getStdAllocator(); }

~NumpyAllocator() {}

UMatData* allocate(PyObject* o, int dims, const int* sizes, int type, size_t* step) const

{

UMatData* u = new UMatData(this);

u->data = u->origdata = (uchar*)PyArray_DATA((PyArrayObject*) o);

npy_intp* _strides = PyArray_STRIDES((PyArrayObject*) o);

for( int i = 0; i < dims - 1; i++ )

step[i] = (size_t)_strides[i];

step[dims-1] = CV_ELEM_SIZE(type);

u->size = sizes[0]*step[0];

u->userdata = o;

return u;

}

UMatData* allocate(int dims0, const int* sizes, int type, void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const

{

if( data != 0 )

{

CV_Error(Error::StsAssert, "The data should normally be NULL!");

// probably this is safe to do in such extreme case

return stdAllocator->allocate(dims0, sizes, type, data, step, flags, usageFlags);

}

PyEnsureGIL gil;

int depth = CV_MAT_DEPTH(type);

int cn = CV_MAT_CN(type);

const int f = (int)(sizeof(size_t)/8);

int typenum = depth == CV_8U ? NPY_UBYTE : depth == CV_8S ? NPY_BYTE :

depth == CV_16U ? NPY_USHORT : depth == CV_16S ? NPY_SHORT :

depth == CV_32S ? NPY_INT : depth == CV_32F ? NPY_FLOAT :

depth == CV_64F ? NPY_DOUBLE : f*NPY_ULONGLONG + (f^1)*NPY_UINT;

int i, dims = dims0;

cv::AutoBuffer _sizes(dims + 1);

for( i = 0; i < dims; i++ )

_sizes[i] = sizes[i];

if( cn > 1 )

_sizes[dims++] = cn;

PyObject* o = PyArray_SimpleNew(dims, _sizes, typenum);

if(!o)

CV_Error_(Error::StsError, ("The numpy array of typenum=%d, ndims=%d can not be created", typenum, dims));

return allocate(o, dims0, sizes, type, step);

}

bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const

{

return stdAllocator->allocate(u, accessFlags, usageFlags);

}

void deallocate(UMatData* u) const

{

if(!u)

return;

PyEnsureGIL gil;

CV_Assert(u->urefcount >= 0);

CV_Assert(u->refcount >= 0);

if(u->refcount == 0)

{

PyObject* o = (PyObject*)u->userdata;

Py_XDECREF(o);

delete u;

}

}

const MatAllocator* stdAllocator;

};

// === ALLOCATOR INITIALIZATION ========================================================================================

NumpyAllocator g_numpyAllocator;

// === CONVERTOR FUNCTIONS =============================================================================================

template static

bool pyopencv_to(PyObject* obj, T& p, const char* name = "");

template static

PyObject* pyopencv_from(const T& src);

enum { ARG_NONE = 0, ARG_MAT = 1, ARG_SCALAR = 2 };

// special case, when the convertor needs full ArgInfo structure

static bool pyopencv_to(PyObject* o, Mat& m, const ArgInfo info)

{

bool allowND = true;

if(!o || o == Py_None)

{

if( !m.data )

m.allocator = &g_numpyAllocator;

return true;

}

if( PyInt_Check(o) )

{

double v[] = {static_cast(PyInt_AsLong((PyObject*)o)), 0., 0., 0.};

m = Mat(4, 1, CV_64F, v).clone();

return true;

}

if( PyFloat_Check(o) )

{

double v[] = {PyFloat_AsDouble((PyObject*)o), 0., 0., 0.};

m = Mat(4, 1, CV_64F, v).clone();

return true;

}

if( PyTuple_Check(o) )

{

int i, sz = (int)PyTuple_Size((PyObject*)o);

m = Mat(sz, 1, CV_64F);

for( i = 0; i < sz; i++ )

{

PyObject* oi = PyTuple_GET_ITEM(o, i);

if( PyInt_Check(oi) )

m.at(i) = (double)PyInt_AsLong(oi);

else if( PyFloat_Check(oi) )

m.at(i) = (double)PyFloat_AsDouble(oi);

else

{

failmsg("%s is not a numerical tuple", info.name);

m.release();

return false;

}

}

return true;

}

if( !PyArray_Check(o) )

{

failmsg("%s is not a numpy array, neither a scalar", info.name);

return false;

}

PyArrayObject* oarr = (PyArrayObject*) o;

bool needcopy = false, needcast = false;

int typenum = PyArray_TYPE(oarr), new_typenum = typenum;

int type = typenum == NPY_UBYTE ? CV_8U :

typenum == NPY_BYTE ? CV_8S :

typenum == NPY_USHORT ? CV_16U :

typenum == NPY_SHORT ? CV_16S :

typenum == NPY_INT ? CV_32S :

typenum == NPY_INT32 ? CV_32S :

typenum == NPY_FLOAT ? CV_32F :

typenum == NPY_DOUBLE ? CV_64F : -1;

if( type < 0 )

{

if( typenum == NPY_INT64 || typenum == NPY_UINT64 || typenum == NPY_LONG )

{

needcopy = needcast = true;

new_typenum = NPY_INT;

type = CV_32S;

}

else

{

failmsg("%s data type = %d is not supported", info.name, typenum);

return false;

}

}

#ifndef CV_MAX_DIM

const int CV_MAX_DIM = 32;

#endif

int ndims = PyArray_NDIM(oarr);

if(ndims >= CV_MAX_DIM)

{

failmsg("%s dimensionality (=%d) is too high", info.name, ndims);

return false;

}

int size[CV_MAX_DIM+1];

size_t step[CV_MAX_DIM+1];

size_t elemsize = CV_ELEM_SIZE1(type);

const npy_intp* _sizes = PyArray_DIMS(oarr);

const npy_intp* _strides = PyArray_STRIDES(oarr);

bool ismultichannel = ndims == 3 && _sizes[2] <= CV_CN_MAX;

for( int i = ndims-1; i >= 0 && !needcopy; i-- )

{

// these checks handle cases of

// a) multi-dimensional (ndims > 2) arrays, as well as simpler 1- and 2-dimensional cases

// b) transposed arrays, where _strides[] elements go in non-descending order

// c) flipped arrays, where some of _strides[] elements are negative

// the _sizes[i] > 1 is needed to avoid spurious copies when NPY_RELAXED_STRIDES is set

if( (i == ndims-1 && _sizes[i] > 1 && (size_t)_strides[i] != elemsize) ||

(i < ndims-1 && _sizes[i] > 1 && _strides[i] < _strides[i+1]) )

needcopy = true;

}

if( ismultichannel && _strides[1] != (npy_intp)elemsize*_sizes[2] )

needcopy = true;

if (needcopy)

{

if (info.outputarg)

{

failmsg("Layout of the output array %s is incompatible with cv::Mat (step[ndims-1] != elemsize or step[1] != elemsize*nchannels)", info.name);

return false;

}

if( needcast ) {

o = PyArray_Cast(oarr, new_typenum);

oarr = (PyArrayObject*) o;

}

else {

oarr = PyArray_GETCONTIGUOUS(oarr);

o = (PyObject*) oarr;

}

_strides = PyArray_STRIDES(oarr);

}

// Normalize strides in case NPY_RELAXED_STRIDES is set

size_t default_step = elemsize;

for ( int i = ndims - 1; i >= 0; --i )

{

size[i] = (int)_sizes[i];

if ( size[i] > 1 )

{

step[i] = (size_t)_strides[i];

default_step = step[i] * size[i];

}

else

{

step[i] = default_step;

default_step *= size[i];

}

}

// handle degenerate case

if( ndims == 0) {

size[ndims] = 1;

step[ndims] = elemsize;

ndims++;

}

if( ismultichannel )

{

ndims--;

type |= CV_MAKETYPE(0, size[2]);

}

if( ndims > 2 && !allowND )

{

failmsg("%s has more than 2 dimensions", info.name);

return false;

}

m = Mat(ndims, size, type, PyArray_DATA(oarr), step);

m.u = g_numpyAllocator.allocate(o, ndims, size, type, step);

m.addref();

if( !needcopy )

{

Py_INCREF(o);

}

m.allocator = &g_numpyAllocator;

return true;

}

template<>

bool pyopencv_to(PyObject* o, Mat& m, const char* name)

{

return pyopencv_to(o, m, ArgInfo(name, 0));

}

template<>

PyObject* pyopencv_from(const Mat& m)

{

if( !m.data )

Py_RETURN_NONE;

Mat temp, *p = (Mat*)&m;

if(!p->u || p->allocator != &g_numpyAllocator)

{

temp.allocator = &g_numpyAllocator;

ERRWRAP2(m.copyTo(temp));

p = &temp;

}

PyObject* o = (PyObject*)p->u->userdata;

Py_INCREF(o);

return o;

}

setupGpuWrapper.py

import subprocess

import os

import numpy as np

from distutils.core import setup, Extension

from Cython.Build import cythonize

from Cython.Distutils import build_ext

"""

Run setup with the following command:

```

python setupGpuWrapper.py build_ext --inplace

```

"""

# Determine current directory of this setup file to find our module

CUR_DIR = os.path.dirname(__file__)

# Use pkg-config to determine library locations and include locations

opencv_libs_str = subprocess.check_output("pkg-config --libs opencv".split()).decode()

opencv_incs_str = subprocess.check_output("pkg-config --cflags opencv".split()).decode()

# Parse into usable format for Extension call

opencv_libs = [str(lib) for lib in opencv_libs_str.strip().split()]

opencv_incs = [str(inc) for inc in opencv_incs_str.strip().split()]

extensions = [

Extension('GpuWrapper',

sources=[os.path.join(CUR_DIR, 'GpuWrapper.pyx')],

language='c++',

include_dirs=[np.get_include()] + opencv_incs,

extra_link_args=opencv_libs)

]

setup(

cmdclass={'build_ext': build_ext},

name="GpuWrapper",

ext_modules=cythonize(extensions)

)

你可能感兴趣的:(python,opencv调用cuda)