本篇文章来测试Tutorials中的第3个例子:Using Multiple Compute Units
在Vitis中创建一个新的Application Project,平台选择zcu106vcu_base。
class Filter2DDispatcher {
cl_device_id &Device,
cl_context &Context,
cl_program &Program )
mKernel = clCreateKernel(Program, "Filter2DKernel", &mErr);
mQueue = clCreateCommandQueue(Context, Device, CL_QUEUE_PROFILING_ENABLE|CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &mErr);
mContext = Context;
mCounter = 0;
Filter2DRequest* operator() (
short *coeffs,
unsigned char *src,
unsigned int width,
unsigned int height,
unsigned int stride,
unsigned char *dst )
Filter2DRequest* req = new Filter2DRequest(mCounter++);
unsigned nbytes = (stride*height);
// Create input buffers for coefficients (host to device)
mSrcExt[0].flags = XCL_MEM_DDR_BANK0;
mSrcExt[0].param = 0;
mSrcExt[0].obj = coeffs;
mSrcBuf[0] = clCreateBuffer(mContext, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, (FILTER2D_KERNEL_V_SIZE*FILTER2D_KERNEL_V_SIZE)*sizeof(short), &mSrcExt[0], &mErr);
// Create input buffer for src (host to device)
mSrcExt[1].flags = XCL_MEM_DDR_BANK0;
mSrcExt[1].param = 0;
mSrcExt[1].obj = src;
mSrcBuf[1] = clCreateBuffer(mContext, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, nbytes, &mSrcExt[1], &mErr);
// Create output buffer for dst (device to host)
mDstExt[0].flags = XCL_MEM_DDR_BANK0;
mDstExt[0].param = 0;
mDstExt[0].obj = dst;
mDstBuf[0] = clCreateBuffer(mContext, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, nbytes, &mDstExt[0], &mErr);
// Schedule the writing of the inputs
clEnqueueMigrateMemObjects(mQueue, 1, mSrcBuf, 0, 0, nullptr, &req->mEvent[0]);
// Set the kernel arguments
clSetKernelArg(mKernel, 0, sizeof(cl_mem), &mSrcBuf[0]);
clSetKernelArg(mKernel, 1, sizeof(cl_mem), &mSrcBuf[1]);
clSetKernelArg(mKernel, 2, sizeof(unsigned int), &width);
clSetKernelArg(mKernel, 3, sizeof(unsigned int), &height);
clSetKernelArg(mKernel, 4, sizeof(unsigned int), &stride);
clSetKernelArg(mKernel, 5, sizeof(cl_mem), &mDstBuf[0]);
// Schedule the execution of the kernel
clEnqueueTask(mQueue, mKernel, 1, &req->mEvent[0], &req->mEvent[1]);
// Schedule the reading of the outputs
clEnqueueMigrateMemObjects(mQueue, 1, mDstBuf, CL_MIGRATE_MEM_OBJECT_HOST, 1, &req->mEvent[1], &req->mEvent[2]);
// Register call back to notify of kernel completion
clSetEventCallback(req->mEvent[1], CL_COMPLETE, event_cb, &req->mId);
return req;
cl_kernel mKernel;
cl_command_queue mQueue;
cl_context mContext;
cl_mem_ext_ptr_t mSrcExt[2];
cl_mem_ext_ptr_t mDstExt[1];
cl_mem mSrcBuf[2];
cl_mem mDstBuf[1];
cl_int mErr;
int mCounter;
使用XRT生成的platform没有包含opencv的头文件,很奇怪,所以这里手动将include相关文件复制到zcu106 platform中。
const short filterCoeffs[][15][15] = {
// filterCoeffs[0] -> Identity
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 225, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
// filterCoeffs[1] -> Blur
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
// filterCoeffs[2] -> Motion Blur
{15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15}
// filterCoeffs[3] -> Emboss
{-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0},
{-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 1},
{-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 1, 1},
{-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 1, 1, 1},
{-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 1, 1, 1, 1},
{-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 1, 1, 1, 1, 1},
{-1,-1,-1,-1,-1,-1,-1,-1, 0, 1, 1, 1, 1, 1, 1},
{-1,-1,-1,-1,-1,-1,-1, 0, 1, 1, 1, 1, 1, 1, 1},
{-1,-1,-1,-1,-1,-1, 0, 1, 1, 1, 1, 1, 1, 1, 1},
{-1,-1,-1,-1,-1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{-1,-1,-1,-1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{-1,-1,-1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{-1,-1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{-1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
板子上的opencv lib连接有问题,因此得手动连接一下:
root@zcu106vcu_base:~# ln -s /usr/lib/libopencv_core.so.3.4 /usr/lib/libopencv_core.so
root@zcu106vcu_base:~# ln -s /usr/lib/libopencv_highgui.so.3.4 /usr/lib/libopencv_highgui.so
root@zcu106vcu_base:~# ln -s /usr/lib/libopencv_imgcodecs.so.3.4 /usr/lib/libopencv_imgcodecs.so
root@zcu106vcu_base:~# ln -s /usr/lib/libopencv_imgproc.so.3.4 /usr/lib/libopencv_imgproc.so
root@zcu106vcu_base:~# /mnt/multiple_cu.exe -x /mnt/multiple_cu_container.xclbin -i /mnt/img/ov.bmp -f 1 -n 5
Xilinx 2D Filter Example Application
FPGA binary : /mnt/multiple_cu_container.xclbin
Input image : /mnt/img/ov.bmp
Number of runs : 5
Filter type : 1
Programming FPGA
Debug platform vendorXilinx
Debug platform NameXilinx
src size 1920 1080 8 3
Running FPGA version
Convert Image Format
Running Software version
MATCH PASS: Output matches reference
FPGA Time: 0.148503 s
FPGA Throughput: 199.748 MB/s
CPU Time: 85.1083 s
CPU Throughput: 0.348534 MB/s
FPGA Speedup: 573.11 x
按照150MHz, 1920×1080来计算,假如一个clock处理一个像素,理论消耗时间为13.8毫秒。
使用Vitis和自定义的ZCU106 XRT平台完成了Vitis-Tutorials中的Using Multiple Compute Units功能测试。