#ifndef TINYTRT_DECODE_H
#define TINYTRT_DECODE_H
#include "Trt.h"
#include "utils.h"
#include "YoloLayerPlugin/YoloLayerPlugin.hpp"
using namespace std;
struct Bbox {
int left, right, top, bottom;
int clsId;
float score;
};
cudaError_t decode_gpu(vector<float> input,YoloKernel yolo_kernel,vector<Detection>& output);
float my_decode(vector<float> intput,YoloKernel yolo_kernel,vector<Detection>& output);
#endif //TINYTRT_DECODE_H
可以看出decode.h只有两个函数一个是cpu段的解码,一个是gpu段的解码,可以将cpu段解码的代码写进decode.cu文件中,也可以单独写进decode.cpp中.
#include "decode.h"
float my_Logist(float data){ return 1./(1. + exp(-data)); }
float my_decode(vector<float> intput,YoloKernel yolo_kernel,vector<Detection>& output)
{
YoloKernel yolo=yolo_kernel;
int stride=yolo.width*yolo.height;
for(int i=0;i<yolo.width*yolo.height;i++)
{
for(int j=0;j<3;j++)
{
int begin_id=7*stride*j+i;
int obj_id=begin_id+4*stride;
float obj_prob=my_Logist(intput[obj_id]);
if(obj_prob<0.7)
continue;
int class_id=-1;
float max_prob=0.7;
for(int k=0;k<2;k++)
{
float temp_prob=my_Logist(intput[begin_id+(5+k)*stride])*obj_prob;
if(temp_prob>max_prob)
{
class_id=k;
max_prob=temp_prob;
}
if(class_id>=0)
{
Detection det;
int row=i/yolo.width;
int cols=i%yolo.height;
float a=my_Logist(intput[begin_id]);
float b=my_Logist(intput[begin_id+stride]);
det.bbox[0]=(cols+a)/yolo.width;
det.bbox[1]=(row+b)/yolo.height;
det.bbox[2]=exp(intput[begin_id+2*stride])*yolo.anchors[2*j];
det.bbox[3]=exp(intput[begin_id+3*stride])*yolo.anchors[2*j+1];
det.classId=class_id;
det.prob=max_prob;
output.emplace_back(det);
}
}
}
}
}
cpu端的解码没有什么可讲的,理解模型最后输出的是什么就可以.
模型最后的输出是一个batch*(channel*(box+1+class))yolo.wyolo.h的float数组,box指的是x,y,w,h.1代表的是这个anchor内有没有目标,class代表的是种类.
#include "decode.h"
#include "Trt.h"
#include "utils.h"
#include "math.h"
#include
#include
#include
#include "../plugin/YoloLayerPlugin/YoloLayerPlugin.hpp"
__device__ float Logist1(float data){ return 1./(1. + exp(-data)); };
__global__ void caldetection(const float* input,float* output,int noelements,int yolowidth,int yoloheight,
const float anchors[6],int classes,int outputElem)
{
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if(idx>noelements)return;
int stride=yoloheight*yolowidth;
int bnidx=idx/stride;
int curidx=idx-bnidx*stride;
const float* curinput=input+bnidx*(7)*stride*3;
for(int k=0;k<3;k++)
{
int beginidx=(7*stride)*k+curidx;
int objidx=beginidx+stride*4;
float objprob=Logist1(curinput[objidx]);
if(objprob <= 0.7)
continue;
int row = curidx / yolowidth;
int cols = curidx % yolowidth;
int classId = -1;
float maxProb = IGNORE_THRESH;
for (int c = 0;c<2;++c){
float cProb = Logist1(curinput[beginidx + (5 + c) * stride]) * objprob;
if(cProb > maxProb){
maxProb = cProb;
classId = c;
}
}
if(classId >= 0) {
float *curOutput = output + bnidx*outputElem;
int resCount = (int)atomicAdd(curOutput,1);
char* data = (char * )curOutput + sizeof(float) + resCount*sizeof(Detection);
Detection* det = (Detection*)(data);
det->bbox[0] = (cols + Logist1(curinput[beginidx]))/ yolowidth;
det->bbox[1] = (row + Logist1(curinput[beginidx+stride]))/ yoloheight;
det->bbox[2] = exp(curinput[beginidx+2*stride]) * anchors[2*k];
det->bbox[3] = exp(curinput[beginidx+3*stride]) * anchors[2*k + 1];
float tem_cla=float(classId);
det->classId = llround(double(tem_cla));
det->prob = maxProb;
}
}
}
cudaError_t decode_gpu(vector<float> input,YoloKernel yolo_kernel,vector<Detection>& output)
{
float* temp_input;
int input_num=input.size();
CUDA_CHECK(cudaMalloc(&temp_input,input.size()*sizeof(float)));
CUDA_CHECK(cudaMemcpy(temp_input,&input[0],input.size()*sizeof(float),cudaMemcpyHostToDevice));
float* output1;
void* devAnchor;
size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen));
int outputElem = 1;
outputElem+=yolo_kernel.width*yolo_kernel.height*3*sizeof(Detection)/sizeof(float);
CUDA_CHECK(cudaMalloc(&output1,sizeof(float)*outputElem));
int numelem=yolo_kernel.width*yolo_kernel.height;
CUDA_CHECK(cudaMemcpyAsync(devAnchor,yolo_kernel.anchors,AnchorLen,cudaMemcpyHostToDevice));
caldetection<<<(yolo_kernel.width*yolo_kernel.height+512-1)/512,512>>>
(temp_input,output1,numelem,yolo_kernel.width,yolo_kernel.height,(float *)devAnchor,2,outputElem);
cudaError_t cudaStatus;
cudaFree(devAnchor);
float* out_host{};
CUDA_CHECK(cudaMallocHost(&out_host,sizeof(float)*outputElem));
CUDA_CHECK(cudaMemcpy(out_host,output1,sizeof(float)*outputElem,cudaMemcpyDeviceToHost));
// printf("第一个输出%f,第二个输出%f",out_host[0],out_host[1]);
for(int k=0;k<int(out_host[0]);k++)
{
Detection temp;
temp.bbox[0]=out_host[6*k+1];
temp.bbox[1]=out_host[6*k+2];
temp.bbox[2]=out_host[6*k+3];
temp.bbox[3]=out_host[6*k+4];
temp.classId=out_host[6*k+5];
temp.prob=out_host[6*k+6];
output.push_back(temp);
}
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
std::cerr << "CUDA error " << cudaGetErrorString(cudaStatus) << " at " << __FILE__ << ":" << __LINE__ << std::endl;
}
return cudaGetLastError();
.cu文件中要注意,kernel函数中不能debug进去,也不能将中间结果输出到txt中查看,(目前水平是这样),因此只能靠printf调试,还是很麻烦的.
注意__device__,global,__host__的区别,__device__由gpu调用在gpu上执行,global__由cpu调用在GPU上执行.所以decode.cu的核函数为__global,再进去__global__函数前,要将cpu中的数据拷贝到gpu,还要在gpu上申请输出的空间块(这个空间要大于等于你所需要的空间)
这一块指针,空间申请之类的c++,cuda语法还要多多研究,先谈点现在简单的理解.
float* temp_input;
CUDA_CHECK(cudaMalloc(&temp_input,input.size()*sizeof(float)));
CUDA_CHECK(cudaMemcpy(temp_input,&input[0],input.size()*sizeof(float),cudaMemcpyHostToDevice));
void* devAnchor;
size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen));
CUDA_CHECK(cudaMemcpy(devAnchor,yolo_kernel.anchors,AnchorLen,cudaMemcpyHostToDevice));
float* output1;
int outputElem = 1;
outputElem+=yolo_kernel.width*yolo_kernel.height*3*sizeof(Detection)/sizeof(float);
CUDA_CHECK(cudaMalloc(&output1,sizeof(float)*outputElem));
关于cudaMalloc()和cudaMemcpy()函数的使用详见cuda编程手册.
注意核函数中要用到的所有host数据都要cpy到device在送入核函数.
float* out_host{};
CUDA_CHECK(cudaMallocHost(&out_host,sizeof(float)*outputElem));
CUDA_CHECK(cudaMemcpy(out_host,output1,sizeof(float)*outputElem,cudaMemcpyDeviceToHost));
这一步是将device中的值cpy到host处理,要不gpu中的值是不可见状态.
caldetection<<<(yolo_kernel.width*yolo_kernel.height+512-1)/512,512>>>
(temp_input,output1,numelem,yolo_kernel.width,yolo_kernel.height,(float *)devAnchor,2,outputElem);
注意下kernel函数的调用,使用的是<<
cudaFree(devAnchor)
cudaFree(output1);
cudaFreeHost(out_host);
最后使用完要记得释放指针,避免报段错误.
int nByte=sizeof(float)*nElem;
float *res_h=(float*)malloc(nByte);
memset(res_h,0,nByte);
free(res_h);
常见的c++内存拷贝
float *curOutput = output + bnidx*outputElem;
int resCount = (int)atomicAdd(curOutput,1);
char* data = (char * )curOutput + sizeof(float) + resCount*sizeof(Detection);
Detection* det = (Detection*)(data);
det->bbox[0] = (cols + Logist1(curinput[beginidx]))/ yolowidth;
注意下atomicADD()的使用,大概是用来多线程计数的.指针数组申请时一定要分配空间块大小及地址.还有强制类型转换的用法.