tensorrt cuda函数编写

cu文件的编写

  • .h文件的编写
  • .cpp
  • .cu

cuda编程手册地址, cuda基础

.h文件的编写

#ifndef TINYTRT_DECODE_H
#define TINYTRT_DECODE_H
#include "Trt.h"
#include "utils.h"
#include "YoloLayerPlugin/YoloLayerPlugin.hpp"
using namespace std;
struct Bbox {
    int left, right, top, bottom;
    int clsId;
    float score;
};
cudaError_t decode_gpu(vector<float> input,YoloKernel yolo_kernel,vector<Detection>& output);
float my_decode(vector<float> intput,YoloKernel yolo_kernel,vector<Detection>& output);
#endif //TINYTRT_DECODE_H

可以看出decode.h只有两个函数一个是cpu段的解码,一个是gpu段的解码,可以将cpu段解码的代码写进decode.cu文件中,也可以单独写进decode.cpp中.

.cpp

#include "decode.h"
float my_Logist(float data){ return 1./(1. + exp(-data)); }
float my_decode(vector<float> intput,YoloKernel yolo_kernel,vector<Detection>& output)
{
    YoloKernel yolo=yolo_kernel;
    int stride=yolo.width*yolo.height;
    for(int i=0;i<yolo.width*yolo.height;i++)
    {
        for(int j=0;j<3;j++)
        {
            int begin_id=7*stride*j+i;
            int obj_id=begin_id+4*stride;
            float obj_prob=my_Logist(intput[obj_id]);
            if(obj_prob<0.7)
                continue;
            int class_id=-1;
            float max_prob=0.7;
            for(int k=0;k<2;k++)
            {
                float temp_prob=my_Logist(intput[begin_id+(5+k)*stride])*obj_prob;
                if(temp_prob>max_prob)
                {
                    class_id=k;
                    max_prob=temp_prob;
                }
                if(class_id>=0)
                {
                    Detection det;
                    int row=i/yolo.width;
                    int cols=i%yolo.height;
                    float a=my_Logist(intput[begin_id]);
                    float b=my_Logist(intput[begin_id+stride]);
                    det.bbox[0]=(cols+a)/yolo.width;
                    det.bbox[1]=(row+b)/yolo.height;
                    det.bbox[2]=exp(intput[begin_id+2*stride])*yolo.anchors[2*j];
                    det.bbox[3]=exp(intput[begin_id+3*stride])*yolo.anchors[2*j+1];
                    det.classId=class_id;
                    det.prob=max_prob;
                    output.emplace_back(det);
                }
            }
        }
    }
}

cpu端的解码没有什么可讲的,理解模型最后输出的是什么就可以.
模型最后的输出是一个batch*(channel*(box+1+class))yolo.wyolo.h的float数组,box指的是x,y,w,h.1代表的是这个anchor内有没有目标,class代表的是种类.

.cu

#include "decode.h"
#include "Trt.h"
#include "utils.h"
#include "math.h"
#include 
#include 
#include 
#include "../plugin/YoloLayerPlugin/YoloLayerPlugin.hpp"
__device__ float Logist1(float data){ return 1./(1. + exp(-data)); };
__global__ void caldetection(const float* input,float* output,int noelements,int yolowidth,int yoloheight,
                             const float anchors[6],int classes,int outputElem)
{
    int idx=threadIdx.x+blockDim.x*blockIdx.x;
    if(idx>noelements)return;
    int stride=yoloheight*yolowidth;
    int bnidx=idx/stride;
    int curidx=idx-bnidx*stride;
    const float* curinput=input+bnidx*(7)*stride*3;
    for(int k=0;k<3;k++)
    {
        int beginidx=(7*stride)*k+curidx;
        int objidx=beginidx+stride*4;
        float objprob=Logist1(curinput[objidx]);
        if(objprob <= 0.7)
            continue;
        int row = curidx / yolowidth;
        int cols = curidx % yolowidth;
        int classId = -1;
        float maxProb = IGNORE_THRESH;
        for (int c = 0;c<2;++c){
            float cProb =  Logist1(curinput[beginidx + (5 + c) * stride]) * objprob;
            if(cProb > maxProb){
                maxProb = cProb;
                classId = c;
            }
        }
        if(classId >= 0) {
            float *curOutput = output + bnidx*outputElem;
            int resCount = (int)atomicAdd(curOutput,1);
            char* data = (char * )curOutput + sizeof(float) + resCount*sizeof(Detection);
            Detection* det =  (Detection*)(data);
            det->bbox[0] = (cols + Logist1(curinput[beginidx]))/ yolowidth;
            det->bbox[1] = (row + Logist1(curinput[beginidx+stride]))/ yoloheight;
            det->bbox[2] = exp(curinput[beginidx+2*stride]) * anchors[2*k];
            det->bbox[3] = exp(curinput[beginidx+3*stride]) * anchors[2*k + 1];
            float tem_cla=float(classId);
            det->classId = llround(double(tem_cla));
            det->prob = maxProb;
        }
    }

}
cudaError_t decode_gpu(vector<float> input,YoloKernel yolo_kernel,vector<Detection>& output)
{
    float* temp_input;
    int input_num=input.size();
    CUDA_CHECK(cudaMalloc(&temp_input,input.size()*sizeof(float)));
    CUDA_CHECK(cudaMemcpy(temp_input,&input[0],input.size()*sizeof(float),cudaMemcpyHostToDevice));
    float* output1;
    void* devAnchor;
    size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
    CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen));
    int outputElem = 1;
    outputElem+=yolo_kernel.width*yolo_kernel.height*3*sizeof(Detection)/sizeof(float);
    CUDA_CHECK(cudaMalloc(&output1,sizeof(float)*outputElem));

    int numelem=yolo_kernel.width*yolo_kernel.height;
    CUDA_CHECK(cudaMemcpyAsync(devAnchor,yolo_kernel.anchors,AnchorLen,cudaMemcpyHostToDevice));
    caldetection<<<(yolo_kernel.width*yolo_kernel.height+512-1)/512,512>>>
    (temp_input,output1,numelem,yolo_kernel.width,yolo_kernel.height,(float *)devAnchor,2,outputElem);

    cudaError_t cudaStatus;
    cudaFree(devAnchor);
    float* out_host{};
    CUDA_CHECK(cudaMallocHost(&out_host,sizeof(float)*outputElem));

  CUDA_CHECK(cudaMemcpy(out_host,output1,sizeof(float)*outputElem,cudaMemcpyDeviceToHost));
//    printf("第一个输出%f,第二个输出%f",out_host[0],out_host[1]);

    for(int k=0;k<int(out_host[0]);k++)
    {
        Detection temp;
        temp.bbox[0]=out_host[6*k+1];
        temp.bbox[1]=out_host[6*k+2];
        temp.bbox[2]=out_host[6*k+3];
        temp.bbox[3]=out_host[6*k+4];
        temp.classId=out_host[6*k+5];
        temp.prob=out_host[6*k+6];
        output.push_back(temp);
    }
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        std::cerr << "CUDA error " << cudaGetErrorString(cudaStatus) << " at " << __FILE__ << ":" << __LINE__ << std::endl;


    }
    return cudaGetLastError();

.cu文件中要注意,kernel函数中不能debug进去,也不能将中间结果输出到txt中查看,(目前水平是这样),因此只能靠printf调试,还是很麻烦的.
注意__device__,global,__host__的区别,__device__由gpu调用在gpu上执行,global__由cpu调用在GPU上执行.所以decode.cu的核函数为__global,再进去__global__函数前,要将cpu中的数据拷贝到gpu,还要在gpu上申请输出的空间块(这个空间要大于等于你所需要的空间)
这一块指针,空间申请之类的c++,cuda语法还要多多研究,先谈点现在简单的理解.

 float* temp_input;
    CUDA_CHECK(cudaMalloc(&temp_input,input.size()*sizeof(float)));
    CUDA_CHECK(cudaMemcpy(temp_input,&input[0],input.size()*sizeof(float),cudaMemcpyHostToDevice));
    void* devAnchor;
    size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
    CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen));
    CUDA_CHECK(cudaMemcpy(devAnchor,yolo_kernel.anchors,AnchorLen,cudaMemcpyHostToDevice));
    float* output1;
    int outputElem = 1;
    outputElem+=yolo_kernel.width*yolo_kernel.height*3*sizeof(Detection)/sizeof(float);
    CUDA_CHECK(cudaMalloc(&output1,sizeof(float)*outputElem));

关于cudaMalloc()和cudaMemcpy()函数的使用详见cuda编程手册.
注意核函数中要用到的所有host数据都要cpy到device在送入核函数.

float* out_host{};
CUDA_CHECK(cudaMallocHost(&out_host,sizeof(float)*outputElem));
CUDA_CHECK(cudaMemcpy(out_host,output1,sizeof(float)*outputElem,cudaMemcpyDeviceToHost));

这一步是将device中的值cpy到host处理,要不gpu中的值是不可见状态.

caldetection<<<(yolo_kernel.width*yolo_kernel.height+512-1)/512,512>>>
    (temp_input,output1,numelem,yolo_kernel.width,yolo_kernel.height,(float *)devAnchor,2,outputElem);

注意下kernel函数的调用,使用的是<<>>形式,其block,thread可以是一维,也可以是二维,每个的idx是确认好的.核函数简单理解就是同时进行很多for循环,这一部分还要多看多写.

cudaFree(devAnchor)
cudaFree(output1);
cudaFreeHost(out_host);

最后使用完要记得释放指针,避免报段错误.

int nByte=sizeof(float)*nElem;
float *res_h=(float*)malloc(nByte);
memset(res_h,0,nByte);
free(res_h);

常见的c++内存拷贝

float *curOutput = output + bnidx*outputElem;
int resCount = (int)atomicAdd(curOutput,1);
char* data = (char * )curOutput + sizeof(float) + resCount*sizeof(Detection);
Detection* det =  (Detection*)(data);
det->bbox[0] = (cols + Logist1(curinput[beginidx]))/ yolowidth;

注意下atomicADD()的使用,大概是用来多线程计数的.指针数组申请时一定要分配空间块大小及地址.还有强制类型转换的用法.

你可能感兴趣的:(c++,tensorrt,cuda,tensorrt)