让Faster R-CNN支持TX1的fp16(half float, float16)特性


为什么要用float16?

一句话,float16的运算速度是float32运算速度的2倍。nVidia说的,数据不对不要怪我,(≖ ‿ ≖)✧

在说一下缺点,也是一句话,精度损失了。(卧槽,废话,(*´Д`*)


会损失多少呢?

IEEE754(wiki)这里描述各种float的规则,这里鄙视一下百度百科。 눈_눈

float32: 负 -3.4028235E+38 到 -1.401298E-45,正 1.401298E-45 到 3.4028235E+38

float16: 半精度占2个字节,1位符号位,5位阶码,10位尾数组成,精度为0.001,所以它的范围,自己算 ,(´・ω・`) ,不会就去问老师(。◕ˇ∀ˇ◕)。

看来还是可以和float32比一比的嘛,(喂喂,这么果断得出结论真的好吗?)

既然那么厉害, 反正拿到了TX1,那就试试呀,(卧槽,真的不打算解释了么)!搞一个超大的矩阵乘,然而事实是cuda目前也只提供float16的矩阵乘运算,_(:3 」∠)_

先贴个代码,请直接略过,代码有点长,有点乱,请随便介意,因为不是我写的,(。◕ˇ∀ˇ◕)

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
using namespace std;


#define  IDX2C(i,j,leading) (((j)*(leading))+(i))


typedef struct _data *PDATA;
typedef struct _data
{
    int _rows;
    int _cols;
    float *data;
} Data;

typedef struct _hdata *PHDATA;
typedef struct _hdata
{
    int _rows;
    int _cols;
    half *data;
} HData;
void free_mat(PDATA mat)
{
    free(mat->data);
    free(mat);
}
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;

half uint16_as_fp16 (uint16_t a)
{
    half res;
#if defined (__cplusplus)
    memcpy (&res, &a, sizeof (res));
#else /* __cplusplus */
    volatile union {
        half f;
        uint16_t i;
    } cvt;
    cvt.i = a;
    res = cvt.f;
#endif /* __cplusplus */
    return res;
}

uint32_t fp32_as_uint32 (float a)
{
    uint32_t res;
#if defined (__cplusplus)
    memcpy (&res, &a, sizeof (res));
#else /* __cplusplus */
    volatile union {
        float f;
        uint32_t i;
    } cvt;
    cvt.f = a;
    res = cvt.i;
#endif /* __cplusplus */
    return res;
}

/* host version of device function __float2half_rn() */
half float2half_rn (float a)
{
    uint32_t ia = fp32_as_uint32 (a);
    uint16_t ir;

    ir = (ia >> 16) & 0x8000;
    if ((ia & 0x7f800000) == 0x7f800000) {
        if ((ia & 0x7fffffff) == 0x7f800000) {
            ir |= 0x7c00; /* infinity */
        } else {
            ir = 0x7fff; /* canonical NaN */
        }
    } else if ((ia & 0x7f800000) >= 0x33000000) {
        int shift = (int)((ia >> 23) & 0xff) - 127;
        if (shift > 15) {
            ir |= 0x7c00; /* infinity */
        } else {
            ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
            if (shift < -14) { /* denormal */  
                ir |= ia >> (-1 - shift);
                ia = ia << (32 - (-1 - shift));
            } else { /* normal */
                ir |= ia >> (24 - 11);
                ia = ia << (32 - (24 - 11));
                ir = ir + ((14 + shift) << 10);
            }
            /* IEEE-754 round to nearest of even */
            if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
                ir++;
            }
        }
    }
    return uint16_as_fp16 (ir);
}

PHDATA mat_product(PHDATA mat1,PHDATA mat2)
{
    if(mat1->_cols!=mat2->_rows)
    {
        printf("this is not right\n");
            return NULL;
    }
    PHDATA mat3=new HData;
    mat3->data=(half *)malloc(sizeof(half)*(mat1->_rows)*(mat2->_cols));
    mat3->_rows=mat1->_rows;
    mat3->_cols=mat2->_cols;
    /*
     *INIT the matrix we want calculate 
     * col primary
     */
    {
        half *d_a,*d_b,*d_c;
        cublasInit();
        cublasAlloc((mat1->_cols)*(mat1->_rows),sizeof(half),(void **)&d_a);
        cublasAlloc((mat2->_cols)*(mat2->_rows),sizeof(half),(void **)&d_b);
        cublasAlloc((mat3->_rows)*(mat3->_cols),sizeof(half),(void **)&d_c);
        cudaMemcpy(d_a,mat1->data,sizeof(half)*(mat1->_cols)*(mat1->_rows),cudaMemcpyHostToDevice);
        cudaMemcpy(d_b,mat2->data,sizeof(half)*(mat2->_rows)*(mat2->_cols),cudaMemcpyHostToDevice);
        cublasHandle_t handle;
        cublasCreate(&handle);
        half alpha=float2half_rn(float(1.0));
        half beta=float2half_rn(float(0.0));
        cudaEvent_t start,stop;
 	cudaEventCreate(&start);
 	cudaEventCreate(&stop);
 	cudaEventRecord(start,0);
        cublasHgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,mat1->_rows,mat2->_cols,
            mat2->_rows,&alpha,d_a,mat1->_rows,d_b,mat2->_rows,&beta,d_c,mat1->_rows);
 	cudaEventRecord(stop,0);
 	cudaEventSynchronize(stop);

 	float ela=0;
 	cudaEventElapsedTime(&ela,start,stop);
 	cout<<"GPU: "<data,d_c,sizeof(half)*(mat3->_rows)*(mat3->_cols),cudaMemcpyDeviceToHost);
        cublasFree(d_a);
        cublasFree(d_b);
        cublasFree(d_c);
        cublasShutdown();
    }
    /* need to trans the mat3*/
    return mat3;
}

void ele_mat_show(PDATA mat)
{
    for (int i=0;i_rows;i++){
        for (int j=0;j_cols;j++){
            cout<data[IDX2C(i,j,mat->_rows)]<<"\t";
        }
        cout<_rows=5000;
    mat1->_cols=50000;
    mat4->_rows=5000;
    mat4->_cols=50000;
    mat1->data=(float *)malloc(sizeof(float)*mat1->_rows*mat1->_cols);
    mat4->data=(half *)malloc(sizeof(half)*mat1->_rows*mat1->_cols);
    for (int i=0;i_rows;i++)
        for (int j=0;j_cols;j++)
            mat1->data[IDX2C(i,j,mat1->_rows)]=i+j;
    for (int i=0;i_rows;i++)
        for (int j=0;j_cols;j++)
            mat4->data[IDX2C(i,j,mat1->_rows)]=float2half_rn(mat1->data[IDX2C(i,j,mat1->_rows)]);

    mat2->_rows=50000;
    mat2->_cols=2000;
    mat5->_rows=50000;
    mat5->_cols=2000;
    mat2->data=(float *)malloc(sizeof(float)*mat2->_rows*mat2->_cols);
    mat5->data=(half *)malloc(sizeof(half)*mat2->_rows*mat2->_cols);
    for (int i=0;i_rows;i++)
        for (int j=0;j_cols;j++)
            mat2->data[IDX2C(i,j,mat2->_rows)]=i+j;
    for (int i=0;i_rows;i++)
        for (int j=0;j_cols;j++)
            mat5->data[IDX2C(i,j,mat2->_rows)]=float2half_rn(mat2->data[IDX2C(i,j,mat2->_rows)]);

    mat6=mat_product(mat4,mat5);

    return 0;
}


漫长的等待之后,跑了完了, 结果是什么呢? (´・ω・`) 

我擦咧, (╯‵□′)╯︵┻━┻还真尼玛快了两倍,nVidia你赢了。

那如此一来,就得开始干活啦,让faster rcnn支持TX1的半精度运算。


怎么做

1.环境:cuda7.5,python,TX1等等
2.需要caffe支持半精运算
3.添加faster rcnn新增的层:roi_pooling_layer和smooth_L1_loss_layer
4.测试(做好后将会在我的博客中公布结果)

目前情况

1.Faster R-CNN在TX1上运行。
2.让mnist在TX1上跑半精运算。
3.正在重写 roi_pooling_layer和 smooth_L1_loss_layer

相关链接

caffe官方将会支持TX1的fp16的特性吗?
New Features in CUDA 7.5
A look at the NVIDIA Jetson TX1


你可能感兴趣的:(研发,图像识别,深度学习)