一句话,float16的运算速度是float32运算速度的2倍。nVidia说的,数据不对不要怪我,(≖ ‿ ≖)✧
在说一下缺点,也是一句话,精度损失了。(卧槽,废话,(*´Д`*))
IEEE754(wiki)这里描述各种float的规则,这里鄙视一下百度百科。 눈_눈
float32: 负 -3.4028235E+38 到 -1.401298E-45,正 1.401298E-45 到 3.4028235E+38
float16: 半精度占2个字节,1位符号位,5位阶码,10位尾数组成,精度为0.001,所以它的范围,自己算 ,(´・ω・`) ,不会就去问老师(。◕ˇ∀ˇ◕)。
看来还是可以和float32比一比的嘛,(喂喂,这么果断得出结论真的好吗?)
既然那么厉害, 反正拿到了TX1,那就试试呀,(卧槽,真的不打算解释了么)!搞一个超大的矩阵乘,然而事实是cuda目前也只提供float16的矩阵乘运算,_(:3 」∠)_
先贴个代码,请直接略过,代码有点长,有点乱,请随便介意,因为不是我写的,(。◕ˇ∀ˇ◕)
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
#define IDX2C(i,j,leading) (((j)*(leading))+(i))
typedef struct _data *PDATA;
typedef struct _data
{
int _rows;
int _cols;
float *data;
} Data;
typedef struct _hdata *PHDATA;
typedef struct _hdata
{
int _rows;
int _cols;
half *data;
} HData;
void free_mat(PDATA mat)
{
free(mat->data);
free(mat);
}
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
half uint16_as_fp16 (uint16_t a)
{
half res;
#if defined (__cplusplus)
memcpy (&res, &a, sizeof (res));
#else /* __cplusplus */
volatile union {
half f;
uint16_t i;
} cvt;
cvt.i = a;
res = cvt.f;
#endif /* __cplusplus */
return res;
}
uint32_t fp32_as_uint32 (float a)
{
uint32_t res;
#if defined (__cplusplus)
memcpy (&res, &a, sizeof (res));
#else /* __cplusplus */
volatile union {
float f;
uint32_t i;
} cvt;
cvt.f = a;
res = cvt.i;
#endif /* __cplusplus */
return res;
}
/* host version of device function __float2half_rn() */
half float2half_rn (float a)
{
uint32_t ia = fp32_as_uint32 (a);
uint16_t ir;
ir = (ia >> 16) & 0x8000;
if ((ia & 0x7f800000) == 0x7f800000) {
if ((ia & 0x7fffffff) == 0x7f800000) {
ir |= 0x7c00; /* infinity */
} else {
ir = 0x7fff; /* canonical NaN */
}
} else if ((ia & 0x7f800000) >= 0x33000000) {
int shift = (int)((ia >> 23) & 0xff) - 127;
if (shift > 15) {
ir |= 0x7c00; /* infinity */
} else {
ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
if (shift < -14) { /* denormal */
ir |= ia >> (-1 - shift);
ia = ia << (32 - (-1 - shift));
} else { /* normal */
ir |= ia >> (24 - 11);
ia = ia << (32 - (24 - 11));
ir = ir + ((14 + shift) << 10);
}
/* IEEE-754 round to nearest of even */
if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
ir++;
}
}
}
return uint16_as_fp16 (ir);
}
PHDATA mat_product(PHDATA mat1,PHDATA mat2)
{
if(mat1->_cols!=mat2->_rows)
{
printf("this is not right\n");
return NULL;
}
PHDATA mat3=new HData;
mat3->data=(half *)malloc(sizeof(half)*(mat1->_rows)*(mat2->_cols));
mat3->_rows=mat1->_rows;
mat3->_cols=mat2->_cols;
/*
*INIT the matrix we want calculate
* col primary
*/
{
half *d_a,*d_b,*d_c;
cublasInit();
cublasAlloc((mat1->_cols)*(mat1->_rows),sizeof(half),(void **)&d_a);
cublasAlloc((mat2->_cols)*(mat2->_rows),sizeof(half),(void **)&d_b);
cublasAlloc((mat3->_rows)*(mat3->_cols),sizeof(half),(void **)&d_c);
cudaMemcpy(d_a,mat1->data,sizeof(half)*(mat1->_cols)*(mat1->_rows),cudaMemcpyHostToDevice);
cudaMemcpy(d_b,mat2->data,sizeof(half)*(mat2->_rows)*(mat2->_cols),cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
half alpha=float2half_rn(float(1.0));
half beta=float2half_rn(float(0.0));
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
cublasHgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,mat1->_rows,mat2->_cols,
mat2->_rows,&alpha,d_a,mat1->_rows,d_b,mat2->_rows,&beta,d_c,mat1->_rows);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float ela=0;
cudaEventElapsedTime(&ela,start,stop);
cout<<"GPU: "<data,d_c,sizeof(half)*(mat3->_rows)*(mat3->_cols),cudaMemcpyDeviceToHost);
cublasFree(d_a);
cublasFree(d_b);
cublasFree(d_c);
cublasShutdown();
}
/* need to trans the mat3*/
return mat3;
}
void ele_mat_show(PDATA mat)
{
for (int i=0;i_rows;i++){
for (int j=0;j_cols;j++){
cout<data[IDX2C(i,j,mat->_rows)]<<"\t";
}
cout<_rows=5000;
mat1->_cols=50000;
mat4->_rows=5000;
mat4->_cols=50000;
mat1->data=(float *)malloc(sizeof(float)*mat1->_rows*mat1->_cols);
mat4->data=(half *)malloc(sizeof(half)*mat1->_rows*mat1->_cols);
for (int i=0;i_rows;i++)
for (int j=0;j_cols;j++)
mat1->data[IDX2C(i,j,mat1->_rows)]=i+j;
for (int i=0;i_rows;i++)
for (int j=0;j_cols;j++)
mat4->data[IDX2C(i,j,mat1->_rows)]=float2half_rn(mat1->data[IDX2C(i,j,mat1->_rows)]);
mat2->_rows=50000;
mat2->_cols=2000;
mat5->_rows=50000;
mat5->_cols=2000;
mat2->data=(float *)malloc(sizeof(float)*mat2->_rows*mat2->_cols);
mat5->data=(half *)malloc(sizeof(half)*mat2->_rows*mat2->_cols);
for (int i=0;i_rows;i++)
for (int j=0;j_cols;j++)
mat2->data[IDX2C(i,j,mat2->_rows)]=i+j;
for (int i=0;i_rows;i++)
for (int j=0;j_cols;j++)
mat5->data[IDX2C(i,j,mat2->_rows)]=float2half_rn(mat2->data[IDX2C(i,j,mat2->_rows)]);
mat6=mat_product(mat4,mat5);
return 0;
}
我擦咧, (╯‵□′)╯︵┻━┻还真尼玛快了两倍,nVidia你赢了。
那如此一来,就得开始干活啦,让faster rcnn支持TX1的半精度运算。