YUV转RGB的公式是固定的,YUV转RGB的代码网上也可以找到很多,不过真的要将这些标准代码用在实际的项目中就会发现, 性能还是不够好。
最近在君正的X1000 CPU上实现YUYV转RGB24时,
为了不使用浮点计算,使用了下面的整数优化公式
B = y + ((443 * (u - 128)) >> 8);
G = y - ((179 * (v - 128) + 86 * (u - 128)) >> 8);
R = y + ((351 * (v - 128)) >> 8);
基于上面的公式用标准C实现了YUYV转RGB24。然而实际运行效率还是不令人满意 。
于是想到了利用XBurst的SIMD扩展指令集(MXU)来优化YUYV转RGB24的函数,更进一步提高效率。
XBurst的MXU指令集,参见 ftp://ftp.ingenic.com/SOC/X1000/X1000_M200_XBurst_ISA_MXU_PM.pdf
下面是YUV TO RGB24的完整实现代码,还是基于上面的YUV2RGB公式,但从C代码转为汇编代码后,长度增加了近10倍。实际效果呢?执行效率的改善还是很明显的,CPU占用率从40%下降到18%.
#include
#include
#include
#define i_pref(hint,base,offset) \ ({ __asm__ __volatile__("pref %0,%2(%1)"::"i"(hint),"r"(base),"i"(offset):"memory");})
int YUYV_to_RGB888 (void *src_buf, void *dst_buf, size_t src_width ,size_t src_height,size_t srcStride, size_t dstStride )
{
int line =0;
int col =0;
const uint8_t *src = (uint8_t *)src_buf;
uint8_t *dst = (uint8_t *)dst_buf;
uint32_t blue4,green4,red4;
i_pref(4, src , 0); // load_streamed
i_pref(5, dst , 0); // store_streamed
for(line = 0; line < src_height; line++)
{
i_pref(4, src , 0); // load_streamed
i_pref(5, dst , 0); // store_streamed
for( col = 0; col < src_width; col = col + 4)
{
S32LDDV(xr3, src, col, 1); // xr3:v0 ,y01,u0 ,y00
S32LDDV(xr4, src, col + 2, 1); // xr4:v1 ,y11,u1 ,y10
S32SFL(xr3, xr4, xr3, xr1, ptn1); // xr1:y11,y10,y01,y00
// xr3:v1 ,u1 ,v0 ,u0
S32LUI(xr13,128,ptn7); // xr13 = 128,128,128,128
Q8ADDE_SS(xr3, xr3, xr13, xr2); // xr2[15:0 ] = xr3[07:00] - xr13[07:00] u0-128
// xr2[31:16] = xr3[15:08] - xr13[15:08] v0-128
// xr3[15:0 ] = xr3[23:16] - xr13[23:16] u1-128
// xr3[31:16] = xr3[31:24] - xr13[31:24] v1-128
S32SFL(xr3, xr3, xr2, xr2, ptn3); // xr2[15:0 ] = u0-128
// xr2[31:16] = u1-128
// xr3[15:0 ] = v0-128
// xr3[31:16] = v1-128
/////////////now xr1 -> y,xr2-> u xr3 -> v store until loop end
/*******************************************BLUE**************/
S32LUI(xr14,222,ptn4); // /xr13 = 0,222,0,222
D16MUL_WW(xr6,xr2,xr14,xr5); // xr5[31:00] = xr2[15:00] * xr14[15:00] (u0 - 128)*222
// xr6[31:00] = xr2[31:16] * xr14[31:16] (u1 - 128)*222
D32SAR(xr6,xr6,xr5,xr5,7); // xr5[31:00] = xr5[31:00] >>7 (u0 - 128)*222/128
// xr6[31:00] = xr6[31:00] >>7 (u1 - 128)*222/128
// compact to short
S32SFL(xr14,xr5,xr5,xr5,ptn3); // xr5[15:00] = (u0 - 128)*222/128
// xr5[31:16] = (u0 - 128)*222/128
// xr14 unused
// convert 16bit
S32SFL(xr14,xr6,xr6,xr6,ptn3); // xr6[15:00] = (u1 - 128)*222/128
// xr6[31:16] = (u1 - 128)*222/128
// xr14 unused
// expand y3,y2,y1,y0 to short
S32LUI(xr14,0,ptn7);
S32SFL(xr8,xr14,xr1,xr7,ptn0); //xr7[15:00] = xr1[07:00] y00
//xr7[31:16] = xr1[15:08] y01
//xr8[15:00] = xr1[24:16] y10
//xr8[31:16] = xr1[31:24] y11
Q16ACCM_AA(xr6, xr8, xr7 ,xr5); //xr5[15:0 ] += xr7[15:0 ] y00 + (u0 -128)*222/128
//xr5[31:16] += xr7[31:16] y01 + (u0 -128)*222/128
//xr6[15:0 ] += xr8[15:0 ] y10 + (u1 -128)*222/128
//xr6[31:16] += xr8[31:16] y11 + (u1 -128)*222/128
// xr9 {b3,b2,b1,b0}
Q16SAT (xr9, xr6, xr5); //xr9[ 7: 0] = xr5[15:0 ] y01 + (u1 -128)*222/128
//xr9[15: 8] = xr5[31:16] y02 + (u1 -128)*222/128
//xr9[23:16] = xr6[15:0 ] y03 + (u3 -128)*222/128
//xr9[31:24] = xr6[31:16] y04 + (u4 -128)*222/128
// blue4 {b3,b2,b1,b0}
S32STD(xr9,&blue4,0);
/*******************************************GREEN**************/
S32LUI(xr14,86,4); //xr14 = 0,86,0,86
D16MUL_WW(xr6,xr2,xr14,xr5); //xr5[31:00] = xr2[15:00] * xr14[15:00] (u0 - 128)*86
//xr6[31:00] = xr2[15:00] * xr14[31:16] (u1 - 128)*86
// compact to short
S32SFL(xr6,xr6,xr5,xr5,ptn3); // xr5[15:00] = (u0 - 128)*86
// xr5[31:16] = (u1 - 128)*86
S32LUI(xr14,179,4); // xr14 = 0,179,0,179
D16MADL_AA_WW(xr5,xr3,xr14,xr5); //xr5[15:00] += xr3[15:00] * xr14[15:00] (u0 - 128)*86 + (v0 - 128)*179
//xr5[31:16] += xr3[31:16] * xr14[31:16] (u1 - 128)*86 + (v1 - 128)*179
// expand to xr5 xr6
S32SFL(xr6,xr5,xr5,xr5,ptn3); //xr5[15:00] (u0 - 128)*86 + (v0 - 128)*179
//xr5[31:16] (u0 - 128)*86 + (v0 - 128)*179
//xr6[15:00] (u1 - 128)*86 + (v1 - 128)*179
//xr6[31:16] (u1 - 128)*86 + (v1 - 128)*179
Q16SAR(xr6,xr6,xr5,xr5,8); //xr5[15:00] = xr5[15:00]>>8 ((u0 - 128)*86 + (v0 - 128)*179)/256
//xr5[31:16] = xr5[31:16]>>8 ((u0 - 128)*86 + (v0 - 128)*179)/256
//xr6[15:00] = xr5[15:00]>>8 ((u1 - 128)*86 + (v1 - 128)*179)/256
//xr6[31:16] = xr5[31:16]>>8 ((u1 - 128)*86 + (v1 - 128)*179)/256
// expand y3,y2,y1,y0 to short
S32SFL(xr8,xr0,xr1,xr7,ptn0); //xr7[15:00] = xr1[07:00] y00
//xr7[31:16] = xr1[15:08] y01
//xr8[15:00] = xr1[23:16] y10
//xr8[31:16] = xr1[31:24] y11
Q16ACCM_AA(xr8, xr6, xr5 ,xr7); //xr7[15:00] += xr5[15:00] y00 + (u0 - 128)*86 + (v0 - 128)*179/256
//xr7[31:16] += xr5[31:16] y01 + (u0 - 128)*86 + (v0 - 128)*179/256
//xr8[15:00] += xr6[15:00] y10 + (u1 - 128)*86 + (v1 - 128)*179/256
//xr8[31:16] += xr6[31:16] y11 + (u1 - 128)*86 + (v1 - 128)*179/256
// xr9 {g3,g2,g1,g0}
Q16SAT (xr9, xr8, xr7); //xr9[07:00] = xr7[15:00]
//xr9[15:08] = xr7[31:16]
//xr9[23:16] = xr8[15:00]
//xr9[31:24] = xr8[31:16]
// green4 {g3,g2,g1,g0}
S32STD(xr9,&green4,0);
/*****************************************RED****************/
S32LUI(xr14,175,ptn4); //xr13 = 0,175,0,175
D16MUL_WW(xr6,xr3,xr14,xr5); // xr5[31:00] = xr3[15:00] * xr14[15:00] (v0 - 128)*175
// xr6[31:00] = xr3[31:16] * xr14[31:16] (v1 - 128)*175
D32SAR(xr6,xr6,xr5,xr5,7); // xr5[31:00] = xr5[31:00] >>7 (v0 - 128)*175/128
// xr6[31:00] = xr6[31:00] >>7 (v1 - 128)*175/128
// compact to short
S32SFL(xr6,xr6,xr5,xr5,ptn3); // xr5[15:00] = (v0 - 128)*175/128
// xr5[31:16] = (v1 - 128)*175/128
S32SFL(xr6,xr5,xr5,xr5,ptn3); // xr5[15:00] = (v0 - 128)*175/128
// xr5[31:16] = (v0 - 128)*175/128
// xr6[15:00] = (v1 - 128)*175/128
// xr6[31:16] = (v1 - 128)*175/128
// expand y3,y2,y1,y0 to short
S32SFL(xr8,xr0,xr1,xr7,ptn0); //xr7[15:00] = xr1[07:00] y00
//xr7[31:16] = xr1[15:08] y01
//xr8[15:00] = xr1[24:16] y10
//xr8[31:16] = xr1[31:24] y11
Q16ACCM_AA(xr6, xr8, xr7 ,xr5); //xr5[15:0 ] += xr7[15:0 ] y00 + (v0 -128)*175/128
//xr5[31:16] += xr7[31:16] y01 + (v0 -128)*175/128
//xr6[15:0 ] += xr8[15:0 ] y10 + (v1 -128)*175/128
//xr6[31:16] += xr8[31:16] y11 + (v1 -128)*175/128
// xr9 {b3,b2,b1,b0}
Q16SAT (xr9, xr6, xr5); //xr9[ 7: 0] = xr5[15:0 ] y01 + (v1 -128)*175/128
//xr9[15: 8] = xr5[31:16] y02 + (v1 -128)*175/128
//xr9[23:16] = xr6[15:0 ] y03 + (v3 -128)*175/128
//xr9[31:24] = xr6[31:16] y04 + (v4 -128)*175/128
// red4 {r3,r2,r1,r0}
S32STD(xr9,&red4,0);
/*****************************************COMPACT TO RGB24****************/
// green
S32LDD(xr2,&green4,0); // xr2 {g3,g2,g1,g0}
S32SFL(xr3,xr2,xr9,xr2,ptn0); // xr3 {g3,r3,g2,r2}
// xr2 {g1,r1,g0,r0}
S32LDD(xr4,&blue4,0); // xr4 {b3,b2,b1,b0}
S32SFL(xr5,xr0,xr4,xr4,ptn0); // xr5 {0,b3,0,b2}
// xr4 {0,b1,0,b0}
S32SFL(xr5,xr5,xr3,xr3,ptn3); // xr5 {0,b3,g3,r3}
// xr3 {0,b2,g2,r2}
S32SFL(xr4,xr4,xr2,xr2,ptn3); // xr4 {0,b1,g1,r1}
// xr2 {0,b0,g0,r0}
D32SLL(xr3,xr3,xr2,xr2,8); // xr3 {b2,g2,r2,0} xr2 {b0,g0,r0,0}
// final xr8,xr7,xr6
S32ALNI(xr8,xr5,xr3,ptn1); // xr8 {b3,g3,r3,b2}
D32SARW(xr7,xr3,xr4,8); // xr7 {b2,g2,r1,b1}
S32ALNI(xr6,xr4,xr2,ptn3); // xr6 {b1,g0,r0,b0}
/*****************************************STORE****************/
S32STDV(xr6, dst, col, 1);
S32STDV(xr7, dst, col + 2, 1);
S32STDV(xr8, dst, col + 4, 1);
}
src += srcStride;
dst += dstStride;
}
return 0;
}