YUV转RGB(NV21-ARGB)的Neon优化代码

说明

此代码仅限于 NV21 格式转 ARGB 格式。
NV21 格式中,Y 单独存储,UV分量交错存储。

使用如下公式:
R = Y + 1.402*(V-128);
G = Y - 0.34414*(U-128) - 0.71414*(V-128);
B = Y + 1.772*(U-128);
浮点乘法用 6位精度处理(即a*b = ((a << 6)*b )>>6)

代码

#ifdef HAS_NEON
#include <arm_neon.h>
#endif
void convertToRGBA(unsigned char* yuv, int w, int h, int* rgba)
{
    for (int i=0; i<h; ++i)
    {
        unsigned char* dst = (unsigned char*)(rgba + w*i);
        unsigned char* y = yuv + w*i;
        unsigned char* uv = yuv + w*h + w*(i/2);
        int count = w;
#ifdef HAS_NEON
/*一次处理16个像素*/
        int c = count/16;
        asm volatile(
                "mov r4, %[c]\t\n"
                "beq 2f\t\n"
                "vmov.u8 d7, #255\t\n"//Alpha
                "vmov.u8 d3, #255\t\n"//Alpha
                "vmov.s16 q11, #90\t\n"
                "vmov.s16 q12, #128\t\n"
                "vmov.s16 q13, #21\t\n"
                "vmov.s16 q14, #46\t\n"
                "vmov.s16 q15, #113\t\n"
                "1:\t\n"
                /*Y1 Y2 是交错的两组像素的Y分量,与 UV分量值 正好一一对应*/
                "vld2.8 {d8, d9}, [%[y]]!\t\n"//Y1, Y2
                /*交错取出 UV 值*/
                "vld2.8 {d0, d1}, [%[uv]]!\t\n"//u, v
                "vmovl.u8 q5, d0\t\n"
                "vmovl.u8 q6, d1\t\n"
                "vsub.i16 q5,q5, q12\t\n"//U
                "vsub.i16 q6,q6, q12\t\n"//V
                //First RGBA
                "vshll.u8 q7, d8, #6\t\n"
                "vshll.u8 q8, d8, #6\t\n"
                "vshll.u8 q9, d8, #6\t\n"
                "vmla.i16 q7, q6, q11\t\n"
                "vmls.i16 q8, q5, q13\t\n"
                "vmls.i16 q8, q6, q14\t\n"
                "vmla.i16 q9, q5, q15\t\n"

                "vshr.s16 q7, q7, #6\t\n"
                "vshr.s16 q8, q8, #6\t\n"
                "vshr.s16 q9, q9, #6\t\n"
                "vmov.s16 q10, #0\t\n"
                "vmax.s16 q7, q7, q10\t\n"
                "vmax.s16 q8, q8, q10\t\n"
                "vmax.s16 q9, q9, q10\t\n"
                "vmov.u16 q10, #255\t\n"
                "vmin.u16 q7, q7, q10\t\n"
                "vmin.u16 q8, q8, q10\t\n"
                "vmin.u16 q9, q9, q10\t\n"
                "vmovn.s16 d2, q7\t\n"
                "vmovn.s16 d1, q8\t\n"
                "vmovn.s16 d0, q9\t\n"

                //Second RGBA
                "vshll.u8 q7, d9, #6\t\n"
                "vshll.u8 q8, d9, #6\t\n"
                "vshll.u8 q9, d9, #6\t\n"
                "vmla.i16 q7, q6, q11\t\n"
                "vmls.i16 q8, q5, q13\t\n"
                "vmls.i16 q8, q6, q14\t\n"
                "vmla.i16 q9, q5, q15\t\n"
                "vshr.s16 q7, q7, #6\t\n"
                "vshr.s16 q8, q8, #6\t\n"
                "vshr.s16 q9, q9, #6\t\n"
                "vmov.s16 q10, #0\t\n"
                "vmax.s16 q7, q7, q10\t\n"
                "vmax.s16 q8, q8, q10\t\n"
                "vmax.s16 q9, q9, q10\t\n"
                "vmov.u16 q10, #255\t\n"
                "vmin.u16 q7, q7, q10\t\n"
                "vmin.u16 q8, q8, q10\t\n"
                "vmin.u16 q9, q9, q10\t\n"
                "vmovn.s16 d6, q7\t\n"
                "vmovn.s16 d5, q8\t\n"
                "vmovn.s16 d4, q9\t\n"
/*目前我们得到的两组RGB分量值是交错的, * 比如: * d0 : (g0 g2 g4 g6 g8 g10 g12 g14) * d4 : (g1 g3 g5 g7 g9 g11 g13 g15) * 需要做交织,变成如下再存储: * d0 :(g0 g1 g2 g3 g4 g5 g6 g7) * d4 :(g8 g9 g10 g11 g12 g13 g14 g15) */

                "vtrn.8 d2,d6\t\n"
                "vtrn.16 d2,d6\t\n"
                "vtrn.32 d2,d6\t\n"

                "vtrn.8 d1,d5\t\n"
                "vtrn.16 d1,d5\t\n"
                "vtrn.32 d1,d5\t\n"

                "vtrn.8 d0,d4\t\n"
                "vtrn.16 d0,d4\t\n"
                "vtrn.32 d0,d4\t\n"

                "vst4.8 {d0-d3}, [%[dst]]!\t\n"
                "vst4.8 {d4-d7}, [%[dst]]!\t\n"

                "subs r4, r4, #1\t\n"
                "bne 1b\t\n"
                "2:\t\n"
                : [dst] "+r" (dst), [y] "+r" (y), [uv] "+r" (uv), [c] "+r" (c)
                :
                : "r4", "cc","memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
                );
        count%=16;
#endif

/*边角料的处理*/
        int r, g, b;
        while (count > 1)
        {
            unsigned char _y = y[0];
            unsigned char _u = uv[0];
            unsigned char _v = uv[1];
            r = _y + ((179*(_v-128))>>7);
            g = _y - ((43*(_u-128) - 91*(_v-128))>>7);
            b = _y + ((227*(_u-128))>>7);
            r = r<0?0:r;r=r>255?255:r;
            g = g<0?0:g;g=g>255?255:g;
            b = b<0?0:b;b=b>255?255:b;
            dst[0] = b;
            dst[1] = g;
            dst[2] = r;
            dst[3] = 0xFF;

            y++;
            dst+=4;
            _y = y[0];

            r = _y + ((179*(_v-128))>>7);
            g = _y - ((43*(_u-128) - 91*(_v-128))>>7);
            b = _y + ((227*(_u-128))>>7);
            r = r<0?0:r;r=r>255?255:r;
            g = g<0?0:g;g=g>255?255:g;
            b = b<0?0:b;b=b>255?255:b;

            dst[0] = b;
            dst[1] = g;
            dst[2] = r;
            dst[3] = 0xFF;
            y++;
            uv+=2;
            dst+=4;

            count-=2;
        }
        if (count > 0)
        {
            unsigned char _y = y[0];
            unsigned char _u = uv[0];
            unsigned char _v = uv[1];
            r = _y + ((179*(_v-128))>>7);
            g = _y - ((43*(_u-128) - 91*(_v-128))>>7);
            b = _y + ((227*(_u-128))>>7);
            r = r<0?0:r;r=r>255?255:r;
            g = g<0?0:g;g=g>255?255:g;
            b = b<0?0:b;b=b>255?255:b;
            dst[0] = b;
            dst[1] = g;
            dst[2] = r;
            dst[3] = 0xFF;
        }
    }
}

你可能感兴趣的:(android,SIMD,RGB,yuv,neon)