基于AVX2的YUV420转RGB C++代码

第一次写博文,不好意思,写的应该不好,另外我只是C++的业余程序员,C++功底很一般,见谅!


我在做视频识别的工程中要用到YUV转RGB的功能,以前我用过MMX指令的代码,那是网上找的代码,我当时并不懂MMX,也不懂汇编,只是知道MMX比普通代码要快,确实很快,现在知道SSE2比MMX要快一倍,AVX2比SSE2要快一倍,所以想尝试用AVX2来实现YUV转RGB的功能,在网上寻找多次,也没找到AVX2的现成代码,只找到libyuv库中有用AVX2来实现,但测试发现它的性能没有比MMX快4倍,只快一倍多一点,分析发现里面还用了SSE3指令:

__declspec(naked)
void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
	__asm {
		mov       eax, [esp + 4]   // src_argb
		mov       edx, [esp + 8]   // dst_rgb
		mov       ecx, [esp + 12]  // width
		movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24

		convertloop :
		movdqu    xmm0, [eax]   // fetch 16 pixels of argb
			movdqu    xmm1, [eax + 16]
			movdqu    xmm2, [eax + 32]
			movdqu    xmm3, [eax + 48]
			lea       eax, [eax + 64]
			pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
			pshufb    xmm1, xmm6
			pshufb    xmm2, xmm6
			pshufb    xmm3, xmm6
			movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
			psrldq    xmm1, 4      // 8 bytes from 1
			pslldq    xmm4, 12     // 4 bytes from 1 for 0
			movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
			por       xmm0, xmm4   // 4 bytes from 1 for 0
			pslldq    xmm5, 8      // 8 bytes from 2 for 1
			movdqu[edx], xmm0  // store 0
			por       xmm1, xmm5   // 8 bytes from 2 for 1
			psrldq    xmm2, 8      // 4 bytes from 2
			pslldq    xmm3, 4      // 12 bytes from 3 for 2
			por       xmm2, xmm3   // 12 bytes from 3 for 2
			movdqu[edx + 16], xmm1   // store 1
			movdqu[edx + 32], xmm2   // store 2
			lea       edx, [edx + 48]
			sub       ecx, 16
			jg        convertloop
			ret
	}
}
这是将RGBA转成RGB的代码,这里降低了性能

下面是AVX2的YUV420转RGBA代码,libyuv里面的,汇编格式的宏

#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
    __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
    __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
    __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
    __asm vpsubw     ymm2, ymm3, ymm2                                          \
    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
    __asm vpsubw     ymm1, ymm3, ymm1                                          \
    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
    __asm vpsubw     ymm0, ymm3, ymm0                                          \
    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
    __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
    __asm vpaddsw    ymm0, ymm0, ymm4           /* B += Y */                   \
    __asm vpaddsw    ymm1, ymm1, ymm4           /* G += Y */                   \
    __asm vpaddsw    ymm2, ymm2, ymm4           /* R += Y */                   \
    __asm vpsraw     ymm0, ymm0, 6                                             \
    __asm vpsraw     ymm1, ymm1, 6                                             \
    __asm vpsraw     ymm2, ymm2, 6                                             \
    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
  }

所以我尝试自己写AVX2的代码,但我没有汇编基础,很难真接在libyuv的基础上改,所以一直在网上找其它方法,

后来才知道C++有AVX2、SSE2等系列的非汇编调用的方法,但基本上要在VC2005以上的版本下才可以使用。

这种非汇编的方式我应该可以尝试,后来我在网上找到有人用SSE2的非汇编方式的YUV转RGB32的代码,我就是从这里开始升级到AVX2,下面是SSE2的代码:

void yuv420_to_argb8888( uint8_t *yp, uint8_t *up, uint8_t *vp,
                         uint32_t sy, uint32_t suv,
                         int width, int height,
                         uint32_t *rgb, uint32_t srgb )
{
    __m128i y0r0, y0r1, u0, v0;
    __m128i y00r0, y01r0, y00r1, y01r1;
    __m128i u00, u01, v00, v01;
    __m128i rv00, rv01, gu00, gu01, gv00, gv01, bu00, bu01;
    __m128i r00, r01, g00, g01, b00, b01;
    __m128i rgb0123, rgb4567, rgb89ab, rgbcdef;
    __m128i gbgb;
    __m128i ysub, uvsub;
    __m128i zero, facy, facrv, facgu, facgv, facbu;
    __m128i *srcy128r0, *srcy128r1;
    __m128i *dstrgb128r0, *dstrgb128r1;
    __m64   *srcu64, *srcv64;
    int x, y;

    ysub  = _mm_set1_epi32( 0x00100010 );
    uvsub = _mm_set1_epi32( 0x00800080 );
    
    facy  = _mm_set1_epi32( 0x004a004a );
    facrv = _mm_set1_epi32( 0x00660066 );
    facgu = _mm_set1_epi32( 0x00190019 );
    facgv = _mm_set1_epi32( 0x00340034 );
    facbu = _mm_set1_epi32( 0x00810081 );
    
    zero  = _mm_set1_epi32( 0x00000000 );

    for( y = 0; y < height; y += 2 ) {

        srcy128r0 = (__m128i *)(yp + sy*y);
        srcy128r1 = (__m128i *)(yp + sy*y + sy);
        srcu64 = (__m64 *)(up + suv*(y/2));
        srcv64 = (__m64 *)(vp + suv*(y/2));

        dstrgb128r0 = (__m128i *)(rgb + srgb*y);
        dstrgb128r1 = (__m128i *)(rgb + srgb*y + srgb);

        for( x = 0; x < width; x += 16 ) {

            u0 = _mm_loadl_epi64( (__m128i *)srcu64 ); srcu64++;
            v0 = _mm_loadl_epi64( (__m128i *)srcv64 ); srcv64++;

            y0r0 = _mm_load_si128( srcy128r0++ );
            y0r1 = _mm_load_si128( srcy128r1++ );

            // constant y factors
            y00r0 = _mm_mullo_epi16( _mm_sub_epi16( _mm_unpacklo_epi8( y0r0, zero ), ysub ), facy );
            y01r0 = _mm_mullo_epi16( _mm_sub_epi16( _mm_unpackhi_epi8( y0r0, zero ), ysub ), facy );
            y00r1 = _mm_mullo_epi16( _mm_sub_epi16( _mm_unpacklo_epi8( y0r1, zero ), ysub ), facy );
            y01r1 = _mm_mullo_epi16( _mm_sub_epi16( _mm_unpackhi_epi8( y0r1, zero ), ysub ), facy );

            // expand u and v so they're aligned with y values
            u0  = _mm_unpacklo_epi8( u0,  zero );
            u00 = _mm_sub_epi16( _mm_unpacklo_epi16( u0, u0 ), uvsub );
            u01 = _mm_sub_epi16( _mm_unpackhi_epi16( u0, u0 ), uvsub );

            v0  = _mm_unpacklo_epi8( v0,  zero );
            v00 = _mm_sub_epi16( _mm_unpacklo_epi16( v0, v0 ), uvsub );
            v01 = _mm_sub_epi16( _mm_unpackhi_epi16( v0, v0 ), uvsub );

            // common factors on both rows.
            rv00 = _mm_mullo_epi16( facrv, v00 );
            rv01 = _mm_mullo_epi16( facrv, v01 );
            gu00 = _mm_mullo_epi16( facgu, u00 );
            gu01 = _mm_mullo_epi16( facgu, u01 );
            gv00 = _mm_mullo_epi16( facgv, v00 );
            gv01 = _mm_mullo_epi16( facgv, v01 );
            bu00 = _mm_mullo_epi16( facbu, u00 );
            bu01 = _mm_mullo_epi16( facbu, u01 );

            // row 0
            r00 = _mm_srai_epi16( _mm_add_epi16( y00r0, rv00 ), 6 );
            r01 = _mm_srai_epi16( _mm_add_epi16( y01r0, rv01 ), 6 );
            g00 = _mm_srai_epi16( _mm_sub_epi16( _mm_sub_epi16( y00r0, gu00 ), gv00 ), 6 );
            g01 = _mm_srai_epi16( _mm_sub_epi16( _mm_sub_epi16( y01r0, gu01 ), gv01 ), 6 );
            b00 = _mm_srai_epi16( _mm_add_epi16( y00r0, bu00 ), 6 );
            b01 = _mm_srai_epi16( _mm_add_epi16( y01r0, bu01 ), 6 );

            r00 = _mm_packus_epi16( r00, r01 );         // rrrr.. saturated
            g00 = _mm_packus_epi16( g00, g01 );         // gggg.. saturated
            b00 = _mm_packus_epi16( b00, b01 );         // bbbb.. saturated

            r01     = _mm_unpacklo_epi8(  r00,  zero ); // 0r0r..
            gbgb    = _mm_unpacklo_epi8(  b00,  g00 );  // gbgb..
            rgb0123 = _mm_unpacklo_epi16( gbgb, r01 );  // 0rgb0rgb..
            rgb4567 = _mm_unpackhi_epi16( gbgb, r01 );  // 0rgb0rgb..

            r01     = _mm_unpackhi_epi8(  r00,  zero );
            gbgb    = _mm_unpackhi_epi8(  b00,  g00 );
            rgb89ab = _mm_unpacklo_epi16( gbgb, r01 );
            rgbcdef = _mm_unpackhi_epi16( gbgb, r01 );

            _mm_store_si128( dstrgb128r0++, rgb0123 );
            _mm_store_si128( dstrgb128r0++, rgb4567 );
            _mm_store_si128( dstrgb128r0++, rgb89ab );
            _mm_store_si128( dstrgb128r0++, rgbcdef );

            // row 1
            r00 = _mm_srai_epi16( _mm_add_epi16( y00r1, rv00 ), 6 );
            r01 = _mm_srai_epi16( _mm_add_epi16( y01r1, rv01 ), 6 );
            g00 = _mm_srai_epi16( _mm_sub_epi16( _mm_sub_epi16( y00r1, gu00 ), gv00 ), 6 );
            g01 = _mm_srai_epi16( _mm_sub_epi16( _mm_sub_epi16( y01r1, gu01 ), gv01 ), 6 );
            b00 = _mm_srai_epi16( _mm_add_epi16( y00r1, bu00 ), 6 );
            b01 = _mm_srai_epi16( _mm_add_epi16( y01r1, bu01 ), 6 );

            r00 = _mm_packus_epi16( r00, r01 );         // rrrr.. saturated
            g00 = _mm_packus_epi16( g00, g01 );         // gggg.. saturated
            b00 = _mm_packus_epi16( b00, b01 );         // bbbb.. saturated

            r01     = _mm_unpacklo_epi8(  r00,  zero ); // 0r0r..
            gbgb    = _mm_unpacklo_epi8(  b00,  g00 );  // gbgb..
            rgb0123 = _mm_unpacklo_epi16( gbgb, r01 );  // 0rgb0rgb..
            rgb4567 = _mm_unpackhi_epi16( gbgb, r01 );  // 0rgb0rgb..

            r01     = _mm_unpackhi_epi8(  r00,  zero );
            gbgb    = _mm_unpackhi_epi8(  b00,  g00 );
            rgb89ab = _mm_unpacklo_epi16( gbgb, r01 );
            rgbcdef = _mm_unpackhi_epi16( gbgb, r01 );

            _mm_store_si128( dstrgb128r1++, rgb0123 );
            _mm_store_si128( dstrgb128r1++, rgb4567 );
            _mm_store_si128( dstrgb128r1++, rgb89ab );
            _mm_store_si128( dstrgb128r1++, rgbcdef );

        }
    }
}

这个代码出自这个网址: 点击打开链接


我英语不好,是初中文化,分析这个代码还是非常吃力的,都是各种翻译

分析后认为他的方式还有些问题

1:精度不高,因为他用的定量整型是字节大小,像这些

facy  = _mm_set1_epi32( 0x004a004a );
2:只输出RGB32格式,看这里

_mm_store_si128( dstrgb128r0++, rgb0123 );

我原来用的MMX的定量就是short,像这个:mmw_mult_Y = 0x2568256825682568;

还有这个版本只输出RGB32,但识别用到的是RGB24,所以他这个代码必须还要改。


MMX版的代码我就不贴出了。

后来我在CSDN中找到一个大神,汇编很厉害,他做了很多计算性能的研究,下面是他的链接

YUV视频格式到RGB32格式转换的速度优化 上篇 

YUV视频格式到RGB32格式转换的速度优化 中篇 


我分析了好几天他的代码,但我很难直接用他的方式改,他里面用的汇编代码我还不能完全理解,

后来结合外国人的SSE2与这位大神的代码我改出了基于SSE2的代码,并正确转换,看下面

void yuv420_to_rgb24_sse3(uint8_t *yp, uint8_t *up, uint8_t *vp, int sy, int suv, int width, int height,
	uint8_t *rgb, int srgb)
{
	//定义空间
	__m128i y0r0, y0r1, u0, v0;
	__m128i y00r0, y01r0, y00r1, y01r1;
	__m128i u00, u01, v00, v01;
	__m128i rv00, rv01, gu00, gu01, gv00, gv01, bu00, bu01;
	__m128i r00, r01, g00, g01, b00, b01;
	__m128i rgb0123, rgb4567, rgb89ab, rgbcdef;
	__m128i gbgb;
	__m128i ysub, uvsub;
	__m128i zero, facy, facrv, facgu, facgv, facbu;
	__m128i *srcy128r0, *srcy128r1;
	uint8_t *dstrgbr0, *dstrgbr1;
	__m128i maskrgb;
	__m64   *srcu64, *srcv64;

	//定义核,公式定量
	ysub = _mm_set1_epi16(0x0010);
	uvsub = _mm_set1_epi16(0x0080);
	zero = _mm_set1_epi16(0x0000);

	maskrgb = _mm_set_epi8(128, 128, 128, 128, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0); 

	facy = _mm_set1_epi16(0x2543);
	facrv = _mm_set1_epi16(0x3313);
	facgu = _mm_set1_epi16(0xF377);
	facgv = _mm_set1_epi16(0xE5FC);
	facbu = _mm_set1_epi16(0x408D);

	for (int y = 0; y < height; y += 2) {
		//源数据指针
		srcy128r0 = (__m128i *)(yp + sy*y);
		srcy128r1 = (__m128i *)(yp + sy*y + sy);
		srcu64 = (__m64 *)(up + suv*(y / 2));
		srcv64 = (__m64 *)(vp + suv*(y / 2));

		dstrgbr0 = rgb + srgb*y;
		dstrgbr1 = rgb + srgb*y + srgb;

		for (int x = 0; x < width; x += 16) {
			//加载行数据
			u0 = _mm_loadl_epi64((__m128i *)srcu64); srcu64++;
			v0 = _mm_loadl_epi64((__m128i *)srcv64); srcv64++; 
 
			y0r0 = _mm_load_si128(srcy128r0++);
			y0r1 = _mm_load_si128(srcy128r1++);

			//计算YUV中的Y向量
			y00r0 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi8(y0r0, zero), ysub), 3), facy);
			y01r0 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi8(y0r0, zero), ysub), 3), facy);
			y00r1 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi8(y0r1, zero), ysub), 3), facy);
			y01r1 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi8(y0r1, zero), ysub), 3), facy);			  

			//展开u和v,使它们与y值对齐
			u0 = _mm_unpacklo_epi8(u0, zero);
			u00 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi16(u0, u0), uvsub), 3);
			u01 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi16(u0, u0), uvsub), 3);

			v0 = _mm_unpacklo_epi8(v0, zero);
			v00 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi16(v0, v0), uvsub), 3);
			v01 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi16(v0, v0), uvsub), 3);

			//计算两行UV的向量
			rv00 = _mm_mulhi_epi16(facrv, v00);
			rv01 = _mm_mulhi_epi16(facrv, v01);
			gu00 = _mm_mulhi_epi16(facgu, u00);
			gu01 = _mm_mulhi_epi16(facgu, u01);
			gv00 = _mm_mulhi_epi16(facgv, v00);
			gv01 = _mm_mulhi_epi16(facgv, v01);
			bu00 = _mm_mulhi_epi16(facbu, u00);
			bu01 = _mm_mulhi_epi16(facbu, u01);
 
			//计算出最后RGB		行0
			r00 = _mm_add_epi16(y00r0, rv00);
			r01 = _mm_add_epi16(y01r0, rv01);
			g00 = _mm_add_epi16(_mm_add_epi16(y00r0, gu00), gv00);
			g01 = _mm_add_epi16(_mm_add_epi16(y01r0, gu01), gv01);
			b00 = _mm_add_epi16(y00r0, bu00);
			b01 = _mm_add_epi16(y01r0, bu01);

			//排列RGB数据
			r00 = _mm_packus_epi16(r00, r01);         // rrrr.. 组合计算
			g00 = _mm_packus_epi16(g00, g01);         // gggg.. 组合计算
			b00 = _mm_packus_epi16(b00, b01);         // bbbb.. 组合计算

			r01 = _mm_unpacklo_epi8(r00, zero); // 0r0r..//取低位
			gbgb = _mm_unpacklo_epi8(b00, g00);  // gbgb..
			rgb0123 = _mm_unpacklo_epi16(gbgb, r01);  // 0rgb0rgb..
			rgb4567 = _mm_unpackhi_epi16(gbgb, r01);  // 0rgb0rgb..

			r01 = _mm_unpackhi_epi8(r00, zero);			//取高位
			gbgb = _mm_unpackhi_epi8(b00, g00);
			rgb89ab = _mm_unpacklo_epi16(gbgb, r01);
			rgbcdef = _mm_unpackhi_epi16(gbgb, r01);

			//输出RGB数据 
			rgb0123 = _mm_shuffle_epi8(rgb0123, maskrgb);
			_mm_store_si128((__m128i *)dstrgbr0, rgb0123); dstrgbr0 += 12;

			rgb4567 = _mm_shuffle_epi8(rgb4567, maskrgb);
			_mm_store_si128((__m128i *)dstrgbr0, rgb4567); dstrgbr0 += 12;

			rgb89ab = _mm_shuffle_epi8(rgb89ab, maskrgb);
			_mm_store_si128((__m128i *)dstrgbr0, rgb89ab); dstrgbr0 += 12;

			rgbcdef = _mm_shuffle_epi8(rgbcdef, maskrgb);
			memcpy(dstrgbr0, &rgbcdef,12); dstrgbr0 += 12;			 

			//计算出最后RGB		行1
			r00 = _mm_add_epi16(y00r1, rv00);
			r01 = _mm_add_epi16(y01r1, rv01);
			g00 = _mm_add_epi16(_mm_add_epi16(y00r1, gu00), gv00);
			g01 = _mm_add_epi16(_mm_add_epi16(y01r1, gu01), gv01);
			b00 = _mm_add_epi16(y00r1, bu00);
			b01 = _mm_add_epi16(y01r1, bu01);

			r00 = _mm_packus_epi16(r00, r01);         // rrrr.. saturated
			g00 = _mm_packus_epi16(g00, g01);         // gggg.. saturated
			b00 = _mm_packus_epi16(b00, b01);         // bbbb.. saturated

			r01 = _mm_unpacklo_epi8(r00, zero); // 0r0r..
			gbgb = _mm_unpacklo_epi8(b00, g00);  // gbgb..
			rgb0123 = _mm_unpacklo_epi16(gbgb, r01);  // 0rgb0rgb..
			rgb4567 = _mm_unpackhi_epi16(gbgb, r01);  // 0rgb0rgb..

			r01 = _mm_unpackhi_epi8(r00, zero);
			gbgb = _mm_unpackhi_epi8(b00, g00);
			rgb89ab = _mm_unpacklo_epi16(gbgb, r01);
			rgbcdef = _mm_unpackhi_epi16(gbgb, r01);

			rgb0123 = _mm_shuffle_epi8(rgb0123, maskrgb);
			_mm_store_si128((__m128i *)dstrgbr1, rgb0123); dstrgbr1 += 12;

			rgb4567 = _mm_shuffle_epi8(rgb4567, maskrgb);
			_mm_store_si128((__m128i *)dstrgbr1, rgb4567); dstrgbr1 += 12;

			rgb89ab = _mm_shuffle_epi8(rgb89ab, maskrgb);
			_mm_store_si128((__m128i *)dstrgbr1, rgb89ab); dstrgbr1 += 12;

			rgbcdef = _mm_shuffle_epi8(rgbcdef, maskrgb);
			memcpy(dstrgbr1, &rgbcdef, 12); dstrgbr1 += 12;
		}
	}
}
这个输出的是RGB24,符合我的需求,其效率也是MMX的两倍。

接下来是AVX2版本的,这是花了好多天才成功实现的,看下面

//定义核,公式定量
static const __m256i ysub = _mm256_set1_epi16(0x0010);
static const __m256i uvsub = _mm256_set1_epi16(0x0080);
static const __m256i zero = _mm256_set1_epi16(0x0000);

static const __m256i facy = _mm256_set1_epi16(0x2543);
static const __m256i facrv = _mm256_set1_epi16(0x3313);
static const __m256i facgu = _mm256_set1_epi16(0xF377);
static const __m256i facgv = _mm256_set1_epi16(0xE5FC);
static const __m256i facbu = _mm256_set1_epi16(0x408D);

//RGB排列掩码
static const __m256i maskrgb = _mm256_set_epi8(128u, 128u, 128u, 128u, 14u, 13u, 12u, 10u, 9u, 8u, 6u, 5u, 4u, 2u, 1u, 0u,
	128u, 128u, 128u, 128u, 14u, 13u, 12u, 10u, 9u, 8u, 6u, 5u, 4u, 2u, 1u, 0u);
static const __m256i offsetyuv0 = _mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
static const __m256i offsetyuv1 = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
static const __m256i offsetrgb = _mm256_set_epi32(7, 3, 6, 5, 4, 2, 1, 0);
 
void yuv420_to_rgb24(uint8_t *yp, uint8_t *up, uint8_t *vp, int sy, int suv, int width, int height,
	uint8_t *rgb, int srgb)
{
	//定义空间
	__m256i y0r0, y0r1, u0, v0;
	__m256i y00r0, y01r0, y00r1, y01r1;
	__m256i u00, u01, v00, v01;
	__m256i rv00, rv01, gu00, gu01, gv00, gv01, bu00, bu01;
	__m256i r00, r01, g00, g01, b00, b01;
	__m256i rgb0123, rgb4567, rgb89ab, rgbcdef, rgb256;
	__m256i gbgb, *srcy256r0, *srcy256r1; 
	uint8_t *dstrgbr0, *dstrgbr1;
	__m128i *srcu, *srcv; 
		
	for (int y = 0; y < height; y += 2) {
		//源数据指针
		srcy256r0 = (__m256i *)(yp + sy*y);
		srcy256r1 = (__m256i *)(yp + sy*y + sy);
		srcu = (__m128i *)(up + suv*(y >> 1));
		srcv = (__m128i *)(vp + suv*(y >> 1));

		dstrgbr0 = rgb + srgb*y;
		dstrgbr1 = rgb + srgb*y + srgb;

		for (int x = 0; x < width; x += 32) {
			//加载行数据
			u0 = _mm256_load_si256((__m256i *)srcu); srcu++;
			v0 = _mm256_load_si256((__m256i *)srcv); srcv++;

			u0 = _mm256_permute4x64_epi64(u0, 216);//对调	:0,1,4,5,2,3,6,7
			v0 = _mm256_permute4x64_epi64(v0, 216);//对调	:0,1,4,5,2,3,6,7

			y0r0 = _mm256_load_si256(srcy256r0++);
			y0r1 = _mm256_load_si256(srcy256r1++);

			//计算YUV中的Y向量		根据YUV420转RGBA的公式,其中YUV是YUV420P
			y00r0 = _mm256_mulhi_epi16(_mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpacklo_epi8(y0r0, zero), ysub), 3), facy);
			y01r0 = _mm256_mulhi_epi16(_mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpackhi_epi8(y0r0, zero), ysub), 3), facy);
			y00r1 = _mm256_mulhi_epi16(_mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpacklo_epi8(y0r1, zero), ysub), 3), facy);
			y01r1 = _mm256_mulhi_epi16(_mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpackhi_epi8(y0r1, zero), ysub), 3), facy);

			//展开u和v,使它们与y值对齐
			u0 = _mm256_unpacklo_epi8(u0, zero);
			u00 = _mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpacklo_epi16(u0, u0), uvsub), 3);
			u01 = _mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpackhi_epi16(u0, u0), uvsub), 3);

			v0 = _mm256_unpacklo_epi8(v0, zero);
			v00 = _mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpacklo_epi16(v0, v0), uvsub), 3);
			v01 = _mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpackhi_epi16(v0, v0), uvsub), 3);

			//计算两行UV的向量
			rv00 = _mm256_mulhi_epi16(facrv, v00);
			rv01 = _mm256_mulhi_epi16(facrv, v01);
			gu00 = _mm256_mulhi_epi16(facgu, u00);
			gu01 = _mm256_mulhi_epi16(facgu, u01);
			gv00 = _mm256_mulhi_epi16(facgv, v00);
			gv01 = _mm256_mulhi_epi16(facgv, v01);
			bu00 = _mm256_mulhi_epi16(facbu, u00);
			bu01 = _mm256_mulhi_epi16(facbu, u01);

			//计算出最后RGB		行0
			r00 = _mm256_add_epi16(y00r0, rv00);
			r01 = _mm256_add_epi16(y01r0, rv01);
			g00 = _mm256_add_epi16(_mm256_add_epi16(y00r0, gu00), gv00);
			g01 = _mm256_add_epi16(_mm256_add_epi16(y01r0, gu01), gv01);
			b00 = _mm256_add_epi16(y00r0, bu00);
			b01 = _mm256_add_epi16(y01r0, bu01);

			//排列RGB数据
			r00 = _mm256_packus_epi16(r00, r01);         // rrrr.. 组合计算
			g00 = _mm256_packus_epi16(g00, g01);         // gggg.. 组合计算
			b00 = _mm256_packus_epi16(b00, b01);         // bbbb.. 组合计算 

			r00 = _mm256_permutevar8x32_epi32(r00, offsetyuv0);         //由于AVX2处理数据的方式,这里需要重新排列数据
			g00 = _mm256_permutevar8x32_epi32(g00, offsetyuv0);         //。。。。。。
			b00 = _mm256_permutevar8x32_epi32(b00, offsetyuv0);         //。。。。。。

			r01 = _mm256_unpacklo_epi8(r00, zero); // R0R0..//取低位
			gbgb = _mm256_unpacklo_epi8(b00, g00);  // GBGB..  
			rgb0123 = _mm256_unpacklo_epi16(gbgb, r01);  // RGB0RGB0..
			rgb4567 = _mm256_unpackhi_epi16(gbgb, r01);  // RGB0RGB0..

			r01 = _mm256_unpackhi_epi8(r00, zero);			//取高位
			gbgb = _mm256_unpackhi_epi8(b00, g00);
			rgb89ab = _mm256_unpacklo_epi16(gbgb, r01);
			rgbcdef = _mm256_unpackhi_epi16(gbgb, r01);

			//输出RGB数据  			
			rgb256 = _mm256_shuffle_epi8(rgb0123, maskrgb);				//RGB32转RGB24,去0
			rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);	//由于AVX2处理数据的方式,这里需要重新排列数据
			_mm256_store_si256((__m256i *)dstrgbr0, rgb256); dstrgbr0 += 24;

			rgb256 = _mm256_shuffle_epi8(rgb89ab, maskrgb);
			rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
			_mm256_store_si256((__m256i *)dstrgbr0, rgb256); dstrgbr0 += 24;

			rgb256 = _mm256_shuffle_epi8(rgb4567, maskrgb);
			rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
			_mm256_store_si256((__m256i *)dstrgbr0, rgb256); dstrgbr0 += 24;

			rgb256 = _mm256_shuffle_epi8(rgbcdef, maskrgb);
			rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
			memcpy(dstrgbr0, &rgb256, 24); dstrgbr0 += 24;				//这里用memcpy是因为用AVX2指令会覆盖下一行数据

			//计算出最后RGB		行1
			r00 = _mm256_add_epi16(y00r1, rv00);
			r01 = _mm256_add_epi16(y01r1, rv01);
			g00 = _mm256_add_epi16(_mm256_add_epi16(y00r1, gu00), gv00);
			g01 = _mm256_add_epi16(_mm256_add_epi16(y01r1, gu01), gv01);
			b00 = _mm256_add_epi16(y00r1, bu00);
			b01 = _mm256_add_epi16(y01r1, bu01);

			r00 = _mm256_packus_epi16(r00, r01);    
			g00 = _mm256_packus_epi16(g00, g01);    
			b00 = _mm256_packus_epi16(b00, b01);    

			r00 = _mm256_permutevar8x32_epi32(r00, offsetyuv1);		//由于AVX2处理数据的方式,这里需要重新排列数据
			g00 = _mm256_permutevar8x32_epi32(g00, offsetyuv1);		//。。。。。。
			b00 = _mm256_permutevar8x32_epi32(b00, offsetyuv1);		//。。。。。。

			r01 = _mm256_unpacklo_epi8(r00, zero); // R0R0..//取低位
			gbgb = _mm256_unpacklo_epi8(b00, g00);  // GBGB.. 
			rgb0123 = _mm256_unpacklo_epi16(gbgb, r01);  // RGB0RGB0..
			rgb4567 = _mm256_unpackhi_epi16(gbgb, r01);  // RGB0RGB0..

			r01 = _mm256_unpackhi_epi8(r00, zero);			//取高位
			gbgb = _mm256_unpackhi_epi8(b00, g00);
			rgb89ab = _mm256_unpacklo_epi16(gbgb, r01);
			rgbcdef = _mm256_unpackhi_epi16(gbgb, r01);

			//输出RGB数据  			
			rgb256 = _mm256_shuffle_epi8(rgb0123, maskrgb);
			rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
			_mm256_store_si256((__m256i *)dstrgbr1, rgb256); dstrgbr1 += 24;

			rgb256 = _mm256_shuffle_epi8(rgb4567, maskrgb);
			rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
			_mm256_store_si256((__m256i *)dstrgbr1, rgb256); dstrgbr1 += 24;

			rgb256 = _mm256_shuffle_epi8(rgb89ab, maskrgb);
			rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
			_mm256_store_si256((__m256i *)dstrgbr1, rgb256); dstrgbr1 += 24;

			rgb256 = _mm256_shuffle_epi8(rgbcdef, maskrgb);
			rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
			memcpy(dstrgbr1, &rgb256, 24); dstrgbr1 += 24;
		}
	}
}
原来AVX2不像SSE2那样直接升级,AVX2它是每128位处理,所以会出现顺序颠倒问题,这是困扰我好几天的一大原因,我这个AVX2是直接输出RGB24的,就是输出RGB24和顺序原因,这个版本并不比SSE2那一版快一倍,只快50%。

使用AVX2要加上immintrin.h头文件

调用方式

yuv420_to_rgb24(yuv[0], yuv[1], yuv[2], WIDTH, WIDTH >> 1, WIDTH, HEIGHT, pRGBBuf, WIDTH * 3);
yuv[0]:Y地址

yuv[1]:U地址

yuv[2]:V地址

pRGBBuf:RGB缓冲地址

WIDTH:图像宽,SSE2中必须是16的倍数,AVX2中必须是32的倍数
HEIGHT:图像高,必须是2的倍数

libyuv中的调用并不限定宽高,那里代码里做了处理,但我所用的图像都是从摄像机里出的YUV420数据,目前的摄像机的图像尺寸都是32的倍数,所以我并没写未对齐的处理。


关于性能,我认为上面AVX2的版本再做优化,还可以提升50%或更高,如果哪位大神在这个版本上做了优化提升,希望指教指教。

你可能感兴趣的:(基于AVX2的YUV420转RGB C++代码)