第一次写博文,不好意思,写的应该不好,另外我只是C++的业余程序员,C++功底很一般,见谅!
我在做视频识别的工程中要用到YUV转RGB的功能,以前我用过MMX指令的代码,那是网上找的代码,我当时并不懂MMX,也不懂汇编,只是知道MMX比普通代码要快,确实很快,现在知道SSE2比MMX要快一倍,AVX2比SSE2要快一倍,所以想尝试用AVX2来实现YUV转RGB的功能,在网上寻找多次,也没找到AVX2的现成代码,只找到libyuv库中有用AVX2来实现,但测试发现它的性能没有比MMX快4倍,只快一倍多一点,分析发现里面还用了SSE3指令:
__declspec(naked)
void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // width
movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
convertloop :
movdqu xmm0, [eax] // fetch 16 pixels of argb
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
lea eax, [eax + 64]
pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
pshufb xmm1, xmm6
pshufb xmm2, xmm6
pshufb xmm3, xmm6
movdqa xmm4, xmm1 // 4 bytes from 1 for 0
psrldq xmm1, 4 // 8 bytes from 1
pslldq xmm4, 12 // 4 bytes from 1 for 0
movdqa xmm5, xmm2 // 8 bytes from 2 for 1
por xmm0, xmm4 // 4 bytes from 1 for 0
pslldq xmm5, 8 // 8 bytes from 2 for 1
movdqu[edx], xmm0 // store 0
por xmm1, xmm5 // 8 bytes from 2 for 1
psrldq xmm2, 8 // 4 bytes from 2
pslldq xmm3, 4 // 12 bytes from 3 for 2
por xmm2, xmm3 // 12 bytes from 3 for 2
movdqu[edx + 16], xmm1 // store 1
movdqu[edx + 32], xmm2 // store 2
lea edx, [edx + 48]
sub ecx, 16
jg convertloop
ret
}
}
这是将RGBA转成RGB的代码,这里降低了性能
下面是AVX2的YUV420转RGBA代码,libyuv里面的,汇编格式的宏
#define YUVTORGB_AVX2(YuvConstants) __asm { \
__asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
__asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
__asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
__asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
__asm vpsubw ymm2, ymm3, ymm2 \
__asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
__asm vpsubw ymm1, ymm3, ymm1 \
__asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
__asm vpsubw ymm0, ymm3, ymm0 \
/* Step 2: Find Y contribution to 16 R,G,B values */ \
__asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
__asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
__asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
__asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
__asm vpsraw ymm0, ymm0, 6 \
__asm vpsraw ymm1, ymm1, 6 \
__asm vpsraw ymm2, ymm2, 6 \
__asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
__asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
__asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
}
后来才知道C++有AVX2、SSE2等系列的非汇编调用的方法,但基本上要在VC2005以上的版本下才可以使用。
这种非汇编的方式我应该可以尝试,后来我在网上找到有人用SSE2的非汇编方式的YUV转RGB32的代码,我就是从这里开始升级到AVX2,下面是SSE2的代码:
void yuv420_to_argb8888( uint8_t *yp, uint8_t *up, uint8_t *vp,
uint32_t sy, uint32_t suv,
int width, int height,
uint32_t *rgb, uint32_t srgb )
{
__m128i y0r0, y0r1, u0, v0;
__m128i y00r0, y01r0, y00r1, y01r1;
__m128i u00, u01, v00, v01;
__m128i rv00, rv01, gu00, gu01, gv00, gv01, bu00, bu01;
__m128i r00, r01, g00, g01, b00, b01;
__m128i rgb0123, rgb4567, rgb89ab, rgbcdef;
__m128i gbgb;
__m128i ysub, uvsub;
__m128i zero, facy, facrv, facgu, facgv, facbu;
__m128i *srcy128r0, *srcy128r1;
__m128i *dstrgb128r0, *dstrgb128r1;
__m64 *srcu64, *srcv64;
int x, y;
ysub = _mm_set1_epi32( 0x00100010 );
uvsub = _mm_set1_epi32( 0x00800080 );
facy = _mm_set1_epi32( 0x004a004a );
facrv = _mm_set1_epi32( 0x00660066 );
facgu = _mm_set1_epi32( 0x00190019 );
facgv = _mm_set1_epi32( 0x00340034 );
facbu = _mm_set1_epi32( 0x00810081 );
zero = _mm_set1_epi32( 0x00000000 );
for( y = 0; y < height; y += 2 ) {
srcy128r0 = (__m128i *)(yp + sy*y);
srcy128r1 = (__m128i *)(yp + sy*y + sy);
srcu64 = (__m64 *)(up + suv*(y/2));
srcv64 = (__m64 *)(vp + suv*(y/2));
dstrgb128r0 = (__m128i *)(rgb + srgb*y);
dstrgb128r1 = (__m128i *)(rgb + srgb*y + srgb);
for( x = 0; x < width; x += 16 ) {
u0 = _mm_loadl_epi64( (__m128i *)srcu64 ); srcu64++;
v0 = _mm_loadl_epi64( (__m128i *)srcv64 ); srcv64++;
y0r0 = _mm_load_si128( srcy128r0++ );
y0r1 = _mm_load_si128( srcy128r1++ );
// constant y factors
y00r0 = _mm_mullo_epi16( _mm_sub_epi16( _mm_unpacklo_epi8( y0r0, zero ), ysub ), facy );
y01r0 = _mm_mullo_epi16( _mm_sub_epi16( _mm_unpackhi_epi8( y0r0, zero ), ysub ), facy );
y00r1 = _mm_mullo_epi16( _mm_sub_epi16( _mm_unpacklo_epi8( y0r1, zero ), ysub ), facy );
y01r1 = _mm_mullo_epi16( _mm_sub_epi16( _mm_unpackhi_epi8( y0r1, zero ), ysub ), facy );
// expand u and v so they're aligned with y values
u0 = _mm_unpacklo_epi8( u0, zero );
u00 = _mm_sub_epi16( _mm_unpacklo_epi16( u0, u0 ), uvsub );
u01 = _mm_sub_epi16( _mm_unpackhi_epi16( u0, u0 ), uvsub );
v0 = _mm_unpacklo_epi8( v0, zero );
v00 = _mm_sub_epi16( _mm_unpacklo_epi16( v0, v0 ), uvsub );
v01 = _mm_sub_epi16( _mm_unpackhi_epi16( v0, v0 ), uvsub );
// common factors on both rows.
rv00 = _mm_mullo_epi16( facrv, v00 );
rv01 = _mm_mullo_epi16( facrv, v01 );
gu00 = _mm_mullo_epi16( facgu, u00 );
gu01 = _mm_mullo_epi16( facgu, u01 );
gv00 = _mm_mullo_epi16( facgv, v00 );
gv01 = _mm_mullo_epi16( facgv, v01 );
bu00 = _mm_mullo_epi16( facbu, u00 );
bu01 = _mm_mullo_epi16( facbu, u01 );
// row 0
r00 = _mm_srai_epi16( _mm_add_epi16( y00r0, rv00 ), 6 );
r01 = _mm_srai_epi16( _mm_add_epi16( y01r0, rv01 ), 6 );
g00 = _mm_srai_epi16( _mm_sub_epi16( _mm_sub_epi16( y00r0, gu00 ), gv00 ), 6 );
g01 = _mm_srai_epi16( _mm_sub_epi16( _mm_sub_epi16( y01r0, gu01 ), gv01 ), 6 );
b00 = _mm_srai_epi16( _mm_add_epi16( y00r0, bu00 ), 6 );
b01 = _mm_srai_epi16( _mm_add_epi16( y01r0, bu01 ), 6 );
r00 = _mm_packus_epi16( r00, r01 ); // rrrr.. saturated
g00 = _mm_packus_epi16( g00, g01 ); // gggg.. saturated
b00 = _mm_packus_epi16( b00, b01 ); // bbbb.. saturated
r01 = _mm_unpacklo_epi8( r00, zero ); // 0r0r..
gbgb = _mm_unpacklo_epi8( b00, g00 ); // gbgb..
rgb0123 = _mm_unpacklo_epi16( gbgb, r01 ); // 0rgb0rgb..
rgb4567 = _mm_unpackhi_epi16( gbgb, r01 ); // 0rgb0rgb..
r01 = _mm_unpackhi_epi8( r00, zero );
gbgb = _mm_unpackhi_epi8( b00, g00 );
rgb89ab = _mm_unpacklo_epi16( gbgb, r01 );
rgbcdef = _mm_unpackhi_epi16( gbgb, r01 );
_mm_store_si128( dstrgb128r0++, rgb0123 );
_mm_store_si128( dstrgb128r0++, rgb4567 );
_mm_store_si128( dstrgb128r0++, rgb89ab );
_mm_store_si128( dstrgb128r0++, rgbcdef );
// row 1
r00 = _mm_srai_epi16( _mm_add_epi16( y00r1, rv00 ), 6 );
r01 = _mm_srai_epi16( _mm_add_epi16( y01r1, rv01 ), 6 );
g00 = _mm_srai_epi16( _mm_sub_epi16( _mm_sub_epi16( y00r1, gu00 ), gv00 ), 6 );
g01 = _mm_srai_epi16( _mm_sub_epi16( _mm_sub_epi16( y01r1, gu01 ), gv01 ), 6 );
b00 = _mm_srai_epi16( _mm_add_epi16( y00r1, bu00 ), 6 );
b01 = _mm_srai_epi16( _mm_add_epi16( y01r1, bu01 ), 6 );
r00 = _mm_packus_epi16( r00, r01 ); // rrrr.. saturated
g00 = _mm_packus_epi16( g00, g01 ); // gggg.. saturated
b00 = _mm_packus_epi16( b00, b01 ); // bbbb.. saturated
r01 = _mm_unpacklo_epi8( r00, zero ); // 0r0r..
gbgb = _mm_unpacklo_epi8( b00, g00 ); // gbgb..
rgb0123 = _mm_unpacklo_epi16( gbgb, r01 ); // 0rgb0rgb..
rgb4567 = _mm_unpackhi_epi16( gbgb, r01 ); // 0rgb0rgb..
r01 = _mm_unpackhi_epi8( r00, zero );
gbgb = _mm_unpackhi_epi8( b00, g00 );
rgb89ab = _mm_unpacklo_epi16( gbgb, r01 );
rgbcdef = _mm_unpackhi_epi16( gbgb, r01 );
_mm_store_si128( dstrgb128r1++, rgb0123 );
_mm_store_si128( dstrgb128r1++, rgb4567 );
_mm_store_si128( dstrgb128r1++, rgb89ab );
_mm_store_si128( dstrgb128r1++, rgbcdef );
}
}
}
我英语不好,是初中文化,分析这个代码还是非常吃力的,都是各种翻译
分析后认为他的方式还有些问题
1:精度不高,因为他用的定量整型是字节大小,像这些
facy = _mm_set1_epi32( 0x004a004a );
2:只输出RGB32格式,看这里
_mm_store_si128( dstrgb128r0++, rgb0123 );
我原来用的MMX的定量就是short,像这个:mmw_mult_Y = 0x2568256825682568;
还有这个版本只输出RGB32,但识别用到的是RGB24,所以他这个代码必须还要改。
MMX版的代码我就不贴出了。
后来我在CSDN中找到一个大神,汇编很厉害,他做了很多计算性能的研究,下面是他的链接
YUV视频格式到RGB32格式转换的速度优化 上篇
YUV视频格式到RGB32格式转换的速度优化 中篇
我分析了好几天他的代码,但我很难直接用他的方式改,他里面用的汇编代码我还不能完全理解,
后来结合外国人的SSE2与这位大神的代码我改出了基于SSE2的代码,并正确转换,看下面
void yuv420_to_rgb24_sse3(uint8_t *yp, uint8_t *up, uint8_t *vp, int sy, int suv, int width, int height,
uint8_t *rgb, int srgb)
{
//定义空间
__m128i y0r0, y0r1, u0, v0;
__m128i y00r0, y01r0, y00r1, y01r1;
__m128i u00, u01, v00, v01;
__m128i rv00, rv01, gu00, gu01, gv00, gv01, bu00, bu01;
__m128i r00, r01, g00, g01, b00, b01;
__m128i rgb0123, rgb4567, rgb89ab, rgbcdef;
__m128i gbgb;
__m128i ysub, uvsub;
__m128i zero, facy, facrv, facgu, facgv, facbu;
__m128i *srcy128r0, *srcy128r1;
uint8_t *dstrgbr0, *dstrgbr1;
__m128i maskrgb;
__m64 *srcu64, *srcv64;
//定义核,公式定量
ysub = _mm_set1_epi16(0x0010);
uvsub = _mm_set1_epi16(0x0080);
zero = _mm_set1_epi16(0x0000);
maskrgb = _mm_set_epi8(128, 128, 128, 128, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
facy = _mm_set1_epi16(0x2543);
facrv = _mm_set1_epi16(0x3313);
facgu = _mm_set1_epi16(0xF377);
facgv = _mm_set1_epi16(0xE5FC);
facbu = _mm_set1_epi16(0x408D);
for (int y = 0; y < height; y += 2) {
//源数据指针
srcy128r0 = (__m128i *)(yp + sy*y);
srcy128r1 = (__m128i *)(yp + sy*y + sy);
srcu64 = (__m64 *)(up + suv*(y / 2));
srcv64 = (__m64 *)(vp + suv*(y / 2));
dstrgbr0 = rgb + srgb*y;
dstrgbr1 = rgb + srgb*y + srgb;
for (int x = 0; x < width; x += 16) {
//加载行数据
u0 = _mm_loadl_epi64((__m128i *)srcu64); srcu64++;
v0 = _mm_loadl_epi64((__m128i *)srcv64); srcv64++;
y0r0 = _mm_load_si128(srcy128r0++);
y0r1 = _mm_load_si128(srcy128r1++);
//计算YUV中的Y向量
y00r0 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi8(y0r0, zero), ysub), 3), facy);
y01r0 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi8(y0r0, zero), ysub), 3), facy);
y00r1 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi8(y0r1, zero), ysub), 3), facy);
y01r1 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi8(y0r1, zero), ysub), 3), facy);
//展开u和v,使它们与y值对齐
u0 = _mm_unpacklo_epi8(u0, zero);
u00 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi16(u0, u0), uvsub), 3);
u01 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi16(u0, u0), uvsub), 3);
v0 = _mm_unpacklo_epi8(v0, zero);
v00 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi16(v0, v0), uvsub), 3);
v01 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi16(v0, v0), uvsub), 3);
//计算两行UV的向量
rv00 = _mm_mulhi_epi16(facrv, v00);
rv01 = _mm_mulhi_epi16(facrv, v01);
gu00 = _mm_mulhi_epi16(facgu, u00);
gu01 = _mm_mulhi_epi16(facgu, u01);
gv00 = _mm_mulhi_epi16(facgv, v00);
gv01 = _mm_mulhi_epi16(facgv, v01);
bu00 = _mm_mulhi_epi16(facbu, u00);
bu01 = _mm_mulhi_epi16(facbu, u01);
//计算出最后RGB 行0
r00 = _mm_add_epi16(y00r0, rv00);
r01 = _mm_add_epi16(y01r0, rv01);
g00 = _mm_add_epi16(_mm_add_epi16(y00r0, gu00), gv00);
g01 = _mm_add_epi16(_mm_add_epi16(y01r0, gu01), gv01);
b00 = _mm_add_epi16(y00r0, bu00);
b01 = _mm_add_epi16(y01r0, bu01);
//排列RGB数据
r00 = _mm_packus_epi16(r00, r01); // rrrr.. 组合计算
g00 = _mm_packus_epi16(g00, g01); // gggg.. 组合计算
b00 = _mm_packus_epi16(b00, b01); // bbbb.. 组合计算
r01 = _mm_unpacklo_epi8(r00, zero); // 0r0r..//取低位
gbgb = _mm_unpacklo_epi8(b00, g00); // gbgb..
rgb0123 = _mm_unpacklo_epi16(gbgb, r01); // 0rgb0rgb..
rgb4567 = _mm_unpackhi_epi16(gbgb, r01); // 0rgb0rgb..
r01 = _mm_unpackhi_epi8(r00, zero); //取高位
gbgb = _mm_unpackhi_epi8(b00, g00);
rgb89ab = _mm_unpacklo_epi16(gbgb, r01);
rgbcdef = _mm_unpackhi_epi16(gbgb, r01);
//输出RGB数据
rgb0123 = _mm_shuffle_epi8(rgb0123, maskrgb);
_mm_store_si128((__m128i *)dstrgbr0, rgb0123); dstrgbr0 += 12;
rgb4567 = _mm_shuffle_epi8(rgb4567, maskrgb);
_mm_store_si128((__m128i *)dstrgbr0, rgb4567); dstrgbr0 += 12;
rgb89ab = _mm_shuffle_epi8(rgb89ab, maskrgb);
_mm_store_si128((__m128i *)dstrgbr0, rgb89ab); dstrgbr0 += 12;
rgbcdef = _mm_shuffle_epi8(rgbcdef, maskrgb);
memcpy(dstrgbr0, &rgbcdef,12); dstrgbr0 += 12;
//计算出最后RGB 行1
r00 = _mm_add_epi16(y00r1, rv00);
r01 = _mm_add_epi16(y01r1, rv01);
g00 = _mm_add_epi16(_mm_add_epi16(y00r1, gu00), gv00);
g01 = _mm_add_epi16(_mm_add_epi16(y01r1, gu01), gv01);
b00 = _mm_add_epi16(y00r1, bu00);
b01 = _mm_add_epi16(y01r1, bu01);
r00 = _mm_packus_epi16(r00, r01); // rrrr.. saturated
g00 = _mm_packus_epi16(g00, g01); // gggg.. saturated
b00 = _mm_packus_epi16(b00, b01); // bbbb.. saturated
r01 = _mm_unpacklo_epi8(r00, zero); // 0r0r..
gbgb = _mm_unpacklo_epi8(b00, g00); // gbgb..
rgb0123 = _mm_unpacklo_epi16(gbgb, r01); // 0rgb0rgb..
rgb4567 = _mm_unpackhi_epi16(gbgb, r01); // 0rgb0rgb..
r01 = _mm_unpackhi_epi8(r00, zero);
gbgb = _mm_unpackhi_epi8(b00, g00);
rgb89ab = _mm_unpacklo_epi16(gbgb, r01);
rgbcdef = _mm_unpackhi_epi16(gbgb, r01);
rgb0123 = _mm_shuffle_epi8(rgb0123, maskrgb);
_mm_store_si128((__m128i *)dstrgbr1, rgb0123); dstrgbr1 += 12;
rgb4567 = _mm_shuffle_epi8(rgb4567, maskrgb);
_mm_store_si128((__m128i *)dstrgbr1, rgb4567); dstrgbr1 += 12;
rgb89ab = _mm_shuffle_epi8(rgb89ab, maskrgb);
_mm_store_si128((__m128i *)dstrgbr1, rgb89ab); dstrgbr1 += 12;
rgbcdef = _mm_shuffle_epi8(rgbcdef, maskrgb);
memcpy(dstrgbr1, &rgbcdef, 12); dstrgbr1 += 12;
}
}
}
这个输出的是RGB24,符合我的需求,其效率也是MMX的两倍。
接下来是AVX2版本的,这是花了好多天才成功实现的,看下面
//定义核,公式定量
static const __m256i ysub = _mm256_set1_epi16(0x0010);
static const __m256i uvsub = _mm256_set1_epi16(0x0080);
static const __m256i zero = _mm256_set1_epi16(0x0000);
static const __m256i facy = _mm256_set1_epi16(0x2543);
static const __m256i facrv = _mm256_set1_epi16(0x3313);
static const __m256i facgu = _mm256_set1_epi16(0xF377);
static const __m256i facgv = _mm256_set1_epi16(0xE5FC);
static const __m256i facbu = _mm256_set1_epi16(0x408D);
//RGB排列掩码
static const __m256i maskrgb = _mm256_set_epi8(128u, 128u, 128u, 128u, 14u, 13u, 12u, 10u, 9u, 8u, 6u, 5u, 4u, 2u, 1u, 0u,
128u, 128u, 128u, 128u, 14u, 13u, 12u, 10u, 9u, 8u, 6u, 5u, 4u, 2u, 1u, 0u);
static const __m256i offsetyuv0 = _mm256_set_epi32(7, 3, 5, 1, 6, 2, 4, 0);
static const __m256i offsetyuv1 = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
static const __m256i offsetrgb = _mm256_set_epi32(7, 3, 6, 5, 4, 2, 1, 0);
void yuv420_to_rgb24(uint8_t *yp, uint8_t *up, uint8_t *vp, int sy, int suv, int width, int height,
uint8_t *rgb, int srgb)
{
//定义空间
__m256i y0r0, y0r1, u0, v0;
__m256i y00r0, y01r0, y00r1, y01r1;
__m256i u00, u01, v00, v01;
__m256i rv00, rv01, gu00, gu01, gv00, gv01, bu00, bu01;
__m256i r00, r01, g00, g01, b00, b01;
__m256i rgb0123, rgb4567, rgb89ab, rgbcdef, rgb256;
__m256i gbgb, *srcy256r0, *srcy256r1;
uint8_t *dstrgbr0, *dstrgbr1;
__m128i *srcu, *srcv;
for (int y = 0; y < height; y += 2) {
//源数据指针
srcy256r0 = (__m256i *)(yp + sy*y);
srcy256r1 = (__m256i *)(yp + sy*y + sy);
srcu = (__m128i *)(up + suv*(y >> 1));
srcv = (__m128i *)(vp + suv*(y >> 1));
dstrgbr0 = rgb + srgb*y;
dstrgbr1 = rgb + srgb*y + srgb;
for (int x = 0; x < width; x += 32) {
//加载行数据
u0 = _mm256_load_si256((__m256i *)srcu); srcu++;
v0 = _mm256_load_si256((__m256i *)srcv); srcv++;
u0 = _mm256_permute4x64_epi64(u0, 216);//对调 :0,1,4,5,2,3,6,7
v0 = _mm256_permute4x64_epi64(v0, 216);//对调 :0,1,4,5,2,3,6,7
y0r0 = _mm256_load_si256(srcy256r0++);
y0r1 = _mm256_load_si256(srcy256r1++);
//计算YUV中的Y向量 根据YUV420转RGBA的公式,其中YUV是YUV420P
y00r0 = _mm256_mulhi_epi16(_mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpacklo_epi8(y0r0, zero), ysub), 3), facy);
y01r0 = _mm256_mulhi_epi16(_mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpackhi_epi8(y0r0, zero), ysub), 3), facy);
y00r1 = _mm256_mulhi_epi16(_mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpacklo_epi8(y0r1, zero), ysub), 3), facy);
y01r1 = _mm256_mulhi_epi16(_mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpackhi_epi8(y0r1, zero), ysub), 3), facy);
//展开u和v,使它们与y值对齐
u0 = _mm256_unpacklo_epi8(u0, zero);
u00 = _mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpacklo_epi16(u0, u0), uvsub), 3);
u01 = _mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpackhi_epi16(u0, u0), uvsub), 3);
v0 = _mm256_unpacklo_epi8(v0, zero);
v00 = _mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpacklo_epi16(v0, v0), uvsub), 3);
v01 = _mm256_slli_epi16(_mm256_sub_epi16(_mm256_unpackhi_epi16(v0, v0), uvsub), 3);
//计算两行UV的向量
rv00 = _mm256_mulhi_epi16(facrv, v00);
rv01 = _mm256_mulhi_epi16(facrv, v01);
gu00 = _mm256_mulhi_epi16(facgu, u00);
gu01 = _mm256_mulhi_epi16(facgu, u01);
gv00 = _mm256_mulhi_epi16(facgv, v00);
gv01 = _mm256_mulhi_epi16(facgv, v01);
bu00 = _mm256_mulhi_epi16(facbu, u00);
bu01 = _mm256_mulhi_epi16(facbu, u01);
//计算出最后RGB 行0
r00 = _mm256_add_epi16(y00r0, rv00);
r01 = _mm256_add_epi16(y01r0, rv01);
g00 = _mm256_add_epi16(_mm256_add_epi16(y00r0, gu00), gv00);
g01 = _mm256_add_epi16(_mm256_add_epi16(y01r0, gu01), gv01);
b00 = _mm256_add_epi16(y00r0, bu00);
b01 = _mm256_add_epi16(y01r0, bu01);
//排列RGB数据
r00 = _mm256_packus_epi16(r00, r01); // rrrr.. 组合计算
g00 = _mm256_packus_epi16(g00, g01); // gggg.. 组合计算
b00 = _mm256_packus_epi16(b00, b01); // bbbb.. 组合计算
r00 = _mm256_permutevar8x32_epi32(r00, offsetyuv0); //由于AVX2处理数据的方式,这里需要重新排列数据
g00 = _mm256_permutevar8x32_epi32(g00, offsetyuv0); //。。。。。。
b00 = _mm256_permutevar8x32_epi32(b00, offsetyuv0); //。。。。。。
r01 = _mm256_unpacklo_epi8(r00, zero); // R0R0..//取低位
gbgb = _mm256_unpacklo_epi8(b00, g00); // GBGB..
rgb0123 = _mm256_unpacklo_epi16(gbgb, r01); // RGB0RGB0..
rgb4567 = _mm256_unpackhi_epi16(gbgb, r01); // RGB0RGB0..
r01 = _mm256_unpackhi_epi8(r00, zero); //取高位
gbgb = _mm256_unpackhi_epi8(b00, g00);
rgb89ab = _mm256_unpacklo_epi16(gbgb, r01);
rgbcdef = _mm256_unpackhi_epi16(gbgb, r01);
//输出RGB数据
rgb256 = _mm256_shuffle_epi8(rgb0123, maskrgb); //RGB32转RGB24,去0
rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb); //由于AVX2处理数据的方式,这里需要重新排列数据
_mm256_store_si256((__m256i *)dstrgbr0, rgb256); dstrgbr0 += 24;
rgb256 = _mm256_shuffle_epi8(rgb89ab, maskrgb);
rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
_mm256_store_si256((__m256i *)dstrgbr0, rgb256); dstrgbr0 += 24;
rgb256 = _mm256_shuffle_epi8(rgb4567, maskrgb);
rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
_mm256_store_si256((__m256i *)dstrgbr0, rgb256); dstrgbr0 += 24;
rgb256 = _mm256_shuffle_epi8(rgbcdef, maskrgb);
rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
memcpy(dstrgbr0, &rgb256, 24); dstrgbr0 += 24; //这里用memcpy是因为用AVX2指令会覆盖下一行数据
//计算出最后RGB 行1
r00 = _mm256_add_epi16(y00r1, rv00);
r01 = _mm256_add_epi16(y01r1, rv01);
g00 = _mm256_add_epi16(_mm256_add_epi16(y00r1, gu00), gv00);
g01 = _mm256_add_epi16(_mm256_add_epi16(y01r1, gu01), gv01);
b00 = _mm256_add_epi16(y00r1, bu00);
b01 = _mm256_add_epi16(y01r1, bu01);
r00 = _mm256_packus_epi16(r00, r01);
g00 = _mm256_packus_epi16(g00, g01);
b00 = _mm256_packus_epi16(b00, b01);
r00 = _mm256_permutevar8x32_epi32(r00, offsetyuv1); //由于AVX2处理数据的方式,这里需要重新排列数据
g00 = _mm256_permutevar8x32_epi32(g00, offsetyuv1); //。。。。。。
b00 = _mm256_permutevar8x32_epi32(b00, offsetyuv1); //。。。。。。
r01 = _mm256_unpacklo_epi8(r00, zero); // R0R0..//取低位
gbgb = _mm256_unpacklo_epi8(b00, g00); // GBGB..
rgb0123 = _mm256_unpacklo_epi16(gbgb, r01); // RGB0RGB0..
rgb4567 = _mm256_unpackhi_epi16(gbgb, r01); // RGB0RGB0..
r01 = _mm256_unpackhi_epi8(r00, zero); //取高位
gbgb = _mm256_unpackhi_epi8(b00, g00);
rgb89ab = _mm256_unpacklo_epi16(gbgb, r01);
rgbcdef = _mm256_unpackhi_epi16(gbgb, r01);
//输出RGB数据
rgb256 = _mm256_shuffle_epi8(rgb0123, maskrgb);
rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
_mm256_store_si256((__m256i *)dstrgbr1, rgb256); dstrgbr1 += 24;
rgb256 = _mm256_shuffle_epi8(rgb4567, maskrgb);
rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
_mm256_store_si256((__m256i *)dstrgbr1, rgb256); dstrgbr1 += 24;
rgb256 = _mm256_shuffle_epi8(rgb89ab, maskrgb);
rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
_mm256_store_si256((__m256i *)dstrgbr1, rgb256); dstrgbr1 += 24;
rgb256 = _mm256_shuffle_epi8(rgbcdef, maskrgb);
rgb256 = _mm256_permutevar8x32_epi32(rgb256, offsetrgb);
memcpy(dstrgbr1, &rgb256, 24); dstrgbr1 += 24;
}
}
}
原来AVX2不像SSE2那样直接升级,AVX2它是每128位处理,所以会出现顺序颠倒问题,这是困扰我好几天的一大原因,我这个AVX2是直接输出RGB24的,就是输出RGB24和顺序原因,这个版本并不比SSE2那一版快一倍,只快50%。
使用AVX2要加上immintrin.h头文件
调用方式
yuv420_to_rgb24(yuv[0], yuv[1], yuv[2], WIDTH, WIDTH >> 1, WIDTH, HEIGHT, pRGBBuf, WIDTH * 3);
yuv[0]:Y地址
yuv[1]:U地址
yuv[2]:V地址
pRGBBuf:RGB缓冲地址
WIDTH:图像宽,SSE2中必须是16的倍数,AVX2中必须是32的倍数
HEIGHT:图像高,必须是2的倍数
libyuv中的调用并不限定宽高,那里代码里做了处理,但我所用的图像都是从摄像机里出的YUV420数据,目前的摄像机的图像尺寸都是32的倍数,所以我并没写未对齐的处理。
关于性能,我认为上面AVX2的版本再做优化,还可以提升50%或更高,如果哪位大神在这个版本上做了优化提升,希望指教指教。