SIMD相关头文件包括:
//#include <ivec.h>//MMX //#include <fvec.h>//SSE(also include ivec.h) //#include <dvec.h>//SSE2(also include fvec.h) #include <mmintrin.h> //MMX #include <xmmintrin.h> //SSE(include mmintrin.h) #include <emmintrin.h> //SSE2(include xmmintrin.h) #include <pmmintrin.h> //SSE3(include emmintrin.h) #include <tmmintrin.h>//SSSE3(include pmmintrin.h) #include <smmintrin.h>//SSE4.1(include tmmintrin.h) #include <nmmintrin.h>//SSE4.2(include smmintrin.h) #include <wmmintrin.h>//AES(include nmmintrin.h) #include <immintrin.h>//AVX(include wmmintrin.h) #include <intrin.h>//(include immintrin.h)
typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64 { unsigned __int64 m64_u64; float m64_f32[2]; __int8 m64_i8[8]; __int16 m64_i16[4]; __int32 m64_i32[2]; __int64 m64_i64; unsigned __int8 m64_u8[8]; unsigned __int16 m64_u16[4]; unsigned __int32 m64_u32[2]; } __m64;
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 { float m128_f32[4]; unsigned __int64 m128_u64[2]; __int8 m128_i8[16]; __int16 m128_i16[8]; __int32 m128_i32[4]; __int64 m128_i64[2]; unsigned __int8 m128_u8[16]; unsigned __int16 m128_u16[8]; unsigned __int32 m128_u32[4]; } __m128;
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i { __int8 m128i_i8[16]; __int16 m128i_i16[8]; __int32 m128i_i32[4]; __int64 m128i_i64[2]; unsigned __int8 m128i_u8[16]; unsigned __int16 m128i_u16[8]; unsigned __int32 m128i_u32[4]; unsigned __int64 m128i_u64[2]; } __m128i; typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d { double m128d_f64[2]; } __m128d;
/*New Single precision vector instructions*/ //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0-b0, r1=a1+b1, r2=a2-b2, r3=a3+b3 extern __m128 _mm_addsub_ps(__m128 a, __m128 b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0+a1, r1=a2+a3, r2=b0+b1, r3=b2+b3 extern __m128 _mm_hadd_ps(__m128 a, __m128 b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0-a1, r1=a2-a3, r2=b0-b1, r3=b2-b3 extern __m128 _mm_hsub_ps(__m128 a, __m128 b); //a=(a0, a1, a2, a3), 则r0=a1, r1=a1, r2=a3, r3=a3 extern __m128 _mm_movehdup_ps(__m128 a); //a=(a0, a1, a2, a3), 则r0=a0, r1=a0, r2=a2, r3=a2 extern __m128 _mm_moveldup_ps(__m128 a); /*New double precision vector instructions*/ //a=(a0, a1), b=(b0, b1), 则r0=a0-b0, r1=a1+b1 extern __m128d _mm_addsub_pd(__m128d a, __m128d b); //a=(a0, a1), b=(b0, b1), 则r0=a0+a1, r1=b0+b1 extern __m128d _mm_hadd_pd(__m128d a, __m128d b); //a=(a0, a1), b=(b0, b1), 则r0=a0-a1, r1=b0-b1 extern __m128d _mm_hsub_pd(__m128d a, __m128d b); //r0=r1=dp[0] extern __m128d _mm_loaddup_pd(double const * dp); //a=(a0, a1),则r0=r1=a0 extern __m128d _mm_movedup_pd(__m128d a); /*New unaligned integer vector load instruction*/ //load unaligned data using _mm_lddqu_si128 for best performance //If the address is not 16-byte aligned, the load begins at the //highest 16-byte-aligned address less than the address of Data extern __m128i _mm_lddqu_si128(__m128i const *p); /*Miscellaneous new instructions, For _mm_monitor p goes in eax, extensions goes in ecx, hints goes in edx*/ //The monitor instruction sets up an address range for hardware monitoring. //The values of extensions and hints correspond to the values in ECX and EDX //used by the monitor instruction. They are reserved for future use and should //be zero for the SSE3-enabled processor. For more information, //see the Intel or AMD documentation as appropriate. extern void _mm_monitor(void const *p, unsigned extensions, unsigned hints); /*Miscellaneous new instructions, For _mm_mwait, extensions goes in ecx, hints goes in eax*/ //The mwait instruction instructs the processor to enter a wait state in which the //processor is instructed to monitor the address range between extensions and hints //and wait for an event or a store to that address range. The values of extensions //and hints are loaded into the ECX and EAX registers. For more information, //see the Intel or AMD documentation as appropriate. extern void _mm_mwait(unsigned extensions, unsigned hints);
/*Add horizonally packed [saturated] words, double words, {X,}MM2/m{128,64} (b) to {X,}MM1 (a).*/ //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=a0+a1, r1=a2+a3, r2=a4+a5, r3=a6+a7, r4=b0+b1, r5=b2+b3, r6=b4+b5, r7=b6+b7 extern __m128i _mm_hadd_epi16 (__m128i a, __m128i b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0+a1, r1=a2+a3, r2=b0+b1, r3=b2+b3 extern __m128i _mm_hadd_epi32 (__m128i a, __m128i b); //SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x)) //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=SATURATE_16(a0+a1), ..., r3=SATURATE_16(a6+a7), //r4=SATURATE_16(b0+b1), ..., r7=SATURATE_16(b6+b7) extern __m128i _mm_hadds_epi16 (__m128i a, __m128i b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0+a1, r1=a2+a3, r2=b0+b1, r3=b2+b3 extern __m64 _mm_hadd_pi16 (__m64 a, __m64 b); //a=(a0, a1), b=(b0, b1), 则r0=a0+a1, r1=b0+b1 extern __m64 _mm_hadd_pi32 (__m64 a, __m64 b); //SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x)) //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=SATURATE_16(a0+a1), r1=SATURATE_16(a2+a3), //r2=SATURATE_16(b0+b1), r3=SATURATE_16(b2+b3) extern __m64 _mm_hadds_pi16 (__m64 a, __m64 b); /*Subtract horizonally packed [saturated] words, double words, {X,}MM2/m{128,64} (b) from {X,}MM1 (a).*/ //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=a0-a1, r1=a2-a3, r2=a4-a5, r3=a6-a7, r4=b0-b1, r5=b2-b3, r6=b4-b5, r7=b6-b7 extern __m128i _mm_hsub_epi16 (__m128i a, __m128i b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0-a1, r1=a2-a3, r2=b0-b1, r3=b2-b3 extern __m128i _mm_hsub_epi32 (__m128i a, __m128i b); //SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x)) //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=SATURATE_16(a0-a1), ..., r3=SATURATE_16(a6-a7), //r4=SATURATE_16(b0-b1), ..., r7=SATURATE_16(b6-b7) extern __m128i _mm_hsubs_epi16 (__m128i a, __m128i b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0-a1, r1=a2-a3, r2=b0-b1, r3=b2-b3 extern __m64 _mm_hsub_pi16 (__m64 a, __m64 b); //a=(a0, a1), b=(b0, b1), 则r0=a0-a1, r1=b0-b1 extern __m64 _mm_hsub_pi32 (__m64 a, __m64 b); //SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x)) //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=SATURATE_16(a0-a1), r1=SATURATE_16(a2-a3), //r2=SATURATE_16(b0-b1), r3=SATURATE_16(b2-b3) extern __m64 _mm_hsubs_pi16 (__m64 a, __m64 b); /*Multiply and add packed words, {X,}MM2/m{128,64} (b) to {X,}MM1 (a).*/ //SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x)) //a=(a0, a1, a2, ..., a13, a14, a15), b=(b0, b1, b2, ..., b13, b14, b15) //则r0=SATURATE_16((a0*b0)+(a1*b1)), ..., r7=SATURATE_16((a14*b14)+(a15*b15)) //Parameter a contains unsigned bytes. Parameter b contains signed bytes. extern __m128i _mm_maddubs_epi16 (__m128i a, __m128i b); //SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x)) //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=SATURATE_16((a0*b0)+(a1*b1)), ..., r3=SATURATE_16((a6*b6)+(a7*b7)) //Parameter a contains unsigned bytes. Parameter b contains signed bytes. extern __m64 _mm_maddubs_pi16 (__m64 a, __m64 b); /*Packed multiply high integers with round and scaling, {X,}MM2/m{128,64} (b) to {X,}MM1 (a).*/ //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=INT16(((a0*b0)+0x4000) >> 15), ..., r7=INT16(((a7*b7)+0x4000) >> 15) extern __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=INT16(((a0*b0)+0x4000) >> 15), ..., r3=INT16(((a3*b3)+0x4000) >> 15) extern __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b); /*Packed shuffle bytes {X,}MM2/m{128,64} (b) by {X,}MM1 (a).*/ //SELECT(a, n) extracts the nth 8-bit parameter from a. The 0th 8-bit parameter //is the least significant 8-bits, b=(b0, b1, b2, ..., b13, b14, b15), b is mask //则r0 = (b0 & 0x80) ? 0 : SELECT(a, b0 & 0x0f), ..., //r15 = (b15 & 0x80) ? 0 : SELECT(a, b15 & 0x0f) extern __m128i _mm_shuffle_epi8 (__m128i a, __m128i b); //SELECT(a, n) extracts the nth 8-bit parameter from a. The 0th 8-bit parameter //is the least significant 8-bits, b=(b0, b1, ..., b7), b is mask //则r0= (b0 & 0x80) ? 0 : SELECT(a, b0 & 0x07),..., //r7=(b7 & 0x80) ? 0 : SELECT(a, b7 & 0x07) extern __m64 _mm_shuffle_pi8 (__m64 a, __m64 b); /*Packed byte, word, double word sign, {X,}MM2/m{128,64} (b) to {X,}MM1 (a).*/ //a=(a0, a1, a2, ..., a13, a14, a15), b=(b0, b1, b2, ..., b13, b14, b15) //则r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ..., //r15= (b15 < 0) ? -a15 : ((b15 == 0) ? 0 : a15) extern __m128i _mm_sign_epi8 (__m128i a, __m128i b); //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ..., //r7= (b7 < 0) ? -a7 : ((b7 == 0) ? 0 : a7) extern __m128i _mm_sign_epi16 (__m128i a, __m128i b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ..., //r3= (b3 < 0) ? -a3 : ((b3 == 0) ? 0 : a3) extern __m128i _mm_sign_epi32 (__m128i a, __m128i b); //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ..., //r7= (b7 < 0) ? -a7 : ((b7 == 0) ? 0 : a7) extern __m64 _mm_sign_pi8 (__m64 a, __m64 b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ..., //r3= (b3 < 0) ? -a3 : ((b3 == 0) ? 0 : a3) extern __m64 _mm_sign_pi16 (__m64 a, __m64 b); //a=(a0, a1), b=(b0, b1), 则r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), //r1= (b1 < 0) ? -a1 : ((b1 == 0) ? 0 : a1) extern __m64 _mm_sign_pi32 (__m64 a, __m64 b); /*Packed align and shift right by n*8 bits, {X,}MM2/m{128,64} (b) to {X,}MM1 (a).*/ //n: A constant that specifies how many bytes the interim result will be //shifted to the right, If n > 32, the result value is zero //CONCAT(a, b) is the 256-bit unsigned intermediate value that is a concatenation of //parameters a and b. The result is this intermediate value shifted right by n bytes. //则r= (CONCAT(a, b) >> (n * 8)) & 0xffffffffffffffff extern __m128i _mm_alignr_epi8 (__m128i a, __m128i b, int n); //n: An integer constant that specifies how many bytes to shift the interim //result to the right,If n > 16, the result value is zero //CONCAT(a, b) is the 128-bit unsigned intermediate value that is formed by //concatenating parameters a and b. The result value is the rightmost 64 bits after //shifting this intermediate result right by n bytes //则r = (CONCAT(a, b) >> (n * 8)) & 0xffffffff extern __m64 _mm_alignr_pi8 (__m64 a, __m64 b, int n); /*Packed byte, word, double word absolute value, {X,}MM2/m{128,64} (b) to {X,}MM1 (a).*/ //a=(a0, a1, a2, ..., a13, a14, a15) //则r0 = (a0 < 0) ? -a0 : a0, ..., r15 = (a15 < 0) ? -a15 : a15 extern __m128i _mm_abs_epi8 (__m128i a); //a=(a0, a1, a2, a3, a4, a5, a6, a7) //则r0 = (a0 < 0) ? -a0 : a0, ..., r7 = (a7 < 0) ? -a7 : a7 extern __m128i _mm_abs_epi16 (__m128i a); //a=(a0, a1, a2, a3) //则r0 = (a0 < 0) ? -a0 : a0, ..., r3 = (a3 < 0) ? -a3 : a3 extern __m128i _mm_abs_epi32 (__m128i a); //a=(a0, a1, a2, a3, a4, a5, a6, a7) //则r0 = (a0 < 0) ? -a0 : a0, ..., r7 = (a7 < 0) ? -a7 : a7 extern __m64 _mm_abs_pi8 (__m64 a); //a=(a0, a1, a2, a3) //则r0 = (a0 < 0) ? -a0 : a0, ..., r3 = (a3 < 0) ? -a3 : a3 extern __m64 _mm_abs_pi16 (__m64 a); //a=(a0, a1), 则r0 = (a0 < 0) ? -a0 : a0, r1 = (a1 < 0) ? -a1 : a1 extern __m64 _mm_abs_pi32 (__m64 a);