SIMD相关头文件包括:
//#include//MMX //#include //SSE(also include ivec.h) //#include //SSE2(also include fvec.h) #include //MMX #include //SSE(include mmintrin.h) #include //SSE2(include xmmintrin.h) #include //SSE3(include emmintrin.h) #include //SSSE3(include pmmintrin.h) #include //SSE4.1(include tmmintrin.h) #include //SSE4.2(include smmintrin.h) #include //AES(include nmmintrin.h) #include //AVX(include wmmintrin.h) #include //(include immintrin.h)
mmintrin.h为MMX 头文件,其中__m64的定义为:
typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64 { unsigned __int64 m64_u64; float m64_f32[2]; __int8 m64_i8[8]; __int16 m64_i16[4]; __int32 m64_i32[2]; __int64 m64_i64; unsigned __int8 m64_u8[8]; unsigned __int16 m64_u16[4]; unsigned __int32 m64_u32[2]; } __m64;
xmmintrin.h为SSE 头文件,此头文件里包含MMX头文件,其中__m128的定义为:
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 { float m128_f32[4]; unsigned __int64 m128_u64[2]; __int8 m128_i8[16]; __int16 m128_i16[8]; __int32 m128_i32[4]; __int64 m128_i64[2]; unsigned __int8 m128_u8[16]; unsigned __int16 m128_u16[8]; unsigned __int32 m128_u32[4]; } __m128;
emmintrin.h为SSE2头文件,此头文件里包含SSE头文件,其中__m128i和__m128d的定义为:
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i { __int8 m128i_i8[16]; __int16 m128i_i16[8]; __int32 m128i_i32[4]; __int64 m128i_i64[2]; unsigned __int8 m128i_u8[16]; unsigned __int16 m128i_u16[8]; unsigned __int32 m128i_u32[4]; unsigned __int64 m128i_u64[2]; } __m128i; typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d { double m128d_f64[2]; } __m128d;
emmintrin.h文件中各函数的介绍:
1 /*----Floating-Point Intrinsics Using Streaming SIMD Extension 2 Instructions----*/ 2 //Arithmetic Operations(Floating Point):add、sub、mul、div、sqrt、min、max 3 //返回一个__m128d的寄存器,r0=_A0+_B0, r1=_A1 4 extern __m128d _mm_add_sd(__m128d _A, __m128d _B); 5 //返回一个__m128d的寄存器,r0=_A0+_B0, r1=_A1+_B1 6 extern __m128d _mm_add_pd(__m128d _A, __m128d _B); 7 //返回一个__m128d的寄存器,r0=_A0-_B0, r1=_A1 8 extern __m128d _mm_sub_sd(__m128d _A, __m128d _B); 9 //返回一个__m128d的寄存器,r0=_A0-_B0, r1=_A1-_B1 10 extern __m128d _mm_sub_pd(__m128d _A, __m128d _B); 11 //返回一个__m128d的寄存器,r0=_A0*_B0, r1=_A1 12 extern __m128d _mm_mul_sd(__m128d _A, __m128d _B); 13 //返回一个__m128d的寄存器,r0=_A0*_B0, r1=_A1*_B1 14 extern __m128d _mm_mul_pd(__m128d _A, __m128d _B); 15 //返回一个__m128d的寄存器,r0=sqrt(_B0), r1=_A1 16 extern __m128d _mm_sqrt_sd(__m128d _A, __m128d _B); 17 //返回一个__m128d的寄存器,r0=sqrt(_A0), r1=sqrt(_A1) 18 extern __m128d _mm_sqrt_pd(__m128d _A); 19 //返回一个__m128d的寄存器,r0=_A0/_B0, r1=_A1 20 extern __m128d _mm_div_sd(__m128d _A, __m128d _B); 21 //返回一个__m128d的寄存器,r0=_A0/_B0, r1=_A1/_B1 22 extern __m128d _mm_div_pd(__m128d _A, __m128d _B); 23 //返回一个__m128d的寄存器,r0=min(_A0,_B0), r1=_A1 24 extern __m128d _mm_min_sd(__m128d _A, __m128d _B); 25 //返回一个__m128d的寄存器,r0=min(_A0,_B0), r1=min(_A1,_B1) 26 extern __m128d _mm_min_pd(__m128d _A, __m128d _B); 27 //返回一个__m128d的寄存器,r0=max(_A0,_B0), r1=_A1 28 extern __m128d _mm_max_sd(__m128d _A, __m128d _B); 29 //返回一个__m128d的寄存器,r0=max(_A0,_B0), r1=max(_A1,_B1) 30 extern __m128d _mm_max_pd(__m128d _A, __m128d _B); 31 32 //Logical Operations(Floating Point SSE2 Intrinsics):and、or、xor、 andnot 33 //返回一个__m128d的寄存器,r0=_A0 & _B0, r1=_A1 & _B1 34 extern __m128d _mm_and_pd(__m128d _A, __m128d _B); 35 //返回一个__m128d的寄存器,r0=(~_A0) & _B0, r1=(~_A1) & _B1 36 extern __m128d _mm_andnot_pd(__m128d _A, __m128d _B); 37 //返回一个__m128d的寄存器,r0=_A0 | _B0, r1=_A1 | _B1 38 extern __m128d _mm_or_pd(__m128d _A, __m128d _B); 39 //返回一个__m128d的寄存器,r0=_A0 ^ _B0, r1=_A1 ^ _B1 40 extern __m128d _mm_xor_pd(__m128d _A, __m128d _B); 41 42 //Comparisions:==、<、<=、>、>=、!= 43 //返回一个__m128d的寄存器,r0=(_A0 == _B0) ? 0xffffffffffffffff : 0x0, r1=_A1 44 extern __m128d _mm_cmpeq_sd(__m128d _A, __m128d _B); 45 //返回一个__m128d的寄存器,r0=(_A0 == _B0) ? 0xffffffffffffffff : 0x0, 46 //r1=(_A1 == _B1) ? 0xffffffffffffffff : 0x0 47 extern __m128d _mm_cmpeq_pd(__m128d _A, __m128d _B); 48 //返回一个__m128d的寄存器,r0=(_A0 < _B0) ? 0xffffffffffffffff : 0x0, r1=_A1 49 extern __m128d _mm_cmplt_sd(__m128d _A, __m128d _B); 50 //返回一个__m128d的寄存器,r0=(_A0 < _B0) ? 0xffffffffffffffff : 0x0, 51 //r1=(_A1 < _B1) ? 0xffffffffffffffff : 0x0 52 extern __m128d _mm_cmplt_pd(__m128d _A, __m128d _B); 53 //返回一个__m128d的寄存器,r0=(_A0 <= _B0) ? 0xffffffffffffffff : 0x0, r1=_A1 54 extern __m128d _mm_cmple_sd(__m128d _A, __m128d _B); 55 //返回一个__m128d的寄存器,r0=(_A0 <= _B0) ? 0xffffffffffffffff : 0x0, 56 //r1=(_A1 <= _B1) ? 0xffffffffffffffff : 0x0 57 extern __m128d _mm_cmple_pd(__m128d _A, __m128d _B); 58 //返回一个__m128d的寄存器,r0=(_A0 > _B0) ? 0xffffffffffffffff : 0x0, r1=_A1 59 extern __m128d _mm_cmpgt_sd(__m128d _A, __m128d _B); 60 //返回一个__m128d的寄存器,r0=(_A0 > _B0) ? 0xffffffffffffffff : 0x0, 61 //r1=(_A1 > _B1) ? 0xffffffffffffffff : 0x0 62 extern __m128d _mm_cmpgt_pd(__m128d _A, __m128d _B); 63 //返回一个__m128d的寄存器,r0=(_A0 >= _B0) ? 0xffffffffffffffff : 0x0, r1=_A1 64 extern __m128d _mm_cmpge_sd(__m128d _A, __m128d _B); 65 //返回一个__m128d的寄存器,r0=(_A0 >= _B0) ? 0xffffffffffffffff : 0x0, 66 //r1=(_A1 >= _B1) ? 0xffffffffffffffff : 0x0 67 extern __m128d _mm_cmpge_pd(__m128d _A, __m128d _B); 68 //返回一个__m128d的寄存器,r0=(_A0 != _B0) ? 0xffffffffffffffff : 0x0, r1=_A1 69 extern __m128d _mm_cmpneq_sd(__m128d _A, __m128d _B); 70 //返回一个__m128d的寄存器,r0=(_A0 != _B0) ? 0xffffffffffffffff : 0x0, 71 //r1=(_A1 != _B1) ? 0xffffffffffffffff : 0x0 72 extern __m128d _mm_cmpneq_pd(__m128d _A, __m128d _B); 73 //返回一个__m128d的寄存器,r0=!(_A0 < _B0) ? 0xffffffffffffffff : 0x0, r1=_A1 74 extern __m128d _mm_cmpnlt_sd(__m128d _A, __m128d _B); 75 //返回一个__m128d的寄存器,r0=!(_A0 < _B0) ? 0xffffffffffffffff : 0x0, 76 //r1=!(_A1 < _B1) ? 0xffffffffffffffff : 0x0 77 extern __m128d _mm_cmpnlt_pd(__m128d _A, __m128d _B); 78 //返回一个__m128d的寄存器,r0=!(_A0 <= _B0) ? 0xffffffffffffffff : 0x0, r1=_A1 79 extern __m128d _mm_cmpnle_sd(__m128d _A, __m128d _B); 80 //返回一个__m128d的寄存器,r0=!(_A0 <= _B0) ? 0xffffffffffffffff : 0x0, 81 //r1=!(_A1 <= _B1) ? 0xffffffffffffffff : 0x0 82 extern __m128d _mm_cmpnle_pd(__m128d _A, __m128d _B); 83 //返回一个__m128d的寄存器,r0=!(_A0 > _B0) ? 0xffffffffffffffff : 0x0, r1=_A1 84 extern __m128d _mm_cmpngt_sd(__m128d _A, __m128d _B); 85 //返回一个__m128d的寄存器,r0=!(_A0 > _B0) ? 0xffffffffffffffff : 0x0, 86 //r1=!(_A1 > _B1) ? 0xffffffffffffffff : 0x0 87 extern __m128d _mm_cmpngt_pd(__m128d _A, __m128d _B); 88 //返回一个__m128d的寄存器,r0=!(_A0 >= _B0) ? 0xffffffffffffffff : 0x0, r1=_A1 89 extern __m128d _mm_cmpnge_sd(__m128d _A, __m128d _B); 90 //返回一个__m128d的寄存器,r0=!(_A0 >= _B0) ? 0xffffffffffffffff : 0x0, 91 //r1=!(_A1 >= _B1) ? 0xffffffffffffffff : 0x0 92 extern __m128d _mm_cmpnge_pd(__m128d _A, __m128d _B); 93 //返回一个__m128d的寄存器,r0=(_A0 ord _B0) ? 0xffffffffffffffff : 0x0, 94 //r1=(_A1 ord _B1) ? 0xffffffffffffffff : 0x0 95 extern __m128d _mm_cmpord_pd(__m128d _A, __m128d _B); 96 //返回一个__m128d的寄存器,r0=(_A0 ord _B0) ? 0xffffffffffffffff : 0x0, r1=_A1 97 extern __m128d _mm_cmpord_sd(__m128d _A, __m128d _B); 98 //返回一个__m128d的寄存器,r0=(_A0 unord _B0) ? 0xffffffffffffffff : 0x0, 99 //r1=(_A1 unord _B1) ? 0xffffffffffffffff : 0x0 100 extern __m128d _mm_cmpunord_pd(__m128d _A, __m128d _B); 101 //返回一个__m128d的寄存器,r0=(_A0 unord _B0) ? 0xffffffffffffffff : 0x0, r1=_A1 102 extern __m128d _mm_cmpunord_sd(__m128d _A, __m128d _B); 103 //返回一个0或1的整数,r=(_A0 != _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 1 is returned 104 extern int _mm_comieq_sd(__m128d _A, __m128d _B); 105 //返回一个0或1的整数,r=(_A0 < _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 1 is returned 106 extern int _mm_comilt_sd(__m128d _A, __m128d _B); 107 //返回一个0或1的整数,r=(_A0 <= _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 1 is returned 108 extern int _mm_comile_sd(__m128d _A, __m128d _B); 109 //返回一个0或1的整数,r=(_A0 > _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 0 is returned 110 extern int _mm_comigt_sd(__m128d _A, __m128d _B); 111 //返回一个0或1的整数,r=(_A0 >= _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 0 is returned 112 extern int _mm_comige_sd(__m128d _A, __m128d _B); 113 //返回一个0或1的整数,r=(_A0 != _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 0 is returned 114 extern int _mm_comineq_sd(__m128d _A, __m128d _B); 115 //返回一个0或1的整数,r=(_A0 == _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 1 is returned 116 extern int _mm_ucomieq_sd(__m128d _A, __m128d _B); 117 //返回一个0或1的整数,r=(_A0 < _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 1 is returned 118 extern int _mm_ucomilt_sd(__m128d _A, __m128d _B); 119 //返回一个0或1的整数,r=(_A0 <= _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 1 is returned 120 extern int _mm_ucomile_sd(__m128d _A, __m128d _B); 121 //返回一个0或1的整数,r=(_A0 > _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 0 is returned 122 extern int _mm_ucomigt_sd(__m128d _A, __m128d _B); 123 //返回一个0或1的整数,r=(_A0 >= _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 0 is returned 124 extern int _mm_ucomige_sd(__m128d _A, __m128d _B); 125 //返回一个0或1的整数,r=(_A0 != _B0) ? 0x1 : 0x0, If _A and _B is a NaN, 0 is returned 126 extern int _mm_ucomineq_sd(__m128d _A, __m128d _B); 127 128 //Conversion Operations 129 //返回一个__m128d的寄存器,r0=(dobule)_A0, r1=(double)_A1 130 extern __m128d _mm_cvtepi32_pd(__m128i _A); 131 //返回一个__m128i的寄存器,r0=(int)_A0, r1=(int)_A1, r2=0x0, r3=0x0 132 extern __m128i _mm_cvtpd_epi32(__m128d _A); 133 //返回一个__m128i的寄存器,r0=(int)_A0, r1=(int)_A1, r2=0x0, r3=0x0,using truncate 134 extern __m128i _mm_cvttpd_epi32(__m128d _A); 135 //返回一个__m128的寄存器,r0=(flaot)_A0, r1=(float)_A1, r2=(float)_A2, r3=(float)_A3 136 extern __m128 _mm_cvtepi32_ps(__m128i _A); 137 //返回一个__m128i的寄存器,r0=(int)_A0, r1=(int)_A1, r2=(int)_A2, r3=(int)_A3 138 extern __m128i _mm_cvtps_epi32(__m128 _A); 139 //返回一个__m128i的寄存器,r0=(int)_A0, r1=(int)_A1, r2=(int)_A2, r3=(int)_A3,using truncate 140 extern __m128i _mm_cvttps_epi32(__m128 _A); 141 //返回一个__m128的寄存器,r0=(flaot)_A0, r1=(float)_A1, r2=0.0, r3=0.0 142 extern __m128 _mm_cvtpd_ps(__m128d _A); 143 //返回一个__m128d的寄存器,r0=(dobule)_A0, r1=(double)_A1 144 extern __m128d _mm_cvtps_pd(__m128 _A); 145 //返回一个__m128的寄存器,r0=(float)_B0, r1=_B1, r2=_B2, r3=_B3 146 extern __m128 _mm_cvtsd_ss(__m128 _A, __m128d _B); 147 //返回一个__m128d的寄存器,r0=(double)_B0, r1=_A1 148 extern __m128d _mm_cvtss_sd(__m128d _A, __m128 _B); 149 //返回一个32bit整数,r=(int)_A0 150 extern int _mm_cvtsd_si32(__m128d _A); 151 //返回一个32bit整数,r=(int)_A0,using truncate 152 extern int _mm_cvttsd_si32(__m128d _A); 153 //返回一个__m128d的寄存器,r0=(double)_B, r1=_A1 154 extern __m128d _mm_cvtsi32_sd(__m128d _A, int _B); 155 //返回一个__m64的寄存器,r0=(int)_A0, r1=(int)_A1 156 extern __m64 _mm_cvtpd_pi32(__m128d _A); 157 //返回一个__m64的寄存器,r0=(int)_A0, r1=(int)_A1,using truncate 158 extern __m64 _mm_cvttpd_pi32(__m128d _A); 159 //返回一个__m128d的寄存器,r0=(dobule)_A0, r1=(double)_A1 160 extern __m128d _mm_cvtpi32_pd(__m64 _A); 161 162 //Miscellaneous Operations(Floating-Point SSE2 Intrinsics) 163 //返回一个__m128d的寄存器,r0=_A1, r1=_B1 164 extern __m128d _mm_unpackhi_pd(__m128d _A, __m128d _B); 165 //返回一个__m128d的寄存器,r0=_A0, r1=_B0 166 extern __m128d _mm_unpacklo_pd(__m128d _A, __m128d _B); 167 //返回一个2bit整数,r=sign(_A1) << 1 | sign(_A0) 168 extern int _mm_movemask_pd(__m128d _A); 169 //返回一个__m128d的寄存器,Selects two specific double-precision, 170 // floating-point values from _A and _B, based on the mask _I, 171 //The mask must be an immediate 172 extern __m128d _mm_shuffle_pd(__m128d _A, __m128d _B, int _I); 173 174 //Load Operations(Floating-Point SSE2 Intrinsics) 175 //返回一个__m128d的寄存器,r0=_Dp[0], r1=_Dp[1], The address _Dp must be 16-byte aligned 176 extern __m128d _mm_load_pd(double const*_Dp); 177 //返回一个__m128d的寄存器,r0=*_Dp, r1=*_Dp, The address _Dp does not need 178 //to be 16-byte aligned 179 extern __m128d _mm_load1_pd(double const*_Dp); 180 //返回一个__m128d的寄存器,r0=_Dp[1], r1=_Dp[0], The address _Dp must be 16-byte aligned 181 extern __m128d _mm_loadr_pd(double const*_Dp); 182 //返回一个__m128d的寄存器,r0=_Dp[0], r1=_Dp[1], The address _Dp does not 183 //need to be 16-byte aligned 184 extern __m128d _mm_loadu_pd(double const*_Dp); 185 //返回一个__m128d的寄存器,r0=*_Dp, r1=0.0, The address _Dp does not 186 //need to be 16-byte aligned 187 extern __m128d _mm_load_sd(double const*_Dp); 188 //返回一个__m128d的寄存器,r0=_A0, r1=*_Dp, The address _Dp does not 189 //need to be 16-byte aligned 190 extern __m128d _mm_loadh_pd(__m128d _A, double const*_Dp); 191 //返回一个__m128d的寄存器,r0=*_Dp, r1=_A1, The address _Dp does not 192 //need to be 16-byte aligned 193 extern __m128d _mm_loadl_pd(__m128d _A, double const*_Dp); 194 195 //Set Operations(Floating-Point SSE2 Intrinsics) 196 //返回一个__m128d的寄存器,r0=_W, r1=0.0 197 extern __m128d _mm_set_sd(double _W); 198 //返回一个__m128d的寄存器,r0=_A, r1=_A 199 extern __m128d _mm_set1_pd(double _A); 200 //返回一个__m128d的寄存器,r0=_Y, r1=_Z 201 extern __m128d _mm_set_pd(double _Z, double _Y); 202 //返回一个__m128d的寄存器,r0=_Y, r1=_Z 203 extern __m128d _mm_setr_pd(double _Y, double _Z); 204 //返回一个__m128d的寄存器,r0=0.0, r1=0.0 205 extern __m128d _mm_setzero_pd(void); 206 //返回一个__m128d的寄存器,r0=_B0, r1=_A1 207 extern __m128d _mm_move_sd(__m128d _A, __m128d _B); 208 209 //Store Operations(Floating-Point SSE2 Intrinsics) 210 //返回为空,*_Dp=_A0, The address _Dp does not need to be 16-byte aligned 211 extern void _mm_store_sd(double *_Dp, __m128d _A); 212 //返回为空,_Dp[0]=_A0, _Dp[1]=_A0, The address _Dp must be 16-byte aligned 213 extern void _mm_store1_pd(double *_Dp, __m128d _A); 214 //返回为空,_Dp[0]=_A0, _Dp[1]=_A1, The address _Dp must be 16-byte aligned 215 extern void _mm_store_pd(double *_Dp, __m128d _A); 216 //返回为空,_Dp[0]=_A0, _Dp[1]=_A1, The address _Dp does not need to be 16-byte aligned 217 extern void _mm_storeu_pd(double *_Dp, __m128d _A); 218 //返回为空,_Dp[0]=_A1, _Dp[1]=_A0, The address _Dp must be 16-byte aligned 219 extern void _mm_storer_pd(double *_Dp, __m128d _A); 220 //返回为空,*_Dp=_A1 221 extern void _mm_storeh_pd(double *_Dp, __m128d _A); 222 //返回为空,*_Dp=_A0 223 extern void _mm_storel_pd(double *_Dp, __m128d _A); 224 225 //new convert to float 226 //返回一个64bit double类型,r=_A0, Extracts the lower order floating point value 227 extern double _mm_cvtsd_f64(__m128d _A); 228 229 //Cache Support for Streaming SIMD Extensions 2 Floating-Point Operations 230 //返回为空,_Dp[0]=_A0, _Dp[1]=_A1, Stores the data in _A to the address _Dp without 231 //polluting caches. The address _Dp must be 16-byte aligned. If the cache line 232 //containing address _Dp is already in the cache, the cache will be updated 233 extern void _mm_stream_pd(double *_Dp, __m128d _A); 234 235 /*------------Integer Intrinsics Using Streaming SIMD Extensions 2-------------*/ 236 //Arithmetic Operations(Integer SSE2 Intrinsics):add、sub、mul、avg、min、max 237 //返回一个__m128i的寄存器,r0=_A0+_B0, r1=_A1+_B1, ... r15=_A15+_B15 238 extern __m128i _mm_add_epi8(__m128i _A, __m128i _B); 239 //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit有符号或无符号整数分别相加, 240 //即ri=_Ai+_Bi(r0=_A0+_B0, r1=_A1+_B1, ... r7=_A7+_B7) 241 extern __m128i _mm_add_epi16(__m128i _A, __m128i _B); 242 //返回一个__m128i的寄存器,r0=_A0+_B0, r1=_A1+_B1, r2=_A2+_B2, r3=_A3+_B3 243 extern __m128i _mm_add_epi32(__m128i _A, __m128i _B); 244 //返回一个__m64的寄存器,r=_A+_B 245 extern __m64 _mm_add_si64(__m64 _A, __m64 _B); 246 //返回一个__m128i的寄存器,r0=_A0+_B0, r1=_A1+_B1 247 extern __m128i _mm_add_epi64(__m128i _A, __m128i _B); 248 //返回一个__m128i的寄存器,r0=SignedSaturate(_A0+_B0), r1=SignedSaturate(_A1+_B1), ... 249 //r15=SignedSaturate(_A15+_B15), saturates 250 extern __m128i _mm_adds_epi8(__m128i _A, __m128i _B); 251 //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit有符号或无符号整数分别相加, 252 //r0=SignedSaturate(_A0+_B0), r1=SignedSaturate(_A1+_B1), ... 253 //r7=SignedSaturate(_A7+_B7), 当计算结果溢出时将其置为边界值(saturates) 254 extern __m128i _mm_adds_epi16(__m128i _A, __m128i _B); 255 //返回一个__m128i的寄存器,r0=UnsignedSaturate(_A0+_B0), r1=UnsignedSaturate(_A1+_B1), ... 256 //r15=UnsignedSaturate(_A15+_B15), saturates 257 extern __m128i _mm_adds_epu8(__m128i _A, __m128i _B); 258 //返回一个__m128i的寄存器,r0=UnsignedSaturate(_A0+_B0), r1=UnsignedSaturate(_A1+_B1), ... 259 //r7=UnsignedSaturate(_A7+_B7), saturates 260 extern __m128i _mm_adds_epu16(__m128i _A, __m128i _B); 261 //返回一个__m128i的寄存器,r0=(_A0+_B0)/2, r1=(_A1+_B1)/2, ... r15=(_A15+_B15)/2, rounds 262 extern __m128i _mm_avg_epu8(__m128i _A, __m128i _B); 263 //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit无符号整数取平均, 264 //即ri=(_Ai+_Bi)/2(r0=(_A0+_B0)/2, r1=(_A1+_B1)/2, ... r7=(_A7+_B7)/2), rounds 265 extern __m128i _mm_avg_epu16(__m128i _A, __m128i _B); 266 //返回一个__m128i的寄存器,它含有4个有符号或无符号32bit的整数, 267 //分别满足:r0=(_A0*_B0)+(_A1*_B1), r1=(_A2*_B2)+(_A3*_B3), 268 //r2=(_A4*_B4)+(_A5*_B5), r3=(_A6*_B6)+(_A7*_B7) 269 extern __m128i _mm_madd_epi16(__m128i _A, __m128i _B); 270 //返回一个__m128i的寄存器,取_A和_B中对应位置的16bit有符号或无符号整数的最大值, 271 //即ri=max(_Ai,_Bi) (r0=max(_A0,_B1), r1=max(_A1,_B1), ... r7=max(_A7,_B7)) 272 extern __m128i _mm_max_epi16(__m128i _A, __m128i _B); 273 //返回一个__m128i的寄存器,r0=max(_A0,_B1), r1=max(_A1,_B1), ... r15=max(_A15,_B15) 274 extern __m128i _mm_max_epu8(__m128i _A, __m128i _B); 275 //返回一个__m128i的寄存器,取_A和_B中对应位置的16bit有符号或无符号整数的最小值, 276 //即ri=min(_Ai, _Bi)(r0=min(_A0,_B1), r1=min(_A1,_B1), ... r7=min(_A7,_B7)) 277 extern __m128i _mm_min_epi16(__m128i _A, __m128i _B); 278 //返回一个__m128i的寄存器,r0=min(_A0,_B1), r1=min(_A1,_B1), ... r15=min(_A15,_B15) 279 extern __m128i _mm_min_epu8(__m128i _A, __m128i _B); 280 //返回一个__m128i的寄存器,它含8个有符号或无符号16bit的整数,分别为_A和_B对应位置的16bit 281 //有符号或无符号整数相乘结果的高16bit数据,即ri=(_Ai*_Bi)[31:16](r0=(_A0*_B0)[31:16], 282 //r1=(_A1*_B1)[31:16] ... r7=(_A7*_B7)[31:16]) 283 extern __m128i _mm_mulhi_epi16(__m128i _A, __m128i _B); 284 //返回一个__m128i的寄存器,r0=(_A0*_B0)[31:16], r1=(_A1*_B1)[31:16] ... r7=(_A7*_B7)[31:16] 285 extern __m128i _mm_mulhi_epu16(__m128i _A, __m128i _B); 286 //返回一个__m128i的寄存器,它含8个有符号或无符号16bit的整数,分别为_A和_B对应位置的16bit 287 //有符号或无符号整数相乘结果的低16bit数据,即ri=(_Ai*_Bi)[15:0](r0=(_A0*_B0)[15:0], 288 //r1=(_A1*_B1)[15:0] ... r7=(_A7*_B7)[15:0]) 289 extern __m128i _mm_mullo_epi16(__m128i _A, __m128i _B); 290 //返回一个__m64的寄存器,r=_A0*_B0 291 extern __m64 _mm_mul_su32(__m64 _A, __m64 _B); 292 //返回一个__m128i的寄存器,r0=_A0*_B0, r1=_A2*_B2 293 extern __m128i _mm_mul_epu32(__m128i _A, __m128i _B); 294 //返回一个__m128i的寄存器,r0=abs(_A0-_B0) + abs(_A1-_B1) + ... + abs(_A7-_B7), 295 //r1=0x0,r2=0x0, r3=0x0, r4=abs(_A8-_B8) + abs(_A9-_B9) + ... + abs(_A15-_B15), 296 //r5=0x0, r6=0x0, r7=0x0 297 extern __m128i _mm_sad_epu8(__m128i _A, __m128i _B); 298 //返回一个__m128i的寄存器,r0=_A0-_B0, r1=_A1-_B1, ... r15=_A15-_B15 299 extern __m128i _mm_sub_epi8(__m128i _A, __m128i _B); 300 //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit有符号或无符号整数分别相减, 301 //即ri=_Ai-_Bi(r0=_A0-_B0, r1=_A1-_B1, ... r7=_A7-_B7) 302 extern __m128i _mm_sub_epi16(__m128i _A, __m128i _B); 303 //返回一个__m128i的寄存器,r0=_A0-_B0, r1=_A1-_B1, r2=_A2-_B2, r3=_A3-_B3 304 extern __m128i _mm_sub_epi32(__m128i _A, __m128i _B); 305 //返回一个__m64的寄存器,r=_A-_B 306 extern __m64 _mm_sub_si64(__m64 _A, __m64 _B); 307 //返回一个__m128i的寄存器,r0=_A0-_B0, r1=_A1-_B1 308 extern __m128i _mm_sub_epi64(__m128i _A, __m128i _B); 309 //返回一个__m128i的寄存器,r0=SignedSaturate(_A0-_B0), r1=SignedSaturate(_A1-_B1), ... 310 //r15=SignedSaturate(_A15-_B15), saturate 311 extern __m128i _mm_subs_epi8(__m128i _A, __m128i _B); 312 //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit有符号或无符号整数分别相减, 313 //当计算结果溢出时将其置为边界值(saturate), r0=SignedSaturate(_A0-_B0), 314 //r1=SignedSaturate(_A1-_B1), ... r7=SignedSaturate(_A7-_B7) 315 extern __m128i _mm_subs_epi16(__m128i _A, __m128i _B); 316 //返回一个__m128i的寄存器,r0=UnsignedSaturate(_A0-_B0), r1=UnsignedSaturate(_A1-_B1), ... 317 //r15=UnsignedSaturate(_A15-_B15), saturate 318 extern __m128i _mm_subs_epu8(__m128i _A, __m128i _B); 319 //返回一个__m128i的寄存器,r0=UnsignedSaturate(_A0-_B0), r1=UnsignedSaturate(_A1-_B1), ... 320 //r15=UnsignedSaturate(_A7-_B7), saturate 321 extern __m128i _mm_subs_epu16(__m128i _A, __m128i _B); 322 323 //Logical Operations(Integer SSE2 Intrinsics):and、or、xor、andnot 324 //返回一个__m128i的寄存器,将寄存器_A和寄存器_B的对应位进行按位与运算, r=_A & _B 325 extern __m128i _mm_and_si128(__m128i _A, __m128i _B); 326 //返回一个__m128i的寄存器,将寄存器_A每一位取非,然后和寄存器_B的每一位进行按位与运算, 327 //r=(~_A) & _B 328 extern __m128i _mm_andnot_si128(__m128i _A, __m128i _B); 329 //返回一个__m128i的寄存器,将寄存器_A和寄存器_B的对应位进行按位或运算, r=_A | _B 330 extern __m128i _mm_or_si128(__m128i _A, __m128i _B); 331 //返回一个__m128i的寄存器,将寄存器_A和寄存器_B的对应位进行按位异或运算, r=_A ^ _B 332 extern __m128i _mm_xor_si128(__m128i _A, __m128i _B); 333 334 //Shift Operations 335 //返回一个__m128i的寄存器,r=_A << (_Imm * 8), _Imm must be an immediate, 336 //shifting in zeros 337 extern __m128i _mm_slli_si128(__m128i _A, int _Imm); 338 //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count进行相同的逻辑左移, 339 //r0=_A0 << _Count, r1=_A1 << _Count, ... r7=_A7 << count, shifting in zeros 340 extern __m128i _mm_slli_epi16(__m128i _A, int _Count); 341 //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count寄存器中对应位置的整数 342 //进行逻辑左移, r0=_A0 << _Count, r1=_A1 << _Count, ... r7=_A7 << count, shifting in zeros 343 extern __m128i _mm_sll_epi16(__m128i _A, __m128i _Count); 344 //返回一个__m128i的寄存器,r0=_A0 << _Count, r1=_A1 << _Count, r2=_A2 << count, 345 //r3=_A3 << count, shifting in zeros 346 extern __m128i _mm_slli_epi32(__m128i _A, int _Count); 347 //返回一个__m128i的寄存器,r0=_A0 << _Count, r1=_A1 << _Count, r2=_A2 << count, 348 //r3=_A3 << count, shifting in zeros 349 extern __m128i _mm_sll_epi32(__m128i _A, __m128i _Count); 350 //返回一个__m128i的寄存器,r0=_A0 << _Count, r1=_A1 << _Count, shifting in zeros 351 extern __m128i _mm_slli_epi64(__m128i _A, int _Count); 352 //返回一个__m128i的寄存器,r0=_A0 << _Count, r1=_A1 << _Count, shifting in zeros 353 extern __m128i _mm_sll_epi64(__m128i _A, __m128i _Count); 354 //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count进行相同的算术右移, 355 //r0=_A0 >> _Count, r1=_A1 >> _Count, ... r7=_A7 >> count, shifting in the sign bit 356 extern __m128i _mm_srai_epi16(__m128i _A, int _Count); 357 //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count寄存器中对应位置的整数进行 358 //算术右移,r0=_A0 >> _Count, r1=_A1 >> _Count, ... r7=_A7 >> count, shifting in the sign bit 359 extern __m128i _mm_sra_epi16(__m128i _A, __m128i _Count); 360 //返回一个__m128i的寄存器,r0=_A0 >> _Count, r1=_A1 >> _Count, r3=_A3 >> count, 361 //r4=_A4 >> count, shifting in the sign bit 362 extern __m128i _mm_srai_epi32(__m128i _A, int _Count); 363 //返回一个__m128i的寄存器,r0=_A0 >> _Count, r1=_A1 >> _Count, r3=_A3 >> count, 364 //r4=_A4 >> count, shifting in the sign bit 365 extern __m128i _mm_sra_epi32(__m128i _A, __m128i _Count); 366 //返回一个__m128i的寄存器,r=srl(_A, _Imm * 8), _Imm must be an immediate, 367 //shifting in zeros 368 extern __m128i _mm_srli_si128(__m128i _A, int _Imm); 369 //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count进行相同的逻辑右移, 370 //移位填充值为0,r0=srl(_A0, _Count), r1=srl(_A1, _Count), ... r7=srl(_A7, _Count), 371 //shifting in zeros 372 extern __m128i _mm_srli_epi16(__m128i _A, int _Count); 373 //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count寄存器中对应位置的整数 374 //进行逻辑右移,移位填充值为0, r0=srl(_A0, _Count), r1=srl(_A1, _Count), ... 375 //r7=srl(_A7, _Count), shifting in zeros 376 extern __m128i _mm_srl_epi16(__m128i _A, __m128i _Count); 377 //返回一个__m128i的寄存器,r0=srl(_A0, _Count), r1=srl(_A1, _Count), r2=srl(_A2, _Count), 378 //r3=srl(_A3, _Count), shifting in zeros 379 extern __m128i _mm_srli_epi32(__m128i _A, int _Count); 380 //返回一个__m128i的寄存器,r0=srl(_A0, _Count), r1=srl(_A1, _Count), r2=srl(_A2, _Count), 381 //r3=srl(_A3, _Count), shifting in zeros 382 extern __m128i _mm_srl_epi32(__m128i _A, __m128i _Count); 383 //返回一个__m128i的寄存器,r0=srl(_A0, _Count), r1=srl(_A1, _Count), shifting in zeros 384 extern __m128i _mm_srli_epi64(__m128i _A, int _Count); 385 //返回一个__m128i的寄存器,r0=srl(_A0, _Count), r1=srl(_A1, _Count), shifting in zeros 386 extern __m128i _mm_srl_epi64(__m128i _A, __m128i _Count); 387 388 //Comparison Intrinsics(SSE2):==、>、< 389 //返回一个__m128i的寄存器,r0=(_A0 == _B0) ? 0xff : 0x00, 390 //r1=(_A1 == _B1) ? 0xff : 0x0, ... r15=(_A15 == _B15) ? 0xff : 0x0 391 extern __m128i _mm_cmpeq_epi8(__m128i _A, __m128i _B); 392 //返回一个__m128i的寄存器,分别比较寄存器_A和寄存器_B对应位置16bit整数是否相等,若相等, 393 //该位置返回0xffff,否则返回0x0,即ri=(_Ai==_Bi)?0xffff:0x0(r0=(_A0 == _B0) ? 0xffff : 0x00, 394 //r1=(_A1 == _B1) ? 0xffff : 0x0, ... r7=(_A7 == _B7) ? 0xffff : 0x0) 395 extern __m128i _mm_cmpeq_epi16(__m128i _A, __m128i _B); 396 //返回一个__m128i的寄存器,r0=(_A0 == _B0) ? 0xffffffff : 0x00, 397 //r1=(_A1 == _B1) ? 0xffffffff : 0x0, 398 //r2=(_A2 == _B2) ? 0xffffffff : 0x0, r3=(_A3 == _B3) ? 0xffffffff : 0x0 399 extern __m128i _mm_cmpeq_epi32(__m128i _A, __m128i _B); 400 //返回一个__m128i的寄存器,r0=(_A0 > _B0) ? 0xff : 0x00, r1=(_A1 > _B1) ? 0xff : 0x0, ... 401 //r15=(_A15 > _B15) ? 0xff : 0x0 402 extern __m128i _mm_cmpgt_epi8(__m128i _A, __m128i _B); 403 //返回一个__m128i的寄存器,分别比较寄存器_A的每个16bit整数是否大于寄存器_B对应位置16bit的整数, 404 //若大于,该位置返回0xffff,否则返回0x0, 405 //即ri=(_Ai>_Bi)?0xffff:0x0(r0=(_A0 > _B0) ? 0xffff : 0x00, 406 //r1=(_A1 > _B1) ? 0xffff : 0x0, ... r7=(_A7 > _B7) ? 0xffff : 0x0) 407 extern __m128i _mm_cmpgt_epi16(__m128i _A, __m128i _B); 408 //返回一个__m128i的寄存器,r0=(_A0 > _B0) ? 0xffffffff : 0x00, 409 //r1=(_A1 > _B1) ? 0xffffffff : 0x0, 410 //r2=(_A2 > _B2) ? 0xffffffff : 0x0, r3=(_A3 > _B3) ? 0xffffffff : 0x0 411 extern __m128i _mm_cmpgt_epi32(__m128i _A, __m128i _B); 412 //返回一个__m128i的寄存器,r0=(_A0 < _B0) ? 0xff : 0x00, r1=(_A1 < _B1) ? 0xff : 0x0, ... 413 //r15=(_A15 < _B15) ? 0xff : 0x0 414 extern __m128i _mm_cmplt_epi8(__m128i _A, __m128i _B); 415 //返回一个__m128i的寄存器,分别比较寄存器_A的每个16bit整数是否小于寄存器_B对应位置16bit整数, 416 //若小于,该位置返回0xffff,否则返回0x0, 417 //即ri=(_Ai<_Bi)?0xffff:0x0(r0=(_A0 < _B0) ? 0xffff : 0x00, 418 //r1=(_A1 < _B1) ? 0xffff : 0x0, ... r7=(_A7 < _B7) ? 0xffff : 0x0) 419 extern __m128i _mm_cmplt_epi16(__m128i _A, __m128i _B); 420 //返回一个__m128i的寄存器,r0=(_A0 < _B0) ? 0xffffffff : 0x00, 421 //r1=(_A1 < _B1) ? 0xffffffff : 0x0, 422 //r2=(_A2 < _B2) ? 0xffffffff : 0x0, r3=(_A3 < _B3) ? 0xffffffff : 0x0 423 extern __m128i _mm_cmplt_epi32(__m128i _A, __m128i _B); 424 425 //Conversion Intrinsics: int <-----> __m128i 426 //返回一个__m128i的寄存器,r0=_A, r1=0x0, r2=0x0, r3=0x0 427 extern __m128i _mm_cvtsi32_si128(int _A); 428 //返回一个32bit整数,r=_A0 429 extern int _mm_cvtsi128_si32(__m128i _A); 430 431 //Miscellaneous Operations(Integer SSE2 Intrinsics) 432 //返回一个__m128i的寄存器,r0=SignedSaturate(_A0), r1=SignedSaturate(_A1), ... 433 //r7=SignedSaturate(_A7), r8=SignedSaturate(_B0), r9=SignedSaturate(_B1), ... 434 //r15=SignedSaturate(_B7), saturate 435 extern __m128i _mm_packs_epi16(__m128i _A, __m128i _B); 436 //返回一个__m128i的寄存器,r0=SignedSaturate(_A0), r1=SignedSaturate(_A1), 437 //r2=SignedSaturate(_A2),r3=SignedSaturate(_A3), r4=SignedSaturate(_B0), 438 //r5=SignedSaturate(_B1), r6=SignedSaturate(_B2), r7=SignedSaturate(_B3), saturate 439 extern __m128i _mm_packs_epi32(__m128i _A, __m128i _B); 440 //返回一个__m128i的寄存器,r0=UnsignedSaturate(_A0), r1=UnsignedSaturate(_A1), ... 441 //r7=UnsignedSaturate(_A7),r8=UnsignedSaturate(_B0), r9=UnsignedSaturate(_B1), ... 442 //r15=UnsignedSaturate(_B7), saturate 443 extern __m128i _mm_packus_epi16(__m128i _A, __m128i _B); 444 //返回一个16bit整数,根据_Imm从_A中8个16bit数中选取对应编号的数, 445 //r=(_Imm == 0) ? _A0 : ((_Imm == 1) ? _A1 : ... (_Imm == 7) ? _A7), 446 //_Imm must be an immediate, zero extends 447 extern int _mm_extract_epi16(__m128i _A, int _Imm); 448 //返回一个__m128i的寄存器,根据_Imm将_A中8个16bit数中对应编号的数替换为_B, 449 //r0=(_Imm == 0) ? _B : _A0; r1=(_Imm == 1) : _B : _A1, ... r7=(_Imm == 7) ? _B : _A7 450 extern __m128i _mm_insert_epi16(__m128i _A, int _B, int _Imm); 451 //返回一个16bit整数,r=(_A15[7] << 15) | (_A14[7] << 14) ... (_A1[7] << 1) | _A0[7], 452 //zero extends the upper bits 453 extern int _mm_movemask_epi8(__m128i _A); 454 //返回一个__m128i的寄存器,它是将_A中128bit数据以32bit为单位重新排列得到的,_Imm为有 455 //一个四元组,表示重新排列的顺序。当_A中原本存储的整数为16bit时,这条指令将其两两一组 456 //进行排列。例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7), _Imm=(2,3,0,1),其中_Ai为16bit整数, 457 //_A0为低位,返回结果为(_A2,_A3,_A0,_A1,_A6,_A7,_A4,_A5), _Imm must be an immediate 458 extern __m128i _mm_shuffle_epi32(__m128i _A, int _Imm); 459 //返回一个__m128i的寄存器,它是将_A中高64bit数据以16bit为单位重新排列得到的,_Imm为一个四元组, 460 //表示重新排列的顺序。_A中低64bit数据顺序不变。例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7), 461 //_Imm=(2,3,0,1),其中_Ai为16bit整数,_A0为低位,返回结果为(_A0,_A1,_A2,_A3,_A5,_A4,_A7,_A6), 462 //_Imm must be an immediate 463 extern __m128i _mm_shufflehi_epi16(__m128i _A, int _Imm); 464 //返回一个__m128i的寄存器,它是将_A中低64bit数据以16bit为单位重新排列得到的,_Imm为一个四元组, 465 //表示重新排列的顺序。_A中高64bit数据顺序不变。例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7), 466 //_Imm=(2,3,0,1),其中_Ai为16bit整数,_A0为低位,返回结果为(_A1,_A0,_A3,_A2,_A5,_A4,_A7,_A6), 467 //_Imm must be an immediate 468 extern __m128i _mm_shufflelo_epi16(__m128i _A, int _Imm); 469 //返回一个__m128i的寄存器,r0=_A8, r1=_B8, r2=_A9, r3=_B9, ... r14=_A15, r15=_B15 470 extern __m128i _mm_unpackhi_epi8(__m128i _A, __m128i _B); 471 //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的高64bit数以16bit为单位交织在一块。 472 //例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),_B=(_B0,_B1,_B2,_B3,_B4,_B5,_B6,_B7), 473 //其中_Ai,_Bi为16bit整数,_A0,_B0为低位,返回结果为(_A4,_B4,_A5,_B5,_A6,_B6,_A7,_B7), 474 //r0=_A4, r1=_B4, r2=_A5, r3=_B5, r4=_A6, r5=_B6, r6=_A7, r7=_B7 475 extern __m128i _mm_unpackhi_epi16(__m128i _A, __m128i _B); 476 //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的高64bit数以32bit为单位交织在一块。 477 //例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),_B=(_B0,_B1,_B2,_B3,_B4,_B5,_B6,_B7), 478 //其中_Ai,_Bi为16bit整数,_A0,_B0为低位,返回结果为(_A4,_A5,_B4,_B5,_A6,_A7,_B6,_B7), 479 //r0=_A2, r1=_B2, r2=_A3, r3=_B3 480 extern __m128i _mm_unpackhi_epi32(__m128i _A, __m128i _B); 481 //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的高64bit数以64bit为单位交织在一块。 482 //例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),_B=(_B0,_B1,_B2,_B3,_B4,_B5,_B6,_B7), 483 //其中_Ai,_Bi为16bit整数,_A0,_B0为低位, 484 //返回结果为(_A4,_A5,_A6,_A7,_B4,_B5,_B6,_B7), r0=_A1, r1=_B1 485 extern __m128i _mm_unpackhi_epi64(__m128i _A, __m128i _B); 486 //返回一个__m128i的寄存器,r0=_A0, r1=_B0, r2=_A1, r3=_B1, ... r14=_A7, r15=_B7 487 extern __m128i _mm_unpacklo_epi8(__m128i _A, __m128i _B); 488 //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的低64bit数以16bit为单位交织在一块。 489 //例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),_B=(_B0,_B1,_B2,_B3,_B4,_B5,_B6,_B7), 490 //其中_Ai,_Bi为16bit整数,_A0,_B0为低位,返回结果为(_A0,_B0,_A1,_B1,_A2,_B2,_A3,_B3), 491 //r0=_A0, r1=_B0, r2=_A1, r3=_B1, r4=_A2, r5=_B2, r6=_A3, r7=_B3 492 extern __m128i _mm_unpacklo_epi16(__m128i _A, __m128i _B); 493 //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的低64bit数以32bit为单位交织在一块。 494 //例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),_B=(_B0,_B1,_B2,_B3,_B4,_B5,_B6,_B7), 495 //其中_Ai,_Bi为16bit整数,_A0,_B0为低位,返回结果为(_A0,_A1,_B0,_B1,_A2,_A3,_B2,_B3), 496 //r0=_A0, r1=_B0, r2=_A1, r3=_B1 497 extern __m128i _mm_unpacklo_epi32(__m128i _A, __m128i _B); 498 //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的低64bit数以32bit为单位交织在一块。 499 //例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),_B=(_B0,_B1,_B2,_B3,_B4,_B5,_B6,_B7), 500 //其中_Ai,_Bi为16bit整数,_A0,_B0为低位,返回结果为(_A0,_A1,_A2,_A3,_B0,_B1,_B2,_B3), 501 //ro=_A0, r1=_B0 502 extern __m128i _mm_unpacklo_epi64(__m128i _A, __m128i _B); 503 504 //Load Operations(Integer SSE2 Intrinsics) 505 //返回为一个__m128i的寄存器,它将_P指向的数据读到指定寄存器中,实际使用时, 506 //_P一般是通过类型转换得到的, Address _P must be 16-byte aligned 507 extern __m128i _mm_load_si128(__m128i const*_P); 508 //返回一个__m128i的寄存器,Loads 128-bit value, Address _P does not need be 16-byte aligned 509 extern __m128i _mm_loadu_si128(__m128i const*_P); 510 //返回一个__m128i的寄存器,r0=*p[63:0], r1=0x0, zeroing the upper 64 bits of the result 511 extern __m128i _mm_loadl_epi64(__m128i const*_P); 512 513 //Set Operations(Integer SSE2 Intrinsics) 514 //返回一个__m128i的寄存器,r0=_Q0, r1=_Q1 515 extern __m128i _mm_set_epi64(__m64 _Q1, __m64 _Q0); 516 //返回一个__m128i的寄存器,r0=_I0, r1=_I1, r2=_I2, r3=_I3 517 extern __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0); 518 //返回一个__m128i的寄存器,使用8个具体的short型数据来设置寄存器存放数据, 519 //r0=_W0, r1=_W1, ... r7=_W7 520 extern __m128i _mm_set_epi16(short _W7, short _W6, short _W5, short _W4, 521 short _W3, short _W2, short _W1, short _W0); 522 //返回一个__m128i的寄存器,r0=_B0, r1=_B1, ... r15=_B15 523 extern __m128i _mm_set_epi8(char _B15, char _B14, char _B13, char _B12, char _B11, 524 char _B10, char _B9,char _B8, char _B7, char _B6, char _B5, char _B4, 525 char _B3, char _B2, char _B1, char _B0); 526 //返回一个__m128i的寄存器,r0=_Q, r1=_Q 527 extern __m128i _mm_set1_epi64(__m64 _Q); 528 //返回一个__m128i的寄存器,r0=_I, r1=_I, r2=_I, r3=_I 529 extern __m128i _mm_set1_epi32(int _I); 530 //返回一个__m128i的寄存器,r0=_W, r1=_W, ... r7=_W 531 extern __m128i _mm_set1_epi16(short _W); 532 //返回一个__m128i的寄存器,r0=_B, r1=_B, ... r15=_B 533 extern __m128i _mm_set1_epi8(char _B); 534 //返回一个__m128i的寄存器,r=_Q 535 extern __m128i _mm_setl_epi64(__m128i _Q); 536 //返回一个__m128i的寄存器,r0=_Q0, r1=_Q1 537 extern __m128i _mm_setr_epi64(__m64 _Q0, __m64 _Q1); 538 //返回一个__m128i的寄存器,r0=_I0, r1=_I1, r2=_I2, r3=_I3 539 extern __m128i _mm_setr_epi32(int _I0, int _I1, int _I2, int _I3); 540 //返回一个__m128i的寄存器,r0=_W0, r1=_W1, ... r7=_W7 541 extern __m128i _mm_setr_epi16(short _W0, short _W1, short _W2, short _W3, 542 short _W4, short _W5, short _W6, short _W7); 543 //返回一个__m128i的寄存器,r0=_B15, r1=_B14, ... r15=_B0 544 extern __m128i _mm_setr_epi8(char _B15, char _B14, char _B13, char _B12, char _B11, 545 char _B10, char _B9, char _B8, char _B7, char _B6, char _B5, char _B4, 546 char _B3, char _B2, char _B1, char _B0); 547 //返回一个__m128i的寄存器,r=0x0 548 extern __m128i _mm_setzero_si128(void); 549 550 //Store Operations(Integer SSE2 Intrinsics) 551 //返回为空,它将寄存器_B中的数据存储到_P指向的地址中,实际使用时, 552 //_P一般是通过类型转换得到的, *_P = _B, Address _P must be 16-byte aligned 553 extern void _mm_store_si128(__m128i *_P, __m128i _B); 554 //返回为空,*_P=_B, Address _P does not need to be 16-byte aligned 555 extern void _mm_storeu_si128(__m128i *_P, __m128i _B); 556 //返回为空,*_P[63:0] =_Q0, lower 64 bits 557 extern void _mm_storel_epi64(__m128i *_P, __m128i _Q); 558 //返回为空,if(_N0[7]) _P[0]=_D0, if(_N1[7]) _P[1]=_D1, ... if(_N15[7]) _P[15]=_D15, 559 //The high bit of each byte in the selector _N determines whether the corresponding byte 560 //in _D will be stored. Address _P does not need to be 16-byte aligned 561 extern void _mm_maskmoveu_si128(__m128i _D, __m128i _N, char *_P); 562 563 //Integer, moves 564 //返回一个__m128i的寄存器,r0=_Q0, r1=0x0, zeroing the upper bits 565 extern __m128i _mm_move_epi64(__m128i _Q); 566 //返回一个__m128i的寄存器,r0=_Q, r1=0x0, zeroing the upper bits 567 extern __m128i _mm_movpi64_epi64(__m64 _Q); 568 //返回一个__m64的寄存器,r=_Q0 569 extern __m64 _mm_movepi64_pi64(__m128i _Q); 570 571 //Cache Support for Steaming SIMD Extensions 2 Integer Operations 572 //返回为空,*_P=_A, Stores the data in _A to the address _P without polluting the caches. 573 //If the cache line containing address _P is already in the cache, the cache will be updated. 574 //Address _P must be 16-byte aligned 575 extern void _mm_stream_si128(__m128i *_P, __m128i _A); 576 //返回为空,Cache line containing _P is flushed and invalidated from 577 //all caches in the coherency domain 578 extern void _mm_clflush(void const*_P); 579 //返回为空,Guarantees that every load instruction that precedes, in program order, the load 580 //fence instruction is globally visible before any load instruction 581 //that follows the fence in program order 582 extern void _mm_lfence(void); 583 //返回为空,Guarantees that every memory access that precedes, in program order, 584 //the memory fence instruction is globally visible before any memory instruction 585 //that follows the fence in program order 586 extern void _mm_mfence(void); 587 //返回为空,*_P=_I, Stores the data in _I to the address _P without polluting the caches. 588 //If the cache line containing address _P is already in the cache, the cache will be updated 589 extern void _mm_stream_si32(int *_P, int _I); 590 //返回为空,The execution of the next instruction is delayed an implementation specific 591 //amount of time. The instruction does not modify the architectural state. This intrinsic 592 //provides especially significant performance gain 593 extern void _mm_pause(void); 594 595 /*---Support for casting between various SP, DP, INT vector types. Note that these do no 596 conversion of values, they just change the type----*/ 597 //返回一个__m128的寄存器,Applies a type cast to reinterpret two 64-bit floating 598 //point values passed in as a 128-bit parameter as packed 32-bit floating point values 599 extern __m128 _mm_castpd_ps(__m128d); 600 //返回一个__m128i的寄存器,Applies a type cast to reinterpret two 64-bit 601 //floating point values passed in as a 128-bit parameter as packed 32-bit integers 602 extern __m128i _mm_castpd_si128(__m128d); 603 //返回一个__m128d的寄存器,Applies a type cast to reinterpret four 32-bit floating 604 //point values passed in as a 128-bit parameter as packed 64-bit floating point values 605 extern __m128d _mm_castps_pd(__m128); 606 //返回一个__m128i的寄存器,Applies a type cast to reinterpret four 32-bit floating 607 //point values passed in as a 128-bit parameter as packed 32-bit integers 608 extern __m128i _mm_castps_si128(__m128); 609 //返回一个__m128的寄存器,Applies a type cast to reinterpret four 32-bit integers 610 //passed in as a 128-bit parameter as packed 32-bit floating point values 611 extern __m128 _mm_castsi128_ps(__m128i); 612 //返回一个__m128d的寄存器,Applies a type cast to reinterpret four 32-bit 613 //integers passed in as a 128-bit parameter as packed 64-bit floating point values 614 extern __m128d _mm_castsi128_pd(__m128i);
reference:
http://blog.csdn.net/fengbingchun/article/details/18460199