加速主要是一条128位的指令可以一次处理多个运算
比如说int是32位的 128/32=4 如果是加法 就可以一条指令就可以处理4个整数的加法
-----------------------------------------------------------------Load--------------------------------------------------------------------------------------------------------------------------
__m128i _mm_load_si128 (__m128i *p);
Loads 128-bit value. Address p must be 16-byte aligned. 必须对齐
r := *p
__m128i _mm_loadu_si128 (__m128i *p);
Loads 128-bit value. Address p does not need be 16-byte aligned.可以不对齐
r := *p
__m128i _mm_loadl_epi64(__m128i const*p);
Load the lower 64 bits of the value pointed to by p into the lower 64 bits of the result, zeroing the upper 64 bits of the result.少于64位的向前补零
r0:= *p[63:0] r1:=0x0
-------------------------------------------------------------Set---------------------------------------------------------------------------------------------------
__m128i _mm_set_epi64 (__m64 q1, __m64 q0);Sets the 2 64-bit integer values.
#include #include #include using namespace std;
int main()
{
__m64 a, b;
a.m64_u64 = 0xA;
b.m64_u64 = 0xB;
__m128i c = _mm_set_epi64(a, b);
cout << c.m128i_i64[0] << c.m128i_i64[1] << endl;
return 0;
}
__m128i _mm_set_epi32 (int i3, int i2, int i1, int i0);
Sets the 4 signed 32-bit integer values.
r0 := i0 r1 := i1 r2 := i2 r3 := i3
__m128i _mm_set_epi16 (short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0);
Sets the 8 signed 16-bit integer values.
r0 := w0 r1 := w1 ... r7 := w7
__m128i _mm_set_epi8 (类似上面的)Sets the 16 signed 8-bit integer values.
__m128i _mm_set1_epi32 (int i);Sets the 4 signed 32-bit integer values to i .
r0 := i r1 := i r2 := i r3 := I
__m128i _mm_set1_epi64 (int i);
__m128i _mm_set1_epi16 (int i);
__m128i _mm_set1_epi8 (int i); (类似set1_epi32)
--------------------------------------------------------------------Store----------------------------------------------------------------------------------
void _mm_store_si128 (__m128i *p, __m128i a);
Stores 128-bit value. Address p must be 16-byte aligned.
*p := a
void _mm_storeu_si128 (__m128i *p, __m128i a);(参考Load里面的)
void _mm_storel_epi64(__m128i *p, __m128i a);(参考Load里面的)
void _mm_maskmoveu_si128(__m128i d, __m128i n, char *p);
The high bit of each byte in the selector n determines whether the corresponding byte in d will be stored. Address p does not need to be 16-byte aligned.
if (n0[7]) p[0] := d0 if (n1[7]) p[1] := d1 ... if (n15[7]) p[15] := d15
-----------------------------------------------Logical--------------------------------------------------------------
Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b.
__m128i _mm_and_si128 (__m128i a, __m128i b);
r := a & b
__m128i _mm_andnot_si128 (__m128i a, __m128i b);
r := (~a) & b
__m128i _mm_or_si128 (__m128i a, __m128i b);
r := a | b
__m128i _mm_xor_si128 ( __m128i a, __m128i b);
r := a ^ b
-----------------------------------------------------------------------------------------------------------------------------------------------
__m128i _mm_unpacklo_epi16 (__m128i a, __m128i b);
Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b.
r0 := a0 ; r1 := b0 r2 := a1 ; r3 := b1 r4 := a2 ; r5 := b2 r6 := a3 ; r7 := b3
__m128 _mm_cvtepi32_ps (__m128i a);Converts the four signed 32-bit integer values of a to single-precision, floating-point values.
r0 := (float) a0 r1 := (float) a1 r2 := (float) a2 r3 := (float) a3
还需要别的可以访问下面的链接:
http://msdn.microsoft.com/en-us/library/hfhxtdwx(v=vs.100).aspx