SSE2的简单理解,主要针对opencv 中的优化

加速主要是一条128位的指令可以一次处理多个运算

比如说int是32位的  128/32=4     如果是加法  就可以一条指令就可以处理4个整数的加法

-----------------------------------------------------------------Load--------------------------------------------------------------------------------------------------------------------------

__m128i _mm_load_si128 (__m128i *p);

Loads 128-bit value. Address p must be 16-byte aligned. 必须对齐

r := *p

__m128i _mm_loadu_si128 (__m128i *p);

Loads 128-bit value. Address p does not need be 16-byte aligned.可以不对齐

r := *p


__m128i _mm_loadl_epi64(__m128i const*p);

Load the lower 64 bits of the value pointed to by p into the lower 64 bits of the result, zeroing the upper 64 bits of the result.少于64位的向前补零

r0:= *p[63:0]
r1:=0x0

-------------------------------------------------------------Set---------------------------------------------------------------------------------------------------

__m128i _mm_set_epi64 (__m64 q1, __m64 q0);
Sets the 2 64-bit integer values.

#include #include #include using namespace std;
 
int main()
{
    __m64 a, b;
    a.m64_u64 = 0xA;
    b.m64_u64 = 0xB;
 
    __m128i c = _mm_set_epi64(a, b);
    cout << c.m128i_i64[0] << c.m128i_i64[1] << endl;
 
    return 0;
}

__m128i _mm_set_epi32 (int i3, int i2, int i1, int i0);
Sets the 4 signed 32-bit integer values.
r0 := i0
r1 := i1
r2 := i2
r3 := i3


__m128i _mm_set_epi16 (short w7, short w6,    short w5, short w4,   short w3, short w2,   short w1, short w0);
Sets the 8 signed 16-bit integer values.
r0 := w0
r1 := w1
...
r7 := w7

__m128i _mm_set_epi8 (类似上面的) 
Sets the 16 signed 8-bit integer values.


__m128i _mm_set1_epi32 (int i);
Sets the 4 signed 32-bit integer values to  i .

r0 := i
r1 := i
r2 := i
r3 := I
__m128i _mm_set1_epi64 (int i);
__m128i _mm_set1_epi16 (int i);
__m128i _mm_set1_epi8 (int i); (类似set1_epi32)


--------------------------------------------------------------------Store----------------------------------------------------------------------------------

void _mm_store_si128 (__m128i *p, __m128i a);

Stores 128-bit value. Address p must be 16-byte aligned.

*p := a

void _mm_storeu_si128 (__m128i *p, __m128i a);(参考Load里面的)

void _mm_storel_epi64(__m128i *p, __m128i a);(参考Load里面的)


void _mm_maskmoveu_si128(__m128i d, __m128i n, char *p);

The high bit of each byte in the selector n determines whether the corresponding byte in d will be stored. Address p does not need to be 16-byte aligned.

if (n0[7]) p[0] := d0
if (n1[7]) p[1] := d1
...
if (n15[7]) p[15] := d15

-----------------------------------------------Logical--------------------------------------------------------------

Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b.

__m128i _mm_and_si128 (__m128i a, __m128i b);

r := a & b

__m128i _mm_andnot_si128 (__m128i a, __m128i b);

r := (~a) & b

__m128i _mm_or_si128 (__m128i a, __m128i b);

r := a | b

__m128i _mm_xor_si128 ( __m128i a, __m128i b);

r := a ^ b

-----------------------------------------------------------------------------------------------------------------------------------------------

__m128i _mm_unpacklo_epi16 (__m128i a, __m128i b);
Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b.
r0 := a0 ; r1 := b0
r2 := a1 ; r3 := b1
r4 := a2 ; r5 := b2
r6 := a3 ; r7 := b3

__m128 _mm_cvtepi32_ps (__m128i a);
Converts the four signed 32-bit integer values of  a  to single-precision, floating-point values.

r0 := (float) a0
r1 := (float) a1
r2 := (float) a2
r3 := (float) a3


还需要别的可以访问下面的链接:

http://msdn.microsoft.com/en-us/library/hfhxtdwx(v=vs.100).aspx

你可能感兴趣的:(学习)