iteye_13202

AVX Intrinsics各函数介绍

SIMD相关头文件包括：

//#include //MMX
//#include //SSE(also include ivec.h)
//#include //SSE2(also include fvec.h)

#include  //MMX
#include  //SSE(include mmintrin.h)
#include  //SSE2(include xmmintrin.h)
#include  //SSE3(include emmintrin.h)
#include //SSSE3(include pmmintrin.h)
#include //SSE4.1(include tmmintrin.h)
#include //SSE4.2(include smmintrin.h)
#include //AES(include nmmintrin.h)
#include //AVX(include wmmintrin.h)
#include //(include immintrin.h)

mmintrin.h为MMX 头文件，其中__m64的定义为：

typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64
{
    unsigned __int64    m64_u64;
    float               m64_f32[2];
    __int8              m64_i8[8];
    __int16             m64_i16[4];
    __int32             m64_i32[2];    
    __int64             m64_i64;
    unsigned __int8     m64_u8[8];
    unsigned __int16    m64_u16[4];
    unsigned __int32    m64_u32[2];
} __m64;

xmmintrin.h为SSE 头文件，此头文件里包含MMX头文件，其中__m128的定义为：

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {
     float               m128_f32[4];
     unsigned __int64    m128_u64[2];
     __int8              m128_i8[16];
     __int16             m128_i16[8];
     __int32             m128_i32[4];
     __int64             m128_i64[2];
     unsigned __int8     m128_u8[16];
     unsigned __int16    m128_u16[8];
     unsigned __int32    m128_u32[4];
 } __m128;

emmintrin.h为SSE2头文件，此头文件里包含SSE头文件，其中__m128i和__m128d的定义为：

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {
    __int8              m128i_i8[16];
    __int16             m128i_i16[8];
    __int32             m128i_i32[4];    
    __int64             m128i_i64[2];
    unsigned __int8     m128i_u8[16];
    unsigned __int16    m128i_u16[8];
    unsigned __int32    m128i_u32[4];
    unsigned __int64    m128i_u64[2];
} __m128i;

typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d {
    double              m128d_f64[2];
} __m128d;

immintrin.h为AVX头文件，此头文件里包含AES头文件，其中__m256、__m256d、__m256i的定义为：

typedef union __declspec(intrin_type) _CRT_ALIGN(32) __m256 { 
    float m256_f32[8];
} __m256;
    
typedef struct __declspec(intrin_type) _CRT_ALIGN(32) {
    double m256d_f64[4]; 
} __m256d;

typedef union  __declspec(intrin_type) _CRT_ALIGN(32) __m256i {
    __int8              m256i_i8[32];
    __int16             m256i_i16[16];
    __int32             m256i_i32[8];
    __int64             m256i_i64[4];
    unsigned __int8     m256i_u8[32];
    unsigned __int16    m256i_u16[16];
    unsigned __int32    m256i_u32[8];
    unsigned __int64    m256i_u64[4];
} __m256i;

immintrin.h文件中各函数的介绍：

/*
* Add Packed Double Precision Floating-Point Values
* **** VADDPD ymm1, ymm2, ymm3/m256
* Performs an SIMD add of the four packed double-precision floating-point
* values from the first source operand to the second source operand, and
* stores the packed double-precision floating-point results in the
* destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10+m20, r1=m11+m21, r2=m12+m22, r3=m13+m23
extern __m256d __cdecl _mm256_add_pd(__m256d m1, __m256d m2);

/*
* Add Packed Single Precision Floating-Point Values
* **** VADDPS ymm1, ymm2, ymm3/m256
* Performs an SIMD add of the eight packed single-precision floating-point
* values from the first source operand to the second source operand, and
* stores the packed single-precision floating-point results in the
* destination
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=m10+m20, r1=m11+m21, ..., r7=m17+m27
extern __m256 __cdecl _mm256_add_ps(__m256 m1, __m256 m2);

/*
* Add/Subtract Double Precision Floating-Point Values
* **** VADDSUBPD ymm1, ymm2, ymm3/m256
* Adds odd-numbered double-precision floating-point values of the first
* source operand with the corresponding double-precision floating-point
* values from the second source operand; stores the result in the odd-numbered
* values of the destination. Subtracts the even-numbered double-precision
* floating-point values from the second source operand from the corresponding
* double-precision floating values in the first source operand; stores the
* result into the even-numbered values of the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10-m20, r1=m11+m21, r2=m12-m22, r3=m13-m23
extern __m256d __cdecl _mm256_addsub_pd(__m256d m1, __m256d m2);

/*
* Add/Subtract Packed Single Precision Floating-Point Values
* **** VADDSUBPS ymm1, ymm2, ymm3/m256
* Adds odd-numbered single-precision floating-point values of the first source
* operand with the corresponding single-precision floating-point values from
* the second source operand; stores the result in the odd-numbered values of
* the destination. Subtracts the even-numbered single-precision floating-point
* values from the second source operand from the corresponding
* single-precision floating values in the first source operand; stores the
* result into the even-numbered values of the destination
*/
//m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27)
//则r0=m10-m20, r1=m11+m21, ... , r6=m16-m26, r7=m17+m27
extern __m256 __cdecl _mm256_addsub_ps(__m256 m1, __m256 m2);

/*
* Bitwise Logical AND of Packed Double Precision Floating-Point Values
* **** VANDPD ymm1, ymm2, ymm3/m256
* Performs a bitwise logical AND of the four packed double-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=(m10 & m20), r1=(m11 & m21), r2=(m12 & m22), r3=(m13 & m23)
extern __m256d __cdecl _mm256_and_pd(__m256d m1, __m256d m2);

/*
* Bitwise Logical AND of Packed Single Precision Floating-Point Values
* **** VANDPS ymm1, ymm2, ymm3/m256
* Performs a bitwise logical AND of the eight packed single-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
//m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27)
//则r0=(m10 & m20), r1=(m11 & m21), ..., r6=(m16 & m26), r7=(m17 & m27)
extern __m256 __cdecl _mm256_and_ps(__m256 m1, __m256 m2);

/*
* Bitwise Logical AND NOT of Packed Double Precision Floating-Point Values
* **** VANDNPD ymm1, ymm2, ymm3/m256
* Performs a bitwise logical AND NOT of the four packed double-precision
* floating-point values from the first source operand and the second source
* operand, and stores the result in the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=(~m10) & m20, r1=(~m11) & m21, r2=(~m12) & m22, r3=(~m13) & m23
extern __m256d __cdecl _mm256_andnot_pd(__m256d m1, __m256d m2);

/*
* Bitwise Logical AND NOT of Packed Single Precision Floating-Point Values
* **** VANDNPS ymm1, ymm2, ymm3/m256
* Performs a bitwise logical AND NOT of the eight packed single-precision
* floating-point values from the first source operand and the second source
* operand, and stores the result in the destination
*/
//m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27)
//则r0=(~m10) & m20, r1=(~m11) & m21), ..., r6=(~m16) & m26, r7=(~m17) & m27
extern __m256 __cdecl _mm256_andnot_ps(__m256 m1, __m256 m2);

/*
* Blend Packed Double Precision Floating-Point Values
* **** VBLENDPD ymm1, ymm2, ymm3/m256, imm8
* Double-Precision Floating-Point values from the second source operand are
* conditionally merged with values from the first source operand and written
* to the destination. The immediate bits [3:0] determine whether the
* corresponding Double-Precision Floating Point value in the destination is
* copied from the second source or first source. If a bit in the mask,
* orresponding to a word, is "1", then the Double-Precision Floating-Point
* value in the second source operand is copied, else the value in the first
* source operand is copied
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23), mask=[b3 b2 b1 b0]
//如果bn=1,则rn=m2n，如果bn=0, 则rn=m1n, 其中n为0,1,2,3
extern __m256d __cdecl _mm256_blend_pd(__m256d m1, __m256d m2, const int mask);

/*
* Blend Packed Single Precision Floating-Point Values
* **** VBLENDPS ymm1, ymm2, ymm3/m256, imm8
* Single precision floating point values from the second source operand are
* conditionally merged with values from the first source operand and written
* to the destination. The immediate bits [7:0] determine whether the
* corresponding single precision floating-point value in the destination is
* copied from the second source or first source. If a bit in the mask,
* corresponding to a word, is "1", then the single-precision floating-point
* value in the second source operand is copied, else the value in the first
* source operand is copied
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)，mask=[b7 b6...b1 b0]
//如果bn=1,则rn=m2n，如果bn=0, 则rn=m1n, 其中n为0,1,2,3,4,5,6,7
extern __m256 __cdecl _mm256_blend_ps(__m256 m1, __m256 m2, const int mask);

/*
* Blend Packed Double Precision Floating-Point Values
* **** VBLENDVPD ymm1, ymm2, ymm3/m256, ymm4
* Conditionally copy each quadword data element of double-precision
* floating-point value from the second source operand (third operand) and the
* first source operand (second operand) depending on mask bits defined in the
* mask register operand (fourth operand).
*/
extern __m256d __cdecl _mm256_blendv_pd(__m256d m1, __m256d m2, __m256d m3);

/*
* Blend Packed Single Precision Floating-Point Values
* **** VBLENDVPS ymm1, ymm2, ymm3/m256, ymm4
* Conditionally copy each dword data element of single-precision
* floating-point value from the second source operand (third operand) and the
* first source operand (second operand) depending on mask bits defined in the
* mask register operand (fourth operand).
*/
extern __m256 __cdecl _mm256_blendv_ps(__m256 m1, __m256 m2, __m256 mask);

/*
* Divide Packed Double-Precision Floating-Point Values
* **** VDIVPD ymm1, ymm2, ymm3/m256
* Performs an SIMD divide of the four packed double-precision floating-point
* values in the first source operand by the four packed double-precision
* floating-point values in the second source operand
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10/m20, r1=m11/m21, r2=m12/m22, r3=m13/m23
extern __m256d __cdecl _mm256_div_pd(__m256d m1, __m256d m2);

/* 
* Divide Packed Single-Precision Floating-Point Values
* **** VDIVPS ymm1, ymm2, ymm3/m256
* Performs an SIMD divide of the eight packed single-precision
* floating-point values in the first source operand by the eight packed
* single-precision floating-point values in the second source operand
*/
//m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27)
//则r0=m10/m20, r1=m11/m21, ..., r6=m16/m26, r7=m17/m27
extern __m256 __cdecl _mm256_div_ps(__m256 m1, __m256 m2);

/*
* Dot Product of Packed Single-Precision Floating-Point Values
* **** VDPPS ymm1, ymm2, ymm3/m256, imm8
* Multiplies the packed single precision floating point values in the
* first source operand with the packed single-precision floats in the
* second source. Each of the four resulting single-precision values is
* conditionally summed depending on a mask extracted from the high 4 bits
* of the immediate operand. This sum is broadcast to each of 4 positions
* in the destination if the corresponding bit of the mask selected from
* the low 4 bits of the immediate operand is "1". If the corresponding
* low bit 0-3 of the mask is zero, the destination is set to zero.
* The process is replicated for the high elements of the destination.
*/
//m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27)
//mask=[b7 b6 ... b0], mask的低四位决定结果值是0，还是m1和m2相应位相乘后再求和
//若b0b1b2b3为0001，则r0=r1=r2=0,m4=m5=m6=0,此时如果b4b5b6b7为1001，
//则r3=m10*m20+m13*m23, r7=m14*m24+m17*m27,其它依次类推
extern __m256 __cdecl _mm256_dp_ps(__m256 m1, __m256 m2, const int mask);

/*
* Add Horizontal Double Precision Floating-Point Values
* **** VHADDPD ymm1, ymm2, ymm3/m256
* Adds pairs of adjacent double-precision floating-point values in the
* first source operand and second source operand and stores results in
* the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10+m11, r1=m20+m21, r2=m12+m13, r3=m22+m23
extern __m256d __cdecl _mm256_hadd_pd(__m256d m1, __m256d m2);

/*
* Add Horizontal Single Precision Floating-Point Values
* **** VHADDPS ymm1, ymm2, ymm3/m256
* Adds pairs of adjacent single-precision floating-point values in the
* first source operand and second source operand and stores results in
* the destination
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=m10+m11, r1=m12+m13, r2=m20+m21, r3=m22+m23, 
//r4=m14+m15, r5=m16+m17, r6=m24+m25, r7=m26+m27
extern __m256 __cdecl _mm256_hadd_ps(__m256 m1, __m256 m2);

/*
* Subtract Horizontal Double Precision Floating-Point Values
* **** VHSUBPD ymm1, ymm2, ymm3/m256
* Subtract pairs of adjacent double-precision floating-point values in
* the first source operand and second source operand and stores results
* in the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10-m11, r1=m20-m21, r2=m12-m13, r3=m22-m23
extern __m256d __cdecl _mm256_hsub_pd(__m256d m1, __m256d m2);

/*
* Subtract Horizontal Single Precision Floating-Point Values
* **** VHSUBPS ymm1, ymm2, ymm3/m256
* Subtract pairs of adjacent single-precision floating-point values in
* the first source operand and second source operand and stores results
* in the destination.
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=m10-m11, r1=m12-m13, r2=m20-m21, r3=m22-m23, 
//r4=m14-m15, r5=m16-m17, r6=m24-m25, r7=m26-m27
extern __m256 __cdecl _mm256_hsub_ps(__m256 m1, __m256 m2);

/*
* Maximum of Packed Double Precision Floating-Point Values
* **** VMAXPD ymm1, ymm2, ymm3/m256
* Performs an SIMD compare of the packed double-precision floating-point
* values in the first source operand and the second source operand and
* returns the maximum value for each pair of values to the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=max(m10,m20), r1=max(m11,m21), r2=max(m12,m22), r3=max(m13,m23)
extern __m256d __cdecl _mm256_max_pd(__m256d m1, __m256d m2);

/*
* Maximum of Packed Single Precision Floating-Point Values
* **** VMAXPS ymm1, ymm2, ymm3/m256
* Performs an SIMD compare of the packed single-precision floating-point
* values in the first source operand and the second source operand and
* returns the maximum value for each pair of values to the destination
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=max(m10,m20), r1=max(m11,m21), ..., r6=max(m16,m26), r7=max(m17,m27) 
extern __m256 __cdecl _mm256_max_ps(__m256 m1, __m256 m2);

/*
* Minimum of Packed Double Precision Floating-Point Values
* **** VMINPD ymm1, ymm2, ymm3/m256
* Performs an SIMD compare of the packed double-precision floating-point
* values in the first source operand and the second source operand and
* returns the minimum value for each pair of values to the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=min(m10,m20), r1=min(m11,m21), r2=min(m12,m22), r3=min(m13,m23)
extern __m256d __cdecl _mm256_min_pd(__m256d m1, __m256d m2);

/*
* Minimum of Packed Single Precision Floating-Point Values
* **** VMINPS ymm1, ymm2, ymm3/m256
* Performs an SIMD compare of the packed single-precision floating-point
* values in the first source operand and the second source operand and
* returns the minimum value for each pair of values to the destination
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=min(m10,m20), r1=min(m11,m21), ..., r6=min(m16,m26), r7=min(m17,m27) 
extern __m256 __cdecl _mm256_min_ps(__m256 m1, __m256 m2);

/*
* Multiply Packed Double Precision Floating-Point Values
* **** VMULPD ymm1, ymm2, ymm3/m256
* Performs a SIMD multiply of the four packed double-precision floating-point
* values from the first Source operand to the Second Source operand, and
* stores the packed double-precision floating-point results in the
* destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10*m20, r1=m11*m21, r2=m12*m22, r3=m13*m23
extern __m256d __cdecl _mm256_mul_pd(__m256d m1, __m256d m2);

/*
* Multiply Packed Single Precision Floating-Point Values
* **** VMULPS ymm1, ymm2, ymm3/m256
* Performs an SIMD multiply of the eight packed single-precision
* floating-point values from the first source operand to the second source
* operand, and stores the packed double-precision floating-point results in
* the destination
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=m10*m20, r1=m11*m21, ..., r6=m16*m26, r7=m17*m27 
extern __m256 __cdecl _mm256_mul_ps(__m256 m1, __m256 m2);

/*
* Bitwise Logical OR of Packed Double Precision Floating-Point Values
* **** VORPD ymm1, ymm2, ymm3/m256
* Performs a bitwise logical OR of the four packed double-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
//注意：有时得到的结果并不是m1和m2按位或的结果?
extern __m256d __cdecl _mm256_or_pd(__m256d m1, __m256d m2);

/*
* Bitwise Logical OR of Packed Single Precision Floating-Point Values
* **** VORPS ymm1, ymm2, ymm3/m256
* Performs a bitwise logical OR of the eight packed single-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
//注意：有时得到的结果并不是m1和m2按位或的结果?
extern __m256 __cdecl _mm256_or_ps(__m256 m1, __m256 m2);

/*
* Shuffle Packed Double Precision Floating-Point Values
* **** VSHUFPD ymm1, ymm2, ymm3/m256, imm8
* Moves either of the two packed double-precision floating-point values from
* each double quadword in the first source operand into the low quadword
* of each double quadword of the destination; moves either of the two packed
* double-precision floating-point values from the second source operand into
* the high quadword of each double quadword of the destination operand.
* The selector operand determines which values are moved to the destination
*/
extern __m256d __cdecl _mm256_shuffle_pd(__m256d m1, __m256d m2, const int select);

/*
* Shuffle Packed Single Precision Floating-Point Values
* **** VSHUFPS ymm1, ymm2, ymm3/m256, imm8
* Moves two of the four packed single-precision floating-point values
* from each double qword of the first source operand into the low
* quadword of each double qword of the destination; moves two of the four
* packed single-precision floating-point values from each double qword of
* the second source operand into to the high quadword of each double qword
* of the destination. The selector operand determines which values are moved
* to the destination.
*/
extern __m256 __cdecl _mm256_shuffle_ps(__m256 m1, __m256 m2, const int select);

/*
* Subtract Packed Double Precision Floating-Point Values
* **** VSUBPD ymm1, ymm2, ymm3/m256
* Performs an SIMD subtract of the four packed double-precision floating-point
* values of the second Source operand from the first Source operand, and
* stores the packed double-precision floating-point results in the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10-m20, r1=m11-m21, r2=m12-m22, r3=m13-m23
extern __m256d __cdecl _mm256_sub_pd(__m256d m1, __m256d m2);

/*
* Subtract Packed Single Precision Floating-Point Values
* **** VSUBPS ymm1, ymm2, ymm3/m256
* Performs an SIMD subtract of the eight packed single-precision
* floating-point values in the second Source operand from the First Source
* operand, and stores the packed single-precision floating-point results in
* the destination
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=m10-m20, r1=m11-m21, ..., r6=m16-m26, r7=m17-m27 
extern __m256 __cdecl _mm256_sub_ps(__m256 m1, __m256 m2);

/*
* Bitwise Logical XOR of Packed Double Precision Floating-Point Values
* **** VXORPD ymm1, ymm2, ymm3/m256
* Performs a bitwise logical XOR of the four packed double-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
//注意：有时得到的结果并不是m1和m2按位异或的结果?
extern __m256d __cdecl _mm256_xor_pd(__m256d m1, __m256d m2);

/*
* Bitwise Logical XOR of Packed Single Precision Floating-Point Values
* **** VXORPS ymm1, ymm2, ymm3/m256
* Performs a bitwise logical XOR of the eight packed single-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
//注意：有时得到的结果并不是m1和m2按位异或的结果?
extern __m256 __cdecl _mm256_xor_ps(__m256 m1, __m256 m2);

/*
* Compare Packed Double-Precision Floating-Point Values
* **** VCMPPD xmm1, xmm2, xmm3/m128, imm8
* **** VCMPPD ymm1, ymm2, ymm3/m256, imm8
* Performs an SIMD compare of the four packed double-precision floating-point
* values in the second source operand (third operand) and the first source
* operand (second operand) and returns the results of the comparison to the
* destination operand (first operand). The comparison predicate operand
* (immediate) specifies the type of comparison performed on each of the pairs
* of packed values.
* For 128-bit intrinsic with compare predicate values in range 0-7 compiler
* may generate SSE2 instructions if it is warranted for performance reasons.
*/
extern __m128d __cdecl _mm_cmp_pd(__m128d m1, __m128d m2, const int predicate);
extern __m256d __cdecl _mm256_cmp_pd(__m256d m1, __m256d m2, const int predicate);

/*
* Compare Packed Single-Precision Floating-Point Values
* **** VCMPPS xmm1, xmm2, xmm3/m256, imm8
* **** VCMPPS ymm1, ymm2, ymm3/m256, imm8
* Performs a SIMD compare of the packed single-precision floating-point values
* in the second source operand (third operand) and the first source operand
* (second operand) and returns the results of the comparison to the destination
* operand (first operand). The comparison predicate operand (immediate)
* specifies the type of comparison performed on each of the pairs of packed
* values.
* For 128-bit intrinsic with compare predicate values in range 0-7 compiler
* may generate SSE2 instructions if it is warranted for performance reasons.
*/
extern __m128 __cdecl _mm_cmp_ps(__m128 m1, __m128 m2, const int predicate);
extern __m256 __cdecl _mm256_cmp_ps(__m256 m1, __m256 m2, const int predicate);

/*
* Compare Scalar Double-Precision Floating-Point Values
* **** VCMPSD xmm1, xmm2, xmm3/m64, imm8
* Compares the low double-precision floating-point values in the second source
* operand (third operand) and the first source operand (second operand) and
* returns the results in of the comparison to the destination operand (first
* operand). The comparison predicate operand (immediate operand) specifies the
* type of comparison performed.
* For compare predicate values in range 0-7 compiler may generate SSE2
* instructions if it is warranted for performance reasons.
*/
extern __m128d __cdecl _mm_cmp_sd(__m128d m1, __m128d m2, const int predicate);

/*
* Compare Scalar Single-Precision Floating-Point Values
* **** VCMPSS xmm1, xmm2, xmm3/m64, imm8
* Compares the low single-precision floating-point values in the second source
* operand (third operand) and the first source operand (second operand) and
* returns the results of the comparison to the destination operand (first
* operand). The comparison predicate operand (immediate operand) specifies
* the type of comparison performed.
* For compare predicate values in range 0-7 compiler may generate SSE2
* instructions if it is warranted for performance reasons.
*/
extern __m128 __cdecl _mm_cmp_ss(__m128 m1, __m128 m2, const int predicate);

/*
* Convert Packed Doubleword Integers to
* Packed Double-Precision Floating-Point Values
* **** VCVTDQ2PD ymm1, xmm2/m128
* Converts four packed signed doubleword integers in the source operand to
* four packed double-precision floating-point values in the destination
*/
//从__int32类型转换到double类型
extern __m256d __cdecl _mm256_cvtepi32_pd(__m128i m1);

/*
* Convert Packed Doubleword Integers to
* Packed Single-Precision Floating-Point Values
* **** VCVTDQ2PS ymm1, ymm2/m256
* Converts eight packed signed doubleword integers in the source operand to
* eight packed double-precision floating-point values in the destination
*/
//从__int32类型转换到float类型
extern __m256  __cdecl _mm256_cvtepi32_ps(__m256i m1);

/*
* Convert Packed Double-Precision Floating-point values to
* Packed Single-Precision Floating-Point Values
* **** VCVTPD2PS xmm1, ymm2/m256
* Converts four packed double-precision floating-point values in the source
* operand to four packed single-precision floating-point values in the
* destination
*/
//从double类型转换到float类型
extern __m128  __cdecl _mm256_cvtpd_ps(__m256d m1);

/*
* Convert Packed Single Precision Floating-Point Values to
* Packed Singed Doubleword Integer Values
* **** VCVTPS2DQ ymm1, ymm2/m256
* Converts eight packed single-precision floating-point values in the source
* operand to eight signed doubleword integers in the destination
*/
//从float类型转换到__int32类型
extern __m256i __cdecl _mm256_cvtps_epi32(__m256 m1);

/*
* Convert Packed Single Precision Floating-point values to
* Packed Double Precision Floating-Point Values
* **** VCVTPS2PD ymm1, xmm2/m128
* Converts four packed single-precision floating-point values in the source
* operand to four packed double-precision floating-point values in the
* destination
*/
//从float类型转换到double类型
extern __m256d __cdecl _mm256_cvtps_pd(__m128 m1);

/*
* Convert with Truncation Packed Double-Precision Floating-Point values to
* Packed Doubleword Integers
* **** VCVTTPD2DQ xmm1, ymm2/m256
* Converts four packed double-precision floating-point values in the source
* operand to four packed signed doubleword integers in the destination.
* When a conversion is inexact, a truncated (round toward zero) value is
* returned. If a converted result is larger than the maximum signed doubleword
* integer, the floating-point invalid exception is raised, and if this
* exception is masked, the indefinite integer value (80000000H) is returned
*/
//从double类型转换到__int32类型，truncated
extern __m128i __cdecl _mm256_cvttpd_epi32(__m256d m1);

/*
* Convert Packed Double-Precision Floating-point values to
* Packed Doubleword Integers
* **** VCVTPD2DQ xmm1, ymm2/m256
* Converts four packed double-precision floating-point values in the source
* operand to four packed signed doubleword integers in the destination
*/
//从double类型转换到__int32类型
extern __m128i __cdecl _mm256_cvtpd_epi32(__m256d m1);

/*
* Convert with Truncation Packed Single Precision Floating-Point Values to
* Packed Singed Doubleword Integer Values
* **** VCVTTPS2DQ ymm1, ymm2/m256
* Converts eight packed single-precision floating-point values in the source
* operand to eight signed doubleword integers in the destination.
* When a conversion is inexact, a truncated (round toward zero) value is
* returned. If a converted result is larger than the maximum signed doubleword
* integer, the floating-point invalid exception is raised, and if this
* exception is masked, the indefinite integer value (80000000H) is returned
*/
//从float类型转换到__int32类型,truncated
extern __m256i __cdecl _mm256_cvttps_epi32(__m256 m1);

/*
* Extract packed floating-point values
* **** VEXTRACTF128 xmm1/m128, ymm2, imm8
* Extracts 128-bits of packed floating-point values from the source operand
* at an 128-bit offset from imm8[0] into the destination
*/
//offset:a constant integer value that represents the 128-bit offset from 
//where extraction must start
//从256位中提取128位，offset决定提取的起始位置
extern __m128  __cdecl _mm256_extractf128_ps(__m256 m1, const int offset);
extern __m128d __cdecl _mm256_extractf128_pd(__m256d m1, const int offset);
extern __m128i __cdecl _mm256_extractf128_si256(__m256i m1, const int offset);

/*
* Zero All YMM registers
* **** VZEROALL
* Zeros contents of all YMM registers
*/
extern void __cdecl _mm256_zeroall(void);

/*
* Zero Upper bits of YMM registers
* **** VZEROUPPER
* Zeros the upper 128 bits of all YMM registers. The lower 128-bits of the
* registers (the corresponding XMM registers) are unmodified
*/
extern void __cdecl _mm256_zeroupper(void);

/*
* Permute Single-Precision Floating-Point Values
* **** VPERMILPS ymm1, ymm2, ymm3/m256
* **** VPERMILPS xmm1, xmm2, xmm3/m128
* Permute Single-Precision Floating-Point values in the first source operand
* using 8-bit control fields in the low bytes of corresponding elements the
* shuffle control and store results in the destination
*/
//control:a vector with 2-bit control fields, one for each corresponding element 
//of the source vector, for the 256-bit m1 source vector this control vector
//contains eight 2-bit control fields,for the 128-bit m1 source vector this 
//control vector contains four 2-bit control fields
extern __m256  __cdecl _mm256_permutevar_ps(__m256 m1, __m256i control);
extern __m128  __cdecl _mm_permutevar_ps(__m128 a, __m128i control);

/*
* Permute Single-Precision Floating-Point Values
* **** VPERMILPS ymm1, ymm2/m256, imm8
* **** VPERMILPS xmm1, xmm2/m128, imm8
* Permute Single-Precision Floating-Point values in the first source operand
* using four 2-bit control fields in the 8-bit immediate and store results
* in the destination
*/
//control:an integer specified as an 8-bit immediate;for the 256-bit m1 vector
//this integer contains four 2-bit control fields in the low 8 bits of 
//the immediate, for the 128-bit m1 vector this integer contains two 2-bit
//control fields in the low 4 bits of the immediate
extern __m256  __cdecl _mm256_permute_ps(__m256 m1, int control);
extern __m128  __cdecl _mm_permute_ps(__m128 a, int control);

/*
* Permute Double-Precision Floating-Point Values
* **** VPERMILPD ymm1, ymm2, ymm3/m256
* **** VPERMILPD xmm1, xmm2, xmm3/m128
* Permute Double-Precision Floating-Point values in the first source operand
* using 8-bit control fields in the low bytes of the second source operand
* and store results in the destination
*/
//control:a vector with 1-bit control fields, one for each corresponding element
//of the source vector, for the 256-bit m1 source vector this control vector 
//contains four 1-bit control fields in the low 4 bits of the immediate, for the 
//128-bit m1 source vector this control vector contains two 1-bit control fields
//in the low 2 bits of the immediate
extern __m256d __cdecl _mm256_permutevar_pd(__m256d m1, __m256i control);
extern __m128d __cdecl _mm_permutevar_pd(__m128d a, __m128i control);

/*
* Permute Double-Precision Floating-Point Values
* **** VPERMILPD ymm1, ymm2/m256, imm8
* **** VPERMILPD xmm1, xmm2/m128, imm8
* Permute Double-Precision Floating-Point values in the first source operand
* using two, 1-bit control fields in the low 2 bits of the 8-bit immediate
* and store results in the destination
*/
//control:an integer specified as an 8-bit immediate; for the 256-bit m1 vector
//this integer contains four 1-bit control fields in the low 4 bits of the 
//immediate, for the 128-bit m1 vector this integer contains two 1-bit
//control fields in the low 2 bits of the immediate
extern __m256d __cdecl _mm256_permute_pd(__m256d m1, int control);
extern __m128d __cdecl _mm_permute_pd(__m128d a, int control);

/*
* Permute Floating-Point Values
* **** VPERM2F128 ymm1, ymm2, ymm3/m256, imm8
* Permute 128 bit floating-point-containing fields from the first source
* operand and second source operand using bits in the 8-bit immediate and
* store results in the destination
*/
//control:an immediate byte that specifies two 2-bit control fields and two 
//additional bits which specify zeroing behavior
extern __m256  __cdecl _mm256_permute2f128_ps(__m256 m1, __m256 m2, int control);
extern __m256d __cdecl _mm256_permute2f128_pd(__m256d m1, __m256d m2, int control);
extern __m256i __cdecl _mm256_permute2f128_si256(__m256i m1, __m256i m2, int control);

/*
* Load with Broadcast
* **** VBROADCASTSS ymm1, m32
* **** VBROADCASTSS xmm1, m32
* Load floating point values from the source operand and broadcast to all
* elements of the destination
*/
//*a:pointer to a memory location that can hold constant 256-bit or
//128-bit float32 values, 则r0=r1=...=rn=a[0]
extern __m256  __cdecl _mm256_broadcast_ss(float const *a);
extern __m128  __cdecl _mm_broadcast_ss(float const *a);

/*
* Load with Broadcast
* **** VBROADCASTSD ymm1, m64
* Load floating point values from the source operand and broadcast to all
* elements of the destination
*/
//则r0=r1=r2=r3=a[0]
extern __m256d __cdecl _mm256_broadcast_sd(double const *a);

/*
* Load with Broadcast
* **** VBROADCASTF128 ymm1, m128
* Load floating point values from the source operand and broadcast to all
* elements of the destination
*/
//若*a为a[0],a[1],则r0=r2=a[0], r1=r3=a[1]
extern __m256  __cdecl _mm256_broadcast_ps(__m128 const *a);
extern __m256d __cdecl _mm256_broadcast_pd(__m128d const *a);

/*
* Insert packed floating-point values
* **** VINSERTF128 ymm1, ymm2, xmm3/m128, imm8
* Performs an insertion of 128-bits of packed floating-point values from the
* second source operand into an the destination at an 128-bit offset from
* imm8[0]. The remaining portions of the destination are written by the
* corresponding fields of the first source operand
*/
//offset:an integer value that represents the 128-bit offset
//where the insertion must start
//The remaining portions of the destination are written by the corresponding
//elements of the first source vector, a
extern __m256  __cdecl _mm256_insertf128_ps(__m256 a, __m128 b, int offset);
extern __m256d __cdecl _mm256_insertf128_pd(__m256d a, __m128d b, int offset);
extern __m256i __cdecl _mm256_insertf128_si256(__m256i a, __m128i b, int offset);

/*
* Move Aligned Packed Double-Precision Floating-Point Values
* **** VMOVAPD ymm1, m256
* **** VMOVAPD m256, ymm1
* Moves 4 double-precision floating-point values from the source operand to
* the destination
*/
//*a:the address must be 32-byte aligned
extern __m256d __cdecl _mm256_load_pd(double const *a);
extern void    __cdecl _mm256_store_pd(double *a, __m256d b);

/*
* Move Aligned Packed Single-Precision Floating-Point Values
* **** VMOVAPS ymm1, m256
* **** VMOVAPS m256, ymm1
* Moves 8 single-precision floating-point values from the source operand to
* the destination
*/
//*a:the address must be 32-byte aligned
extern __m256  __cdecl _mm256_load_ps(float const *a);
extern void    __cdecl _mm256_store_ps(float *a, __m256 b);

/*
* Move Unaligned Packed Double-Precision Floating-Point Values
* **** VMOVUPD ymm1, m256
* **** VMOVUPD m256, ymm1
* Moves 256 bits of packed double-precision floating-point values from the
* source operand to the destination
*/
//The address a does not need to be 32-byte aligned  
extern __m256d __cdecl _mm256_loadu_pd(double const *a);
extern void    __cdecl _mm256_storeu_pd(double *a, __m256d b);

/*
* Move Unaligned Packed Single-Precision Floating-Point Values
* **** VMOVUPS ymm1, m256
* **** VMOVUPS m256, ymm1
* Moves 256 bits of packed single-precision floating-point values from the
* source operand to the destination
*/
//The address a does not need to be 32-byte aligned  
extern __m256  __cdecl _mm256_loadu_ps(float const *a);
extern void    __cdecl _mm256_storeu_ps(float *a, __m256 b);

/*
* Move Aligned Packed Integer Values
* **** VMOVDQA ymm1, m256
* **** VMOVDQA m256, ymm1
* Moves 256 bits of packed integer values from the source operand to the
* destination
*/
//The address a does not need to be 32-byte aligned  
extern __m256i __cdecl _mm256_load_si256(__m256i const *a);
extern void    __cdecl _mm256_store_si256(__m256i *a, __m256i b);

/*
* Move Unaligned Packed Integer Values
* **** VMOVDQU ymm1, m256
* **** VMOVDQU m256, ymm1
* Moves 256 bits of packed integer values from the source operand to the
* destination
*/
//The address a does not need to be 32-byte aligned  
extern __m256i __cdecl _mm256_loadu_si256(__m256i const *a);
extern void    __cdecl _mm256_storeu_si256(__m256i *a, __m256i b); 

/*
* Conditional SIMD Packed Loads and Stores
* **** VMASKMOVPD xmm1, xmm2, m128
* **** VMASKMOVPD ymm1, ymm2, m256
* **** VMASKMOVPD m128, xmm1, xmm2
* **** VMASKMOVPD m256, ymm1, ymm2
*
* Load forms:
* Load packed values from the 128-bit (XMM forms) or 256-bit (YMM forms)
* memory location (third operand) into the destination XMM or YMM register
* (first operand) using a mask in the first source operand (second operand).
*
* Store forms:
* Stores packed values from the XMM or YMM register in the second source
* operand (third operand) into the 128-bit (XMM forms) or 256-bit (YMM forms)
* memory location using a mask in first source operand (second operand).
* Stores are atomic.
*/
//The mask is calculated from the most significant bit of each qword of the mask
//register. If any of the bits of the mask is set to zero, the corresponding value
//from the memory location is not loaded, and the corresponding field of the
//destination vector is set to zero.
extern __m256d __cdecl _mm256_maskload_pd(double const *a, __m256i mask);
extern void    __cdecl _mm256_maskstore_pd(double *a, __m256i mask, __m256d b);
extern __m128d __cdecl _mm_maskload_pd(double const *a, __m128i mask);
extern void    __cdecl _mm_maskstore_pd(double *a, __m128i mask, __m128d b); 

/*
* Conditional SIMD Packed Loads and Stores
* **** VMASKMOVPS xmm1, xmm2, m128
* **** VMASKMOVPS ymm1, ymm2, m256
* **** VMASKMOVPS m128, xmm1, xmm2
* **** VMASKMOVPS m256, ymm1, ymm2
*
* Load forms:
* Load packed values from the 128-bit (XMM forms) or 256-bit (YMM forms)
* memory location (third operand) into the destination XMM or YMM register
* (first operand) using a mask in the first source operand (second operand).
*
* Store forms:
* Stores packed values from the XMM or YMM register in the second source
* operand (third operand) into the 128-bit (XMM forms) or 256-bit (YMM forms)
* memory location using a mask in first source operand (second operand).
* Stores are atomic.
*/
//The mask is calculated from the most significant bit of each dword of the mask 
//register. If any of the bits of the mask is set to zero, the corresponding 
//value from the memory location is not loaded, and the corresponding field of
//the destination vector is set to zero.
extern __m256  __cdecl _mm256_maskload_ps(float const *a, __m256i mask);
extern void    __cdecl _mm256_maskstore_ps(float *a, __m256i mask, __m256 b); 
extern __m128  __cdecl _mm_maskload_ps(float const *a, __m128i mask);
extern void    __cdecl _mm_maskstore_ps(float *a, __m128i mask, __m128 b); 

/*
* Replicate Single-Precision Floating-Point Values
* **** VMOVSHDUP ymm1, ymm2/m256
* Duplicates odd-indexed single-precision floating-point values from the
* source operand
*/
//a=(a0, a1, a2, a3, a4, a5, a6, a7);则r=(a1, a1, a3, a3, a5, a5, a7, a7)
extern __m256  __cdecl _mm256_movehdup_ps(__m256 a);

/*
* Replicate Single-Precision Floating-Point Values
* **** VMOVSLDUP ymm1, ymm2/m256
* Duplicates even-indexed single-precision floating-point values from the
* source operand
*/
//a=(a0, a1, a2, a3, a4, a5, a6, a7);则r=(a0, a0, a2, a2, a4, a4, a6, a6)
extern __m256  __cdecl _mm256_moveldup_ps(__m256 a);

/*
* Replicate Double-Precision Floating-Point Values
* **** VMOVDDUP ymm1, ymm2/m256
* Duplicates even-indexed double-precision floating-point values from the
* source operand
*/
//a=(a0, a1, a2, a3), 则r=(a0, a0, a2, a2)
extern __m256d __cdecl _mm256_movedup_pd(__m256d a);

/*
* Move Unaligned Integer
* **** VLDDQU ymm1, m256
* The instruction is functionally similar to VMOVDQU YMM, m256 for loading
* from memory. That is: 32 bytes of data starting at an address specified by
* the source memory operand are fetched from memory and placed in a
* destination
*/
//*a:points to a memory location from where unaligned integer value must be moved
extern __m256i __cdecl _mm256_lddqu_si256(__m256i const *a);

/*
* Store Packed Integers Using Non-Temporal Hint
* **** VMOVNTDQ m256, ymm1
* Moves the packed integers in the source operand to the destination using a
* non-temporal hint to prevent caching of the data during the write to memory
*/
//the address must be 32-byte aligned
extern void    __cdecl _mm256_stream_si256(__m256i *p, __m256i a);

/*
* Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint
* **** VMOVNTPD m256, ymm1
* Moves the packed double-precision floating-point values in the source
* operand to the destination operand using a non-temporal hint to prevent
* caching of the data during the write to memory
*/
//the address must be 32-byte aligned
extern void    __cdecl _mm256_stream_pd(double *p, __m256d a);

/*
* Store Packed Single-Precision Floating-Point Values Using Non-Temporal Hint
* **** VMOVNTPS m256, ymm1
* Moves the packed single-precision floating-point values in the source
* operand to the destination operand using a non-temporal hint to prevent
* caching of the data during the write to memory
*/
//the address must be 32-byte aligned
extern void    __cdecl _mm256_stream_ps(float *p, __m256 a);

/*
* Compute Approximate Reciprocals of Packed Single-Precision Floating-Point Values
* **** VRCPPS ymm1, ymm2/m256
* Performs an SIMD computation of the approximate reciprocals of the eight
* packed single precision floating-point values in the source operand and
* stores the packed single-precision floating-point results in the destination
*/
//a=(a0, a1, a2, ..., a6, a7);
//则r=(1/a0, 1/a1, ..., 1/a6, 1/a7), 求倒数
extern __m256  __cdecl _mm256_rcp_ps(__m256 a);

/*
* Compute Approximate Reciprocals of Square Roots of
* Packed Single-Precision Floating-point Values
* **** VRSQRTPS ymm1, ymm2/m256
* Performs an SIMD computation of the approximate reciprocals of the square
* roots of the eight packed single precision floating-point values in the
* source operand and stores the packed single-precision floating-point results
* in the destination
*/
//a=(a0, a1, a2, ..., a6, a7);
//则r=(1/sqrt(a0), 1/sqrt(a1), ..., 1/sqrt(a6), 1/sqrt(a7)), 先开方再求倒数
extern __m256  __cdecl _mm256_rsqrt_ps(__m256 a);

/*
* Square Root of Double-Precision Floating-Point Values
* **** VSQRTPD ymm1, ymm2/m256
* Performs an SIMD computation of the square roots of the two or four packed
* double-precision floating-point values in the source operand and stores
* the packed double-precision floating-point results in the destination
*/
//a=(a0, a1, a2, a3, a4);则r=(sqrt(a0),sqrt(a1), sqrt(a2), sqrt(a3)), 求开方
extern __m256d __cdecl _mm256_sqrt_pd(__m256d a);

/*
* Square Root of Single-Precision Floating-Point Values
* **** VSQRTPS ymm1, ymm2/m256
* Performs an SIMD computation of the square roots of the eight packed
* single-precision floating-point values in the source operand stores the
* packed double-precision floating-point results in the destination
*/
//a=(a0, a1, a2, ..., a3, a4);则r=(sqrt(a0),sqrt(a1), ..., sqrt(a2), sqrt(a3)), 求开方
extern __m256  __cdecl _mm256_sqrt_ps(__m256 a);

/*
* Round Packed Double-Precision Floating-Point Values
* **** VROUNDPD ymm1,ymm2/m256,imm8
* Round the four Double-Precision Floating-Point Values values in the source
* operand by the rounding mode specified in the immediate operand and place
* the result in the destination. The rounding process rounds the input to an
* integral value and returns the result as a double-precision floating-point
* value. The Precision Floating Point Exception is signaled according to the
* immediate operand. If any source operand is an SNaN then it will be
* converted to a QNaN.
*/
//a=(22.8, -11.3, -33.8, 4.3),
//若iRoundMode=0x0A, 则r=(23, -11, -33, 5)
//若iRoundMode=0x09, 则r=(22, -12, -34, 4)
extern __m256d __cdecl _mm256_round_pd(__m256d a, int iRoundMode);
#define _mm256_ceil_pd(val)   _mm256_round_pd((val), 0x0A);
#define _mm256_floor_pd(val)  _mm256_round_pd((val), 0x09);

/*
* Round Packed Single-Precision Floating-Point Values
* **** VROUNDPS ymm1,ymm2/m256,imm8
* Round the four single-precision floating-point values values in the source
* operand by the rounding mode specified in the immediate operand and place
* the result in the destination. The rounding process rounds the input to an
* integral value and returns the result as a double-precision floating-point
* value. The Precision Floating Point Exception is signaled according to the
* immediate operand. If any source operand is an SNaN then it will be
* converted to a QNaN.
*/
//用法与_mm256_round_pd相同
extern __m256  __cdecl _mm256_round_ps(__m256 a, int iRoundMode);
#define _mm256_ceil_ps(val)   _mm256_round_ps((val), 0x0A);
#define _mm256_floor_ps(val)  _mm256_round_ps((val), 0x09);

/*
* Unpack and Interleave High Packed Double-Precision Floating-Point Values
* **** VUNPCKHPD ymm1,ymm2,ymm3/m256
* Performs an interleaved unpack of the high double-precision floating-point
* values from the first source operand and the second source operand.
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r=(m11, m21, m13, m23)
extern __m256d __cdecl _mm256_unpackhi_pd(__m256d m1, __m256d m2);

/*
* Unpack and Interleave High Packed Single-Precision Floating-Point Values
* **** VUNPCKHPS ymm1,ymm2,ymm3
* Performs an interleaved unpack of the high single-precision floating-point
* values from the first source operand and the second source operand
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r=(m12, m22, m13, m23, m16, m26, m17, m27)
extern __m256  __cdecl _mm256_unpackhi_ps(__m256 m1, __m256 m2); 

/*
* Unpack and Interleave Low Packed Double-Precision Floating-Point Values
* **** VUNPCKLPD ymm1,ymm2,ymm3/m256
* Performs an interleaved unpack of the low double-precision floating-point
* values from the first source operand and the second source operand
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r=(m10, m20, m12, m22)
extern __m256d __cdecl _mm256_unpacklo_pd(__m256d m1, __m256d m2);

/*
* Unpack and Interleave Low Packed Single-Precision Floating-Point Values
* **** VUNPCKLPS ymm1,ymm2,ymm3
* Performs an interleaved unpack of the low single-precision floating-point
* values from the first source operand and the second source operand
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r=(m10, m20, m11, m21, m14, m24, m15, m25)
extern __m256  __cdecl _mm256_unpacklo_ps(__m256 m1, __m256 m2);

/*
* Packed Bit Test
* **** VPTEST ymm1, ymm2/m256
* VPTEST set the ZF flag if all bits in the result are 0 of the bitwise AND
* of the first source operand and the second source operand. VPTEST sets the
* CF flag if all bits in the result are 0 of the bitwise AND of the second
* source operand and the logical NOT of the destination.
*/
extern int     __cdecl _mm256_testz_si256(__m256i s1, __m256i s2);
extern int     __cdecl _mm256_testc_si256(__m256i s1, __m256i s2);
extern int     __cdecl _mm256_testnzc_si256(__m256i s1, __m256i s2);

/*
* Packed Bit Test
* **** VTESTPD ymm1, ymm2/m256
* **** VTESTPD xmm1, xmm2/m128
* VTESTPD performs a bitwise comparison of all the sign bits of the
* double-precision elements in the first source operation and corresponding
* sign bits in the second source operand. If the AND of the two sets of bits
* produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of
* the source sign bits with the dest sign bits produces all zeros the CF is
* set else the CF is clear
*/
extern int     __cdecl _mm256_testz_pd(__m256d s1, __m256d s2);
extern int     __cdecl _mm256_testc_pd(__m256d s1, __m256d s2);
extern int     __cdecl _mm256_testnzc_pd(__m256d s1, __m256d s2);
extern int     __cdecl _mm_testz_pd(__m128d s1, __m128d s2);
extern int     __cdecl _mm_testc_pd(__m128d s1, __m128d s2);
extern int     __cdecl _mm_testnzc_pd(__m128d s1, __m128d s2);

/*
* Packed Bit Test
* **** VTESTPS ymm1, ymm2/m256
* **** VTESTPS xmm1, xmm2/m128
* VTESTPS performs a bitwise comparison of all the sign bits of the packed
* single-precision elements in the first source operation and corresponding
* sign bits in the second source operand. If the AND of the two sets of bits
* produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of
* the source sign bits with the dest sign bits produces all zeros the CF is
* set else the CF is clear
*/
extern int     __cdecl _mm256_testz_ps(__m256 s1, __m256 s2);
extern int     __cdecl _mm256_testc_ps(__m256 s1, __m256 s2);
extern int     __cdecl _mm256_testnzc_ps(__m256 s1, __m256 s2);
extern int     __cdecl _mm_testz_ps(__m128 s1, __m128 s2);
extern int     __cdecl _mm_testc_ps(__m128 s1, __m128 s2);
extern int     __cdecl _mm_testnzc_ps(__m128 s1, __m128 s2);

/*
* Extract Double-Precision Floating-Point Sign mask
* **** VMOVMSKPD r32, ymm2
* Extracts the sign bits from the packed double-precision floating-point
* values in the source operand, formats them into a 4-bit mask, and stores
* the mask in the destination
*/
extern int     __cdecl _mm256_movemask_pd(__m256d a);

/*
* Extract Single-Precision Floating-Point Sign mask
* **** VMOVMSKPS r32, ymm2
* Extracts the sign bits from the packed single-precision floating-point
* values in the source operand, formats them into a 8-bit mask, and stores
* the mask in the destination
*/
extern int     __cdecl _mm256_movemask_ps(__m256 a);

/*
* Return 256-bit vector with all elements set to 0
*/
//则r0=r1=...=rn=0
extern __m256d __cdecl _mm256_setzero_pd(void);
extern __m256  __cdecl _mm256_setzero_ps(void);
extern __m256i __cdecl _mm256_setzero_si256(void);

/*
* Return 256-bit vector intialized to specified arguments
*/
//则r = (d, c, b, a)
extern __m256d __cdecl _mm256_set_pd(double a, double b, double c, double d);
extern __m256  __cdecl _mm256_set_ps(float, float, float, float, float, float, float, float);
extern __m256i __cdecl _mm256_set_epi8(char, char, char, char, char, char, char, char,
	char, char, char, char, char, char, char, char,
	char, char, char, char, char, char, char, char,
	char, char, char, char, char, char, char, char);
extern __m256i __cdecl _mm256_set_epi16(short, short, short, short, short, short, short, short,
	short, short, short, short, short, short, short, short);
extern __m256i __cdecl _mm256_set_epi32(int, int, int, int, int, int, int, int);
extern __m256i __cdecl _mm256_set_epi64x(long long, long long, long long, long long);

//则r = (a, b, c, d)
extern __m256d __cdecl _mm256_setr_pd(double a, double b, double c, double d);
extern __m256  __cdecl _mm256_setr_ps(float, float, float, float, float, float, float, float);
extern __m256i __cdecl _mm256_setr_epi8(char, char, char, char, char, char, char, char,
	char, char, char, char, char, char, char, char,
	char, char, char, char, char, char, char, char,
	char, char, char, char, char, char, char, char);
extern __m256i __cdecl _mm256_setr_epi16(short, short, short, short, short, short, short, short,
	short, short, short, short, short, short, short, short);
extern __m256i __cdecl _mm256_setr_epi32(int, int, int, int, int, int, int, int);
extern __m256i __cdecl _mm256_setr_epi64x(long long, long long, long long, long long);

/*
* Return 256-bit vector with all elements intialized to specified scalar
*/
//则r0 =  ... = rn = a
extern __m256d __cdecl _mm256_set1_pd(double a);
extern __m256  __cdecl _mm256_set1_ps(float);
extern __m256i __cdecl _mm256_set1_epi8(char);
extern __m256i __cdecl _mm256_set1_epi16(short);
extern __m256i __cdecl _mm256_set1_epi32(int);
extern __m256i __cdecl _mm256_set1_epi64x(long long);

/*
* Support intrinsics to do vector type casts. These intrinsics do not introduce
* extra moves to generated code. When cast is done from a 128 to 256-bit type
* the low 128 bits of the 256-bit result contain source parameter value; the
* upper 128 bits of the result are undefined
*/
//类型转换
extern __m256  __cdecl _mm256_castpd_ps(__m256d a);
extern __m256d __cdecl _mm256_castps_pd(__m256 a);
extern __m256i __cdecl _mm256_castps_si256(__m256 a);
extern __m256i __cdecl _mm256_castpd_si256(__m256d a);
extern __m256  __cdecl _mm256_castsi256_ps(__m256i a);
extern __m256d __cdecl _mm256_castsi256_pd(__m256i a);
extern __m128  __cdecl _mm256_castps256_ps128(__m256 a);
extern __m128d __cdecl _mm256_castpd256_pd128(__m256d a);
extern __m128i __cdecl _mm256_castsi256_si128(__m256i a);
extern __m256  __cdecl _mm256_castps128_ps256(__m128 a);
extern __m256d __cdecl _mm256_castpd128_pd256(__m128d a);
extern __m256i __cdecl _mm256_castsi128_si256(__m128i a);

/* Start of new intrinsics for Dev10 SP1
*
* The list of extended control registers.
* Currently, the list includes only one register.
*/
#define _XCR_XFEATURE_ENABLED_MASK 0

/* Returns the content of the specified extended control register */
extern unsigned __int64 __cdecl _xgetbv(unsigned int ext_ctrl_reg);

/* Writes the value to the specified extended control register */
extern void __cdecl _xsetbv(unsigned int ext_ctrl_reg, unsigned __int64 val);


/* 
* Performs a full or partial save of the enabled processor state components
* using the the specified memory address location and a mask.
*/
extern void __cdecl _xsave(void *mem, unsigned __int64 save_mask);
extern void __cdecl _xsave64(void *mem, unsigned __int64 save_mask);

/* 
* Performs a full or partial save of the enabled processor state components
* using the the specified memory address location and a mask.
* Optimize the state save operation if possible.
*/
extern void __cdecl _xsaveopt(void *mem, unsigned __int64 save_mask);
extern void __cdecl _xsaveopt64(void *mem, unsigned __int64 save_mask);

/* 
* Performs a full or partial restore of the enabled processor states
* using the state information stored in the specified memory address location
* and a mask.
*/
extern void __cdecl _xrstor(void *mem, unsigned __int64 restore_mask);
extern void __cdecl _xrstor64(void *mem, unsigned __int64 restore_mask);

/* 
* Saves the current state of the x87 FPU, MMX technology, XMM,
* and MXCSR registers to the specified 512-byte memory location.
*/
extern void __cdecl _fxsave(void *mem);
extern void __cdecl _fxsave64(void *mem);

/* 
* Restore the current state of the x87 FPU, MMX technology, XMM,
* and MXCSR registers from the specified 512-byte memory location.
*/
extern void __cdecl _fxrstor(void *mem);
extern void __cdecl _fxrstor64(void *mem);

你可能感兴趣的:(AVX Intrinsics各函数介绍)

使用 Docker 部署 MySQL 8
使用Docker部署MySQL8详细指南MySQL是一个广泛使用的开源关系型数据库管理系统。通过Docker部署MySQL8可以快速搭建一个可移植、可扩展的数据库环境。本文将详细介绍如何使用Docker部署MySQL8，并讲解如何根据需求配置MySQL。从拉取镜像开始的详细步骤1.拉取MySQL8镜像首先，从DockerHub拉取MySQL8的官方镜像。dockerpullmysql:8.0mys
python中的*args 和 **kwargs Hi_kenyon python python
简单来说，它们允许一个函数接收不定数量的参数。这在我们预先不知道会传递多少个参数给函数时非常有用。*args(任意数量的位置参数)*args用于在一个函数中接收任意数量的位置参数(positionalarguments)。当你在函数定义中使用*args时，Python会将所有传入的多余的位置参数收集到一个元组(tuple)中。这个名字args只是一个约定俗成的惯例(arguments的缩写)，你也
computed()、watch() 与 watchEffect() 前端岳大宝前端框架Vue vue.js javascript 前端
下面，我们来系统的梳理关于computed、watch与watchEffect的基本知识点：一、核心概念与响应式基础1.1响应式依赖关系Vue的响应式系统基于依赖收集和触发更新的机制：响应式数据依赖收集创建依赖关系数据变更触发更新执行副作用1.2三大API对比特性computedwatchwatchEffect返回值Ref对象停止函数停止函数依赖收集自动手动指定自动执行时机惰性求值响应变化立即执行
从零开始理解零样本学习：AI人工智能必学技术 AI天才研究院 Agentic AI 实战 AI人工智能与大数据 AI大模型企业级应用开发实战 ai
从零开始理解零样本学习：AI人工智能必学技术关键词：零样本学习、人工智能、机器学习、知识迁移、语义嵌入摘要：本文旨在全面深入地介绍零样本学习这一在人工智能领域具有重要意义的技术。首先阐述零样本学习的背景和基本概念，通过详细的解释和直观的示意图让读者建立起对零样本学习的初步认识。接着深入剖析其核心算法原理，结合Python代码进行详细说明，同时引入相关数学模型和公式并举例阐释。通过项目实战部分，带领
高斯混合模型（Gaussian Mixture Model, GMM）不想秃头的程序神经网络语音识别人工智能深度学习网络
高斯混合模型（GaussianMixtureModel,GMM）是一种概率模型，用于表示数据点由多个高斯分布（GaussianDistribution）混合生成的过程。它广泛应用于聚类分析、密度估计、图像分割、语音识别等领域，尤其适合处理非球形簇或多模态数据。以下是GMM的详细介绍：一、核心思想GMM假设数据是由多个高斯分布混合生成的，每个高斯分布代表一个簇（Cluster），并引入隐变量（Lat
卷积神经网络（Convolutional Neural Network, CNN）不想秃头的程序神经网络语音识别人工智能深度学习网络卷积神经网络
卷积神经网络（ConvolutionalNeuralNetwork,CNN）是一种专门用于处理图像、视频等网格数据的深度学习模型。它通过卷积层自动提取数据的特征，并利用空间共享权重和池化层减少参数量和计算复杂度，成为计算机视觉领域的核心技术。以下是CNN的详细介绍：一、核心思想CNN的核心目标是从图像中自动学习层次化特征，并通过空间共享权重和平移不变性减少参数量和计算成本。其关键组件包括：卷积层（
ResNet（Residual Network）不想秃头的程序神经网络语音识别人工智能深度学习网络残差网络神经网络
ResNet（ResidualNetwork）是深度学习中一种经典的卷积神经网络（CNN）架构，由微软研究院的KaimingHe等人在2015年提出。它通过引入残差连接（SkipConnection）解决了深度神经网络中的梯度消失问题，使得网络可以训练极深的模型（如上百层），并在图像分类、目标检测、语义分割等任务中取得了突破性成果。以下是ResNet的详细介绍：一、核心思想ResNet的核心创新是
P25：LSTM实现糖尿病探索与预测 ?Agony lstm 人工智能 rnn
本文为365天深度学习训练营中的学习记录博客原作者：K同学啊一、相关技术1.LSTM基本概念LSTM（长短期记忆网络）是RNN（循环神经网络）的一种变体，它通过引入特殊的结构来解决传统RNN中的梯度消失和梯度爆炸问题，特别适合处理序列数据。结构组成：遗忘门：决定丢弃哪些信息，通过sigmoid函数输出0-1之间的值，表示保留或遗忘的程度。输入门：决定更新哪些信息，同样通过sigmoid函数控制更新
Python的一点基础教程------文件读写卡提西亚 python 开发语言
最近在看大佬写的Python教程自学,但是感觉有点头痛,因为大佬讲了一些底层的结构和原理,但是又没那么详细,然后作为一个初学者自学的情况下,看的很费劲.看完就有感而发,想写一篇更基础的教程,教会大家怎么去用它,尽量少的去讲原理.但是当然,你也需要有一定的编程语言基础,了解基本的语法和函数等功能.正所谓师傅领进门,修行在个人,有时候我们学了一个东西,如果觉得很有趣,自然就会去了解关于它的更多信息,但
Unity脚本--01-脚本书写规则-脚本生命周期-脚本调试-常用API 秦果开发语言
一、脚本书写规则脚本：.cs的文本文件类文件作用：附加到游戏物体中，定义游戏对象行为指令的代码与C#类的区别：脚本只有字段和方法，没有自动属性和构造函数publicintA{get{returna;}set{a=value;}}属性定义了在unity中不会显示publicLifecycle(){Debug.Log("构造函数")//b=Time.time;}不要在脚本中写构造函数，因为不能在子线程
【C++】简单学——类和对象（下） CtrlZ小牛码 C++简单学 c++开发语言
初始化列表前提：对象实例化，成员变量就整体定义了，那么成员变量是在哪里单体定义初始化的？构造函数处吗？概念概念：初始化列表是每个的成员定义初始化的位置位置：在构造函数底下结构：：代表开始，代表分点classDate{public:////初始化列表Date(intyear,intmonth,intday):_year(year),_month(month),_day(day){}}语法一个成员变量
【C++】简单学——类和对象（中） CtrlZ小牛码 C++简单学 c++开发语言
六个默认成员函数共性你如果没有写这六个成员函数，编译器就会自动帮你写编译器会自动调用构造函数析构函数拷贝构造函数赋值运算符重载取地址运算符重载被const修饰的取地址运算符重载构造函数作用帮助你初始化以前的初始化的问题：总是会忘记初始化，然后用着用着就崩了使用的位置：对象实例化的时候这几个词要区分开来默认成员函数：类里“隐藏”的6个特殊函数（包括构造函数、析构函数、拷贝构造等），不写时编译器自动生
[学习] C语言编程中线程安全的实现方法（示例）极客不孤独学习 c语言安全
C语言编程中线程安全的实现方法在多线程编程中，线程安全（ThreadSafety）是一个非常重要的概念。当多个线程同时访问共享资源时，如果没有合理的同步机制，就可能导致数据竞争、死锁甚至程序崩溃。本文将详细介绍在C语言中如何实现线程安全的几种主要方式，并提供可以实际运行的代码示例。文章目录C语言编程中线程安全的实现方法一、什么是线程安全？二、C语言中线程安全的实现方式方法一：互斥锁（Mutex）✅
C++程序实现阻止屏保、阻止系统自动关闭屏幕、阻止系统待机（附源码） dvlinker C/C++实战专栏阻止屏保阻止系统自动关闭屏幕阻止系统待机 API Monitor
目录1、概述2、设置屏幕保护程序，修改自动关闭显示器和待机的时间2.1、设置屏保程序2.2、修改自动关闭显示器和待机的时间3、通过屏保的通知消息来阻止屏保4、调用API函数SystemParametersInfo关闭/启用屏保，但存在问题4.1、初步确定处理策略4.2、启动监控进程去监控主进程4.3、系统强行关机的情况无法处理5、使用APIMonitor监测到目标程序对API的调用，找到了问题的突
Go Lang Fiber介绍技术的游戏 golang 开发语言后端
利用GoLangFiber进行高性能Web开发在不断发展的Web开发世界中，选择合适的框架至关重要。速度、简洁性和强大的功能集是每个开发者都追求的品质。在使用Go构建Web应用时，“Fiber”作为一个强大且轻量级的框架在众多选择中脱颖而出。在这份全面的指南中，我们将介绍GoLangFiber，涵盖其安装和设置，指导您创建一个基本的Fiber应用，并帮助您了解构成与Fiber进行Web开发旅程基础
Golang高性能并发：Goroutine调度器优化技巧 Golang编程笔记 golang 爬虫网络 ai
Golang高性能并发：Goroutine调度器优化技巧关键词：Golang、高性能并发、Goroutine、调度器、优化技巧摘要：本文深入探讨了Golang中Goroutine调度器的优化技巧，旨在帮助开发者充分发挥Golang在并发编程方面的优势，提升程序的性能。首先介绍了相关背景知识，包括目的范围、预期读者等，接着解释了核心概念，如Goroutine、调度器等，阐述了它们之间的关系。然后详细
Go插件性能优化：如何减少内存占用和提升加载速度 Golang编程笔记 golang 性能优化网络 ai
Go插件性能优化：如何减少内存占用和提升加载速度关键词：Go插件、性能优化、内存占用、加载速度、编译优化、动态链接、插件架构摘要：本文将深入探讨Go语言插件的性能优化策略，从内存管理和加载速度两个核心维度出发，详细分析插件系统的运行机制，并提供一系列实用的优化技巧和最佳实践。通过本文，您将学会如何诊断插件性能瓶颈，应用有效的优化手段，并构建高效可靠的Go插件系统。背景介绍目的和范围本文旨在为Go开
Golang Fiber框架最佳实践：如何构建企业级应用 Golang编程笔记 Golang编程笔记 Golang开发实战 golang 开发语言后端 ai
GolangFiber框架最佳实践：如何构建企业级应用关键词：Golang、Fiber框架、企业级应用、最佳实践、Web开发摘要：本文聚焦于GolangFiber框架在企业级应用构建中的最佳实践。详细介绍了Fiber框架的背景、核心概念、算法原理、数学模型等基础知识，通过具体的代码案例展示了如何搭建开发环境、实现和解读源代码。同时探讨了Fiber框架在实际应用场景中的应用，推荐了相关的学习资源、开
Unity知识点-Renderer常用材质变量徐子竣 unity 材质游戏引擎
本篇总结了Unity中renderer的3种常用的材质相关的变量：renderer.material,renderer.sharedMaterial,renderer.MaterialPropertyBlock。以及三者对SRPBatcher的影响。一.介绍及对比1.概念介绍1.material定义：material是Render组件（如MeshRenderer）的实例化材质。特点：访问rende
深入研究 Golang 领域的 Fiber 框架架构 Golang编程笔记 golang 架构网络 ai
深入研究Golang领域的Fiber框架架构关键词：Golang、Fiber框架、架构、高性能、Web开发摘要：本文将深入探讨Golang领域的Fiber框架架构。我们会先介绍背景知识，包括目的、预期读者等。接着用通俗易懂的方式解释核心概念，如Fiber框架的各个组成部分，以及它们之间的关系。然后详细阐述核心算法原理、数学模型，通过实际代码案例展示其应用。还会介绍Fiber框架的实际应用场景、推荐
【问题解决】pnpm : 无法将“pnpm”项识别为 cmdlet、函数、脚本文件或可运行程序的名称。 aPurpleBerry 问题解决前端
今天配置完poetry环境变量之后pnpm不能用了具体报错pnpm:无法将“pnpm”项识别为cmdlet、函数、脚本文件或可运行程序的名称。请检查名称的拼写，如果包括路径，请确保路径正确，然后再试一次。所在位置行:1字符:1+pnpmrundev+~~~~+CategoryInfo:ObjectNotFound:(pnpm:String)[],CommandNotFoundException+F
[Python] 使用 dataclass 简化数据结构：定义、功能与实战踏雪无痕老爷子 Python python 开发语言
在经典面向对象编程中，为了保存和操作数据往往需要定义多个类，手写__init__()、__repr__()、__eq__()等方法。Python3.7引入了@dataclass装饰器，它能自动生成这些常见方法，大幅减少样板代码。本文将介绍dataclass的定义与参数、比较与普通类的差别、实战示例，以及常见注意事项。一、什么是dataclass@dataclass是一种类装饰器，它通过类成员的类型
算法竞赛备考冲刺必刷题（C++） | 洛谷 P8814 解密热爱编程的通信人算法 c++开发语言
本文分享的必刷题目是从蓝桥云课、洛谷、AcWing等知名刷题平台精心挑选而来，并结合各平台提供的算法标签和难度等级进行了系统分类。题目涵盖了从基础到进阶的多种算法和数据结构，旨在为不同阶段的编程学习者提供一条清晰、平稳的学习提升路径。欢迎大家订阅我的专栏：算法题解：C++与Python实现！附上汇总贴：算法竞赛备考冲刺必刷题（C++）|汇总【题目来源】洛谷：P8814[CSP-J2022]解密-洛
HarmonyOS从入门到精通：WebView开发逻极 harmonyos 华为鸿蒙 webview UI 前端实战
引言WebView是现代移动应用中不可或缺的组件，它使应用能够显示Web内容，实现混合开发。本文将详细介绍鸿蒙系统中WebView的开发技术，包括基本使用、性能优化和最佳实践。WebView基础知识1.WebView类型鸿蒙系统支持多种WebView实现：系统WebView自定义WebViewWeb组件2.WebView权限配置在开发WebView应用前，需要在配置文件中添加相关权限：{"modu
Python爬虫实战：爬取知乎问答与用户信息 Python爬虫项目 python 爬虫 php 数据分析开发语言开源
简介随着网络信息量的爆炸，如何有效获取有价值的内容，成为了数据分析、机器学习等领域的基础之一。爬虫作为数据采集的基本工具之一，常常被用来获取互联网上的公开数据。在这篇博客中，我们将结合最新的Python爬虫技术，详细讲解如何爬取知乎问答与用户信息。本文将会介绍：Python爬虫的基础知识知乎问答网页结构分析使用Python进行知乎数据爬取爬取知乎问答内容与用户信息如何处理和存储爬取的数据使用最新的
【机器学习&深度学习】反向传播机制
目录一、一句话定义二、类比理解三、为什重要？四、用生活例子解释：神经网络=烹饪机器人4.1第一步：尝一口（前向传播）4.2第二步：倒着推原因（反向传播）五、换成人工智能流程说一遍六、图示类比：找山顶（最优参数）七、总结一句人话八、PyTorch代码示例：亲眼看到每一层的梯度九、梯度=损失函数对参数的偏导数十、类比总结反向传播（Backpropagation）是神经网络中训练过程的核心机制，它就像“
【C++】atoi和std::stoi bluebonnet27 编程语言 #C++c++算法开发语言
两个将字符串转为int的方法atoi（C语言）atoi是C库中的一个函数，它定义在头文件里。其作用是把一个字符串转换为对应的整数。/*Convertastringtoaninteger.*/externintatoi(constchar*__nptr)__THROW__attribute_pure____nonnull((1))__wur;转换的原则如下：此函数接收一个以空字符'\0'结尾的字符串
python实战项目79：采集知乎话题下的所有回答 wp_tao Python副业接单实战项目 python 开发语言
python实战项目79：采集知乎话题下的所有回答一、项目介绍二、代码使用方法三、drissionpage的优缺点四、完整代码五、注意事项一、项目介绍需求是采集知乎某话题下的所有回答，这里以话题“大学宿舍相处之间遇到莫名其妙的冷落怎么办呢？”为例，网页链接为https://www.zhihu.com/question/1898156781215146265，其中189815678121514626
GitHub Actions与AWS OIDC实现安全的ECR/ECS自动化部署 ivwdcwso 运维与云原生 github aws 安全 ecr ecs oldc CI/CD
引言在现代云原生应用开发中，实现安全、高效的CI/CD流程至关重要。本文将详细介绍如何利用GitHubActions和AWSOIDC（OpenIDConnect）构建一个无需长期凭证的安全部署管道，将容器化应用自动部署到AmazonECR和ECS服务。架构概述整个解决方案的架构包含三个主要部分：GitHub端：代码仓库和GitHubActions工作流AWS端：OIDC身份验证、ECR容器仓库和E
AWS Lambda与RDS连接优化之旅 t0_54manong 编程问题解决手册 aws 云计算个人开发
在云计算的时代，AWSLambda与RDS的结合为开发者提供了高效且灵活的解决方案。然而，在实际应用中，我们常常会遇到一些性能瓶颈。本文将通过一个真实案例，探讨如何优化AWSLambda与RDS之间的连接，以提高API的响应速度。背景介绍最近，我们在AWS上部署了一个使用Dotnet6开发的API，它通过APIGateway暴露给外部，并连接到同VPC内的MySQLAuroraRDS数据库。部署前
springmvc 下 freemarker页面枚举的遍历输出杨白白 enum freemarker
spring mvc freemarker 中遍历枚举 1枚举类型有一个本地方法叫values（），这个方法可以直接返回枚举数组。所以可以利用这个遍历。 enum public enum BooleanEnum { TRUE(Boolean.TRUE, "是"), FALSE(Boolean.FALSE, "否");
实习简要总结 byalias 工作
来白虹不知不觉中已经一个多月了，因为项目还在需求分析及项目架构阶段，自己在这段时间都是在学习相关技术知识，现在对这段时间的工作及学习情况做一个总结：（1）工作技能方面大体分为两个阶段，Java Web 基础阶段和Java EE阶段 1）Java Web阶段在这个阶段，自己主要着重学习了 JSP, Servlet, JDBC, MySQL，这些知识的核心点都过了一遍，也
Quartz——DateIntervalTrigger触发器 eksliang quartz
转载请出自出处：http://eksliang.iteye.com/blog/2208559 一.概述 simpleTrigger 内部实现机制是通过计算间隔时间来计算下次的执行时间，这就导致他有不适合调度的定时任务。例如我们想每天的 1：00AM 执行任务，如果使用 SimpleTrigger，间隔时间就是一天。注意这里就会有一个问题，即当有 misfired 的任务并且恢复执行时，该执行时间
Unix快捷键 18289753290 unix Unix；快捷键;
复制，删除，粘贴： dd:删除光标所在的行 &nbs
获取Android设备屏幕的相关参数酷的飞上天空 android
包含屏幕的分辨率以及屏幕宽度的最大dp 高度最大dp TextView text = (TextView)findViewById(R.id.text); DisplayMetrics dm = new DisplayMetrics(); text.append("getResources().ge
要做物联网？先保护好你的数据蓝儿唯美数据
根据Beecham Research的说法，那些在行业中希望利用物联网的关键领域需要提供更好的安全性。在Beecham的物联网安全威胁图谱上，展示了那些可能产生内外部攻击并且需要通过快速发展的物联网行业加以解决的关键领域。 Beecham Research的技术主管Jon Howes说：“之所以我们目前还没有看到与物联网相关的严重安全事件，是因为目前还没有在大型客户和企业应用中进行部署，也就
Java取模（求余）运算随便小屋 java
整数之间的取模求余运算很好求，但几乎没有遇到过对负数进行取模求余，直接看下面代码： /** * * @author Logic * */ public class Test { public static void main(String[] args) { // TODO A
SQL注入介绍 aijuans sql注入
二、SQL注入范例这里我们根据用户登录页面 <form action="" > 用户名：<input type="text" name="username"><br/> 密码：<input type="password" name="passwor
优雅代码风格 aoyouzi 代码
总结了几点关于优雅代码风格的描述：代码简单：不隐藏设计者的意图，抽象干净利落，控制语句直截了当。接口清晰：类型接口表现力直白，字面表达含义，API 相互呼应以增强可测试性。依赖项少：依赖关系越少越好，依赖少证明内聚程度高，低耦合利于自动测试，便于重构。没有重复：重复代码意味着某些概念或想法没有在代码中良好的体现，及时重构消除重复。战术分层：代码分层清晰，隔离明确，
布尔数组百合不是茶 java 布尔数组
androi中提到了布尔数组; 布尔数组默认的是false, 并且只会打印false或者是true 布尔数组的例子; 根据字符数组创建布尔数组 char[] c = {'p','u','b','l','i','c'}; //根据字符数组的长度创建布尔数组的个数 boolean[] b = new bool
web.xml之welcome-file-list、error-page bijian1013 java web.xml servlet error-page
welcome-file-list 1.定义： <welcome-file-list> <welcome-file>login.jsp</welcome> </welcome-file-list> 2.作用：用来指定WEB应用首页名称。 error-page1.定义： <error-page&g
richfaces 4 fileUpload组件删除上传的文件 sunjing clear Richfaces 4 fileupload
页面代码 <h:form id="fileForm"> <rich:
技术文章备忘 bit1129 技术文章
Zookeeper http://wenku.baidu.com/view/bab171ffaef8941ea76e05b8.html http://wenku.baidu.com/link?url=8thAIwFTnPh2KL2b0p1V7XSgmF9ZEFgw4V_MkIpA9j8BX2rDQMPgK5l3wcs9oBTxeekOnm5P3BK8c6K2DWynq9nfUCkRlTt9uV
org.hibernate.hql.ast.QuerySyntaxException: unexpected token: on near line 1解决方案白糖_ Hibernate
文章摘自：http://blog.csdn.net/yangwawa19870921/article/details/7553181 在编写HQL时，可能会出现这种代码： select a.name,b.age from TableA a left join TableB b on a.id=b.id 如果这是HQL，那么这段代码就是错误的，因为HQL不支持
sqlserver按照字段内容进行排序 bozch 按照内容排序
在做项目的时候，遇到了这样的一个需求：从数据库中取出的数据集，首先要将某个数据或者多个数据按照地段内容放到前面显示，例如:从学生表中取出姓李的放到数据集的前面； select * fro
编程珠玑-第一章-位图排序 bylijinnan java 编程珠玑
import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.Writer; import java.util.Random; public class BitMapSearch {
Java关于==和equals chenbowen00 java
关于==和equals概念其实很简单，一个是比较内存地址是否相同，一个比较的是值内容是否相同。虽然理解上不难，但是有时存在一些理解误区，如下情况： 1、 String a = "aaa"; a=="aaa"; ==> true 2、 new String("aaa")==new String("aaa
[IT与资本]软件行业需对外界投资热情保持警惕 comsci it
我还是那个看法,软件行业需要增强内生动力,尽量依靠自有资金和营业收入来进行经营,避免在资本市场上经受各种不同类型的风险,为企业自主研发核心技术和产品提供稳定,温和的外部环境... 如果我们在自己尚未掌握核心技术之前,企图依靠上市来筹集资金,然后使劲往某个领域砸钱,然
oracle 数据块结构 daizj oracle 块数据块块结构行目录
oracle 数据块是数据库存储的最小单位，一般为操作系统块的N倍。其结构为：块头－－〉空行－－〉数据，其实际为纵行结构。块的标准大小由初始化参数DB_BLOCK_SIZE指定。具有标准大小的块称为标准块（Standard Block）。块的大小和标准块的大小不同的块叫非标准块（Nonstandard Block）。同一数据库中，Oracle9i及以上版本支持同一数据库中同时使用标
github上一些觉得对自己工作有用的项目收集 dengkane github
github上一些觉得对自己工作有用的项目收集技能类 markdown语法中文说明回到顶部全文检索 elasticsearch bigdesk elasticsearch管理插件回到顶部 nosql mapdb 支持亿级别map, list, 支持事务. 可考虑做为缓存使用 C
初二上学期难记单词二 dcj3sjt126com english word
dangerous 危险的 panda 熊猫 lion 狮子 elephant 象 monkey 猴子 tiger 老虎 deer 鹿 snake 蛇 rabbit 兔子 duck 鸭 horse 马 forest 森林 fall 跌倒；落下 climb 爬；攀登 finish 完成；结束 cinema 电影院；电影 seafood 海鲜；海产食品 bank 银行
8、mysql外键(FOREIGN KEY)的简单使用 dcj3sjt126com mysql
一、基本概念 1、MySQL中“键”和“索引”的定义相同，所以外键和主键一样也是索引的一种。不同的是MySQL会自动为所有表的主键进行索引，但是外键字段必须由用户进行明确的索引。用于外键关系的字段必须在所有的参照表中进行明确地索引，InnoDB不能自动地创建索引。 2、外键可以是一对一的，一个表的记录只能与另一个表的一条记录连接，或者是一对多的，一个表的记录与另一个表的多条记录连接。 3、如
java循环标签 Foreach shuizhaosi888 标签 java循环 foreach
1. 简单的for循环 public static void main(String[] args) { for (int i = 1, y = i + 10; i < 5 && y < 12; i++, y = i * 2) { System.err.println("i=" + i + " y="
Spring Security（05）——异常信息本地化 234390216 exception Spring Security 异常信息本地化
异常信息本地化 Spring Security支持将展现给终端用户看的异常信息本地化，这些信息包括认证失败、访问被拒绝等。而对于展现给开发者看的异常信息和日志信息（如配置错误）则是不能够进行本地化的，它们是以英文硬编码在Spring Security的代码中的。在Spring-Security-core-x
DUBBO架构服务端告警Failed to send message Response javamingtingzhao 架构 DUBBO
废话不多说，警告日志如下，不知道有哪位遇到过，此异常在服务端抛出(服务器启动第一次运行会有这个警告)，后续运行没问题，找了好久真心不知道哪里错了。 WARN 2015-07-18 22:31:15,272 com.alibaba.dubbo.remoting.transport.dispatcher.ChannelEventRunnable.run(84)
JS中Date对象中几个用法 leeqq JavaScript Date 最后一天
近来工作中遇到这样的两个需求 1. 给个Date对象，找出该时间所在月的第一天和最后一天 2. 给个Date对象，找出该时间所在周的第一天和最后一天需求1中的找月第一天很简单，我记得api中有setDate方法可以使用使用setDate方法前，先看看getDate var date = new Date(); console.log(date); // Sat J
MFC中使用ado技术操作数据库你不认识的休道人 sql mfc
1.在stdafx.h中导入ado动态链接库 #import"C:\Program Files\Common Files\System\ado\msado15.dll" no_namespace rename("EOF","end")2.在CTestApp文件的InitInstance()函数中domodal之前写::CoIniti
Android Studio加速 rensanning android studio
Android Studio慢、吃内存！启动时后会立即通过Gradle来sync & build工程。（1）设置Android Studio a) 禁用插件 File -> Settings... Plugins 去掉一些没有用的插件。比如：Git Integration、GitHub、Google Cloud Testing、Google Cloud
各数据库的批量Update操作 tomcat_oracle java oracle sql mysql sqlite
MyBatis的update元素的用法与insert元素基本相同，因此本篇不打算重复了。本篇仅记录批量update操作的 sql语句，懂得SQL语句，那么MyBatis部分的操作就简单了。　　注意：下列批量更新语句都是作为一个事务整体执行，要不全部成功，要不全部回滚。 MSSQL的SQL语句　WITH R AS（　　SELECT 'John' as name, 18 as
html禁止清除input文本输入缓存 xp9802 input
多数浏览器默认会缓存input的值，只有使用ctl+F5强制刷新的才可以清除缓存记录。如果不想让浏览器缓存input的值，有2种方法：方法一：在不想使用缓存的input中添加 autocomplete="off"; eg: <input type="text" autocomplete="off" name