hemmingway

ARM Neon Intrinsics各函数介绍

ARM NEON Optimization. An Example

[cpp] view plain copy

 
     
 #ifndef __ARM_NEON__  
 #error You must enable NEON instructions (e.g. -mfloat-abi=softfp -mfpu=neon) to use arm_neon.h  
 #endif  
   
 /*(1)、正常指令：生成大小相同且类型通常与操作数向量相同的结果向量； 
 (2)、长指令：对双字向量操作数执行运算，生成四字向量的结果。所生成的元素一般是操作数元素宽度的两倍， 
 并属于同一类型； 
 (3)、宽指令：一个双字向量操作数和一个四字向量操作数执行运算，生成四字向量结果。所生成的元素和第一个 
 操作数的元素是第二个操作数元素宽度的两倍； 
 (4)、窄指令：四字向量操作数执行运算，并生成双字向量结果，所生成的元素一般是操作数元素宽度的一半； 
 (5)、饱和指令：当超过数据类型指定的范围则自动限制在该范围内。*/  
   
 /******************************************************Addition*************************/  
 /*--1、Vector add(正常指令): vadd -> ri = ai + bi; r, a, b have equal lane sizes--*/  
 int8x8_t vadd_s8 (int8x8_t __a, int8x8_t __b);//_mm_add_epi8  
 int16x4_t vadd_s16 (int16x4_t __a, int16x4_t __b);//_mm_add_epi16  
 int32x2_t vadd_s32 (int32x2_t __a, int32x2_t __b);//_mm_add_epi32  
 int64x1_t vadd_s64 (int64x1_t __a, int64x1_t __b);//_mm_add_epi64  
 //_mm_add_ps, SSE, use only low 64 bits  
 float32x2_t vadd_f32 (float32x2_t __a, float32x2_t __b);  
 uint8x8_t vadd_u8 (uint8x8_t __a, uint8x8_t __b);//_mm_add_epi8  
 uint16x4_t vadd_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_add_epi16  
 uint32x2_t vadd_u32 (uint32x2_t __a, uint32x2_t __b);//_mm_add_epi32  
 uint64x1_t vadd_u64 (uint64x1_t __a, uint64x1_t __b);//_mm_add_epi64  
 int8x16_t vaddq_s8 (int8x16_t __a, int8x16_t __b);//_mm_add_epi8  
 int16x8_t vaddq_s16 (int16x8_t __a, int16x8_t __b);//_mm_add_epi16  
 int32x4_t vaddq_s32 (int32x4_t __a, int32x4_t __b);//_mm_add_epi32  
 int64x2_t vaddq_s64 (int64x2_t __a, int64x2_t __b);//_mm_add_epi64  
 float32x4_t vaddq_f32 (float32x4_t __a, float32x4_t __b);//_mm_add_ps  
 uint8x16_t vaddq_u8 (uint8x16_t __a, uint8x16_t __b);//_mm_add_epi8  
 uint16x8_t vaddq_u16 (uint16x8_t __a, uint16x8_t __b);//_mm_add_epi16  
 uint32x4_t vaddq_u32 (uint32x4_t __a, uint32x4_t __b);//_mm_add_epi32  
 uint64x2_t vaddq_u64 (uint64x2_t __a, uint64x2_t __b);//_mm_add_epi64  
 /*--2、Vector long add(长指令): vaddl -> ri = ai + bi; a, b have equal lane sizes,  
 result is a 128 bit vector of lanes that are twice the width--*/  
 int16x8_t vaddl_s8 (int8x8_t __a, int8x8_t __b);  
 int32x4_t vaddl_s16 (int16x4_t __a, int16x4_t __b);  
 int64x2_t vaddl_s32 (int32x2_t __a, int32x2_t __b);  
 uint16x8_t vaddl_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint32x4_t vaddl_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint64x2_t vaddl_u32 (uint32x2_t __a, uint32x2_t __b);  
 /*--3、Vector wide add(宽指令): vaddw -> ri = ai + bi--*/  
 int16x8_t vaddw_s8 (int16x8_t __a, int8x8_t __b);  
 int32x4_t vaddw_s16 (int32x4_t __a, int16x4_t __b);  
 int64x2_t vaddw_s32 (int64x2_t __a, int32x2_t __b);  
 uint16x8_t vaddw_u8 (uint16x8_t __a, uint8x8_t __b);  
 uint32x4_t vaddw_u16 (uint32x4_t __a, uint16x4_t __b);  
 uint64x2_t vaddw_u32 (uint64x2_t __a, uint32x2_t __b);  
 /*--4、Vector halving add: vhadd -> ri = (ai + bi) >> 1;  
 shifts each result right one bit, Results are truncated--*/  
 int8x8_t vhadd_s8 (int8x8_t __a, int8x8_t __b);  
 int16x4_t vhadd_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vhadd_s32 (int32x2_t __a, int32x2_t __b);  
 uint8x8_t vhadd_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint16x4_t vhadd_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint32x2_t vhadd_u32 (uint32x2_t __a, uint32x2_t __b);  
 int8x16_t vhaddq_s8 (int8x16_t __a, int8x16_t __b);  
 int16x8_t vhaddq_s16 (int16x8_t __a, int16x8_t __b)  
 int32x4_t vhaddq_s32 (int32x4_t __a, int32x4_t __b)  
 uint8x16_t vhaddq_u8 (uint8x16_t __a, uint8x16_t __b)  
 uint16x8_t vhaddq_u16 (uint16x8_t __a, uint16x8_t __b)  
 uint32x4_t vhaddq_u32 (uint32x4_t __a, uint32x4_t __b);  
 /*--5、Vector rounding halving add: vrhadd -> ri = (ai + bi + 1) >> 1;  
 shifts each result right one bit, Results are rounded(四舍五入)--*/  
 int8x8_t vrhadd_s8 (int8x8_t __a, int8x8_t __b);  
 int16x4_t vrhadd_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vrhadd_s32 (int32x2_t __a, int32x2_t __b);  
 uint8x8_t vrhadd_u8 (uint8x8_t __a, uint8x8_t __b);//_mm_avg_epu8  
 uint16x4_t vrhadd_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_avg_epu16  
 uint32x2_t vrhadd_u32 (uint32x2_t __a, uint32x2_t __b);  
 int8x16_t vrhaddq_s8 (int8x16_t __a, int8x16_t __b);  
 int16x8_t vrhaddq_s16 (int16x8_t __a, int16x8_t __b);  
 int32x4_t vrhaddq_s32 (int32x4_t __a, int32x4_t __b);  
 uint8x16_t vrhaddq_u8 (uint8x16_t __a, uint8x16_t __b);//_mm_avg_epu8  
 uint16x8_t vrhaddq_u16 (uint16x8_t __a, uint16x8_t __b);//_mm_avg_epu16  
 uint32x4_t vrhaddq_u32 (uint32x4_t __a, uint32x4_t __b);  
 /*--6、Vector saturating add(饱和指令): vqadd -> ri = sat(ai + bi);  
 the results are saturated if they overflow--*/  
 int8x8_t vqadd_s8 (int8x8_t __a, int8x8_t __b);//_mm_adds_epi8  
 int16x4_t vqadd_s16 (int16x4_t __a, int16x4_t __b);//_mm_adds_epi16  
 int32x2_t vqadd_s32 (int32x2_t __a, int32x2_t __b);  
 int64x1_t vqadd_s64 (int64x1_t __a, int64x1_t __b);  
 uint8x8_t vqadd_u8 (uint8x8_t __a, uint8x8_t __b);//_mm_adds_epu8  
 uint16x4_t vqadd_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_adds_epu16  
 uint32x2_t vqadd_u32 (uint32x2_t __a, uint32x2_t __b);  
 uint64x1_t vqadd_u64 (uint64x1_t __a, uint64x1_t __b);  
 int8x16_t vqaddq_s8 (int8x16_t __a, int8x16_t __b);//_mm_adds_epi8  
 int16x8_t vqaddq_s16 (int16x8_t __a, int16x8_t __b);//_mm_adds_epi16  
 int32x4_t vqaddq_s32 (int32x4_t __a, int32x4_t __b);  
 int64x2_t vqaddq_s64 (int64x2_t __a, int64x2_t __b);  
 uint8x16_t vqaddq_u8 (uint8x16_t __a, uint8x16_t __b);//_mm_adds_epu8  
 uint16x8_t vqaddq_u16 (uint16x8_t __a, uint16x8_t __b);//_mm_adds_epu16  
 uint32x4_t vqaddq_u32 (uint32x4_t __a, uint32x4_t __b);  
 uint64x2_t vqaddq_u64 (uint64x2_t __a, uint64x2_t __b);  
 /*--7、Vector add high half(窄指令): vaddhn -> ri = sat(ai + bi);  
 selecting High half, The results are truncated--*/  
 int8x8_t vaddhn_s16 (int16x8_t __a, int16x8_t __b);  
 int16x4_t vaddhn_s32 (int32x4_t __a, int32x4_t __b);  
 int32x2_t vaddhn_s64 (int64x2_t __a, int64x2_t __b);  
 uint8x8_t vaddhn_u16 (uint16x8_t __a, uint16x8_t __b);  
 uint16x4_t vaddhn_u32 (uint32x4_t __a, uint32x4_t __b);  
 uint32x2_t vaddhn_u64 (uint64x2_t __a, uint64x2_t __b);  
 /*--8、Vector rounding add high half(窄指令): vraddhn -> ri = ai + bi;  
 selecting High half, The results are rounded--*/  
 int8x8_t vraddhn_s16 (int16x8_t __a, int16x8_t __b);  
 int16x4_t vraddhn_s32 (int32x4_t __a, int32x4_t __b)  
 int32x2_t vraddhn_s64 (int64x2_t __a, int64x2_t __b)  
 uint8x8_t vraddhn_u16 (uint16x8_t __a, uint16x8_t __b)  
 uint16x4_t vraddhn_u32 (uint32x4_t __a, uint32x4_t __b)  
 uint32x2_t vraddhn_u64 (uint64x2_t __a, uint64x2_t __b);  
 /*******************************************Multiplication******************************/  
 /*--1、Vector multiply(正常指令): vmul -> ri = ai * bi;--*/  
 int8x8_t vmul_s8 (int8x8_t __a, int8x8_t __b);  
 int16x4_t vmul_s16 (int16x4_t __a, int16x4_t __b);//_mm_mullo_epi16  
 int32x2_t vmul_s32 (int32x2_t __a, int32x2_t __b);  
 float32x2_t vmul_f32 (float32x2_t __a, float32x2_t __b);//_mm_mul_ps  
 uint8x8_t vmul_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint16x4_t vmul_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_mullo_epi16  
 uint32x2_t vmul_u32 (uint32x2_t __a, uint32x2_t __b);  
 poly8x8_t vmul_p8 (poly8x8_t __a, poly8x8_t __b);  
 int8x16_t vmulq_s8 (int8x16_t __a, int8x16_t __b);  
 int16x8_t vmulq_s16 (int16x8_t __a, int16x8_t __b);//_mm_mullo_epi16  
 int32x4_t vmulq_s32 (int32x4_t __a, int32x4_t __b);  
 float32x4_t vmulq_f32 (float32x4_t __a, float32x4_t __b);//_mm_mul_ps  
 uint8x16_t vmulq_u8 (uint8x16_t __a, uint8x16_t __b);  
 uint16x8_t vmulq_u16 (uint16x8_t __a, uint16x8_t __b);//_mm_mullo_epi16  
 uint32x4_t vmulq_u32 (uint32x4_t __a, uint32x4_t __b);  
 poly8x16_t vmulq_p8 (poly8x16_t __a, poly8x16_t __b);  
 /*--2、Vector multiply accumulate: vmla -> ri = ai + bi * ci; --*/  
 int8x8_t vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c);  
 int16x4_t vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c);  
 int32x2_t vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c);  
 float32x2_t vmla_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c);  
 uint8x8_t vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c);  
 uint16x4_t vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c);  
 uint32x2_t vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c);  
 int8x16_t vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c);  
 int16x8_t vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c);  
 int32x4_t vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c);  
 float32x4_t vmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c);  
 uint8x16_t vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c);  
 uint16x8_t vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c);  
 uint32x4_t vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c);  
 /*--3、Vector multiply accumulate long: vmlal -> ri = ai + bi * ci --*/  
 int16x8_t vmlal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c);  
 int32x4_t vmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c);  
 int64x2_t vmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c);  
 uint16x8_t vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c);  
 uint32x4_t vmlal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c);  
 uint64x2_t vmlal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c);  
 /*--4、Vector multiply subtract: vmls -> ri = ai - bi * ci --*/  
 int8x8_t vmls_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c);  
 int16x4_t vmls_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c);  
 int32x2_t vmls_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c);  
 float32x2_t vmls_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c);  
 uint8x8_t vmls_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c);  
 uint16x4_t vmls_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c);  
 uint32x2_t vmls_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c);  
 int8x16_t vmlsq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c);  
 int16x8_t vmlsq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c);  
 int32x4_t vmlsq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c);  
 float32x4_t vmlsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c);  
 uint8x16_t vmlsq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c);  
 uint16x8_t vmlsq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c);  
 uint32x4_t vmlsq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c);  
 /*--5、Vector multiply subtract long：vmlsl -> ri = ai - bi * ci --*/  
 int16x8_t vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c);  
 int32x4_t vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c);  
 int64x2_t vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c);  
 uint16x8_t vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c);  
 uint32x4_t vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c);  
 uint64x2_t vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c);  
 /*--6、Vector saturating doubling multiply high: vqdmulh -> ri = sat(ai * bi);  
 doubles the results and returns only the high half of the truncated results--*/  
 int16x4_t vqdmulh_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vqdmulh_s32 (int32x2_t __a, int32x2_t __b);  
 int16x8_t vqdmulhq_s16 (int16x8_t __a, int16x8_t __b);  
 int32x4_t vqdmulhq_s32 (int32x4_t __a, int32x4_t __b);  
 /*--7、Vector saturating rounding doubling multiply high vqrdmulh -> ri = ai * bi:  
 doubles the results and returns only the high half of the rounded results.  
 The results are saturated if they overflow--*/  
 int16x4_t vqrdmulh_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vqrdmulh_s32 (int32x2_t __a, int32x2_t __b);  
 int16x8_t vqrdmulhq_s16 (int16x8_t __a, int16x8_t __b);  
 int32x4_t vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b);  
 /*--8、Vector saturating doubling multiply accumulate long: vqdmlal -> ri = ai + bi * ci; 
 multiplies the elements in the second and third vectors, doubles the results and adds the 
 results to the values in the first vector. The results are saturated if they overflow--*/  
 int32x4_t vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c);  
 int64x2_t  vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c);  
 /*--9、Vector saturating doubling multiply subtract long: vqdmlsl -> ri = ai - bi * ci; 
 multiplies the elements in the second and third vectors, doubles the results and subtracts  
 the results from the elements in the first vector.  
 The results are saturated if they overflow--*/  
 int32x4_t vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c);  
 int64x2_t vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c);  
 /*--10、Vector long multiply(长指令): vmull -> ri = ai * bi;--*/  
 int16x8_t vmull_s8 (int8x8_t __a, int8x8_t __b);  
 int32x4_t vmull_s16 (int16x4_t __a, int16x4_t __b);  
 int64x2_t vmull_s32 (int32x2_t __a, int32x2_t __b);  
 uint16x8_t vmull_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint32x4_t vmull_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint64x2_t vmull_u32 (uint32x2_t __a, uint32x2_t __b);  
 poly16x8_t vmull_p8 (poly8x8_t __a, poly8x8_t __b);  
 /*--11、Vector saturating doubling long multiply: vqdmull -> ri = ai * bi; 
 If any of the results overflow, they are saturated--*/  
 int32x4_t vqdmull_s16 (int16x4_t __a, int16x4_t __b);  
 int64x2_t vqdmull_s32 (int32x2_t __a, int32x2_t __b);  
 /*--12、Fused multiply accumulate: vfma -> ri = ai + bi * ci;  
 The result of the multiply is not rounded before the accumulation--*/  
 float32x2_t vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)  
 float32x4_t vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c);  
 /*--13、Fused multiply subtract: vfms -> ri = ai - bi * ci;  
 The result of the multiply is not rounded before the subtraction--*/  
 float32x2_t vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c);  
 float32x4_t vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c);  
 /******************************************************Round to integral****************/  
 /*--1、to nearest, ties to even--*/  
 float32x2_t vrndn_f32 (float32x2_t __a);  
 float32x4_t vrndqn_f32 (float32x4_t __a);  
 /*--2、to nearest, ties away from zero--*/  
 float32x2_t vrnda_f32 (float32x2_t __a);  
 float32x4_t vrndqa_f32 (float32x4_t __a);  
 /*--3、towards +Inf--*/  
 float32x2_t vrndp_f32 (float32x2_t __a);  
 float32x4_t vrndqp_f32 (float32x4_t __a);  
 /*--4、towards -Inf--*/  
 float32x2_t vrndm_f32 (float32x2_t __a);  
 float32x4_t vrndqm_f32 (float32x4_t __a);  
 /*--5、towards 0--*/  
 float32x2_t vrnd_f32 (float32x2_t __a);  
 float32x4_t vrndq_f32 (float32x4_t __a);  
 /**********************************************Subtraction******************************/  
 /*--1、Vector subtract(正常指令):vsub -> ri = ai - bi;--*/  
 int8x8_t vsub_s8 (int8x8_t __a, int8x8_t __b);//_mm_sub_epi8  
 int16x4_t vsub_s16 (int16x4_t __a, int16x4_t __b);//_mm_sub_epi16  
 int32x2_t vsub_s32 (int32x2_t __a, int32x2_t __b);//_mm_sub_epi32  
 int64x1_t vsub_s64 (int64x1_t __a, int64x1_t __b);//_mm_sub_epi64  
 float32x2_t vsub_f32 (float32x2_t __a, float32x2_t __b);//_mm_sub_ps  
 uint8x8_t vsub_u8 (uint8x8_t __a, uint8x8_t __b);//_mm_sub_epi8  
 uint16x4_t vsub_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_sub_epi16  
 uint32x2_t vsub_u32 (uint32x2_t __a, uint32x2_t __b);//_mm_sub_epi32  
 uint64x1_t vsub_u64 (uint64x1_t __a, uint64x1_t __b);//_mm_sub_epi64  
 int8x16_t vsubq_s8 (int8x16_t __a, int8x16_t __b);//_mm_sub_epi8  
 int16x8_t vsubq_s16 (int16x8_t __a, int16x8_t __b);//_mm_sub_epi16  
 int32x4_t vsubq_s32 (int32x4_t __a, int32x4_t __b);//_mm_sub_epi32  
 int64x2_t vsubq_s64 (int64x2_t __a, int64x2_t __b);//_mm_sub_epi64  
 float32x4_t vsubq_f32 (float32x4_t __a, float32x4_t __b);//_mm_sub_ps  
 uint8x16_t vsubq_u8 (uint8x16_t __a, uint8x16_t __b);//_mm_sub_epi8  
 uint16x8_t vsubq_u16 (uint16x8_t __a, uint16x8_t __b);//_mm_sub_epi16  
 uint32x4_t vsubq_u32 (uint32x4_t __a, uint32x4_t __b);//_mm_sub_epi32  
 uint64x2_t vsubq_u64 (uint64x2_t __a, uint64x2_t __b);//_mm_sub_epi64  
 /*--2、Vector long subtract(长指令): vsubl -> ri = ai - bi; --*/  
 int16x8_t vsubl_s8 (int8x8_t __a, int8x8_t __b);  
 int32x4_t vsubl_s16 (int16x4_t __a, int16x4_t __b);  
 int64x2_t vsubl_s32 (int32x2_t __a, int32x2_t __b);  
 uint16x8_t vsubl_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint32x4_t vsubl_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint64x2_t vsubl_u32 (uint32x2_t __a, uint32x2_t __b);  
 /*--3、Vector wide subtract(宽指令): vsubw -> ri = ai - bi;--*/  
 int16x8_t vsubw_s8 (int16x8_t __a, int8x8_t __b);  
 int32x4_t vsubw_s16 (int32x4_t __a, int16x4_t __b);  
 int64x2_t vsubw_s32 (int64x2_t __a, int32x2_t __b);  
 uint16x8_t vsubw_u8 (uint16x8_t __a, uint8x8_t __b);  
 uint32x4_t vsubw_u16 (uint32x4_t __a, uint16x4_t __b);  
 uint64x2_t vsubw_u32 (uint64x2_t __a, uint32x2_t __b);  
 /*--4、Vector saturating subtract(饱和指令): vqsub -> ri = sat(ai - bi); 
 If any of the results overflow, they are saturated--*/  
 int8x8_t vqsub_s8 (int8x8_t __a, int8x8_t __b);//_mm_subs_epi8  
 int16x4_t vqsub_s16 (int16x4_t __a, int16x4_t __b);//_mm_subs_epi16  
 int32x2_t vqsub_s32 (int32x2_t __a, int32x2_t __b);//_mm_subs_epi32  
 int64x1_t vqsub_s64 (int64x1_t __a, int64x1_t __b);  
 uint8x8_t vqsub_u8 (uint8x8_t __a, uint8x8_t __b);//_mm_subs_epu8  
 uint16x4_t vqsub_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_subs_epu16  
 uint32x2_t vqsub_u32 (uint32x2_t __a, uint32x2_t __b);//_mm_subs_epu32  
 uint64x1_t vqsub_u64 (uint64x1_t __a, uint64x1_t __b);  
 int8x16_t vqsubq_s8 (int8x16_t __a, int8x16_t __b);//_mm_subs_epi8  
 int16x8_t vqsubq_s16 (int16x8_t __a, int16x8_t __b);//_mm_subs_epi16  
 int32x4_t vqsubq_s32 (int32x4_t __a, int32x4_t __b);//_mm_subs_epi32  
 int64x2_t vqsubq_s64 (int64x2_t __a, int64x2_t __b);  
 uint8x16_t vqsubq_u8 (uint8x16_t __a, uint8x16_t __b);//_mm_subs_epu8  
 uint16x8_t vqsubq_u16 (uint16x8_t __a, uint16x8_t __b);//_mm_subs_epu16  
 uint32x4_t vqsubq_u32 (uint32x4_t __a, uint32x4_t __b);//_mm_subs_epu32  
 uint64x2_t vqsubq_u64 (uint64x2_t __a, uint64x2_t __b);  
 /*--5、Vector halving subtract: vhsub -> ri = (ai - bi) >> 1;  
 shifts each result right one bit, The results are truncated.--*/  
 int8x8_t vhsub_s8 (int8x8_t __a, int8x8_t __b);  
 int16x4_t vhsub_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vhsub_s32 (int32x2_t __a, int32x2_t __b);  
 uint8x8_t vhsub_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint16x4_t vhsub_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint32x2_t vhsub_u32 (uint32x2_t __a, uint32x2_t __b);  
 int8x16_t vhsubq_s8 (int8x16_t __a, int8x16_t __b);  
 int16x8_t vhsubq_s16 (int16x8_t __a, int16x8_t __b);  
 int32x4_t vhsubq_s32 (int32x4_t __a, int32x4_t __b);  
 uint8x16_t vhsubq_u8 (uint8x16_t __a, uint8x16_t __b);  
 uint16x8_t vhsubq_u16 (uint16x8_t __a, uint16x8_t __b);  
 uint32x4_t vhsubq_u32 (uint32x4_t __a, uint32x4_t __b);  
 /*--6、Vector subtract high half(窄指令): vsubhn -> ri = ai - bi; 
 It returns the most significant halves of the results. The results are truncated--*/  
 int8x8_t vsubhn_s16 (int16x8_t __a, int16x8_t __b);  
 int16x4_t vsubhn_s32 (int32x4_t __a, int32x4_t __b);  
 int32x2_t vsubhn_s64 (int64x2_t __a, int64x2_t __b);  
 uint8x8_t vsubhn_u16 (uint16x8_t __a, uint16x8_t __b);  
 uint16x4_t vsubhn_u32 (uint32x4_t __a, uint32x4_t __b);  
 uint32x2_t vsubhn_u64 (uint64x2_t __a, uint64x2_t __b);  
 /*--7、Vector rounding subtract high half(窄指令): vrsubhn -> ai - bi;  
 It returns the most significant halves of the results. The results are rounded--*/  
 int8x8_t vrsubhn_s16 (int16x8_t __a, int16x8_t __b);  
 int16x4_t vrsubhn_s32 (int32x4_t __a, int32x4_t __b);  
 int32x2_t vrsubhn_s64 (int64x2_t __a, int64x2_t __b)  
 uint8x8_t vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b);  
 uint16x4_t vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b);  
 uint32x2_t vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b);  
 /******************************************************Comparison***********************/  
 /*--1、Vector compare equal(正常指令): vceq -> ri = ai == bi ? 1...1 : 0...0;  
 If they are equal, the corresponding element in the destination vector is set to all ones. 
 Otherwise, it is set to all zeros--*/  
 uint8x8_t vceq_s8 (int8x8_t __a, int8x8_t __b);//_mm_cmpeq_epi8  
 uint16x4_t vceq_s16 (int16x4_t __a, int16x4_t __b);//_mm_cmpeq_epi16  
 uint32x2_t vceq_s32 (int32x2_t __a, int32x2_t __b);//_mm_cmpeq_epi32  
 uint32x2_t vceq_f32 (float32x2_t __a, float32x2_t __b);  
 uint8x8_t vceq_u8 (uint8x8_t __a, uint8x8_t __b);//_mm_cmpeq_epi8  
 uint16x4_t vceq_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_cmpeq_epi16  
 uint32x2_t vceq_u32 (uint32x2_t __a, uint32x2_t __b);//_mm_cmpeq_epi32  
 uint8x8_t vceq_p8 (poly8x8_t __a, poly8x8_t __b);//_mm_cmpeq_epi8  
 uint8x16_t vceqq_s8 (int8x16_t __a, int8x16_t __b);//_mm_cmpeq_epi8  
 uint16x8_t vceqq_s16 (int16x8_t __a, int16x8_t __b);//_mm_cmpeq_epi16  
 uint32x4_t vceqq_s32 (int32x4_t __a, int32x4_t __b);//_mm_cmpeq_epi32  
 uint32x4_t vceqq_f32 (float32x4_t __a, float32x4_t __b);  
 uint8x16_t vceqq_u8 (uint8x16_t __a, uint8x16_t __b);//_mm_cmpeq_epi8  
 uint16x8_t vceqq_u16 (uint16x8_t __a, uint16x8_t __b);//_mm_cmpeq_epi16  
 uint32x4_t vceqq_u32 (uint32x4_t __a, uint32x4_t __b);//_mm_cmpeq_epi32  
 uint8x16_t vceqq_p8 (poly8x16_t __a, poly8x16_t __b);//_mm_cmpeq_epi8  
 /*--2、Vector compare greater-than or equal(正常指令): vcge-> ri = ai >= bi ? 1...1:0...0; 
 If it is greater than or equal to it, the corresponding element in the destination  
 vector is set to all ones. Otherwise, it is set to all zeros.--*/  
 uint8x8_t vcge_s8 (int8x8_t __a, int8x8_t __b);  
 uint16x4_t vcge_s16 (int16x4_t __a, int16x4_t __b);  
 uint32x2_t vcge_s32 (int32x2_t __a, int32x2_t __b);  
 uint32x2_t vcge_f32 (float32x2_t __a, float32x2_t __b);  
 uint8x8_t vcge_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint16x4_t vcge_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint32x2_t vcge_u32 (uint32x2_t __a, uint32x2_t __b);  
 uint8x16_t vcgeq_s8 (int8x16_t __a, int8x16_t __b);  
 uint16x8_t vcgeq_s16 (int16x8_t __a, int16x8_t __b);  
 uint32x4_t vcgeq_s32 (int32x4_t __a, int32x4_t __b);  
 uint32x4_t vcgeq_f32 (float32x4_t __a, float32x4_t __b);  
 uint8x16_t vcgeq_u8 (uint8x16_t __a, uint8x16_t __b);  
 uint16x8_t vcgeq_u16 (uint16x8_t __a, uint16x8_t __b);  
 uint32x4_t vcgeq_u32 (uint32x4_t __a, uint32x4_t __b);  
 /*--3、Vector compare less-than or equal(正常指令): vcle -> ri = ai <= bi ? 1...1:0...0; 
 If it is less than or equal to it, the corresponding element in the destination vector  
 is set to all ones. Otherwise, it is set to all zeros.--*/  
 uint8x8_t vcle_s8 (int8x8_t __a, int8x8_t __b);  
 uint16x4_t vcle_s16 (int16x4_t __a, int16x4_t __b);  
 uint32x2_t vcle_s32 (int32x2_t __a, int32x2_t __b);  
 uint32x2_t vcle_f32 (float32x2_t __a, float32x2_t __b);  
 uint8x8_t vcle_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint16x4_t vcle_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint32x2_t vcle_u32 (uint32x2_t __a, uint32x2_t __b);  
 uint8x16_t vcleq_s8 (int8x16_t __a, int8x16_t __b);  
 uint16x8_t vcleq_s16 (int16x8_t __a, int16x8_t __b);  
 uint32x4_t vcleq_s32 (int32x4_t __a, int32x4_t __b);  
 uint32x4_t vcleq_f32 (float32x4_t __a, float32x4_t __b);  
 uint8x16_t vcleq_u8 (uint8x16_t __a, uint8x16_t __b);  
 uint16x8_t vcleq_u16 (uint16x8_t __a, uint16x8_t __b);  
 uint32x4_t vcleq_u32 (uint32x4_t __a, uint32x4_t __b);  
 /*--4、Vector compare greater-than(正常指令): vcgt -> ri = ai > bi ? 1...1:0...0; 
 If it is greater than it, the corresponding element in the destination vector is 
 set to all ones. Otherwise, it is set to all zeros--*/  
 uint8x8_t vcgt_s8 (int8x8_t __a, int8x8_t __b);  
 uint16x4_t vcgt_s16 (int16x4_t __a, int16x4_t __b);  
 uint32x2_t vcgt_s32 (int32x2_t __a, int32x2_t __b);  
 uint32x2_t vcgt_f32 (float32x2_t __a, float32x2_t __b);  
 uint8x8_t vcgt_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint16x4_t vcgt_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint32x2_t vcgt_u32 (uint32x2_t __a, uint32x2_t __b);  
 uint8x16_t vcgtq_s8 (int8x16_t __a, int8x16_t __b);  
 uint16x8_t vcgtq_s16 (int16x8_t __a, int16x8_t __b);  
 uint32x4_t vcgtq_s32 (int32x4_t __a, int32x4_t __b);  
 uint32x4_t vcgtq_f32 (float32x4_t __a, float32x4_t __b);  
 uint8x16_t vcgtq_u8 (uint8x16_t __a, uint8x16_t __b);  
 uint16x8_t vcgtq_u16 (uint16x8_t __a, uint16x8_t __b);  
 uint32x4_t vcgtq_u32 (uint32x4_t __a, uint32x4_t __b);  
 /*--5、Vector compare less-than(正常指令): vclt -> ri = ai < bi ? 1...1:0...0; 
 If it is less than it, the corresponding element in the destination vector is set  
 to all ones.Otherwise, it is set to all zeros--*/  
 uint8x8_t vclt_s8 (int8x8_t __a, int8x8_t __b);  
 uint16x4_t vclt_s16 (int16x4_t __a, int16x4_t __b);  
 uint32x2_t vclt_s32 (int32x2_t __a, int32x2_t __b);  
 uint32x2_t vclt_f32 (float32x2_t __a, float32x2_t __b);  
 uint8x8_t vclt_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint16x4_t vclt_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint32x2_t vclt_u32 (uint32x2_t __a, uint32x2_t __b);  
 uint8x16_t vcltq_s8 (int8x16_t __a, int8x16_t __b);  
 uint16x8_t vcltq_s16 (int16x8_t __a, int16x8_t __b);  
 uint32x4_t vcltq_s32 (int32x4_t __a, int32x4_t __b);  
 uint32x4_t vcltq_f32 (float32x4_t __a, float32x4_t __b);  
 uint8x16_t vcltq_u8 (uint8x16_t __a, uint8x16_t __b);  
 uint16x8_t vcltq_u16 (uint16x8_t __a, uint16x8_t __b);  
 uint32x4_t vcltq_u32 (uint32x4_t __a, uint32x4_t __b);  
 /*--6、Vector compare absolute greater-than or equal(正常指令):  
 vcage -> ri = |ai| >= |bi| ? 1...1:0...0; 
 compares the absolute value of each element in a vector with the absolute value of the  
 corresponding element of a second vector. If it is greater than or equal to it,  
 the corresponding element in the destination vector is set to all ones. 
 Otherwise, it is set to all zeros.--*/  
 uint32x2_t vcage_f32 (float32x2_t __a, float32x2_t __b);  
 uint32x4_t vcageq_f32 (float32x4_t __a, float32x4_t __b);  
 /*--7、Vector compare absolute less-than or equal(正常指令): 
 vcale -> ri = |ai| <= |bi| ? 1...1:0...0; 
 compares the absolute value of each element in a vector with the absolute value of the  
 corresponding element of a second vector. If it is less than or equal to it,  
 the corresponding element in the destination vector is set to all ones. 
 Otherwise, it is set to all zeros--*/  
 uint32x2_t vcale_f32 (float32x2_t __a, float32x2_t __b);  
 uint32x4_t vcaleq_f32 (float32x4_t __a, float32x4_t __b);  
 /*--8、Vector compare absolute greater-than(正常指令): 
 vcage -> ri = |ai| > |bi| ? 1...1:0...0; 
 compares the absolute value of each element in a vector with the absolute value of the 
 corresponding element of a second vector. If it is greater than it,  
 the corresponding element in the destination vector is set to all ones.  
 Otherwise, it is set to all zeros.--*/  
 uint32x2_t vcagt_f32 (float32x2_t __a, float32x2_t __b);  
 uint32x4_t vcagtq_f32 (float32x4_t __a, float32x4_t __b);  
 /*--9、Vector compare absolute less-than(正常指令): 
 vcalt -> ri = |ai| < |bi| ? 1...1:0...0; 
 compares the absolute value of each element in a vector with the absolute value of the 
 corresponding element of a second vector.If it is less than it, the corresponding  
 element in the destination vector is set to all ones. Otherwise,it is set to all zeros--*/  
 uint32x2_t vcalt_f32 (float32x2_t __a, float32x2_t __b);  
 uint32x4_t vcaltq_f32 (float32x4_t __a, float32x4_t __b);  
 /**********************************************Vector test bits*************************/  
 /*--正常指令，vtst -> ri = (ai & bi != 0) ? 1...1:0...0; 
 bitwise logical ANDs each element in a vector with the corresponding element of a second  
 vector.If the result is not zero, the corresponding element in the destination vector  
 is set to all ones. Otherwise, it is set to all zeros--*/  
 uint8x8_t vtst_s8 (int8x8_t __a, int8x8_t __b);  
 uint16x4_t vtst_s16 (int16x4_t __a, int16x4_t __b);  
 uint32x2_t vtst_s32 (int32x2_t __a, int32x2_t __b);  
 uint8x8_t vtst_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint16x4_t vtst_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint32x2_t vtst_u32 (uint32x2_t __a, uint32x2_t __b);  
 uint8x8_t vtst_p8 (poly8x8_t __a, poly8x8_t __b);  
 uint8x16_t vtstq_s8 (int8x16_t __a, int8x16_t __b);  
 uint16x8_t vtstq_s16 (int16x8_t __a, int16x8_t __b);  
 uint32x4_t vtstq_s32 (int32x4_t __a, int32x4_t __b);  
 uint8x16_t vtstq_u8 (uint8x16_t __a, uint8x16_t __b);  
 uint16x8_t vtstq_u16 (uint16x8_t __a, uint16x8_t __b);  
 uint32x4_t vtstq_u32 (uint32x4_t __a, uint32x4_t __b);  
 uint8x16_t vtstq_p8 (poly8x16_t __a, poly8x16_t __b);  
 /**********************************************Absolute difference**********************/  
 /*--1、Absolute difference between the arguments(正常指令): vabd -> ri = |ai - bi|; 
 returns the absolute values of the results--*/  
 int8x8_t vabd_s8 (int8x8_t __a, int8x8_t __b);  
 int16x4_t vabd_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vabd_s32 (int32x2_t __a, int32x2_t __b);  
 float32x2_t vabd_f32 (float32x2_t __a, float32x2_t __b);  
 uint8x8_t vabd_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint16x4_t vabd_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint32x2_t vabd_u32 (uint32x2_t __a, uint32x2_t __b);  
 int8x16_t vabdq_s8 (int8x16_t __a, int8x16_t __b);  
 int16x8_t vabdq_s16 (int16x8_t __a, int16x8_t __b);  
 int32x4_t vabdq_s32 (int32x4_t __a, int32x4_t __b);  
 float32x4_t vabdq_f32 (float32x4_t __a, float32x4_t __b);  
 uint8x16_t vabdq_u8 (uint8x16_t __a, uint8x16_t __b);  
 uint16x8_t vabdq_u16 (uint16x8_t __a, uint16x8_t __b);  
 uint32x4_t vabdq_u32 (uint32x4_t __a, uint32x4_t __b);  
 /*--2、Absolute difference - long(长指令): vabdl -> ri = |ai - bi|;  
 The elements in the result vector are wider--*/  
 int16x8_t vabdl_s8 (int8x8_t __a, int8x8_t __b);  
 int32x4_t vabdl_s16 (int16x4_t __a, int16x4_t __b);  
 int64x2_t vabdl_s32 (int32x2_t __a, int32x2_t __b);  
 uint16x8_t vabdl_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint32x4_t vabdl_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint64x2_t vabdl_u32 (uint32x2_t __a, uint32x2_t __b);  
 /*--3、Absolute difference and accumulate: vaba -> ri = ai + |bi - ci|;--*/  
 int8x8_t vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c);  
 int16x4_t vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c);  
 int32x2_t vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c);  
 uint8x8_t vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c);  
 uint16x4_t vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c);  
 uint32x2_t vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c);  
 int8x16_t vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c);  
 int16x8_t vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c);  
 int32x4_t vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c);  
 uint8x16_t vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c);  
 uint16x8_t vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c);  
 uint32x4_t vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c);  
 /*--4、Absolute difference and accumulate - long: vabal -> ri = ai + |bi - ci|;  
 The elements in the result are wider--*/  
 int16x8_t vabal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c);  
 int32x4_t vabal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c);  
 int64x2_t vabal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c);  
 uint16x8_t vabal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c);  
 uint32x4_t vabal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c);  
 uint64x2_t vabal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c);  
 /***********************************************Max*************************************/  
 /*--正常指令, vmax -> ri = ai >= bi ? ai : bi; returns the larger of each pair--*/  
 int8x8_t vmax_s8 (int8x8_t __a, int8x8_t __b);//_mm_max_epi8  
 int16x4_t vmax_s16 (int16x4_t __a, int16x4_t __b);//_mm_max_epi16  
 int32x2_t vmax_s32 (int32x2_t __a, int32x2_t __b);//_mm_max_epi32  
 float32x2_t vmax_f32 (float32x2_t __a, float32x2_t __b);//_mm_max_ps  
 uint8x8_t vmax_u8 (uint8x8_t __a, uint8x8_t __b);//_mm_max_epu8  
 uint16x4_t vmax_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_max_epu16  
 uint32x2_t vmax_u32 (uint32x2_t __a, uint32x2_t __b);//_mm_max_epu32  
 int8x16_t vmaxq_s8 (int8x16_t __a, int8x16_t __b);//_mm_max_epi8  
 int16x8_t vmaxq_s16 (int16x8_t __a, int16x8_t __b);//_mm_max_epi16  
 int32x4_t vmaxq_s32 (int32x4_t __a, int32x4_t __b);//_mm_max_epi32  
 float32x4_t vmaxq_f32 (float32x4_t __a, float32x4_t __b);//_mm_max_ps  
 uint8x16_t vmaxq_u8 (uint8x16_t __a, uint8x16_t __b);//_mm_max_epu8  
 uint16x8_t vmaxq_u16 (uint16x8_t __a, uint16x8_t __b);//_mm_max_epu16  
 uint32x4_t vmaxq_u32 (uint32x4_t __a, uint32x4_t __b);//_mm_max_epu32  
 /****************************************************Min********************************/  
 /*--正常指令, vmin -> ri = ai >= bi ? bi : ai; returns the smaller of each pair--*/  
 int8x8_t vmin_s8 (int8x8_t __a, int8x8_t __b);//_mm_min_epi8  
 int16x4_t vmin_s16 (int16x4_t __a, int16x4_t __b);//_mm_min_epi16  
 int32x2_t vmin_s32 (int32x2_t __a, int32x2_t __b);//_mm_min_epi32  
 float32x2_t vmin_f32 (float32x2_t __a, float32x2_t __b);//_mm_min_ps  
 uint8x8_t vmin_u8 (uint8x8_t __a, uint8x8_t __b);//_mm_min_epu8  
 uint16x4_t vmin_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_min_epu16  
 uint32x2_t vmin_u32 (uint32x2_t __a, uint32x2_t __b);//_mm_min_epu32  
 int8x16_t vminq_s8 (int8x16_t __a, int8x16_t __b);//_mm_min_epi8  
 int16x8_t vminq_s16 (int16x8_t __a, int16x8_t __b);//_mm_min_epi16  
 int32x4_t vminq_s32 (int32x4_t __a, int32x4_t __b);//_mm_min_epi32  
 float32x4_t vminq_f32 (float32x4_t __a, float32x4_t __b);//_mm_min_ps  
 uint8x16_t vminq_u8 (uint8x16_t __a, uint8x16_t __b);//_mm_min_epu8  
 uint16x8_t vminq_u16 (uint16x8_t __a, uint16x8_t __b);//_mm_min_epu16  
 uint32x4_t vminq_u32 (uint32x4_t __a, uint32x4_t __b);//_mm_min_epu32  
 /*******************************************Pairwise addition***************************/  
 /*--1、Pairwise add(正常指令):  
 vpadd -> r0 = a0 + a1, ..., r3 = a6 + a7, r4 = b0 + b1, ..., r7 = b6 + b7 
 adds adjacent pairs of elements of two vectors,  
 and places the results in the destination vector.--*/  
 //r0 = a0 + a1, ...,r3 = a6 + a7, r4 = b0 + b1, ...,r7 = b6 + b7  
 int8x8_t vpadd_s8 (int8x8_t __a, int8x8_t __b);  
 int16x4_t vpadd_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vpadd_s32 (int32x2_t __a, int32x2_t __b);  
 float32x2_t vpadd_f32 (float32x2_t __a, float32x2_t __b);  
 uint8x8_t vpadd_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint16x4_t vpadd_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint32x2_t vpadd_u32 (uint32x2_t __a, uint32x2_t __b);  
 /*--2、Long pairwise add: vpaddl vpaddl -> r0 = a0 + a1, ..., r3 = a6 + a7; 
 adds adjacent pairs of elements of a vector, sign extends or zero extends the results to  
 twice their original width, and places the final results in the destination vector--*/  
 int16x4_t vpaddl_s8 (int8x8_t __a);  
 int32x2_t vpaddl_s16 (int16x4_t __a);  
 int64x1_t vpaddl_s32 (int32x2_t __a);  
 uint16x4_t vpaddl_u8 (uint8x8_t __a);  
 uint32x2_t vpaddl_u16 (uint16x4_t __a);  
 uint64x1_t vpaddl_u32 (uint32x2_t __a);  
 int16x8_t vpaddlq_s8 (int8x16_t __a);  
 int32x4_t vpaddlq_s16 (int16x8_t __a);  
 int64x2_t vpaddlq_s32 (int32x4_t __a);  
 uint16x8_t vpaddlq_u8 (uint8x16_t __a);  
 uint32x4_t vpaddlq_u16 (uint16x8_t __a);  
 uint64x2_t vpaddlq_u32 (uint32x4_t __a);  
 /*--3、Long pairwise add and accumulate:  
 vpadal -> r0 = a0 + (b0 + b1), ..., r3 = a3 + (b6 + b7); 
 adds adjacent pairs of elements in the second vector, sign extends or zero extends the 
 results to twice the original width.  It then accumulates this with the corresponding  
 element in the first vector and places the final results in the destination vector--*/  
 int16x4_t vpadal_s8 (int16x4_t __a, int8x8_t __b);  
 int32x2_t vpadal_s16 (int32x2_t __a, int16x4_t __b);  
 int64x1_t vpadal_s32 (int64x1_t __a, int32x2_t __b);  
 uint16x4_t vpadal_u8 (uint16x4_t __a, uint8x8_t __b);  
 uint32x2_t vpadal_u16 (uint32x2_t __a, uint16x4_t __b);  
 uint64x1_t vpadal_u32 (uint64x1_t __a, uint32x2_t __b);  
 int16x8_t vpadalq_s8 (int16x8_t __a, int8x16_t __b);  
 int32x4_t vpadalq_s16 (int32x4_t __a, int16x8_t __b);  
 int64x2_t vpadalq_s32 (int64x2_t __a, int32x4_t __b);  
 uint16x8_t vpadalq_u8 (uint16x8_t __a, uint8x16_t __b);  
 uint32x4_t vpadalq_u16 (uint32x4_t __a, uint16x8_t __b);  
 uint64x2_t vpadalq_u32 (uint64x2_t __a, uint32x4_t __b);  
 /**********************************************Folding maximum**************************/  
 /*--饱和指令, vpmax -> vpmax r0 = a0 >= a1 ? a0 : a1, ..., r4 = b0 >= b1 ? b0 : b1, ...; 
 compares adjacent pairs of elements, and copies the larger of each pair into the  
 destination vector.The maximums from each pair of the first input vector are stored in  
 the lower half of the destination vector. The maximums from each pair of the second input  
 vector are stored in the higher half of the destination vector--*/  
 int8x8_t vpmax_s8 (int8x8_t __a, int8x8_t __b);  
 int16x4_t vpmax_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vpmax_s32 (int32x2_t __a, int32x2_t __b);  
 float32x2_t vpmax_f32 (float32x2_t __a, float32x2_t __b);  
 uint8x8_t vpmax_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint16x4_t vpmax_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint32x2_t vpmax_u32 (uint32x2_t __a, uint32x2_t __b);  
 /***************************************************Folding minimum*********************/  
 /*--饱和指令, vpmin -> r0 = a0 >= a1 ? a1 : a0, ..., r4 = b0 >= b1 ? b1 : b0, ...; 
 compares adjacent pairs of elements, and copies the smaller of each pair into the  
 destination vector.The minimums from each pair of the first input vector are stored in  
 the lower half of the destination vector. The minimums from each pair of the second  
 input vector are stored in the higher half of the destination vector.--*/  
 int8x8_t vpmin_s8 (int8x8_t __a, int8x8_t __b);  
 int16x4_t vpmin_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vpmin_s32 (int32x2_t __a, int32x2_t __b);  
 float32x2_t vpmin_f32 (float32x2_t __a, float32x2_t __b);  
 uint8x8_t vpmin_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint16x4_t vpmin_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint32x2_t vpmin_u32 (uint32x2_t __a, uint32x2_t __b);  
 /***************************************************Reciprocal**************************/  
 /*--1、饱和指令, Newton-Raphson iteration(牛顿 - 拉夫逊迭代) 
 performs a Newton-Raphson step for finding the reciprocal. It multiplies the elements of 
 one vector by the corresponding elements of another vector, subtracts each of the results 
 from 2, and places the final results into the elements of the destination vector--*/  
 float32x2_t vrecps_f32 (float32x2_t __a, float32x2_t __b);  
 float32x4_t vrecpsq_f32 (float32x4_t __a, float32x4_t __b);  
 /*--2、饱和指令,performs a Newton-Raphson step for finding the reciprocal square root.  
 It multiplies the elements of one vector by the corresponding elements of another vector,  
 subtracts each of the results from 3, divides these results by two, and places  
 the final results into the elements of the destination vector--*/  
 float32x2_t vrsqrts_f32 (float32x2_t __a, float32x2_t __b);  
 float32x4_t vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b);  
 /************************************************Shifts by signed variable**************/  
 /*--1、Vector shift left(饱和指令): vshl -> ri = ai << bi; (negative values shift right) 
 left shifts each element in a vector by an amount specified in the corresponding element  
 in the second input vector. The shift amount is the signed integer value of the least  
 significant byte of the element in the second input vector. The bits shifted out of each 
 element are lost.If the signed integer value is negative, it results in a right shift--*/  
 int8x8_t vshl_s8 (int8x8_t __a, int8x8_t __b);  
 int16x4_t vshl_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vshl_s32 (int32x2_t __a, int32x2_t __b);  
 int64x1_t vshl_s64 (int64x1_t __a, int64x1_t __b);  
 uint8x8_t vshl_u8 (uint8x8_t __a, int8x8_t __b);  
 uint16x4_t vshl_u16 (uint16x4_t __a, int16x4_t __b);  
 uint32x2_t vshl_u32 (uint32x2_t __a, int32x2_t __b);  
 uint64x1_t vshl_u64 (uint64x1_t __a, int64x1_t __b);  
 int8x16_t vshlq_s8 (int8x16_t __a, int8x16_t __b);  
 int16x8_t vshlq_s16 (int16x8_t __a, int16x8_t __b);  
 int32x4_t vshlq_s32 (int32x4_t __a, int32x4_t __b);  
 int64x2_t vshlq_s64 (int64x2_t __a, int64x2_t __b);  
 uint8x16_t vshlq_u8 (uint8x16_t __a, int8x16_t __b);  
 uint16x8_t vshlq_u16 (uint16x8_t __a, int16x8_t __b);  
 uint32x4_t vshlq_u32 (uint32x4_t __a, int32x4_t __b);  
 uint64x2_t vshlq_u64 (uint64x2_t __a, int64x2_t __b);  
 /*--2、Vector saturating shift left(饱和指令):  
 vqshl -> ri = ai << bi;(negative values shift right) 
 If the shift value is positive, the operation is a left shift. Otherwise, it is a  
 truncating right shift. left shifts each element in a vector of integers and places 
 the results in the destination vector. It is similar to VSHL.  
 The difference is that the sticky QC flag is set if saturation occurs--*/  
 int8x8_t vqshl_s8 (int8x8_t __a, int8x8_t __b);  
 int16x4_t vqshl_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vqshl_s32 (int32x2_t __a, int32x2_t __b);  
 int64x1_t vqshl_s64 (int64x1_t __a, int64x1_t __b);  
 uint8x8_t vqshl_u8 (uint8x8_t __a, int8x8_t __b);  
 uint16x4_t vqshl_u16 (uint16x4_t __a, int16x4_t __b);  
 uint32x2_t vqshl_u32 (uint32x2_t __a, int32x2_t __b);  
 uint64x1_t vqshl_u64 (uint64x1_t __a, int64x1_t __b);  
 int8x16_t vqshlq_s8 (int8x16_t __a, int8x16_t __b);  
 int16x8_t vqshlq_s16 (int16x8_t __a, int16x8_t __b);  
 int32x4_t vqshlq_s32 (int32x4_t __a, int32x4_t __b);  
 int64x2_t vqshlq_s64 (int64x2_t __a, int64x2_t __b);  
 uint8x16_t vqshlq_u8 (uint8x16_t __a, int8x16_t __b);  
 uint16x8_t vqshlq_u16 (uint16x8_t __a, int16x8_t __b);  
 uint32x4_t vqshlq_u32 (uint32x4_t __a, int32x4_t __b);  
 uint64x2_t vqshlq_u64 (uint64x2_t __a, int64x2_t __b);  
 /*--3、Vector rounding shift left(饱和指令):  
 vrshl -> ri = ai << bi;(negative values shift right) 
 If the shift value is positive, the operation is a left shift. Otherwise, it is a 
 rounding right shift. left shifts each element in a vector of integers and places 
 the results in the destination vector. It is similar to VSHL.  
 The difference is that the shifted value is then rounded.--*/  
 int8x8_t vrshl_s8 (int8x8_t __a, int8x8_t __b);  
 int16x4_t vrshl_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vrshl_s32 (int32x2_t __a, int32x2_t __b);  
 int64x1_t vrshl_s64 (int64x1_t __a, int64x1_t __b);  
 uint8x8_t vrshl_u8 (uint8x8_t __a, int8x8_t __b);  
 uint16x4_t vrshl_u16 (uint16x4_t __a, int16x4_t __b);  
 uint32x2_t vrshl_u32 (uint32x2_t __a, int32x2_t __b);  
 uint64x1_t vrshl_u64 (uint64x1_t __a, int64x1_t __b);  
 int8x16_t vrshlq_s8 (int8x16_t __a, int8x16_t __b);  
 int16x8_t vrshlq_s16 (int16x8_t __a, int16x8_t __b);  
 int32x4_t vrshlq_s32 (int32x4_t __a, int32x4_t __b);  
 int64x2_t vrshlq_s64 (int64x2_t __a, int64x2_t __b);  
 uint8x16_t vrshlq_u8 (uint8x16_t __a, int8x16_t __b);  
 uint16x8_t vrshlq_u16 (uint16x8_t __a, int16x8_t __b);  
 uint32x4_t vrshlq_u32 (uint32x4_t __a, int32x4_t __b);  
 uint64x2_t vrshlq_u64 (uint64x2_t __a, int64x2_t __b);  
 /*--4、Vector saturating rounding shift left(饱和指令): 
 vqrshl -> ri = ai << bi;(negative values shift right) 
 left shifts each element in a vector of integers and places the results in the  
 destination vector.It is similar to VSHL. The difference is that the shifted value 
 is rounded, and the sticky QC flag is set if saturation occurs.--*/  
 int8x8_t vqrshl_s8 (int8x8_t __a, int8x8_t __b);  
 int16x4_t vqrshl_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vqrshl_s32 (int32x2_t __a, int32x2_t __b);  
 int64x1_t vqrshl_s64 (int64x1_t __a, int64x1_t __b);  
 uint8x8_t vqrshl_u8 (uint8x8_t __a, int8x8_t __b);  
 uint16x4_t vqrshl_u16 (uint16x4_t __a, int16x4_t __b);  
 uint32x2_t vqrshl_u32 (uint32x2_t __a, int32x2_t __b);  
 uint64x1_t vqrshl_u64 (uint64x1_t __a, int64x1_t __b);  
 int8x16_t vqrshlq_s8 (int8x16_t __a, int8x16_t __b);  
 int16x8_t vqrshlq_s16 (int16x8_t __a, int16x8_t __b);  
 int32x4_t vqrshlq_s32 (int32x4_t __a, int32x4_t __b);  
 int64x2_t vqrshlq_s64 (int64x2_t __a, int64x2_t __b);  
 uint8x16_t vqrshlq_u8 (uint8x16_t __a, int8x16_t __b);  
 uint16x8_t vqrshlq_u16 (uint16x8_t __a, int16x8_t __b);  
 uint32x4_t vqrshlq_u32 (uint32x4_t __a, int32x4_t __b);  
 uint64x2_t vqrshlq_u64 (uint64x2_t __a, int64x2_t __b);  
 /****************************************Shifts by a constant***************************/  
 /*--1、Vector shift right by constant: vshr -> ri = ai >> b;The results are truncated. 
 right shifts each element in a vector by an immediate value,  
 and places the results in the destination vector.--*/  
 int8x8_t vshr_n_s8 (int8x8_t __a, const int __b);  
 int16x4_t vshr_n_s16 (int16x4_t __a, const int __b);  
 int32x2_t vshr_n_s32 (int32x2_t __a, const int __b);  
 int64x1_t vshr_n_s64 (int64x1_t __a, const int __b);  
 uint8x8_t vshr_n_u8 (uint8x8_t __a, const int __b);  
 uint16x4_t vshr_n_u16 (uint16x4_t __a, const int __b);  
 uint32x2_t vshr_n_u32 (uint32x2_t __a, const int __b);  
 uint64x1_t vshr_n_u64 (uint64x1_t __a, const int __b);  
 int8x16_t vshrq_n_s8 (int8x16_t __a, const int __b);  
 int16x8_t vshrq_n_s16 (int16x8_t __a, const int __b);  
 int32x4_t vshrq_n_s32 (int32x4_t __a, const int __b);  
 int64x2_t vshrq_n_s64 (int64x2_t __a, const int __b);  
 uint8x16_t vshrq_n_u8 (uint8x16_t __a, const int __b);  
 uint16x8_t vshrq_n_u16 (uint16x8_t __a, const int __b);  
 uint32x4_t vshrq_n_u32 (uint32x4_t __a, const int __b);  
 uint64x2_t vshrq_n_u64 (uint64x2_t __a, const int __b);  
 /*--2、Vector shift left by constant: vshl -> ri = ai << b; 
 left shifts each element in a vector by an immediate value, and places the results in the  
 destination vector. The bits shifted out of the left of each element are lost--*/  
 int8x8_t vshl_n_s8 (int8x8_t __a, const int __b);  
 int16x4_t vshl_n_s16 (int16x4_t __a, const int __b);  
 int32x2_t vshl_n_s32 (int32x2_t __a, const int __b);  
 int64x1_t vshl_n_s64 (int64x1_t __a, const int __b);  
 uint8x8_t vshl_n_u8 (uint8x8_t __a, const int __b);  
 uint16x4_t vshl_n_u16 (uint16x4_t __a, const int __b);  
 uint32x2_t vshl_n_u32 (uint32x2_t __a, const int __b);  
 uint64x1_t vshl_n_u64 (uint64x1_t __a, const int __b);  
 int8x16_t vshlq_n_s8 (int8x16_t __a, const int __b);  
 int16x8_t vshlq_n_s16 (int16x8_t __a, const int __b);  
 int32x4_t vshlq_n_s32 (int32x4_t __a, const int __b);  
 int64x2_t vshlq_n_s64 (int64x2_t __a, const int __b);  
 uint8x16_t vshlq_n_u8 (uint8x16_t __a, const int __b);  
 uint16x8_t vshlq_n_u16 (uint16x8_t __a, const int __b);  
 uint32x4_t vshlq_n_u32 (uint32x4_t __a, const int __b);  
 uint64x2_t vshlq_n_u64 (uint64x2_t __a, const int __b);  
 /*--3、Vector rounding shift right by constant: vrshr -> ri = ai >> b; 
 right shifts each element in a vector by an immediate value, and places the results 
 in the destination vector. The shifted values are rounded.--*/  
 int8x8_t vrshr_n_s8 (int8x8_t __a, const int __b);  
 int16x4_t vrshr_n_s16 (int16x4_t __a, const int __b);  
 int32x2_t vrshr_n_s32 (int32x2_t __a, const int __b);  
 int64x1_t vrshr_n_s64 (int64x1_t __a, const int __b);  
 uint8x8_t vrshr_n_u8 (uint8x8_t __a, const int __b);  
 uint16x4_t vrshr_n_u16 (uint16x4_t __a, const int __b);  
 uint32x2_t vrshr_n_u32 (uint32x2_t __a, const int __b);  
 uint64x1_t vrshr_n_u64 (uint64x1_t __a, const int __b);  
 int8x16_t vrshrq_n_s8 (int8x16_t __a, const int __b);  
 int16x8_t vrshrq_n_s16 (int16x8_t __a, const int __b);  
 int32x4_t vrshrq_n_s32 (int32x4_t __a, const int __b);  
 int64x2_t vrshrq_n_s64 (int64x2_t __a, const int __b);  
 uint8x16_t vrshrq_n_u8 (uint8x16_t __a, const int __b);  
 uint16x8_t vrshrq_n_u16 (uint16x8_t __a, const int __b);  
 uint32x4_t vrshrq_n_u32 (uint32x4_t __a, const int __b);  
 uint64x2_t vrshrq_n_u64 (uint64x2_t __a, const int __b);  
 /*--4、Vector shift right by constant and accumulate: vsra -> ri = (ai >> c) + (bi >> c);  
 The results are truncated. right shifts each element in a vector by an immediate value,  
 and accumulates the results into the destination vector.--*/  
 int8x8_t vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c);  
 int16x4_t vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c);  
 int32x2_t vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c);  
 int64x1_t vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c);  
 uint8x8_t vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c);  
 uint16x4_t vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c);  
 uint32x2_t vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c);  
 uint64x1_t vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c);  
 int8x16_t vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c);  
 int16x8_t vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c);  
 int32x4_t vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c);  
 int64x2_t vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c);  
 uint8x16_t vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c);  
 uint16x8_t vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c);  
 uint32x4_t vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c);  
 uint64x2_t vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c);  
 /*--5、Vector rounding shift right by constant and accumulate:  
 vrsra -> ri = (ai >> c) + (bi >> c); 
 The results are rounded.right shifts each element in a vector by an immediate value,  
 and accumulates the rounded results into the destination vector.--*/  
 int8x8_t vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c);  
 int16x4_t vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c);  
 int32x2_t vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c);  
 int64x1_t vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c);  
 uint8x8_t vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c);  
 uint16x4_t vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c);  
 uint32x2_t vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c);  
 uint64x1_t vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c);  
 int8x16_t vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c);  
 int16x8_t vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c);  
 int32x4_t vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c);  
 int64x2_t vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c);  
 uint8x16_t vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c);  
 uint16x8_t vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c);  
 uint32x4_t vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c);  
 uint64x2_t vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c);  
 /*--6、Vector saturating shift left by constant: vqshl -> ri = sat(ai << b);  
 left shifts each element in a vector of integers by an immediate value, and places the  
 results in the destination vector,and the sticky QC flag is set if saturation occurs.--*/  
 int8x8_t vqshl_n_s8 (int8x8_t __a, const int __b);  
 int16x4_t vqshl_n_s16 (int16x4_t __a, const int __b);  
 int32x2_t vqshl_n_s32 (int32x2_t __a, const int __b);  
 int64x1_t vqshl_n_s64 (int64x1_t __a, const int __b);  
 uint8x8_t vqshl_n_u8 (uint8x8_t __a, const int __b);  
 uint16x4_t vqshl_n_u16 (uint16x4_t __a, const int __b);  
 uint32x2_t vqshl_n_u32 (uint32x2_t __a, const int __b);  
 uint64x1_t vqshl_n_u64 (uint64x1_t __a, const int __b);  
 int8x16_t vqshlq_n_s8 (int8x16_t __a, const int __b);  
 int16x8_t vqshlq_n_s16 (int16x8_t __a, const int __b);  
 int32x4_t vqshlq_n_s32 (int32x4_t __a, const int __b);  
 int64x2_t vqshlq_n_s64 (int64x2_t __a, const int __b);  
 uint8x16_t vqshlq_n_u8 (uint8x16_t __a, const int __b);  
 uint16x8_t vqshlq_n_u16 (uint16x8_t __a, const int __b);  
 uint32x4_t vqshlq_n_u32 (uint32x4_t __a, const int __b);  
 uint64x2_t vqshlq_n_u64 (uint64x2_t __a, const int __b);  
 /*--7、Vector signed->unsigned saturating shift left by constant: vqshlu -> ri = ai << b;  
 left shifts each element in a vector of integers by an immediate value, places the  
 results in the destination vector, the sticky QC flag is set if saturation occurs,  
 and indicates that the results are unsigned even though the operands are signed.--*/  
 uint8x8_t vqshlu_n_s8 (int8x8_t __a, const int __b);  
 uint16x4_t vqshlu_n_s16 (int16x4_t __a, const int __b);  
 uint32x2_t vqshlu_n_s32 (int32x2_t __a, const int __b);  
 uint64x1_t vqshlu_n_s64 (int64x1_t __a, const int __b);  
 uint8x16_t vqshluq_n_s8 (int8x16_t __a, const int __b);  
 uint16x8_t vqshluq_n_s16 (int16x8_t __a, const int __b);  
 uint32x4_t vqshluq_n_s32 (int32x4_t __a, const int __b);  
 uint64x2_t vqshluq_n_s64 (int64x2_t __a, const int __b);  
 /*--8、Vector narrowing shift right by constant: vshrn -> ri = ai >> b; 
 The results are truncated.right shifts each element in the input vector by an  
 immediate value. It then narrows the result by storing only the least significant 
 half of each element into the destination vector.--*/  
 int8x8_t vshrn_n_s16 (int16x8_t __a, const int __b);  
 int16x4_t vshrn_n_s32 (int32x4_t __a, const int __b);  
 int32x2_t vshrn_n_s64 (int64x2_t __a, const int __b);  
 uint8x8_t vshrn_n_u16 (uint16x8_t __a, const int __b);  
 uint16x4_t vshrn_n_u32 (uint32x4_t __a, const int __b);  
 uint32x2_t vshrn_n_u64 (uint64x2_t __a, const int __b);  
 /*--9、Vector signed->unsigned narrowing saturating shift right by constant:  
 vqshrun -> ri = ai >> b;  
 Results are truncated. right shifts each element in a quadword vector of integers by an 
 immediate value, and places the results in a doubleword vector. The results are unsigned,  
 although the operands are signed. The sticky QC flag is set if saturation occurs.--*/  
 uint8x8_t vqshrun_n_s16 (int16x8_t __a, const int __b);  
 uint16x4_t vqshrun_n_s32 (int32x4_t __a, const int __b);  
 uint32x2_t vqshrun_n_s64 (int64x2_t __a, const int __b);  
 /*--10、Vector signed->unsigned rounding narrowing saturating shift right by constant:  
 vqrshrun -> ri = ai >> b; Results are rounded. right shifts each element in a quadword  
 vector of integers by an immediate value, and places the rounded results in a doubleword  
 vector. The results are unsigned, although the operands are signed.--*/  
 uint8x8_t vqrshrun_n_s16 (int16x8_t __a, const int __b);  
 uint16x4_t vqrshrun_n_s32 (int32x4_t __a, const int __b);  
 uint32x2_t vqrshrun_n_s64 (int64x2_t __a, const int __b);  
 /*--11、Vector narrowing saturating shift right by constant: vqshrn -> ri = ai >> b;  
 Results are truncated. right shifts each element in a quadword vector of integers by an  
 immediate value, and places the results in a doubleword vector,  
 and the sticky QC flag is set if saturation occurs.--*/  
 int8x8_t vqshrn_n_s16 (int16x8_t __a, const int __b);  
 int16x4_t vqshrn_n_s32 (int32x4_t __a, const int __b);  
 int32x2_t vqshrn_n_s64 (int64x2_t __a, const int __b);  
 uint8x8_t vqshrn_n_u16 (uint16x8_t __a, const int __b);  
 uint16x4_t vqshrn_n_u32 (uint32x4_t __a, const int __b);  
 uint32x2_t vqshrn_n_u64 (uint64x2_t __a, const int __b);  
 /*--12、Vector rounding narrowing shift right by constant: vrshrn -> ri = ai >> b;  
 The results are rounded. right shifts each element in a vector by an immediate value, 
 and places the rounded,narrowed results in the destination vector.--*/  
 int8x8_t vrshrn_n_s16 (int16x8_t __a, const int __b);  
 int16x4_t vrshrn_n_s32 (int32x4_t __a, const int __b);  
 int32x2_t vrshrn_n_s64 (int64x2_t __a, const int __b);  
 uint8x8_t vrshrn_n_u16 (uint16x8_t __a, const int __b);  
 uint16x4_t vrshrn_n_u32 (uint32x4_t __a, const int __b);  
 uint32x2_t vrshrn_n_u64 (uint64x2_t __a, const int __b);  
 /*--13、Vector rounding narrowing saturating shift right by constant: 
 vqrshrn -> ri = ai >> b; 
 Results are rounded. right shifts each element in a quadword vector of integers by an  
 immediate value,and places the rounded,narrowed results in a doubleword vector.  
 The sticky QC flag is set if saturation occurs.--*/  
 int8x8_t vqrshrn_n_s16 (int16x8_t __a, const int __b);  
 int16x4_t vqrshrn_n_s32 (int32x4_t __a, const int __b);  
 int32x2_t vqrshrn_n_s64 (int64x2_t __a, const int __b);  
 uint8x8_t vqrshrn_n_u16 (uint16x8_t __a, const int __b);  
 uint16x4_t vqrshrn_n_u32 (uint32x4_t __a, const int __b);  
 uint32x2_t vqrshrn_n_u64 (uint64x2_t __a, const int __b);  
 /*--14、Vector widening shift left by constant: vshll -> ri = ai << b;  
 left shifts each element in a vector of integers by an immediate value,  
 and place the results in the destination vector. Bits shifted out of the left of each 
 element are lost and values are sign extended or zero extended.--*/  
 int16x8_t vshll_n_s8 (int8x8_t __a, const int __b);  
 int32x4_t vshll_n_s16 (int16x4_t __a, const int __b);  
 int64x2_t vshll_n_s32 (int32x2_t __a, const int __b);  
 uint16x8_t vshll_n_u8 (uint8x8_t __a, const int __b);  
 uint32x4_t vshll_n_u16 (uint16x4_t __a, const int __b);  
 uint64x2_t vshll_n_u32 (uint32x2_t __a, const int __b);  
 /********************************************Shifts with insert*************************/  
 /*--1、Vector shift right and insert: vsri -> ; The two most significant bits in the  
 destination vector are unchanged. right shifts each element in the second input vector  
 by an immediate value, and inserts the results in the destination vector. It does not  
 affect the highest n significant bits of the elements in the destination register. 
 Bits shifted out of the right of each element are lost.The first input vector holds 
 the elements of the destination vector before the operation is performed.--*/  
 int8x8_t vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c);  
 int16x4_t vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c);  
 int32x2_t vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c);  
 int64x1_t vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c);  
 uint8x8_t vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c);  
 uint16x4_t vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c);  
 uint32x2_t vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c);  
 uint64x1_t vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c);  
 poly8x8_t vsri_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c);  
 poly16x4_t vsri_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c);  
 int8x16_t vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c);  
 int16x8_t vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c);  
 int32x4_t vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c);  
 int64x2_t vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c);  
 uint8x16_t vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c);  
 uint16x8_t vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c);  
 uint32x4_t vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c);  
 uint64x2_t vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c);  
 poly8x16_t vsriq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c);  
 poly16x8_t vsriq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c);  
 /*--2、Vector shift left and insert: vsli ->; The least significant bit in each element 
 in the destination vector is unchanged. left shifts each element in the second input  
 vector by an immediate value, and inserts the results in the destination vector. 
 It does not affect the lowest n significant bits of the elements in the destination  
 register. Bits shifted out of the left of each element are lost. The first input vector 
 holds the elements of the destination vector before the operation is performed.--*/  
 int8x8_t vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c);  
 int16x4_t vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c);  
 int32x2_t vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c);  
 int64x1_t vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c);  
 uint8x8_t vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c);  
 uint16x4_t vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c);  
 uint32x2_t vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c);  
 uint64x1_t vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c);  
 poly8x8_t vsli_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c);  
 poly16x4_t vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c);  
 int8x16_t vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c);  
 int16x8_t vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c);  
 int32x4_t vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c);  
 int64x2_t vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c);  
 uint8x16_t vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c);  
 uint16x8_t vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c);  
 uint32x4_t vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c);  
 uint64x2_t vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c);  
 poly8x16_t vsliq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c);  
 poly16x8_t vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c);  
 /*****************************************Absolute value********************************/  
 /*--1、Absolute(正常指令): vabs -> ri = |ai|; 
 returns the absolute value of each element in a vector.--*/  
 int8x8_t vabs_s8 (int8x8_t __a);//_mm_abs_epi8  
 int16x4_t vabs_s16 (int16x4_t __a);//_mm_abs_epi16  
 int32x2_t vabs_s32 (int32x2_t __a);//_mm_abs_epi32  
 float32x2_t vabs_f32 (float32x2_t __a);  
 int8x16_t vabsq_s8 (int8x16_t __a);//_mm_abs_epi8  
 int16x8_t vabsq_s16 (int16x8_t __a);//_mm_abs_epi16  
 int32x4_t vabsq_s32 (int32x4_t __a);//_mm_abs_epi32  
 float32x4_t vabsq_f32 (float32x4_t __a);  
 /*--2、Saturating absolute(饱和指令): vqabs -> ri = sat(|ai|); 
 returns the absolute value of each element in a vector. If any of the results overflow, 
 they are saturated and the sticky QC flag is set.--*/  
 int8x8_t vqabs_s8 (int8x8_t __a);  
 int16x4_t vqabs_s16 (int16x4_t __a);  
 int32x2_t vqabs_s32 (int32x2_t __a);  
 int8x16_t vqabsq_s8 (int8x16_t __a);  
 int16x8_t vqabsq_s16 (int16x8_t __a);  
 int32x4_t vqabsq_s32 (int32x4_t __a);  
 /***************************************************Negation****************************/  
 /*--1、Negate(正常指令): vneg -> ri = -ai; negates each element in a vector.--*/  
 int8x8_t vneg_s8 (int8x8_t __a);  
 int16x4_t vneg_s16 (int16x4_t __a);  
 int32x2_t vneg_s32 (int32x2_t __a);  
 float32x2_t vneg_f32 (float32x2_t __a);  
 int8x16_t vnegq_s8 (int8x16_t __a);  
 int16x8_t vnegq_s16 (int16x8_t __a);  
 int32x4_t vnegq_s32 (int32x4_t __a);  
 float32x4_t vnegq_f32 (float32x4_t __a);  
 /*--2、Saturating Negate: vqneg -> ri = sat(-ai); 
 negates each element in a vector. If any of the results overflow,  
 they are saturated and the sticky QC flag is set.--*/  
 int8x8_t vqneg_s8 (int8x8_t __a);  
 int16x4_t vqneg_s16 (int16x4_t __a);  
 int32x2_t vqneg_s32 (int32x2_t __a);  
 int8x16_t vqnegq_s8 (int8x16_t __a);  
 int16x8_t vqnegq_s16 (int16x8_t __a);  
 int32x4_t vqnegq_s32 (int32x4_t __a);  
 /********************************************Logical operations*************************/  
 /*--1、Bitwise not(正常指令): vmvn -> ri = ~ai;  
 performs a bitwise inversion of each element from the input vector.--*/  
 int8x8_t vmvn_s8 (int8x8_t __a);  
 int16x4_t vmvn_s16 (int16x4_t __a);  
 int32x2_t vmvn_s32 (int32x2_t __a);  
 uint8x8_t vmvn_u8 (uint8x8_t __a);  
 uint16x4_t vmvn_u16 (uint16x4_t __a);  
 uint32x2_t vmvn_u32 (uint32x2_t __a);  
 poly8x8_t vmvn_p8 (poly8x8_t __a);  
 int8x16_t vmvnq_s8 (int8x16_t __a);  
 int16x8_t vmvnq_s16 (int16x8_t __a);  
 int32x4_t vmvnq_s32 (int32x4_t __a);  
 uint8x16_t vmvnq_u8 (uint8x16_t __a);  
 uint16x8_t vmvnq_u16 (uint16x8_t __a);  
 uint32x4_t vmvnq_u32 (uint32x4_t __a);  
 poly8x16_t vmvnq_p8 (poly8x16_t __a);  
 /*--2、Bitwise and(正常指令): vand -> ri = ai & bi; performs a bitwise AND between  
 corresponding elements of the input vectors.--*/  
 int8x8_t vand_s8 (int8x8_t __a, int8x8_t __b);//_mm_and_si128  
 int16x4_t vand_s16 (int16x4_t __a, int16x4_t __b);//_mm_and_si128  
 int32x2_t vand_s32 (int32x2_t __a, int32x2_t __b);//_mm_and_si128  
 uint8x8_t vand_u8 (uint8x8_t __a, uint8x8_t __b);//_mm_and_si128  
 uint16x4_t vand_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_and_si128  
 uint32x2_t vand_u32 (uint32x2_t __a, uint32x2_t __b);//_mm_and_si128  
 int64x1_t vand_s64 (int64x1_t __a, int64x1_t __b);//_mm_and_si128  
 uint64x1_t vand_u64 (uint64x1_t __a, uint64x1_t __b);//_mm_and_si128  
 int8x16_t vandq_s8 (int8x16_t __a, int8x16_t __b);//_mm_and_si128  
 int16x8_t vandq_s16 (int16x8_t __a, int16x8_t __b);//_mm_and_si128  
 int32x4_t vandq_s32 (int32x4_t __a, int32x4_t __b);//_mm_and_si128  
 int64x2_t vandq_s64 (int64x2_t __a, int64x2_t __b);//_mm_and_si128  
 uint8x16_t vandq_u8 (uint8x16_t __a, uint8x16_t __b);//_mm_and_si128  
 uint16x8_t vandq_u16 (uint16x8_t __a, uint16x8_t __b);//_mm_and_si128  
 uint32x4_t vandq_u32 (uint32x4_t __a, uint32x4_t __b);//_mm_and_si128  
 uint64x2_t vandq_u64 (uint64x2_t __a, uint64x2_t __b);//_mm_and_si128  
 /*--3、Bitwise or(正常指令): vorr -> ri = ai | bi; performs a bitwise OR between 
 corresponding elements of the input vectors.--*/  
 int8x8_t vorr_s8 (int8x8_t __a, int8x8_t __b);//_mm_or_si128  
 int16x4_t vorr_s16 (int16x4_t __a, int16x4_t __b);//_mm_or_si128  
 int32x2_t vorr_s32 (int32x2_t __a, int32x2_t __b);//_mm_or_si128  
 uint8x8_t vorr_u8 (uint8x8_t __a, uint8x8_t __b);//_mm_or_si128  
 uint16x4_t vorr_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_or_si128  
 uint32x2_t vorr_u32 (uint32x2_t __a, uint32x2_t __b);//_mm_or_si128  
 int64x1_t vorr_s64 (int64x1_t __a, int64x1_t __b);//_mm_or_si128  
 uint64x1_t vorr_u64 (uint64x1_t __a, uint64x1_t __b);//_mm_or_si128  
 int8x16_t vorrq_s8 (int8x16_t __a, int8x16_t __b);//_mm_or_si128  
 int16x8_t vorrq_s16 (int16x8_t __a, int16x8_t __b);//_mm_or_si128  
 int32x4_t vorrq_s32 (int32x4_t __a, int32x4_t __b);//_mm_or_si128  
 int64x2_t vorrq_s64 (int64x2_t __a, int64x2_t __b);//_mm_or_si128  
 uint8x16_t vorrq_u8 (uint8x16_t __a, uint8x16_t __b);//_mm_or_si128  
 uint16x8_t vorrq_u16 (uint16x8_t __a, uint16x8_t __b);//_mm_or_si128  
 uint32x4_t vorrq_u32 (uint32x4_t __a, uint32x4_t __b);//_mm_or_si128  
 uint64x2_t vorrq_u64 (uint64x2_t __a, uint64x2_t __b);//_mm_or_si128  
 /*--4、Bitwise exclusive or (EOR or XOR)(正常指令): veor -> ri = ai ^ bi;  
 performs a bitwise exclusive-OR between corresponding elements of the input vectors.--*/  
 int8x8_t veor_s8 (int8x8_t __a, int8x8_t __b);//_mm_xor_si128  
 int16x4_t veor_s16 (int16x4_t __a, int16x4_t __b);//_mm_xor_si128  
 int32x2_t veor_s32 (int32x2_t __a, int32x2_t __b);//_mm_xor_si128  
 uint8x8_t veor_u8 (uint8x8_t __a, uint8x8_t __b);//_mm_xor_si128  
 uint16x4_t veor_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_xor_si128  
 uint32x2_t veor_u32 (uint32x2_t __a, uint32x2_t __b);//_mm_xor_si128  
 int64x1_t veor_s64 (int64x1_t __a, int64x1_t __b);//_mm_xor_si128  
 uint64x1_t veor_u64 (uint64x1_t __a, uint64x1_t __b);//_mm_xor_si128  
 int8x16_t veorq_s8 (int8x16_t __a, int8x16_t __b);//_mm_xor_si128  
 int16x8_t veorq_s16 (int16x8_t __a, int16x8_t __b);//_mm_xor_si128  
 int32x4_t veorq_s32 (int32x4_t __a, int32x4_t __b);//_mm_xor_si128  
 int64x2_t veorq_s64 (int64x2_t __a, int64x2_t __b);//_mm_xor_si128  
 uint8x16_t veorq_u8 (uint8x16_t __a, uint8x16_t __b);//_mm_xor_si128  
 uint16x8_t veorq_u16 (uint16x8_t __a, uint16x8_t __b);//_mm_xor_si128  
 uint32x4_t veorq_u32 (uint32x4_t __a, uint32x4_t __b);//_mm_xor_si128  
 uint64x2_t veorq_u64 (uint64x2_t __a, uint64x2_t __b);//_mm_xor_si128  
 /*--5、Bit Clear(正常指令): vbic -> ri = ~ai & bi; 
 VBIC (Vector Bitwise Clear) performs a bitwise logical AND complement operation between 
 values in two registers, and places the results in the destination register.--*/  
 int8x8_t vbic_s8 (int8x8_t __a, int8x8_t __b);//_mm_andnot_si128  
 int16x4_t vbic_s16 (int16x4_t __a, int16x4_t __b);//_mm_andnot_si128  
 int32x2_t vbic_s32 (int32x2_t __a, int32x2_t __b);//_mm_andnot_si128  
 uint8x8_t vbic_u8 (uint8x8_t __a, uint8x8_t __b);//_mm_andnot_si128  
 uint16x4_t vbic_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_andnot_si128  
 uint32x2_t vbic_u32 (uint32x2_t __a, uint32x2_t __b);//_mm_andnot_si128  
 int64x1_t vbic_s64 (int64x1_t __a, int64x1_t __b);//_mm_andnot_si128  
 uint64x1_t vbic_u64 (uint64x1_t __a, uint64x1_t __b);//_mm_andnot_si128  
 int8x16_t vbicq_s8 (int8x16_t __a, int8x16_t __b);//_mm_andnot_si128  
 int16x8_t vbicq_s16 (int16x8_t __a, int16x8_t __b);//_mm_andnot_si128  
 int32x4_t vbicq_s32 (int32x4_t __a, int32x4_t __b);//_mm_andnot_si128  
 int64x2_t vbicq_s64 (int64x2_t __a, int64x2_t __b);//_mm_andnot_si128  
 uint8x16_t vbicq_u8 (uint8x16_t __a, uint8x16_t __b);//_mm_andnot_si128  
 uint16x8_t vbicq_u16 (uint16x8_t __a, uint16x8_t __b);//_mm_andnot_si128  
 uint32x4_t vbicq_u32 (uint32x4_t __a, uint32x4_t __b);//_mm_andnot_si128  
 uint64x2_t vbicq_u64 (uint64x2_t __a, uint64x2_t __b);//_mm_andnot_si128  
 /*--6、Bitwise OR complement(正常指令): vorn -> ri = ai | (~bi);  
 performs a bitwise logical OR NOT operation  
 between values in two registers, and places the results in the destination register.--*/  
 int8x8_t vorn_s8 (int8x8_t __a, int8x8_t __b);  
 int16x4_t vorn_s16 (int16x4_t __a, int16x4_t __b);  
 int32x2_t vorn_s32 (int32x2_t __a, int32x2_t __b);  
 uint8x8_t vorn_u8 (uint8x8_t __a, uint8x8_t __b);  
 uint16x4_t vorn_u16 (uint16x4_t __a, uint16x4_t __b);  
 uint32x2_t vorn_u32 (uint32x2_t __a, uint32x2_t __b);  
 int64x1_t vorn_s64 (int64x1_t __a, int64x1_t __b);  
 uint64x1_t vorn_u64 (uint64x1_t __a, uint64x1_t __b);  
 int8x16_t vornq_s8 (int8x16_t __a, int8x16_t __b);  
 int16x8_t vornq_s16 (int16x8_t __a, int16x8_t __b);  
 int32x4_t vornq_s32 (int32x4_t __a, int32x4_t __b);  
 int64x2_t vornq_s64 (int64x2_t __a, int64x2_t __b);  
 uint8x16_t vornq_u8 (uint8x16_t __a, uint8x16_t __b);  
 uint16x8_t vornq_u16 (uint16x8_t __a, uint16x8_t __b);  
 uint32x4_t vornq_u32 (uint32x4_t __a, uint32x4_t __b);  
 uint64x2_t vornq_u64 (uint64x2_t __a, uint64x2_t __b);  
 /****************************************Count leading sign bits************************/  
 /*--正常指令, vcls -> ; counts the number of consecutive bits, starting from the most  
 significant bit,that are the same as the most significant bit, in each element in a  
 vector, and places the count in the result vector.--*/  
 int8x8_t vcls_s8 (int8x8_t __a);  
 int16x4_t vcls_s16 (int16x4_t __a);  
 int32x2_t vcls_s32 (int32x2_t __a);  
 int8x16_t vclsq_s8 (int8x16_t __a);  
 int16x8_t vclsq_s16 (int16x8_t __a);  
 int32x4_t vclsq_s32 (int32x4_t __a);  
 /*******************************************Count leading zeros*************************/  
 /*--正常指令, vclz -> ; counts the number of consecutive zeros, starting from the most 
 significant bit, in each element in a vector, and places the count in result vector.--*/  
 int8x8_t vclz_s8 (int8x8_t __a);  
 int16x4_t vclz_s16 (int16x4_t __a);  
 int32x2_t vclz_s32 (int32x2_t __a);  
 uint8x8_t vclz_u8 (uint8x8_t __a);  
 uint16x4_t vclz_u16 (uint16x4_t __a);  
 uint32x2_t vclz_u32 (uint32x2_t __a);  
 int8x16_t vclzq_s8 (int8x16_t __a);  
 int16x8_t vclzq_s16 (int16x8_t __a);  
 int32x4_t vclzq_s32 (int32x4_t __a);  
 uint8x16_t vclzq_u8 (uint8x16_t __a);  
 uint16x8_t vclzq_u16 (uint16x8_t __a);  
 uint32x4_t vclzq_u32 (uint32x4_t __a);  
 /*******************************************Count number of set bits********************/  
 /*--正常指令, vcnt -> counts the number of bits that are one in each element in a vector,  
 and places the count in the result vector.--*/  
 int8x8_t vcnt_s8 (int8x8_t __a);  
 uint8x8_t vcnt_u8 (uint8x8_t __a);  
 poly8x8_t vcnt_p8 (poly8x8_t __a);  
 int8x16_t vcntq_s8 (int8x16_t __a);  
 uint8x16_t vcntq_u8 (uint8x16_t __a);  
 poly8x16_t vcntq_p8 (poly8x16_t __a);  
 /*****************************************Reciprocal estimate***************************/  
 /*--正常指令, vrecpe -> ; finds an approximate reciprocal of each element in a vector,  
 and places it in the result vector.--*/  
 float32x2_t vrecpe_f32 (float32x2_t __a);  
 uint32x2_t vrecpe_u32 (uint32x2_t __a);  
 float32x4_t vrecpeq_f32 (float32x4_t __a);  
 uint32x4_t vrecpeq_u32 (uint32x4_t __a);  
 /****************************************Reciprocal square-root estimate****************/  
 /*--正常指令, vrsqrte -> ; finds an approximate reciprocal square root of each element 
 in a vector, and places it in the return vector.--*/  
 float32x2_t vrsqrte_f32 (float32x2_t __a);  
 uint32x2_t vrsqrte_u32 (uint32x2_t __a);  
 float32x4_t vrsqrteq_f32 (float32x4_t __a);  
 uint32x4_t vrsqrteq_u32 (uint32x4_t __a);  
 /*******************************************Get lanes from a vector*********************/  
 /*--vmov -> r = a[b]; returns the value from the specified lane of a vector. 
 Extract lanes from a vector and put into a register.  
 These intrinsics extract a single lane (element) from a vector.--*/  
 int8_t vget_lane_s8 (int8x8_t __a, const int __b);//_mm_extract_epi8  
 int16_t vget_lane_s16 (int16x4_t __a, const int __b);//_mm_extract_epi16  
 int32_t vget_lane_s32 (int32x2_t __a, const int __b);//_mm_extract_epi32  
 float32_t vget_lane_f32 (float32x2_t __a, const int __b);  
 uint8_t vget_lane_u8 (uint8x8_t __a, const int __b);//_mm_extract_epi8  
 uint16_t vget_lane_u16 (uint16x4_t __a, const int __b);//_mm_extract_epi16  
 uint32_t vget_lane_u32 (uint32x2_t __a, const int __b);//_mm_extract_epi32  
 poly8_t vget_lane_p8 (poly8x8_t __a, const int __b);//_mm_extract_epi8  
 poly16_t vget_lane_p16 (poly16x4_t __a, const int __b);//_mm_extract_epi16  
 int64_t vget_lane_s64 (int64x1_t __a, const int __b);//_mm_extract_epi64  
 uint64_t vget_lane_u64 (uint64x1_t __a, const int __b);//_mm_extract_epi64  
 int8_t vgetq_lane_s8 (int8x16_t __a, const int __b);//_mm_extract_epi8  
 int16_t vgetq_lane_s16 (int16x8_t __a, const int __b);//_mm_extract_epi16  
 int32_t vgetq_lane_s32 (int32x4_t __a, const int __b);//_mm_extract_epi32  
 float32_t vgetq_lane_f32 (float32x4_t __a, const int __b);  
 uint8_t vgetq_lane_u8 (uint8x16_t __a, const int __b);//_mm_extract_epi8  
 uint16_t vgetq_lane_u16 (uint16x8_t __a, const int __b);//_mm_extract_epi16  
 uint32_t vgetq_lane_u32 (uint32x4_t __a, const int __b);//_mm_extract_epi32  
 poly8_t vgetq_lane_p8 (poly8x16_t __a, const int __b);//_mm_extract_epi8  
 poly16_t vgetq_lane_p16 (poly16x8_t __a, const int __b);//_mm_extract_epi16  
 int64_t vgetq_lane_s64 (int64x2_t __a, const int __b);//_mm_extract_epi64  
 uint64_t vgetq_lane_u64 (uint64x2_t __a, const int __b);//_mm_extract_epi64  
 /*********************************************Set lanes in a vector********************/  
 /*--vmov -> ; sets the value of the specified lane of a vector. It returns the vector  
 with the new value.Load a single lane of a vector from a literal. These intrinsics set  
 a single lane (element) within a vector.--*/  
 int8x8_t vset_lane_s8 (int8_t __a, int8x8_t __b, const int __c);  
 int16x4_t vset_lane_s16 (int16_t __a, int16x4_t __b, const int __c);  
 int32x2_t vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c);  
 float32x2_t vset_lane_f32 (float32_t __a, float32x2_t __b, const int __c);  
 uint8x8_t vset_lane_u8 (uint8_t __a, uint8x8_t __b, const int __c);  
 uint16x4_t vset_lane_u16 (uint16_t __a, uint16x4_t __b, const int __c);  
 uint32x2_t vset_lane_u32 (uint32_t __a, uint32x2_t __b, const int __c);  
 poly8x8_t vset_lane_p8 (poly8_t __a, poly8x8_t __b, const int __c);  
 poly16x4_t vset_lane_p16 (poly16_t __a, poly16x4_t __b, const int __c);  
 int64x1_t vset_lane_s64 (int64_t __a, int64x1_t __b, const int __c);  
 uint64x1_t vset_lane_u64 (uint64_t __a, uint64x1_t __b, const int __c);  
 int8x16_t vsetq_lane_s8 (int8_t __a, int8x16_t __b, const int __c);  
 int16x8_t vsetq_lane_s16 (int16_t __a, int16x8_t __b, const int __c);  
 int32x4_t vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c);  
 float32x4_t vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c);  
 uint8x16_t vsetq_lane_u8 (uint8_t __a, uint8x16_t __b, const int __c);  
 uint16x8_t vsetq_lane_u16 (uint16_t __a, uint16x8_t __b, const int __c);  
 uint32x4_t vsetq_lane_u32 (uint32_t __a, uint32x4_t __b, const int __c);  
 poly8x16_t vsetq_lane_p8 (poly8_t __a, poly8x16_t __b, const int __c);  
 poly16x8_t vsetq_lane_p16 (poly16_t __a, poly16x8_t __b, const int __c);  
 int64x2_t vsetq_lane_s64 (int64_t __a, int64x2_t __b, const int __c);  
 uint64x2_t vsetq_lane_u64 (uint64_t __a, uint64x2_t __b, const int __c);  
 /****************************************Create vector from literal bit pattern*********/  
 /*--vmov -> ; creates a vector from a 64-bit pattern.  
 Initialize a vector from a literal bit pattern.--*/  
 int8x8_t vcreate_s8 (uint64_t __a);//_mm_loadl_epi64  
 int16x4_t vcreate_s16 (uint64_t __a);//_mm_loadl_epi64  
 int32x2_t vcreate_s32 (uint64_t __a);//_mm_loadl_epi64  
 int64x1_t vcreate_s64 (uint64_t __a);//_mm_loadl_epi64  
 float32x2_t vcreate_f32 (uint64_t __a);  
 uint8x8_t vcreate_u8 (uint64_t __a);//_mm_loadl_epi64  
 uint16x4_t vcreate_u16 (uint64_t __a);//_mm_loadl_epi64  
 uint32x2_t vcreate_u32 (uint64_t __a);//_mm_loadl_epi64  
 uint64x1_t vcreate_u64 (uint64_t __a);//_mm_loadl_epi64  
 poly8x8_t vcreate_p8 (uint64_t __a);//_mm_loadl_epi64  
 poly16x4_t vcreate_p16 (uint64_t __a);//_mm_loadl_epi64  
 /*****************************************Set all lanes to the same value***************/  
 /*--1、Load all lanes of vector to the same literal value: vdup/vmov -> ri = a;  
 duplicates a scalar into every element of the destination vector.  
 Load all lanes of vector to the same literal value--*/  
 int8x8_t vdup_n_s8 (int8_t __a);//_mm_set1_epi8  
 int16x4_t vdup_n_s16 (int16_t __a);//_mm_set1_epi16  
 int32x2_t vdup_n_s32 (int32_t __a);//_mm_set1_epi32  
 float32x2_t vdup_n_f32 (float32_t __a);//_mm_set1_ps  
 uint8x8_t vdup_n_u8 (uint8_t __a);//_mm_set1_epi8  
 uint16x4_t vdup_n_u16 (uint16_t __a);//_mm_set1_epi16  
 uint32x2_t vdup_n_u32 (uint32_t __a);//_mm_set1_epi32  
 poly8x8_t vdup_n_p8 (poly8_t __a);//_mm_set1_epi8  
 poly16x4_t vdup_n_p16 (poly16_t __a);//_mm_set1_epi16  
 int64x1_t vdup_n_s64 (int64_t __a);  
 uint64x1_t vdup_n_u64 (uint64_t __a);  
 int8x16_t vdupq_n_s8 (int8_t __a);//_mm_set1_epi8  
 int16x8_t vdupq_n_s16 (int16_t __a);//_mm_set1_epi16  
 int32x4_t vdupq_n_s32 (int32_t __a);//_mm_set1_epi32  
 float32x4_t vdupq_n_f32 (float32_t __a);//_mm_set1_ps  
 uint8x16_t vdupq_n_u8 (uint8_t __a);//_mm_set1_epi8  
 uint16x8_t vdupq_n_u16 (uint16_t __a);//_mm_set1_epi16  
 uint32x4_t vdupq_n_u32 (uint32_t __a);//_mm_set1_epi32  
 poly8x16_t vdupq_n_p8 (poly8_t __a);//_mm_set1_epi8  
 poly16x8_t vdupq_n_p16 (poly16_t __a);//_mm_set1_epi16  
 int64x2_t vdupq_n_s64 (int64_t __a);  
 uint64x2_t vdupq_n_u64 (uint64_t __a);  
 int8x8_t vmov_n_s8 (int8_t __a);//_mm_set1_epi8  
 int16x4_t vmov_n_s16 (int16_t __a);//_mm_set1_epi16  
 int32x2_t vmov_n_s32 (int32_t __a);//_mm_set1_epi32  
 float32x2_t vmov_n_f32 (float32_t __a);//_mm_set1_ps  
 uint8x8_t vmov_n_u8 (uint8_t __a);//_mm_set1_epi8  
 uint16x4_t vmov_n_u16 (uint16_t __a);//_mm_set1_epi16  
 uint32x2_t vmov_n_u32 (uint32_t __a);//_mm_set1_epi32  
 poly8x8_t vmov_n_p8 (poly8_t __a);//_mm_set1_epi8  
 poly16x4_t vmov_n_p16 (poly16_t __a);//_mm_set1_epi16  
 int64x1_t vmov_n_s64 (int64_t __a);  
 uint64x1_t vmov_n_u64 (uint64_t __a);  
 int8x16_t vmovq_n_s8 (int8_t __a);//_mm_set1_epi8  
 int16x8_t vmovq_n_s16 (int16_t __a);//_mm_set1_epi16  
 int32x4_t vmovq_n_s32 (int32_t __a);//_mm_set1_epi32  
 float32x4_t vmovq_n_f32 (float32_t __a);//_mm_set1_ps  
 uint8x16_t vmovq_n_u8 (uint8_t __a);//_mm_set1_epi8  
 uint16x8_t vmovq_n_u16 (uint16_t __a);//_mm_set1_epi16  
 uint32x4_t vmovq_n_u32 (uint32_t __a);//_mm_set1_epi32  
 poly8x16_t vmovq_n_p8 (poly8_t __a);//_mm_set1_epi8  
 poly16x8_t vmovq_n_p16 (poly16_t __a);//_mm_set1_epi16  
 int64x2_t vmovq_n_s64 (int64_t __a);  
 uint64x2_t vmovq_n_u64 (uint64_t __a);  
 /*--2、Load all lanes of the vector to the value of a lane of a vector:  
 vdup/vmov -> ri = a[b]; 
 duplicates a scalar into every element of the destination vector.--*/  
 int8x8_t vdup_lane_s8 (int8x8_t __a, const int __b);  
 int16x4_t vdup_lane_s16 (int16x4_t __a, const int __b);  
 int32x2_t vdup_lane_s32 (int32x2_t __a, const int __b);  
 float32x2_t vdup_lane_f32 (float32x2_t __a, const int __b);  
 uint8x8_t vdup_lane_u8 (uint8x8_t __a, const int __b);  
 uint16x4_t vdup_lane_u16 (uint16x4_t __a, const int __b);  
 uint32x2_t vdup_lane_u32 (uint32x2_t __a, const int __b);  
 poly8x8_t vdup_lane_p8 (poly8x8_t __a, const int __b);  
 poly16x4_t vdup_lane_p16 (poly16x4_t __a, const int __b);  
 int64x1_t vdup_lane_s64 (int64x1_t __a, const int __b);  
 uint64x1_t vdup_lane_u64 (uint64x1_t __a, const int __b);  
 int8x16_t vdupq_lane_s8 (int8x8_t __a, const int __b);  
 int16x8_t vdupq_lane_s16 (int16x4_t __a, const int __b);  
 int32x4_t vdupq_lane_s32 (int32x2_t __a, const int __b);  
 float32x4_t vdupq_lane_f32 (float32x2_t __a, const int __b);  
 uint8x16_t vdupq_lane_u8 (uint8x8_t __a, const int __b);  
 uint16x8_t vdupq_lane_u16 (uint16x4_t __a, const int __b);  
 uint32x4_t vdupq_lane_u32 (uint32x2_t __a, const int __b);  
 poly8x16_t vdupq_lane_p8 (poly8x8_t __a, const int __b);  
 poly16x8_t vdupq_lane_p16 (poly16x4_t __a, const int __b);  
 int64x2_t vdupq_lane_s64 (int64x1_t __a, const int __b);//_mm_unpacklo_epi64  
 uint64x2_t vdupq_lane_u64 (uint64x1_t __a, const int __b);//_mm_unpacklo_epi64  
 /********************************************Combining vectors**************************/  
 /*--长指令, -> r0 = a0, ..., r7 = a7, r8 = b0, ..., r15 = b7; 
 joins two 64-bit vectors into a single 128-bit vector.  
 The output vector contains twice the number of elements as each input vector.  
 The lower half of the output vector contains the elements of the first input vector.--*/  
 int8x16_t vcombine_s8 (int8x8_t __a, int8x8_t __b);//_mm_unpacklo_epi64  
 int16x8_t vcombine_s16 (int16x4_t __a, int16x4_t __b);//_mm_unpacklo_epi64  
 int32x4_t vcombine_s32 (int32x2_t __a, int32x2_t __b);//_mm_unpacklo_epi64  
 int64x2_t vcombine_s64 (int64x1_t __a, int64x1_t __b);//_mm_unpacklo_epi64  
 float32x4_t vcombine_f32 (float32x2_t __a, float32x2_t __b);  
 uint8x16_t vcombine_u8 (uint8x8_t __a, uint8x8_t __b);//_mm_unpacklo_epi64  
 uint16x8_t vcombine_u16 (uint16x4_t __a, uint16x4_t __b);//_mm_unpacklo_epi64  
 uint32x4_t vcombine_u32 (uint32x2_t __a, uint32x2_t __b);//_mm_unpacklo_epi64  
 uint64x2_t vcombine_u64 (uint64x1_t __a, uint64x1_t __b);//_mm_unpacklo_epi64  
 poly8x16_t vcombine_p8 (poly8x8_t __a, poly8x8_t __b);//_mm_unpacklo_epi64  
 poly16x8_t vcombine_p16 (poly16x4_t __a, poly16x4_t __b);//_mm_unpacklo_epi64  
 /***************************************Splitting vectors*******************************/  
 /*--1、窄指令, -> ri = a(i+4); returns the higher half of the 128-bit input vector. The 
 output is a 64-bit vector that has half the number of elements as the input vector.--*/  
 int8x8_t vget_high_s8 (int8x16_t __a);//_mm_unpackhi_epi64  
 int16x4_t vget_high_s16 (int16x8_t __a);//_mm_unpackhi_epi64  
 int32x2_t vget_high_s32 (int32x4_t __a);//_mm_unpackhi_epi64  
 int64x1_t vget_high_s64 (int64x2_t __a);//_mm_unpackhi_epi64  
 float32x2_t vget_high_f32 (float32x4_t __a);  
 uint8x8_t vget_high_u8 (uint8x16_t __a);//_mm_unpackhi_epi64  
 uint16x4_t vget_high_u16 (uint16x8_t __a);//_mm_unpackhi_epi64  
 uint32x2_t vget_high_u32 (uint32x4_t __a);//_mm_unpackhi_epi64  
 uint64x1_t vget_high_u64 (uint64x2_t __a);//_mm_unpackhi_epi64  
 poly8x8_t vget_high_p8 (poly8x16_t __a);//_mm_unpackhi_epi64  
 poly16x4_t vget_high_p16 (poly16x8_t __a);//_mm_unpackhi_epi64  
 /*--2、窄指令, -> ri = ai; returns the lower half of the 128-bit input vector. The 
 output is a 64-bit vector that has half the number of elements as the input vector.--*/  
 int8x8_t vget_low_s8 (int8x16_t __a);  
 int16x4_t vget_low_s16 (int16x8_t __a);  
 int32x2_t vget_low_s32 (int32x4_t __a);  
 float32x2_t vget_low_f32 (float32x4_t __a);  
 uint8x8_t vget_low_u8 (uint8x16_t __a);  
 uint16x4_t vget_low_u16 (uint16x8_t __a);  
 uint32x2_t vget_low_u32 (uint32x4_t __a);  
 poly8x8_t vget_low_p8 (poly8x16_t __a);  
 poly16x4_t vget_low_p16 (poly16x8_t __a);  
 int64x1_t vget_low_s64 (int64x2_t __a);  
 uint64x1_t vget_low_u64 (uint64x2_t __a);  
 /****************************************************Conversions************************/  
 /*--1、Convert from float: vcvt ->, convert from floating-point to integer.--*/  
 int32x2_t vcvt_s32_f32 (float32x2_t __a);  
 uint32x2_t vcvt_u32_f32 (float32x2_t __a);  
 int32x4_t vcvtq_s32_f32 (float32x4_t __a);  
 uint32x4_t vcvtq_u32_f32 (float32x4_t __a);  
 int32x2_t vcvt_n_s32_f32 (float32x2_t __a, const int __b);  
 uint32x2_t vcvt_n_u32_f32 (float32x2_t __a, const int __b);  
 int32x4_t vcvtq_n_s32_f32 (float32x4_t __a, const int __b);  
 uint32x4_t vcvtq_n_u32_f32 (float32x4_t __a, const int __b);  
 /*--2、Convert to float: vcvt ->, convert from integer to floating-point.--*/  
 float32x2_t vcvt_f32_s32 (int32x2_t __a);  
 float32x2_t vcvt_f32_u32 (uint32x2_t __a);  
 float32x4_t vcvtq_f32_s32 (int32x4_t __a);  
 float32x4_t vcvtq_f32_u32 (uint32x4_t __a);  
 float32x2_t vcvt_n_f32_s32 (int32x2_t __a, const int __b);  
 float32x2_t vcvt_n_f32_u32 (uint32x2_t __a, const int __b);  
 float32x4_t vcvtq_n_f32_s32 (int32x4_t __a, const int __b);  
 float32x4_t vcvtq_n_f32_u32 (uint32x4_t __a, const int __b);  
 /*--3、between single-precision and double-precision numbers: vcvt ->--*/  
 float16x4_t vcvt_f16_f32(float32x4_t a);  
 float32x4_t vcvt_f32_f16(float16x4_t a);  
 /*************************************************Move**********************************/  
 /*--1、Vector narrow integer(窄指令): vmovn -> ri = ai[0...8]; copies the least  
 significant half of each element of a quadword vector into  
 the corresponding elements of a doubleword vector.--*/  
 int8x8_t vmovn_s16 (int16x8_t __a);  
 int16x4_t vmovn_s32 (int32x4_t __a);  
 int32x2_t vmovn_s64 (int64x2_t __a);  
 uint8x8_t vmovn_u16 (uint16x8_t __a);  
 uint16x4_t vmovn_u32 (uint32x4_t __a);  
 uint32x2_t vmovn_u64 (uint64x2_t __a);  
 /*--2、Vector long move(长指令): vmovl -> sign extends or zero extends each element 
 in a doubleword vector to twice its original length, 
 and places the results in a quadword vector.--*/  
 int16x8_t vmovl_s8 (int8x8_t __a);//_mm_cvtepi8_epi16  
 int32x4_t vmovl_s16 (int16x4_t __a);//_mm_cvtepi16_epi32  
 int64x2_t vmovl_s32 (int32x2_t __a);//_mm_cvtepi32_epi64  
 uint16x8_t vmovl_u8 (uint8x8_t __a);//_mm_cvtepu8_epi16  
 uint32x4_t vmovl_u16 (uint16x4_t __a);//_mm_cvtepu16_epi32  
 uint64x2_t vmovl_u32 (uint32x2_t __a);////_mm_cvtepu32_epi64  
 /*--3、Vector saturating narrow integer(窄指令): vqmovn -> copies each element of the 
 operand vector to the corresponding element of the destination vector.  
 The result element is half the width of  
 the operand element, and values are saturated to the result width. 
 The results are the same type as the operands.--*/  
 int8x8_t vqmovn_s16 (int16x8_t __a);//_mm_packs_epi16  
 int16x4_t vqmovn_s32 (int32x4_t __a);//_mm_packs_epi32  
 int32x2_t vqmovn_s64 (int64x2_t __a);  
 uint8x8_t vqmovn_u16 (uint16x8_t __a);  
 uint16x4_t vqmovn_u32 (uint32x4_t __a);  
 uint32x2_t vqmovn_u64 (uint64x2_t __a);  
 /*--4、Vector saturating narrow integer signed->unsigned(窄指令): copies each element of 
 the operand vector to the corresponding element of the destination vector. 
 The result element is half the width of the operand element, 
 and values are saturated to the result width. 
 The elements in the operand are signed and the elements in the result are unsigned.--*/  
 uint8x8_t vqmovun_s16 (int16x8_t __a);//_mm_packus_epi16  
 uint16x4_t vqmovun_s32 (int32x4_t __a);//_mm_packus_epi32  
 uint32x2_t vqmovun_s64 (int64x2_t __a);  
 /******************************************************Table lookup*********************/  
 /*--1、Table lookup: vtbl -> uses byte indexes in a control vector to look up byte  
 values in a table and generate a new vector. Indexes out of range return 0.  
 The table is in Vector1 and uses one(or two or three or four)D registers.--*/  
 int8x8_t vtbl1_s8 (int8x8_t __a, int8x8_t __b);  
 uint8x8_t vtbl1_u8 (uint8x8_t __a, uint8x8_t __b);  
 poly8x8_t vtbl1_p8 (poly8x8_t __a, uint8x8_t __b);  
 int8x8_t vtbl2_s8 (int8x8x2_t __a, int8x8_t __b);  
 uint8x8_t vtbl2_u8 (uint8x8x2_t __a, uint8x8_t __b);  
 poly8x8_t vtbl2_p8 (poly8x8x2_t __a, uint8x8_t __b);  
 int8x8_t vtbl3_s8 (int8x8x3_t __a, int8x8_t __b);  
 uint8x8_t vtbl3_u8 (uint8x8x3_t __a, uint8x8_t __b);  
 poly8x8_t vtbl3_p8 (poly8x8x3_t __a, uint8x8_t __b);  
 int8x8_t vtbl4_s8 (int8x8x4_t __a, int8x8_t __b);  
 uint8x8_t vtbl4_u8 (uint8x8x4_t __a, uint8x8_t __b);  
 poly8x8_t vtbl4_p8 (poly8x8x4_t __a, uint8x8_t __b);  
 /*--2、Extended table lookup: vtbx -> uses byte indexes in a control vector to look up 
 byte values in a table and generate a new vector. Indexes out of range leave the  
 destination element unchanged.The table is in Vector2 and uses one(or two or three or 
 four) D register. Vector1 contains the elements of the destination vector.--*/  
 int8x8_t vtbx1_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c);  
 uint8x8_t vtbx1_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c);  
 poly8x8_t vtbx1_p8 (poly8x8_t __a, poly8x8_t __b, uint8x8_t __c);  
 int8x8_t vtbx2_s8 (int8x8_t __a, int8x8x2_t __b, int8x8_t __c);  
 uint8x8_t vtbx2_u8 (uint8x8_t __a, uint8x8x2_t __b, uint8x8_t __c);  
 poly8x8_t vtbx2_p8 (poly8x8_t __a, poly8x8x2_t __b, uint8x8_t __c);  
 int8x8_t vtbx3_s8 (int8x8_t __a, int8x8x3_t __b, int8x8_t __c);  
 uint8x8_t vtbx3_u8 (uint8x8_t __a, uint8x8x3_t __b, uint8x8_t __c);  
 poly8x8_t vtbx3_p8 (poly8x8_t __a, poly8x8x3_t __b, uint8x8_t __c);  
 int8x8_t vtbx4_s8 (int8x8_t __a, int8x8x4_t __b, int8x8_t __c);  
 uint8x8_t vtbx4_u8 (uint8x8_t __a, uint8x8x4_t __b, uint8x8_t __c);  
 poly8x8_t vtbx4_p8 (poly8x8_t __a, poly8x8x4_t __b, uint8x8_t __c);  
 /***************************************Multiply, scalar, lane**************************/  
 /*--1、Vector multiply by scalar: vmul -> ri = ai * b;  
 multiplies each element in a vector by a scalar,  
 and places the results in the destination vector.--*/  
 int16x4_t vmul_n_s16 (int16x4_t __a, int16_t __b);  
 int32x2_t vmul_n_s32 (int32x2_t __a, int32_t __b);  
 float32x2_t vmul_n_f32 (float32x2_t __a, float32_t __b);  
 uint16x4_t vmul_n_u16 (uint16x4_t __a, uint16_t __b);  
 uint32x2_t vmul_n_u32 (uint32x2_t __a, uint32_t __b);  
 int16x8_t vmulq_n_s16 (int16x8_t __a, int16_t __b);  
 int32x4_t vmulq_n_s32 (int32x4_t __a, int32_t __b);  
 float32x4_t vmulq_n_f32 (float32x4_t __a, float32_t __b);  
 uint16x8_t vmulq_n_u16 (uint16x8_t __a, uint16_t __b);  
 uint32x4_t vmulq_n_u32 (uint32x4_t __a, uint32_t __b);  
 /*--2、Vector multiply by scalar: -> ri = ai * b[c];  
 multiplies the first vector by a scalar.  
 The scalar is the element in the second vector with index c.--*/  
 int16x4_t vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c);  
 int32x2_t vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c);  
 float32x2_t vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __c);  
 uint16x4_t vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c);  
 uint32x2_t vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c);  
 int16x8_t vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c);  
 int32x4_t vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c);  
 float32x4_t vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __c);  
 uint16x8_t vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c);  
 uint32x4_t vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c);  
 /*--3、Vector long multiply with scalar: vmull ->  ri = ai * b; 
 multiplies a vector by a scalar.  
 Elements in the result are wider than elements in input vector.--*/  
 int32x4_t vmull_n_s16 (int16x4_t __a, int16_t __b);  
 int64x2_t vmull_n_s32 (int32x2_t __a, int32_t __b);  
 uint32x4_t vmull_n_u16 (uint16x4_t __a, uint16_t __b);  
 uint64x2_t vmull_n_u32 (uint32x2_t __a, uint32_t __b);  
 /*--4、Vector long multiply by scalar: vmull -> ri = ai * b[c]; 
 multiplies the first vector by a scalar.  
 The scalar is the element in the second vector with index c.  
 The elements in the result are wider than the elements in input vector.--*/  
 int32x4_t vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c);  
 int64x2_t vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c);  
 uint32x4_t vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c);  
 uint64x2_t vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c);  
 /*--5、Vector saturating doubling long multiply with scalar: vqdmull -> ri = sat(ai * b); 
 multiplies the elements in the vector by a scalar, and doubles the results.  
 If any of the results overflow, they are saturated and the sticky QC flag is set.--*/  
 int32x4_t vqdmull_n_s16 (int16x4_t __a, int16_t __b);  
 int64x2_t vqdmull_n_s32 (int32x2_t __a, int32_t __b);  
 /*--6、Vector saturating doubling long multiply by scalar: vqdmull -> ri = sat(ai * b[c]); 
 multiplies the elements in the first vector by a scalar, and doubles the results.  
 The scalar has index c in the second vector. If any of the results overflow,  
 they are saturated and the sticky QC flagis set.--*/  
 int32x4_t vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c);  
 int64x2_t vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c);  
 /*--7、Vector saturating doubling multiply high with scalar: vqdmulh -> ri = sat(ai * b) 
 multiplies the elements of the vector by a scalar, and doubles the results. 
 It then returns only the high half of the results. 
 If any of the results overflow, they are saturated and the sticky QC flag is set.--*/  
 int16x4_t vqdmulh_n_s16 (int16x4_t __a, int16_t __b);  
 int32x2_t vqdmulh_n_s32 (int32x2_t __a, int32_t __b);  
 int16x8_t vqdmulhq_n_s16 (int16x8_t __a, int16_t __b);  
 int32x4_t vqdmulhq_n_s32 (int32x4_t __a, int32_t __b);  
 /*--8、Vector saturating doubling multiply high by scalar:  
 vqdmulh -> ri = sat(ai * b[c]); 
 multiplies the elements of the first vector by a scalar, and doubles the results. It then 
 returns only the high half of the results. The scalar has index n in the second vector. 
 If any of the results overflow, they are saturated and the sticky QC flag is set.--*/  
 int16x4_t vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c);  
 int32x2_t vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c);  
 int16x8_t vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c);  
 int32x4_t vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c);  
 /*--9、Vector saturating rounding doubling multiply high with scalar:  
 vqqrdmulh -> ri = sat(ai * b); 
 multiplies the elements of the vector by a scalar and doubles the results.  
 It then returns only the high half of the rounded results.  
 If any of the results overflow, they are saturated and the sticky QC flag is set.--*/  
 int16x4_t vqrdmulh_n_s16 (int16x4_t __a, int16_t __b);  
 int32x2_t vqrdmulh_n_s32 (int32x2_t __a, int32_t __b);  
 int16x8_t vqrdmulhq_n_s16 (int16x8_t __a, int16_t __b);  
 int32x4_t vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b);  
 /*--10、Vector rounding saturating doubling multiply high by scalar:  
 vqrdmulh -> ri = sat(ai * b[c]); 
 multiplies the elements of the first vector by a scalar and doubles the results. 
 It then returns only the high half of the rounded results. 
 The scalar has index n in the second vector. If any of the results overflow,  
 they are saturated and the sticky QC flag is set.--*/  
 int16x4_t vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c);  
 int32x2_t vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c);  
 int16x8_t vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c);  
 int32x4_t vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c);  
 /*--11、Vector multiply accumulate with scalar: vmla -> ri = ai + bi * c; 
 multiplies each element in the second vector by a scalar,  
 and adds the results to the corresponding elements of the first vector.--*/  
 int16x4_t vmla_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c);  
 int32x2_t vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c);  
 float32x2_t vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c);  
 uint16x4_t vmla_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c);  
 uint32x2_t vmla_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c);  
 int16x8_t vmlaq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c);  
 int32x4_t vmlaq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c);  
 float32x4_t vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c);  
 uint16x8_t vmlaq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c);  
 uint32x4_t vmlaq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c);  
 /*--12、Vector multiply accumulate by scalar: vmla -> ri = ai + bi * c[d]; 
 multiplies each element in the second vector by a scalar,  
 and adds the results to the corresponding elements of the first vector.  
 The scalar has index d in the third vector.--*/  
 int16x4_t vmla_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d);  
 int32x2_t vmla_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d);  
 float32x2_t vmla_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c,  
     const int __d);  
 uint16x4_t vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d);  
 uint32x2_t vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d);  
 int16x8_t vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d);  
 int32x4_t vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d);  
 float32x4_t vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c,  
     const int __d);  
 uint16x8_t vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d);  
 uint32x4_t vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d);  
 /*--13、Vector widening multiply accumulate with scalar: vmlal -> ri = ai + bi * c; 
 multiplies each element in the second vector by a scalar, and adds the results into the  
 corresponding elements of the first vector.  
 The scalar has index n in the third vector. The elements in the result are wider.--*/  
 int32x4_t vmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c);  
 int64x2_t vmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c);  
 uint32x4_t vmlal_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c);  
 uint64x2_t vmlal_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c);  
 /*--14、Vector widening multiply accumulate by scalar: vmlal -> ri = ai + bi * c[d]; 
 multiplies each element in the second vector by a scalar, and adds the results to the  
 corresponding elements of the first vector. The scalar has index d in the third vector. 
 The elements in the result are wider.--*/  
 int32x4_t vmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d);  
 int64x2_t vmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d);  
 uint32x4_t vmlal_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d);  
 uint64x2_t vmlal_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d);  
 /*--15、Vector widening saturating doubling multiply accumulate with scalar:  
 vqdmlal -> ri = sat(ai + bi * c); 
 multiplies the elements in the second vector by a scalar, and doubles the results.  
 It then adds the results to the elements in the first vector. 
 If any of the results overflow, they are saturated and the sticky QC flag is set.--*/  
 int32x4_t vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c);  
 int64x2_t vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c);  
 /*--16、Vector widening saturating doubling multiply accumulate by scalar:  
 vqdmlal -> ri = sat(ai + bi * c[d]) 
 multiplies each element in the second vector by a scalar, doubles the results and adds  
 them to the corresponding elements of the first vector. The scalar has index d in the  
 third vector. If any of the results overflow, 
 they are saturated and the sticky QC flag is set.--*/  
 int32x4_t vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d);  
 int64x2_t vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d);  
 /*--17、Vector multiply subtract with scalar: vmls -> ri = ai - bi * c; 
 multiplies each element in a vector by a scalar, subtracts the results from the  
 corresponding elements of the destination vector,  
 and places the final results in the destination vector.--*/  
 int16x4_t vmls_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c);  
 int32x2_t vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c);  
 float32x2_t vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c);  
 uint16x4_t vmls_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c);  
 uint32x2_t vmls_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c);  
 int16x8_t vmlsq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c);  
 int32x4_t vmlsq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c);  
 float32x4_t vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c);  
 uint16x8_t vmlsq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c);  
 uint32x4_t vmlsq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c);  
 /*--18、Vector multiply subtract by scalar: vmls -> ri = ai - bi * c[d]; 
 multiplies each element in the second vector by a scalar, and subtracts them from the 
 corresponding elements of the first vector. 
 The scalar has index d in the third vector.--*/  
 int16x4_t vmls_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d);  
 int32x2_t vmls_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d);  
 float32x2_t vmls_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c,  
     const int __d);  
 uint16x4_t vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d);  
 uint32x2_t vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d);  
 int16x8_t vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d);  
 int32x4_t vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d);  
 float32x4_t vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c,  
     const int __d);  
 uint16x8_t vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d);  
 uint32x4_t vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d);  
 /*--19、Vector widening multiply subtract with scalar: vmlsl -> ri = ai - bi * c; 
 multiplies the elements in the second vector by a scalar, then subtracts the results from 
 the elements in the first vector. The elements of the result are wider.--*/  
 int32x4_t vmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c);  
 int64x2_t vmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c);  
 uint32x4_t vmlsl_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c);  
 uint64x2_t vmlsl_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c);  
 /*--20、Vector widening multiply subtract by scalar: vmlsl -> ri = ai - bi * c[d]; 
 multiplies each element in the second vector by a scalar,  
 and subtracts them from the corresponding elements of the first vector.  
 The scalar has index d in the third vector. The elements in the result are wider.--*/  
 int32x4_t vmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d);  
 int64x2_t vmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d);  
 uint32x4_t vmlsl_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)  
 uint64x2_t vmlsl_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d);  
 /*--21、Vector widening saturating doubling multiply subtract with scalar:  
 vqdmlsl -> ri = sat(ai - bi * c); 
 multiplies the elements of the second vector with a scalar and doubles the results.  
 It then subtracts the results from the elements in the first vector. 
 If any of the results overflow, they are saturated and the sticky QC flag is set.--*/  
 int32x4_t vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c);  
 int64x2_t vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c);  
 /*--22、Vector widening saturating doubling multiply subtract by scalar: 
 vqdmlsl -> ri = sat(ai - bi * c[[d]); 
 multiplies each element in the second vector by a scalar, doubles the results and subtracts 
 them from the corresponding elements of the first vector. The scalar has index n in the  
 third vector.If any of the results overflow,  
 they are saturated and the sticky QC flag is set.--*/  
 int32x4_t vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d);  
 int64x2_t vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d);  
 /*****************************************************Vector extract********************/  
 /*--Vector extract: vext -> extracts n elements from the lower end of the second operand 
 vector and the remaining elements from the higher end of the first, and combines them to 
 form the result vector. The elements from the second operand are placed in the most  
 significant part of the result vector.The elements from the first operand are placed in 
 the least significant part of the result vector.This intrinsic cycles the elements 
 through the lanes if the two input vectors are the same.--*/  
 int8x8_t vext_s8 (int8x8_t __a, int8x8_t __b, const int __c);  
 int16x4_t vext_s16 (int16x4_t __a, int16x4_t __b, const int __c);  
 int32x2_t vext_s32 (int32x2_t __a, int32x2_t __b, const int __c);  
 int64x1_t vext_s64 (int64x1_t __a, int64x1_t __b, const int __c);  
 float32x2_t vext_f32 (float32x2_t __a, float32x2_t __b, const int __c);  
 uint8x8_t vext_u8 (uint8x8_t __a, uint8x8_t __b, const int __c);  
 uint16x4_t vext_u16 (uint16x4_t __a, uint16x4_t __b, const int __c);  
 uint32x2_t vext_u32 (uint32x2_t __a, uint32x2_t __b, const int __c);  
 uint64x1_t vext_u64 (uint64x1_t __a, uint64x1_t __b, const int __c);  
 poly8x8_t vext_p8 (poly8x8_t __a, poly8x8_t __b, const int __c);  
 poly16x4_t vext_p16 (poly16x4_t __a, poly16x4_t __b, const int __c);  
 int8x16_t vextq_s8 (int8x16_t __a, int8x16_t __b, const int __c);//_mm_alignr_epi8   
 int16x8_t vextq_s16 (int16x8_t __a, int16x8_t __b, const int __c);//_mm_alignr_epi8   
 int32x4_t vextq_s32 (int32x4_t __a, int32x4_t __b, const int __c);//_mm_alignr_epi8  
 int64x2_t vextq_s64 (int64x2_t __a, int64x2_t __b, const int __c);//_mm_alignr_epi8  
 float32x4_t vextq_f32 (float32x4_t __a, float32x4_t __b, const int __c);//_mm_alignr_epi8  
 uint8x16_t vextq_u8 (uint8x16_t __a, uint8x16_t __b, const int __c);//_mm_alignr_epi8  
 uint16x8_t vextq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c);//_mm_alignr_epi8  
 uint32x4_t vextq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c);//_mm_alignr_epi8  
 uint64x2_t vextq_u64 (uint64x2_t __a, uint64x2_t __b, const int __c);//_mm_alignr_epi8  
 poly8x16_t vextq_p8 (poly8x16_t __a, poly8x16_t __b, const int __c);//_mm_alignr_epi8  
 poly16x8_t vextq_p16 (poly16x8_t __a, poly16x8_t __b, const int __c);//_mm_alignr_epi8  
 /****************************************************Reverse elements*******************/  
 /*--1、Reverse vector elements (swap endianness): vrev64 -> reverses the order of 8-bit,  
 16-bit, or 32-bit elements within each doubleword of the vector,  
 and places the result in the corresponding destination vector.--*/  
 int8x8_t vrev64_s8 (int8x8_t __a);  
 int16x4_t vrev64_s16 (int16x4_t __a);  
 int32x2_t vrev64_s32 (int32x2_t __a);  
 float32x2_t vrev64_f32 (float32x2_t __a);//_mm_shuffle_ps  
 uint8x8_t vrev64_u8 (uint8x8_t __a);  
 uint16x4_t vrev64_u16 (uint16x4_t __a);  
 uint32x2_t vrev64_u32 (uint32x2_t __a);  
 poly8x8_t vrev64_p8 (poly8x8_t __a);  
 poly16x4_t vrev64_p16 (poly16x4_t __a);  
 int8x16_t vrev64q_s8 (int8x16_t __a);  
 int16x8_t vrev64q_s16 (int16x8_t __a);  
 int32x4_t vrev64q_s32 (int32x4_t __a);  
 float32x4_t vrev64q_f32 (float32x4_t __a);//_mm_shuffle_ps  
 uint8x16_t vrev64q_u8 (uint8x16_t __a);  
 uint16x8_t vrev64q_u16 (uint16x8_t __a);  
 uint32x4_t vrev64q_u32 (uint32x4_t __a);  
 poly8x16_t vrev64q_p8 (poly8x16_t __a);  
 poly16x8_t vrev64q_p16 (poly16x8_t __a);  
 /*--2、Reverse vector elements (swap endianness): vrev32 -> reverses the order of 8-bit  
 or 16-bit elements within each word of the vector,  
 and places the result in the corresponding destination vector.--*/  
 int8x8_t vrev32_s8 (int8x8_t __a);  
 int16x4_t vrev32_s16 (int16x4_t __a);  
 uint8x8_t vrev32_u8 (uint8x8_t __a);  
 uint16x4_t vrev32_u16 (uint16x4_t __a);  
 poly8x8_t vrev32_p8 (poly8x8_t __a);  
 poly16x4_t vrev32_p16 (poly16x4_t __a);  
 int8x16_t vrev32q_s8 (int8x16_t __a);  
 int16x8_t vrev32q_s16 (int16x8_t __a);  
 

你可能感兴趣的:(ARM Neon Intrinsics各函数介绍)

UDP通信开发 Charary udp 网络
开发流程：UDP本身不考虑链接，不存在客户和服务器的概念，UDP开发只有三步：创建UDP的套接字socket(AF_INET,SOCK_DGRAM,0)绑定自己的属性bindUDP随意的发送和接收数据sendto/recvfromUDP接口函数：sendto()函数功能：UDP专用的发送函数函数原型：ssize_tsendto(intsockfd,//套接字constvoid*buf,//待发送的
AI 大模型：Intelligent Agent—— 开启智能新纪元 AI-入门人工智能学习产品经理面试 agi
在LLM语境下，Agent理解为在某种能自主理解、规划决策、执行复杂任务的智能体，LLM充当着智能体的“大脑”。从软件工程的角度，智能体是一种基于大语言模型的，具备规划思考能力、记忆能力、使用工具函数的能力，能自主完成给定任务的计算机程序。在基于LLM的智能体中，LLM的充当着智能体的“大脑”的角色，同时还有3个关键部分：规划（Planning）:智能体会把大型任务分解为子任务，并规划执行任务的流
Conda 常用命令全解析 melck conda
在Windows系统中，Conda是一款功能强大的包管理和环境管理工具，尤其对于数据分析、科学计算等场景有着重要的作用。本文将详细介绍Conda在Windows系统中的常用命令，帮助你高效地管理虚拟环境和软件包。一、环境管理命令1.1查看Conda版本conda--version该命令用于确认Conda是否成功安装以及查看其版本号。这对于确保Conda的兼容性和功能性非常重要。1.2创建新环境co
欧*雅WCS项目总结十五001 项目归档后端 java 程序人生
项目介绍使用系统APRISO下发任务与wcs交互，wcs包含与海康agv对接，以及APRISO不纳入管理的库位（包括线边库位、码头库位、暂存区库位、空栈板库位）。wcs的主要定位就是高度定制化贴合生产业务，可以说wcs成为了agv和APRISO之间的桥梁。APRISO下发任务时候，通过生成xml文件实现的，这时候wcs会监听该文件目录新建的xml文件来生成任务。刚开始部署后不到一周出现了监听失效问
JavaScript 闭包与作用域的深度解析小钟H呀 JS知识手册 javascript 开发语言 ecmascript
引言在JavaScript世界里，闭包和作用域是两个核心概念，理解它们对于编写高效、可维护的代码至关重要。本文将深入探讨JavaScript闭包与作用域的原理、应用及注意事项。一、作用域的概念（一）什么是作用域作用域是指变量和函数的可访问范围。在JavaScript中，主要有全局作用域和局部作用域。全局作用域：在代码的任何地方都可以访问到的变量和函数，通常在脚本的最外层或通过全局对象（如windo
如何快速定位并解决 Linux 系统性能瓶颈：终极全攻略 BitTalk 性能优化 linux 服务器 java
在现代IT环境中，Linux系统被广泛应用于服务器、嵌入式设备和超级计算机等各类场景。随着系统负载的增加，性能瓶颈不可避免地会影响系统的可靠性和效率。因此，了解如何有效地诊断和解决Linux系统中的性能问题至关重要。本篇博客将深入探讨Linux性能瓶颈的可能来源，介绍各种性能评估方法和概念，并最终提供使用Linux命令查找性能瓶颈的实用指南。性能瓶颈的可能来源在Linux系统中，性能瓶颈可能出现在
Python wifi 安装手机app yichengace python
目的当测试机数量越来越多时，测试包的安装会成为一个问题，用wifi安装来解决这个问题，并且用脚本语言来批量控制思路思路就是py调用pc端的adb命令，向手机发送请求，无线是因为，如果未来测试机越来越多，一台电脑的usb接口数量肯定不够准备工具python，adb，pycharm，测试用app，这里选择qq（https://qd.myapp.com/myapp/qqteam/AndroidQQ/mo
【人工智能时代】- AI 聚合平台 xiaoli8748_软件开发人工智能时代人工智能
最近听朋友介绍，国内有个团队开发了一个全功能的AI聚合平台，包含主流的GPT和绘画功能，以及一些其他的衍生功能，几乎应有尽有。于是，对AI很感兴趣的我，便也来瞧瞧这是个什么样的存在，以下便是我的真实使用感受。除此以外，作为一个程序员，我还使用了该平台提供的API接口，开发了一个简单的小程序。文章的末尾，我将提供免费的AI机器人，以及小程序体验地址，记得查收哦~官方网站：https://302.ai
在瑞芯微RK3588平台上使用RKNN部署YOLOv8Pose模型的C++实战指南机＿长 YOLO系列模型有效涨点改进深度学习落地实战 YOLO c++开发语言
在人工智能和计算机视觉领域，人体姿态估计是一项极具挑战性的任务，它对于理解人类行为、增强人机交互等方面具有重要意义。YOLOv8Pose作为YOLO系列中的新成员，以其高效和准确性在人体姿态估计任务中脱颖而出。本文将详细介绍如何在瑞芯微RK3588平台上，使用RKNN（RockchipNeuralNetworkToolkit）框架部署YOLOv8Pose模型，并进行C++代码的编译和运行。注本文全
计算机程序制作的小作品,义乌市中小学生电脑作品制作比赛201203 东南前哨计算机程序制作的小作品
《义乌市中小学生电脑作品制作比赛201203》由会员分享，可在线阅读，更多相关《义乌市中小学生电脑作品制作比赛201203(4页珍藏版)》请在人人文库网上搜索。1、浙江省义乌市教育研修院关于举办2012年义乌市中小学生电脑作品制作比赛暨首届青少年网络道德建设专题创作活动的通知各中小学：为进一步推进和加强中小学信息技术教育，普及信息技术知识，培养学生创新精神和实践能力，提高信息技术水平，根据上级文件
【HarmonyOS Next】鸿蒙监听手机按键 GeorgeGcs HarmonyOS 解决方案 OpenHarmony知识体系 harmonyos 华为 onKeyEvent 按键监听事件按下鸿蒙
【HarmonyOSNext】鸿蒙监听手机按键一、前言应用开发中我们会遇到监听用户实体按键，或者扩展按键的需求。亦或者是在某些场景下，禁止用户按下某些按键的业务需求。这两种需求，鸿蒙都提供了对应的监听事件进行处理。onKeyEvent默认的按钮监听事件onKeyPreIme这是优先级最高的监听回调，别上面多了一个return开关，用于告诉系统监听事件是否再向下传递。窗口是第一级接收按钮事件的实体。
麒麟v10安装mysql5.7（ARM架构） qqxinxi arm开发
下载路径：华为云镜像麒麟v10是潮流时代的新时髦的linux操作系统，但随着ARM架构流行，出现了一些卡点，不以为然，没当回事的大吃一惊。经常卡住。例如:在安装mysql5.7（ARM架构）最简单：使用rpmmysql-5.7.27.1.el7.aarch64.rpm文件比较小下载完之后rpm-ivhmysql-5.7.27.1.el7.aarch64.rpm比较简单常用的方法，再不能连接互联网时
YOLOv8 Pose使用RKNN进行推理い不靠譜︶朱Sir 实用项目部署 YOLO 人工智能 python linux pip
关注微信公众号：朱sir的小站，发送202411081即可免费获取源代码下载链接一、简单介绍YOLOv8-Pose是一种基于YOLOv8架构的姿态估计模型，能够识别图像中的关键点位置，这些关键点通常表示人体的关节、特征点或其他显著位置。该模型在COCO关键点数据集上训练，适合多种姿势估计任务。二、ONNX推理1.首先需要先将Pytorch模型转换为Onnx模型，下载pt模型这里给出官方的权重下载地
五大常考SQL面试题 Begin to change MySQL sql 面试
目录一、找出连续7天登陆，连续30天登陆的用户（小红书笔试，电信云面试），最大连续登陆天数的问题--窗口函数二、求连续点击三次的用户数，而且中间不能有别人的点击三、计算除去部门最高工资，和最低工资的平均工资（字节跳动面试）--窗口函数四、留存的计算，和累计求和的计算--窗口函数，自联结（pdd面试）一、找出连续7天登陆，连续30天登陆的用户（小红书笔试，电信云面试），最大连续登陆天数的问题--窗口
PyCharm 集成 DeepSeek：本地运行 or API 直连？打造你的 AI 编程神器！ AI云极【AI智能系列】pycharm 人工智能 ide deepseek
在AI赋能编程的时代，如何让AI辅助写代码，提升开发效率？DeepSeek作为一款开源、强大、免费的AI编程助手，结合PyCharm，能够大幅提升Python编程体验。今天，我们就来详细讲解如何在PyCharm中接入DeepSeek，无论你想使用本地部署的DeepSeek，还是官方API版本，都能轻松实现！为什么选择DeepSeek+PyCharm？DeepSeekR1采用6710亿参数的MoE（
Python3.5源码分析-sys模块及site模块导入小屋子大侠 python Python分析 python源码
Python3源码分析本文环境python3.5.2。参考书籍>python官网Python3的sys模块初始化根据分析完成builtins初始化后，继续分析sys模块的初始化，继续分析_Py_InitializeEx_Private函数的执行，void_Py_InitializeEx_Private(intinstall_sigs,intinstall_importlib){...sysmod=
HarmonyOS进程通信及原理拥有一颗学徒的心 HarmonyOS harmonyos 华为鸿蒙信息与通信分布式
大家好，我是学徒小z，最近在研究鸿蒙中一些偏底层原理的内容，今天分析进程通信给大家，请用餐文章目录进程间通信1.通过公共事件（@ohos.commonEventManager）公共事件的底层原理2.IPCKit能力LiteIPC的归属与特点1.所属内核2.核心思想3.公共事件子系统鸿蒙内核小知识进程间通信1.通过公共事件（@ohos.commonEventManager）公共事件的底层原理公共事件
深入了解 CDN：概念、原理、过程、作用及工作场景羊村懒哥网络网络加速缓存
目录一、CDN的概念二、CDN的工作原理三、CDN的工作过程四、CDN的作用五、CDN可结合使用的技术六、CDN能够解决的网络问题七、CDN的工作场景在当今互联网飞速发展的时代，用户对于网页加载速度和内容获取的时效性要求越来越高。CDN（ContentDeliveryNetwork，⭐内容分发网络）应运而生，它在提升网络性能和用户体验方面发挥着关键作用。本文将详细介绍CDN的概念、工作原理、工作过
OpenLayers总结3 Super毛毛穗 WebGIS开发 OpenLayers GIS WebGIS
一、静态测距1.原理静态测距主要是针对地图上已有的矢量要素（如线要素），利用OpenLayers提供的几何计算函数来获取其长度。在实际操作中，先加载包含几何要素的GeoJSON数据到矢量图层，当鼠标指针移动到要素上时，获取该要素的几何信息，再调用getLength函数计算其长度。2.代码实现步骤及注释//引入必要的模块importVectorLayerfrom"ol/layer/Vector.js
【CUDA】Pytorch_Extensions joker D888 深度学习 pytorch python cuda c++深度学习
【CUDA】Pytorch_Extensions为什么要开发CUDA扩展？当我们在PyTorch中实现自定义算子时，通常有两种选择：使用纯Python实现（简单但效率低）使用C++/CUDA扩展（高效但需要编译）对于计算密集型的操作（如神经网络中的自定义激活函数），使用CUDA扩展可以获得接近硬件极限的性能。本文将以实现一个多项式激活函数x²+x+1为例，展示完整的开发流程。完整CUDA扩展代码解
Labelbox：引领AI与人类协作的未来魏兴雄Milburn
Labelbox：引领AI与人类协作的未来labelbox-pythonLabelboxPythonClient项目地址:https://gitcode.com/gh_mirrors/la/labelbox-python项目介绍Labelbox是一款专为企业和学术研究社区设计的开源工具，旨在简化数据标注、生成高质量的人类反馈数据、评估和提升模型性能，并通过无缝结合AI与人类工作流程来自动化任务。无
探索HeidiSQL：一款强大的数据库管理工具夏庭彭Maxine
探索HeidiSQL：一款强大的数据库管理工具HeidiSQLHeidiSQL:是一个免费且强大的SQL编辑器和数据库管理工具，支持MySQL、PostgreSQL、SQLite等多种数据库。适合数据库管理员和开发者使用HeidiSQL管理数据库和查询数据。项目地址:https://gitcode.com/gh_mirrors/he/HeidiSQL项目介绍HeidiSQL是一款开源的图形化数据库
基于python使用scanpy分析单细胞转录组数据探序基因单细胞分析 python 开发语言
探序基因肿瘤研究院整理相关后缀的格式介绍：.h5ad：是一种用于存储单细胞数据的文件格式，可以通过anndata库在Python中处理.loom：高效的数据存储格式（.loom文件），使得用户可以轻松地存储、查询和分析大规模的单细胞数据集。Loompy的设计目标是提供一个快速、灵活且易于使用的工具，以支持生物信息学家和研究人员在单细胞水平上进行数据分析。python的单细胞转录组数据结构说明：da
【HarmonyOS NEXT】是否有监听键盘显隐的方法 Mayism123 harmonyos
关键字监听/键盘/输入法框架/窗口问题描述是否有监听键盘显隐的方法？解决方案可选择以下任一方案：方案一：通过输入法框架模块（@ohos.inputMethod）来监听软键盘状态。用InputMethodController实例的on('sendKeyboardStatus')方法来监听，直接在inputMethodController.on('sendKeyboardStatus',callbac
HarmonyOS全栈开发指南：从入门到精通，构建万物智联的未来生态（一）林钟雪 Harmonyos harmonyos 华为
一、HarmonyOS基础认知篇1.HarmonyOS发展历程与核心使命内容摘要：HarmonyOS，由华为公司于2019年首次公开发布，标志着华为在操作系统领域的深度布局。从最初的智能物联网设备操作系统定位，到如今面向万物智联时代的分布式全场景操作系统，HarmonyOS经历了多次迭代与升级。发展历程：初期探索：2019年，华为正式推出HarmonyOS，旨在打造一个适用于智能物联网设备的操作系
使用Python和OpenCV实现图像像素压缩与解压东方佑量子变法 python opencv 开发语言
在本文中，我们将探讨如何使用Python和OpenCV库来实现一种简单的图像像素压缩算法。我们将详细讨论代码的工作原理，并提供一个具体的示例来演示该过程。1.引言随着数字媒体的普及，图像处理成为了一个重要的领域。无论是为了减少存储空间还是加快网络传输速度，图像压缩技术都扮演着至关重要的角色。这里，我们提出了一种基于像素重复模式的简单压缩算法，它适用于具有大量连续相同像素值的图像。2.技术栈介绍2.
DeepSeek如何重塑我的编程学习：计算机新生的AI实践 EnigmaCoder DeepSeek 学习人工智能
目录前言邂逅DeepSeek：从困惑到惊喜初学编程的困境DeepSeek的优势️DeepSeek在编程学习中的运用注释算法逐步分析调试帮助跨语言迁移学习AI时代学习方法论革新知识获取方式转变新型学习能力培养反思与展望反思展望总结前言大家好！我是EnigmaCoder，本文我将介绍我的AI编程学习之旅。春节期间，DeepSeek横空出世，迅速登顶热榜。它功能强大，精准答疑、高效创作，瞬间点燃大众热情
安装与部署openeuler 的HA VX-IT BANG 服务器网络 linux
实现原理LinuxHA（HighAvailability，高可用性）是指利用Linux操作系统构建的高可用集群解决方案，旨在确保关键业务服务在面临硬件故障、软件错误、网络中断等各种异常情况时，依然能够持续、稳定地运行，尽量减少服务中断时间，提高系统的可靠性和可用性。以下从几个方面详细介绍：关键组件和技术心跳监测（Heartbeat）这是LinuxHA系统中最基础也是最重要的组件之一。它通过在节点之
如何将Docker容器打包并在其他服务器上运行 IT小辉同学技巧性工具栏分布式云部署搜索引擎 docker 服务器容器
如何将Docker容器打包并在其他服务器上运行我会幻想很多次我们的相遇，你穿着合身的T恤，一个素色的外套，搭配一条蓝色的牛仔裤，干净的像那天空中的云朵，而我，还是一个的傻傻的少年，我们相识而笑，默默不语，如此甚好！Docker容器使得应用程序的部署和管理变得更加简单和高效。有时，我们可能需要将一个运行中的Docker容器打包，并在其他服务器上运行。本文将详细介绍如何实现这一过程。1.提交容器为镜像
【自然语言处理|迁移学习-08】：中文语料完型填空爱学习不掉头发深度学习自然语言处理（NLP）自然语言处理迁移学习人工智能
文章目录1中文语料完型填空任务介绍2数据集加载及处理3定义下游任务模型4模型训练5.模型测试1中文语料完型填空任务介绍任务介绍：完成中文语料完型填空完型填空是一个分类问题，[MASK]单词有21128种可能数据构建实现分析：使用迁移学习方式完成使用预训练模型bert模型提取文特征，后面添加全连接层和softmax进行单标签多分类2数据集加载及处理数据介绍：数据文件有三个train.csv，test
ios内付费 374016526 ios 内付费
近年来写了很多IOS的程序，内付费也用到不少，使用IOS的内付费实现起来比较麻烦，这里我写了一个简单的内付费包，希望对大家有帮助。具体使用如下: 这里的sender其实就是调用者，这里主要是为了回调使用。 [KuroStoreApi kuroStoreProductId:@"产品ID" storeSender:self storeFinishCallBa
20 款优秀的 Linux 终端仿真器 brotherlamp linux linux视频 linux资料 linux自学 linux教程
终端仿真器是一款用其它显示架构重现可视终端的计算机程序。换句话说就是终端仿真器能使哑终端看似像一台连接上了服务器的客户机。终端仿真器允许最终用户用文本用户界面和命令行来访问控制台和应用程序。（LCTT 译注：终端仿真器原意指对大型机-哑终端方式的模拟，不过在当今的 Linux 环境中，常指通过远程或本地方式连接的伪终端，俗称“终端”。）你能从开源世界中找到大量的终端仿真器，它们
Solr Deep Paging(solr 深分页) eksliang solr深分页 solr分页性能问题
转载请出自出处：http://eksliang.iteye.com/blog/2148370 作者：eksliang(ickes) blg:http://eksliang.iteye.com/ 概述长期以来，我们一直有一个深分页问题。如果直接跳到很靠后的页数，查询速度会比较慢。这是因为Solr的需要为查询从开始遍历所有数据。直到Solr的4.7这个问题一直没有一个很好的解决方案。直到solr
数据库面试题 18289753290 面试题数据库
1.union ,union all 网络搜索出的最佳答案： union和union all的区别是,union会自动压缩多个结果集合中的重复结果，而union all则将所有的结果全部显示出来，不管是不是重复。 Union：对两个结果集进行并集操作，不包括重复行，同时进行默认规则的排序； Union All：对两个结果集进行并集操作，包括重复行，不进行排序； 2.索引有哪些分类？作用是
Android TV屏幕适配酷的飞上天空 android
先说下现在市面上TV分辨率的大概情况两种分辨率为主 1.720标清，分辨率为1280x720. 屏幕尺寸以32寸为主，部分电视为42寸 2.1080p全高清，分辨率为1920x1080 屏幕尺寸以42寸为主，此分辨率电视屏幕从32寸到50寸都有适配遇到问题，已1080p尺寸为例：分辨率固定不变，屏幕尺寸变化较大。如：效果图尺寸为1920x1080，如果使用d
Timer定时器与ActionListener联合应用永夜-极光 java
功能:在控制台每秒输出一次代码: package Main; import javax.swing.Timer; import java.awt.event.*; public class T { private static int count = 0; public static void main(String[] args){
Ubuntu14.04系统Tab键不能自动补全问题解决随便小屋 Ubuntu 14.04
Unbuntu 14.4安装之后就在终端中使用Tab键不能自动补全，解决办法如下： 1、利用vi编辑器打开/etc/bash.bashrc文件（需要root权限） sudo vi /etc/bash.bashrc 接下来会提示输入密码 2、找到文件中的下列代码 #enable bash completion in interactive shells #if
学会人际关系三招轻松走职场 aijuans 职场
要想成功，仅有专业能力是不够的，处理好与老板、同事及下属的人际关系也是门大学问。如何才能在职场如鱼得水、游刃有余呢？在此，教您简单实用的三个窍门。　　第一，多汇报最近，管理学又提出了一个新名词“追随力”。它告诉我们，做下属最关键的就是要多请示汇报，让上司随时了解你的工作进度，有了新想法也要及时建议。不知不觉，你就有了“追随力”，上司会越来越了解和信任你。　　第二，勤沟通团队的力
《O2O：移动互联网时代的商业革命》读书笔记 aoyouzi 读书笔记
移动互联网的未来：碎片化内容+碎片化渠道=各式精准、互动的新型社会化营销。 O2O：Online to OffLine 线上线下活动 O2O就是在移动互联网时代，生活消费领域通过线上和线下互动的一种新型商业模式。手机二维码本质：O2O商务行为从线下现实世界到线上虚拟世界的入口。线上虚拟世界创造的本意是打破信息鸿沟，让不同地域、不同需求的人
js实现图片随鼠标滚动的效果百合不是茶 JavaScript 滚动属性的获取图片滚动属性获取页面加载
1,获取样式属性值 top 与顶部的距离 left 与左边的距离 right 与右边的距离 bottom 与下边的距离 zIndex 层叠层次例子:获取左边的宽度,当css写在body标签中时 <div id="adver" style="position:absolute;top:50px;left:1000p
ajax同步异步参数async bijian1013 jquery Ajax async
开发项目开发过程中，需要将ajax的返回值赋到全局变量中，然后在该页面其他地方引用，因为ajax异步的原因一直无法成功，需将async:false，使其变成同步的。格式： $.ajax({ type: 'POST', ur
Webx3框架（1） Bill_chen eclipse spring maven 框架 ibatis
Webx是淘宝开发的一套Web开发框架，Webx3是其第三个升级版本；采用Eclipse的开发环境，现在支持java开发；采用turbine原型的MVC框架，扩展了Spring容器，利用Maven进行项目的构建管理，灵活的ibatis持久层支持，总的来说，还是一套很不错的Web框架。 Webx3遵循turbine风格，velocity的模板被分为layout/screen/control三部
【MongoDB学习笔记五】MongoDB概述 bit1129 mongodb
MongoDB是面向文档的NoSQL数据库，尽量业界还对MongoDB存在一些质疑的声音，比如性能尤其是查询性能、数据一致性的支持没有想象的那么好，但是MongoDB用户群确实已经够多。MongoDB的亮点不在于它的性能，而是它处理非结构化数据的能力以及内置对分布式的支持(复制、分片达到的高可用、高可伸缩)，同时它提供的近似于SQL的查询能力，也是在做NoSQL技术选型时，考虑的一个重要因素。Mo
spring/hibernate/struts2常见异常总结白糖_ Hibernate
Spring ①ClassNotFoundException: org.aspectj.weaver.reflect.ReflectionWorld$ReflectionWorldException 缺少aspectjweaver.jar，该jar包常用于spring aop中 ②java.lang.ClassNotFoundException: org.sprin
jquery easyui表单重置(reset)扩展思路 bozch form jquery easyui reset
在jquery easyui表单中尚未提供表单重置的功能，这就需要自己对其进行扩展。扩展的时候要考虑的控件有： combo,combobox,combogrid,combotree,datebox,datetimebox 需要对其添加reset方法，reset方法就是把初始化的值赋值给当前的组件，这就需要在组件的初始化时将值保存下来。在所有的reset方法添加完毕之后，就需要对fo
编程之美-烙饼排序 bylijinnan 编程之美
package beautyOfCoding; import java.util.Arrays; /* *《编程之美》的思路是：搜索+剪枝。有点像是写下棋程序：当前情况下，把所有可能的下一步都做一遍；在这每一遍操作里面，计算出如果按这一步走的话，能不能赢（得出最优结果）。 *《编程之美》上代码有很多错误，且每个变量的含义令人费解。因此我按我的理解写了以下代码： */
Struts1.X 源码分析之ActionForm赋值原理 chenbowen00 struts
struts1在处理请求参数之前，首先会根据配置文件action节点的name属性创建对应的ActionForm。如果配置了name属性，却找不到对应的ActionForm类也不会报错，只是不会处理本次请求的请求参数。如果找到了对应的ActionForm类，则先判断是否已经存在ActionForm的实例，如果不存在则创建实例，并将其存放在对应的作用域中。作用域由配置文件action节点的s
[空天防御与经济]在获得充足的外部资源之前,太空投资需有限度 comsci 资源
这里有一个常识性的问题: 地球的资源,人类的资金是有限的,而太空是无限的..... 就算全人类联合起来,要在太空中修建大型空间站,也不一定能够成功,因为资源和资金,技术有客观的限制.... &
ORACLE临时表—ON COMMIT PRESERVE ROWS daizj oracle 临时表
ORACLE临时表转临时表：像普通表一样，有结构，但是对数据的管理上不一样，临时表存储事务或会话的中间结果集，临时表中保存的数据只对当前会话可见，所有会话都看不到其他会话的数据，即使其他会话提交了，也看不到。临时表不存在并发行为，因为他们对于当前会话都是独立的。创建临时表时，ORACLE只创建了表的结构（在数据字典中定义），并没有初始化内存空间，当某一会话使用临时表时，ORALCE会
基于Nginx XSendfile+SpringMVC进行文件下载 denger 应用服务器 Web nginx 网络应用 lighttpd
在平常我们实现文件下载通常是通过普通 read-write方式，如下代码所示。 @RequestMapping("/courseware/{id}") public void download(@PathVariable("id") String courseID, HttpServletResp
scanf接受char类型的字符 dcj3sjt126com c
/* 2013年3月11日22:35:54 目的：学习char只接受一个字符 */ # include <stdio.h> int main(void) { int i; char ch; scanf("%d", &i); printf("i = %d\n", i); scanf("%
学编程的价值 dcj3sjt126com 编程
发一个人会编程, 想想以后可以教儿女, 是多么美好的事啊, 不管儿女将来从事什么样的职业, 教一教, 对他思维的开拓大有帮助像这位朋友学习: http://blog.sina.com.cn/s/articlelist_2584320772_0_1.html VirtualGS教程 (By @林泰前): 几十年的老程序员，资深的
二维数组（矩阵）对角线输出飞天奔月二维数组
今天在BBS里面看到这样的面试题目, 1，二维数组（N*N），沿对角线方向，从右上角打印到左下角如N=4： 4*4二维数组 { 1 2 3 4 } { 5 6 7 8 } { 9 10 11 12 } {13 14 15 16 } 打印顺序 4 3 8 2 7 12 1 6 11 16 5 10 15 9 14 13 要
Ehcache（08）——可阻塞的Cache——BlockingCache 234390216 并发 ehcache BlockingCache 阻塞
可阻塞的Cache—BlockingCache 在上一节我们提到了显示使用Ehcache锁的问题，其实我们还可以隐式的来使用Ehcache的锁，那就是通过BlockingCache。BlockingCache是Ehcache的一个封装类，可以让我们对Ehcache进行并发操作。其内部的锁机制是使用的net.
mysqldiff对数据库间进行差异比较 jackyrong mysqld
mysqldiff该工具是官方mysql-utilities工具集的一个脚本，可以用来对比不同数据库之间的表结构，或者同个数据库间的表结构如果在windows下，直接下载mysql-utilities安装就可以了，然后运行后，会跑到命令行下： 1）基本用法 mysqldiff --server1=admin:12345
spring data jpa 方法中可用的关键字 lawrence.li java spring
spring data jpa 支持以方法名进行查询/删除/统计。查询的关键字为find 删除的关键字为delete/remove (>=1.7.x) 统计的关键字为count (>=1.7.x) 修改需要使用@Modifying注解 @Modifying @Query("update User u set u.firstna
Spring的ModelAndView类 nicegege spring
项目中controller的方法跳转的到ModelAndView类，一直很好奇spring怎么实现的？ /* * Copyright 2002-2010 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * yo
搭建 CentOS 6 服务器(13) - rsync、Amanda rensanning centos
（一）rsync Server端 # yum install rsync # vi /etc/xinetd.d/rsync service rsync { disable = no flags = IPv6 socket_type = stream wait
Learn Nodejs 02 toknowme nodejs
（1）npm是什么 npm is the package manager for node 官方网站：https://www.npmjs.com/ npm上有很多优秀的nodejs包，来解决常见的一些问题，比如用node-mysql，就可以方便通过nodejs链接到mysql，进行数据库的操作在开发过程往往会需要用到其他的包，使用npm就可以下载这些包来供程序调用 &nb
Spring MVC 拦截器 xp9802 spring mvc
Controller层的拦截器继承于HandlerInterceptorAdapter HandlerInterceptorAdapter.java 1 public abstract class HandlerInterceptorAdapter implements HandlerIntercep