使用neon优化,iPhone4s 上测试时间在3ms,直接上代码,需要的朋友拿去
#include
void neon_asm_convert_BGRA_to_gray(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
{
#if defined(__arm64__)
//64
asm volatile (
"movi v4.8b, #14 \n"
"movi v5.8b, #76 \n"
"movi v6.8b, #38 \n"
//"movi v7.8b, #0 \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
"subs %w2, %w2, #8 \n"
"umull v16.8h, v0.8b, v4.8b \n"
"umlal v16.8h, v1.8b, v5.8b \n"
"umlal v16.8h, v2.8b, v6.8b \n"
"sqrshrun v3.8b, v16.8h, #7 \n"
//"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v3.8b}, [%0], #8 \n"
"b.gt 1b \n"
: "+r"(dest),
"+r"(src),
"+r"(numPixels)
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16"
);
#else
//32
asm volatile (
"vmov.u8 d4, #14 \n"
"vmov.u8 d5, #76 \n"
"vmov.u8 d6, #38 \n"
//"vmov.u8 d7, #16 \n"
".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%1]! \n"
"subs %2, %2, #8 \n"
"vmull.u8 q8, d0, d4 \n"
"vmlal.u8 q8, d1, d5 \n"
"vmlal.u8 q8, d2, d6 \n"
"vqrshrun.s16 d3, q8, #7 \n"
//"vqadd.u8 d3, d7 \n"
"vst1.8 {d3}, [%0]! \n"
"bgt 1b \n"
: "+r"(dest),
"+r"(src),
"+r"(numPixels)
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q8"
);
#endif
}