BGRA转灰度数据

使用neon优化,iPhone4s 上测试时间在3ms,直接上代码,需要的朋友拿去

 
  
#include 
void neon_asm_convert_BGRA_to_gray(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
{
    
#if defined(__arm64__)
    //64
    asm volatile (
                  "movi       v4.8b, #14                     \n"
                  "movi       v5.8b, #76                     \n"
                  "movi       v6.8b, #38                     \n"
                  //"movi       v7.8b, #0                     \n"
                  "1:                                          \n"
                  "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
                  "subs       %w2, %w2, #8                     \n"
                  "umull      v16.8h, v0.8b, v4.8b           \n"
                  "umlal      v16.8h, v1.8b, v5.8b           \n"
                  "umlal      v16.8h, v2.8b, v6.8b           \n"
                  "sqrshrun   v3.8b, v16.8h, #7              \n"
                  //"uqadd      v0.8b, v0.8b, v7.8b            \n"
                  "st1        {v3.8b}, [%0], #8              \n"
                  "b.gt       1b                             \n"
                  : "+r"(dest),
                  "+r"(src),
                  "+r"(numPixels)
                  :
                  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16"
                  );
    
#else
    //32
    asm volatile (
                  "vmov.u8    d4, #14                        \n"
                  "vmov.u8    d5, #76                        \n"
                  "vmov.u8    d6, #38                        \n"
                  //"vmov.u8    d7, #16                        \n"
                  ".p2align   2                              \n"
                  "1:                                          \n"
                  "vld4.8     {d0, d1, d2, d3}, [%1]!        \n"
                  "subs       %2, %2, #8                     \n"
                  "vmull.u8   q8, d0, d4                     \n"
                  "vmlal.u8   q8, d1, d5                     \n"
                  "vmlal.u8   q8, d2, d6                     \n"
                  "vqrshrun.s16 d3, q8, #7                   \n"
                  //"vqadd.u8   d3, d7                         \n"
                  "vst1.8     {d3}, [%0]!                    \n"
                  "bgt        1b                             \n"
                  : "+r"(dest),
                  "+r"(src),
                  "+r"(numPixels)
                  :
                  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q8"
                  );
    
#endif
    
    
}


你可能感兴趣的:(算法)