C:
void resizeBilinearGray(unsigned char* src, int w, int h, int src_stride, int w2, int h2,unsigned char* dst)
{
int st = src_stride;
//int[] temp = new int[w2*h2];
int A, B, C, D, x, y, index, gray;
float x_ratio = ((float)(w - 1)) / w2;
float y_ratio = ((float)(h - 1)) / h2;
float x_diff, y_diff, ya, yb;
int offset = 0;
int i = 0;
int j = 0;
for (i = 0; ifor (j = 0; jint)(x_ratio * j);
y = (int)(y_ratio * i);
x_diff = (x_ratio * j) - x;
y_diff = (y_ratio * i) - y;
index = y*st + x;
// range is 0 to 255 thus bitwise AND with 0xff
A = src[index] & 0xff;
B = src[index + 1] & 0xff;
C = src[index + st] & 0xff;
D = src[index + st + 1] & 0xff;
// Y = A(1-w)(1-h) + B(w)(1-h) + C(h)(1-w) + Dwh
gray = (int)(
A*(1 - x_diff)*(1 - y_diff) + B*(x_diff)*(1 - y_diff) +
C*(y_diff)*(1 - x_diff) + D*(x_diff*y_diff)
);
dst[offset++] = gray;
}
}
return ;
}
neon优化:neon需要批量加载,但双线性插值法的每个计算所需数据都需要根据像素位置计算,无法直接批量加载,我目前的思路是将计算所需的所有数据按序存储在临时内存中,然后用neon批量加载计算。
void resizeBilinearGray_neon(unsigned char* src, int w, int h, int src_stride, int w2, int h2,unsigned char* dst)
{
int st = src_stride;
int x,y,index;
uint16_t* dst_11 = (uint16_t*)malloc(w2*h2*sizeof(uint16_t));
uint16_t* dst_12 = (uint16_t*)malloc(w2*h2*sizeof(uint16_t));
uint16_t* dst_21 = (uint16_t*)malloc(w2*h2*sizeof(uint16_t));
uint16_t* dst_22 = (uint16_t*)malloc(w2*h2*sizeof(uint16_t));
float* x_diff = (float*)malloc(w2*h2*sizeof(float));
float* y_diff = (float*)malloc(w2*h2*sizeof(float));
//int[] temp = new int[w2*h2];
//int A, B, C, D, x, y, index, gray;
float x_ratio = ((float)(w - 1)) / w2;
float y_ratio = ((float)(h - 1)) / h2;
//float x_diff, y_diff, ya, yb;
int offset = 0;
int i = 0;
int j = 0;
for (i = 0; ifor (j = 0; jint)(x_ratio * j);
y = (int)(y_ratio * i);
x_diff[i*w2+j] = (x_ratio * j) - x;
y_diff[i*w2+j] = (y_ratio * i) - y;
index = y*st + x;
// range is 0 to 255 thus bitwise AND with 0xff
dst_11[i*w2+j] = src[index] & 0xff;
dst_12[i*w2+j] = src[index + 1] & 0xff;
dst_21[i*w2+j] = src[index + st] & 0xff;
dst_22[i*w2+j] = src[index + st + 1] & 0xff;
}
}
// Y = A(1-w)(1-h) + B(w)(1-h) + C(h)(1-w) + Dwh
for(offset=0;offset8)
{
uint16x8_t A = vld1q_u16(dst_11+offset); //加载尽可能多的数据进入寄存器,减少内存读写
uint16x8_t B = vld1q_u16(dst_12+offset);
uint16x8_t C = vld1q_u16(dst_21+offset);
uint16x8_t D = vld1q_u16(dst_22+offset);
float32x4_t x_1 = vld1q_f32(x_diff+offset);
float32x4_t x_2 = vld1q_f32(x_diff+offset+4);
float32x4_t y_1 = vld1q_f32(y_diff+offset);
float32x4_t y_2 = vld1q_f32(y_diff+offset+4);
float32x4_t one = vdupq_n_f32(1.0);
float32x4_t one_x_1 = vsubq_f32(one,x_1); //计算1-x_diff和1-y_diff
float32x4_t one_x_2 = vsubq_f32(one,x_2);
float32x4_t one_y_1 = vsubq_f32(one,y_1);
float32x4_t one_y_2 = vsubq_f32(one,y_2);
/*类型转换,将16位uint转换成32位float*/
//处理A
uint16x4_t v_16_low = vget_low_u16(A); //读取寄存器的高/低部分到新的寄存器中
uint16x4_t v_16_high = vget_high_u16(A);
uint32x4_t v_32_low = vmovl_u16(v_16_low); //将16位扩展为32位
uint32x4_t v_32_high = vmovl_u16(v_16_high);
float32x4_t A_32f_low = vcvtq_f32_u32(v_32_low); //将int转换为float
float32x4_t A_32f_high = vcvtq_f32_u32(v_32_high);
//处理B
v_16_low = vget_low_u16(B);
v_16_high = vget_high_u16(B);
v_32_low = vmovl_u16(v_16_low);
v_32_high = vmovl_u16(v_16_high);
float32x4_t B_32f_low = vcvtq_f32_u32(v_32_low);
float32x4_t B_32f_high = vcvtq_f32_u32(v_32_high);
//处理C
v_16_low = vget_low_u16(C);
v_16_high = vget_high_u16(C);
v_32_low = vmovl_u16(v_16_low);
v_32_high = vmovl_u16(v_16_high);
float32x4_t C_32f_low = vcvtq_f32_u32(v_32_low);
float32x4_t C_32f_high = vcvtq_f32_u32(v_32_high);
//处理D
v_16_low = vget_low_u16(C);
v_16_high = vget_high_u16(C);
v_32_low = vmovl_u16(v_16_low);
v_32_high = vmovl_u16(v_16_high);
float32x4_t D_32f_low = vcvtq_f32_u32(v_32_low);
float32x4_t D_32f_high = vcvtq_f32_u32(v_32_high);
float32x4_t temp1,temp2;
uint32x4_t result_32;
uint16x4_t result_16_low,result_16_high;
temp1 = vmulq_f32(A_32f_low,one_x_1); //temp1=A(low)*(1 - x_diff)
temp1 = vmulq_f32(temp1,one_y_1); //temp1=A(low)*(1 - x_diff)*(1 - y_diff)
temp2 = vmulq_f32(B_32f_low,x_1); //temp2=B(low)*(x_diff)
temp2 = vmulq_f32(temp2,one_y_1); //temp2=B(low)*(x_diff)*(1 - y_diff)
temp1 = vaddq_f32(temp1,temp2);
temp2 = vmulq_f32(C_32f_low,y_1); //temp2=C(low)*(y_diff)
temp2 = vmulq_f32(temp2,one_x_1); //temp2=C(low)*(y_diff)*(1 - x_diff)
temp1 = vaddq_f32(temp1,temp2);
temp2 = vmulq_f32(D_32f_low,x_1); //temp2=D(low)*(x_diff)
temp2 = vmulq_f32(temp2,y_1); //temp2=D(low)*(x_diff*y_diff)
temp1 = vaddq_f32(temp1,temp2);
result_32 = vcvtq_u32_f32(temp1); //数据类型转换
result_16_low = vqmovn_u32(result_32); //窄指令,32位变为16位
temp1 = vmulq_f32(A_32f_high,one_x_2); //temp1=A(high)*(1 - x_diff)
temp1 = vmulq_f32(temp1,one_y_2); //temp1=A(high)*(1 - x_diff)*(1 - y_diff)
temp2 = vmulq_f32(B_32f_high,x_2); //temp2=B(high)*(x_diff)
temp2 = vmulq_f32(temp2,one_y_2); //temp2=B(high)*(x_diff)*(1 - y_diff)
temp1 = vaddq_f32(temp1,temp2);
temp2 = vmulq_f32(C_32f_high,y_2); //temp2=C(high)*(y_diff)
temp2 = vmulq_f32(temp2,one_x_2); //temp2=C(high)*(y_diff)*(1 - x_diff)
temp1 = vaddq_f32(temp1,temp2);
temp2 = vmulq_f32(D_32f_high,x_2); //temp2=D(high)*(x_diff)
temp2 = vmulq_f32(temp2,y_2); //temp2=D(high)*(x_diff*y_diff)
temp1 = vaddq_f32(temp1,temp2);
result_32 = vcvtq_u32_f32(temp1); //数据类型转换
result_16_high = vqmovn_u32(result_32); //窄指令,32位变为16位
uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);
uint8x8_t result = vqmovn_u16(result_16);
vst1_u8(dst+offset,result);
}
free(dst_11);
free(dst_12);
free(dst_21);
free(dst_22);
free(x_diff);
free(y_diff);
return ;
}
测试结果:实际测试时,发现用neon的实现反而比C慢不少,猜测是由于需要将数据重新放入临时内存,读写内存过多的原因
内嵌汇编:采用内嵌汇编进行优化,直接将数据从r寄存器逐个加载进neon寄存器,不先放入临时内存
void resizeBilinearGray_neon_Optimized(unsigned char* src, int w, int h, int src_stride, int w2, int h2,unsigned char* dst)
{
int st = src_stride;
int x,y,index;
uint32_t dst_11,dst_12,dst_22,dst_21;
float x_diff,y_diff,x_diff_1,y_diff_1;
//uint32x4_t A,B,C,D;
//float32x4_t x_32,y_32;
float x_ratio = ((float)(w - 1)) / w2;
float y_ratio = ((float)(h - 1)) / h2;
/*
uint32_t* test0=(uint32_t*)malloc(4*sizeof(uint32_t));
uint32_t* test1=(uint32_t*)malloc(4*sizeof(uint32_t));
uint32_t* test2=(uint32_t*)malloc(4*sizeof(uint32_t));
uint32_t* test3=(uint32_t*)malloc(4*sizeof(uint32_t));
float* ftest0=(float*)malloc(4*sizeof(float));
float* ftest1=(float*)malloc(4*sizeof(float));
float* ftest2=(float*)malloc(4*sizeof(float));
float* ftest3=(float*)malloc(4*sizeof(float));
uint8_t* test8=(uint8_t*)malloc(8*sizeof(uint8_t));
*/
int offset = 0;
int i = 0;
int j = 0;
int flag=0;
for (i = 0; ifor (j = 0; j1-x_diff;
y_diff_1=1-y_diff;
index = y*st + x;
//int t=0;
// range is 0 to 255 thus bitwise AND with 0xff
dst_11 = src[index] & 0xff;
dst_12 = src[index + 1] & 0xff;
dst_21 = src[index + st] & 0xff;
dst_22 = src[index + st + 1] & 0xff;
switch(offset)
{
case 0:
asm volatile
(
"vmov.32 d0[0], %0\t\n"
"vmov.32 d2[0], %1\t\n"
"vmov.32 d4[0], %2\t\n"
"vmov.32 d30[0], %3\t\n"
"vmov.32 d8[0], %4\t\n"
"vmov.32 d10[0], %5\t\n"
"vmov.32 d12[0], %6\t\n"
"vmov.32 d14[0], %7\t\n"
:
:"r"(dst_11),"r"(dst_12),"r"(dst_21),"r"(dst_22),"r"(x_diff),"r"(x_diff_1),"r"(y_diff),"r"(y_diff_1)
:"q0","q1","q2","q15","q4","q5","q6","q7"
);
offset++;
break;
case 1:
asm volatile
(
"vmov.32 d0[1], %0\t\n"
"vmov.32 d2[1], %1\t\n"
"vmov.32 d4[1], %2\t\n"
"vmov.32 d30[1], %3\t\n"
"vmov.32 d8[1], %4\t\n"
"vmov.32 d10[1], %5\t\n"
"vmov.32 d12[1], %6\t\n"
"vmov.32 d14[1], %7\t\n"
:
:"r"(dst_11),"r"(dst_12),"r"(dst_21),"r"(dst_22),"r"(x_diff),"r"(x_diff_1),"r"(y_diff),"r"(y_diff_1)
:"q0","q1","q2","q15","q4","q5","q6","q7"
);
offset++;
break;
case 2:
asm volatile
(
"vmov.32 d1[0], %0\t\n"
"vmov.32 d3[0], %1\t\n"
"vmov.32 d5[0], %2\t\n"
"vmov.32 d31[0], %3\t\n"
"vmov.32 d9[0], %4\t\n"
"vmov.32 d11[0], %5\t\n"
"vmov.32 d13[0], %6\t\n"
"vmov.32 d15[0], %7\t\n"
:
:"r"(dst_11),"r"(dst_12),"r"(dst_21),"r"(dst_22),"r"(x_diff),"r"(x_diff_1),"r"(y_diff),"r"(y_diff_1)
:"q0","q1","q2","q15","q4","q5","q6","q7"
);
/*
asm(
"vst1.32 {q0}, [%0]\t\n"
"vst1.32 {q1}, [%1]\t\n"
"vst1.32 {q2}, [%2]\t\n"
"vst1.32 {q15}, [%3]\t\n"
"vst1.32 {q4}, [%4]\t\n"
"vst1.32 {q5}, [%5]\t\n"
"vst1.32 {q6}, [%6]\t\n"
"vst1.32 {q7}, [%7]\t\n"
:"+r"(test0),"+r"(test1),"+r"(test2),"+r"(test3),"+r"(ftest0),"+r"(ftest1),"+r"(ftest2),"+r"(ftest3)
:
:"memory"
);
for(t=0;t<3;t++)
{
printf("%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\n",test0[t],test1[t],test2[t],test3[t],ftest0[t],ftest1[t],ftest2[t],ftest3[t]);
//printf("%d",dst[i*w2+j+t]);
}
printf("\n");*/
offset++;
break;
case 3:
asm volatile
(
"vmov.32 d1[1], %0\t\n"
"vmov.32 d3[1], %1\t\n"
"vmov.32 d5[1], %2\t\n"
"vmov.32 d31[1], %3\t\n"
"vmov.32 d9[1], %4\t\n"
"vmov.32 d11[1], %5\t\n"
"vmov.32 d13[1], %6\t\n"
"vmov.32 d15[1], %7\t\n"
:
:"r"(dst_11),"r"(dst_12),"r"(dst_21),"r"(dst_22),"r"(x_diff),"r"(x_diff_1),"r"(y_diff),"r"(y_diff_1)
:"memory","q0","q1","q2","q15","q4","q5","q6","q7"
);
/*
asm volatile
(
"vst1.32 {q0}, [%0]\t\n"
"vst1.32 {q1}, [%1]\t\n"
"vst1.32 {q2}, [%2]\t\n"
"vst1.32 {q15}, [%3]\t\n"
"vst1.32 {q4}, [%4]\t\n"
"vst1.32 {q5}, [%5]\t\n"
"vst1.32 {q6}, [%6]\t\n"
"vst1.32 {q7}, [%7]\t\n"
:"+r"(test0),"+r"(test1),"+r"(test2),"+r"(test3),"+r"(ftest0),"+r"(ftest1),"+r"(ftest2),"+r"(ftest3)
:
:"memory"
);
for(t=0;t<4;t++)
{
printf("%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\n",test0[t],test1[t],test2[t],test3[t],ftest0[t],ftest1[t],ftest2[t],ftest3[t]);
//printf("%d",dst[i*w2+j+t]);
}
printf("\n");*/
if(flag)
{
//unsigned char* dst_offset=dst[i*w2+j];
int dst_offset=&dst[i*w2+j];
asm volatile
(
"vcvt.f32.u32 q0, q0\t\n" //转换格式,uint32x4_t -> float32x4_t
"vcvt.f32.u32 q1, q1\t\n"
"vcvt.f32.u32 q2, q2\t\n"
"vcvt.f32.u32 q15, q15\t\n"
//A*(1 - x_diff)*(1 - y_diff) + B*(x_diff)*(1 - y_diff) +C*(y_diff)*(1 - x_diff) + D*(x_diff*y_diff)
"vmul.f32 q8,q0,q5\t\n" //A*(1 - x_diff)*(1 - y_diff) ->q8
"vmul.f32 q8,q8,q7\t\n"
"vmul.f32 q9,q1,q4\t\n" //B*(x_diff)*(1 - y_diff) ->q9
"vmul.f32 q9,q9,q7\t\n"
"vadd.f32 q8,q8,q9\t\n" //q8+q9 -> q8
"vmul.f32 q9,q2,q6\t\n" //C*(y_diff)*(1 - x_diff) -> q9
"vmul.f32 q9,q9,q5\t\n"
"vadd.f32 q8,q8,q9\t\n" //q8+q9 -> q8
"vmul.f32 q9,q15,q4\t\n" //D*(x_diff*y_diff) -> q9
"vmul.f32 q9,q9,q6\t\n"
"vadd.f32 q8,q8,q9\t\n" //q8+q9 -> q8
"vcvt.u32.f32 q8,q8\t\n" //转换格式,float32x4_t -> uint32x4_t
"vqmovn.u32 d21,q8\t\n" //窄指令,uint32x4_t -> uint16x4_t
"vqmovn.u16 d22,q10\t\n" //窄指令,uint16x8_t -> uint8x8_t
//"vst1.8 {d22}, [%1]\t\n" //存储
"vst1.8 {d22}, [%0]!\t\n" //存储
:"+r"(dst_offset)//,"+r"(test)
:
:"memory", "q10","d22"
);
/*
asm(
"vst1.32 {q0}, [%0]\t\n"
"vst1.32 {q1}, [%1]\t\n"
"vst1.32 {q2}, [%2]\t\n"
"vst1.32 {q15}, [%3]\t\n"
"vst1.32 {q4}, [%4]\t\n"
"vst1.32 {q5}, [%5]\t\n"
"vst1.32 {q6}, [%6]\t\n"
"vst1.32 {q7}, [%7]\t\n"
"vst1.8 {d22}, [%8]\t\n"
:"+r"(test0),"+r"(test1),"+r"(test2),"+r"(test3),"+r"(ftest0),"+r"(ftest1),"+r"(ftest2),"+r"(ftest3),"+r"(test8)
:
:"memory"
);
for(t=0;t<4;t++)
{
printf("%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\n",test0[t],test1[t],test2[t],test3[t],ftest0[t],ftest1[t],ftest2[t],ftest3[t]);
//printf("%d",dst[i*w2+j+t]);
}
printf("\n");
for(t=0;t<8;t++)
{
printf("%d\t",test8[t]);
}
printf("\n");*/
flag=0;
}
else
{
asm volatile
(
"vcvt.f32.u32 q0, q0\t\n" //转换格式,uint32x4_t -> float32x4_t
"vcvt.f32.u32 q1, q1\t\n"
"vcvt.f32.u32 q2, q2\t\n"
"vcvt.f32.u32 q15, q15\t\n"
//A*(1 - x_diff)*(1 - y_diff) + B*(x_diff)*(1 - y_diff) +C*(y_diff)*(1 - x_diff) + D*(x_diff*y_diff)
"vmul.f32 q8,q0,q5\t\n" //A*(1 - x_diff)*(1 - y_diff) ->q8
"vmul.f32 q8,q8,q7\t\n"
"vmul.f32 q9,q1,q4\t\n" //B*(x_diff)*(1 - y_diff) ->q9
"vmul.f32 q9,q9,q7\t\n"
"vadd.f32 q8,q8,q9\t\n" //q8+q9 -> q8
"vmul.f32 q9,q2,q6\t\n" //C*(y_diff)*(1 - x_diff) -> q9
"vmul.f32 q9,q9,q5\t\n"
"vadd.f32 q8,q8,q9\t\n" //q8+q9 -> q8
"vmul.f32 q9,q15,q4\t\n" //D*(x_diff*y_diff) -> q9
"vmul.f32 q9,q9,q6\t\n"
"vadd.f32 q8,q8,q9\t\n" //q8+q9 -> q8
"vcvt.u32.f32 q8,q8\t\n" //转换格式,float32x4_t -> uint32x4_t
"vqmovn.u32 d20,q8\t\n" //窄指令,uint32x4_t -> uint16x4_t
:
:
:"d20"
);
flag=1;
}
offset=0;
break;
default:
printf("offset error!\n");
break;
}
}
}
return ;
}
测试:跟未优化的C差不多,测试下来,感觉对于灰度图的双线性插值法图像缩放,并不适合neon优化。虽然代码中还可以进行指令调度、直接从内存加载数据进neon寄存器等优化,但感觉优化效果以及性价比不高。Ne10库中的图像缩放算法也是基于双线性插值法的rgba图像缩放,其加速比也才1.5(相比于它自己的C),如果是用于rgba格式的图像数据,感觉会有加速,因为计算像素数据可以批量加载进寄存器,不需单独加载。
还有一个用移位优化后的C实现,测试结果最快,该版本的实现尚未尝试用neon优化过,没测过优化效果