NEON优化系列文章:
- NEON优化1:软件性能优化、降功耗怎么搞?link
- NEON优化2:ARM优化高频指令总结, link
- NEON优化3:矩阵转置的指令优化案例,link
- NEON优化4:floor/ceil函数的优化案例,link
- NEON优化5:log10函数的优化案例,link
- NEON优化6:关于交叉存取与反向交叉存取,link
- NEON优化7:性能优化经验总结,link
- NEON优化8:性能优化常见问题QA,link
假设已有float类型的大数组x[10000]
和float类型的y[10000]
,将x[i]
以向下取整的方式转为都是整数的float类型数组,结果放在y[i]
数组里。
ceil/floor/round/int
傻傻分不清楚? 先把浮点取整函数梳理如下表:
函数名 | 符号 | 功能 | 举例 |
---|---|---|---|
ceil | ⌈⌉ | 向上取整 | 正数:(4, 5] -> 5;负数:(-6, -5] -> -5 |
floor | ⌊⌋ | 向下取整 | 正数:[5, 6) -> 5;负数:[-5, -4) -> -5 |
round | ⌊⌉ | 圆整,四舍五入 | 正数:[4.5, 5.5) -> 5;负数: (-5.5, -4.5] -> -5 |
int | NA | 截断 | 正数:[5, 6) -> 5;负数:(-6, -5] -> -5 |
从上表中可以看出,floor函数的特性就是,非负数时等价于截断功能,负数时若不是整数则减一再截断,若是负整数则等于自身。我们容易发现,floor函数处理前后两数间距不可能>=1
,作为处理特殊情况负整数的依据。
于是有以下思路:
>=1
,该值为负整数,需换为初始数据注意,该思路同样也可以推广到ceil和round函数的优化。ceil处理前后数据间距小于1,round函数处理后数据间距不超过0.5。
原函数:
#include
int i;
for (i = 0; i < 10000; i++)
{
y[i] = (int)floor(x[i]);
}
优化后:
#include
#include
float32x4_t vf32x4Temp, vf32x4TempV1;
float32x4_t vf32x4Zero, vf32x4One;
int32x4_t vs32x4Temp;
uint32x4_t vu32x4TempV1;
vf32x4Zero = vdupq_n_f32(0.0f);
vf32x4One = vdupq_n_f32(1.0f);
#define STEP_NUM 4
int i;
for (i = 0; i < 10000 - STEP_NUM + 1; i += STEP_NUM) {
vf32x4Temp = vld1q_f32(&x[i]);
vf32x4TempV1 = vf32x4Temp;
vu32x4TempV1 = vcltq_f32(vf32x4Temp, vf32x4Zero); // 判断是否小于0
vf32x4Temp = vbslq_f32(vu32x4TempV1, vsubq_f32(vf32x4Temp, vf32x4One), vf32x4Temp); // 大于0,则截断;否则,就减1
vs32x4Temp = vcvtq_s32_f32(vf32x4Temp); // 截断取整,但存在如果是负数且为整数时,多减了1
vf32x4Temp = vcvtq_f32_s32(vs32x4Temp);
vu32x4TempV1 = vcgeq_f32(vf32x4TempV1, vaddq_f32(vf32x4Temp, vf32x4One)); // a>=b+1 时,多减1的负数整数本身
vf32x4Temp = vbslq_f32(vu32x4TempV1, vf32x4TempV1, vf32x4Temp); // 出现a>=b+1情况,必然是负数整数本身,此时赋值为原初始值
vst1q_f32(&y[i], vf32x4Temp);
}
for (; i < 10000; i++) // 处理数组尾部不足STEP_NUM的数据
{
y[i] = (float)floor(x[i]);
}
功能思路
代码思路
公共部分代码:
#include
#include
#define CITY_NUM (7)
// 假设有7个城市的温度,需向上圆整成整数
float g_fTemp[CITY_NUM] = {-4.5, -21.3, -20, 25, 26.5, -24.9, 21.1};
Ceil优化函数demo:
void ceil_neon_opt_demo(void)
{
int32_t iTemp[CITY_NUM];
float_t ceilTemp[CITY_NUM];
float32x4_t vf32x4fTemp, vf32x4TempOrigin;
float32x4_t vf32x4Zero, vf32x4One;
uint32x4_t vu32x4ComparFlag;
int32x4_t vs32x4Temp;
vf32x4Zero = vdupq_n_f32(0.0f);
vf32x4One = vdupq_n_f32(1.0f);
// processing
int32_t i;
for (i = CITY_NUM - 1; i - 3 >= 0; i -= 4) { // 3, 4 is for sth
// load from memory into neon register
vf32x4fTemp = vld1q_f32(&g_fTemp[i - 3]); // 3 is for sth
// ceil optimazation
vf32x4TempOrigin = vf32x4fTemp;
vu32x4ComparFlag = vcltq_f32(vf32x4fTemp, vf32x4Zero); // 判断是否小于0
vf32x4fTemp = vbslq_f32(vu32x4ComparFlag, vf32x4fTemp, vaddq_f32(vf32x4TempOrigin, vf32x4One)); // 为负,则截断;否则加1
vs32x4Temp = vcvtq_s32_f32(vf32x4fTemp); // 截断取整
vf32x4fTemp = vcvtq_f32_s32(vs32x4Temp);
vu32x4ComparFlag = vcleq_f32(vf32x4TempOrigin, vsubq_f32(vf32x4fTemp, vf32x4One)); // a<=i-1时,则本身为整数
vf32x4fTemp = vbslq_f32(vu32x4ComparFlag, vf32x4TempOrigin, vf32x4fTemp); // 若本身为整数,则赋原始值
vs32x4Temp = vcvtq_s32_f32(vf32x4fTemp); // 浮点整数转成整型整数
vst1q_s32(&iTemp[i - 3], vs32x4Temp); // 从寄存器存到内存中
}
for (; i >= 0; i--) {
iTemp[i] = (int32_t)ceil(g_fTemp[i]);
}
// verify result
for (i = 0; i < CITY_NUM; i++) {
printf("orig: %f \tceil:%d \tneon: %d\n", fTemp[i], (int32_t)ceil(g_fTemp[i]), iTemp[i]);
}
}
输出结果:
orig: -4.500000 ceil:-4 neon: -4
orig: -21.299999 ceil:-21 neon: -21
orig: -20.000000 ceil:-20 neon: -20
orig: 25.000000 ceil:25 neon: 25
orig: 26.500000 ceil:27 neon: 27
orig: -24.900000 ceil:-24 neon: -24
orig: 21.100000 ceil:22 neon: 22
floor优化函数demo:
void floor_neon_opt_demo(void)
{
float32x4_t vf32x4Temp, vf32x4TempOrigin;
float32x4_t vf32x4Zero, vf32x4One;
int32x4_t vs32x4Temp;
uint32x4_t vu32x4ComparFlag;
vf32x4Zero = vdupq_n_f32(0.0f);
vf32x4One = vdupq_n_f32(1.0f);
#define STEP_NUM 4
float floorTemp[CITY_NUM];
int32_t i;
for (i = 0; i < CITY_NUM - STEP_NUM + 1; i += STEP_NUM) {
vf32x4Temp = vld1q_f32(&fTemp[i]);
vf32x4TempOrigin = vf32x4Temp;
vu32x4ComparFlag = vcltq_f32(vf32x4Temp, vf32x4Zero); // 判断是否小于0
vf32x4Temp = vbslq_f32(vu32x4ComparFlag, vsubq_f32(vf32x4Temp, vf32x4One), vf32x4Temp); // 大于0,则截断;否则,就减1
vs32x4Temp = vcvtq_s32_f32(vf32x4Temp); // 截断取整,但存在如果是负数且为整数时,多减了1
vf32x4Temp = vcvtq_f32_s32(vs32x4Temp);
vu32x4ComparFlag = vcgeq_f32(vf32x4TempOrigin, vaddq_f32(vf32x4Temp, vf32x4One)); // a>=b+1 时,多减1的负数整数本身
vf32x4Temp = vbslq_f32(vu32x4ComparFlag, vf32x4TempOrigin, vf32x4Temp); // 出现a>=b+1情况,必然是负数整数本身,此时赋值为原初始值
vst1q_f32(&floorTemp[i], vf32x4Temp);
}
for (; i < CITY_NUM; i++) // 处理数组尾部不足STEP_NUM的数据
{
floorTemp[i] = (float)floor(fTemp[i]);
}
// verify result
for (i = 0; i < CITY_NUM; i++) {
printf("orig: %f\t floor:%d\t neon: %f\n", fTemp[i], (int32_t)floor(fTemp[i]), floorTemp[i]);
}
}