ARM NEON + Ne10 优化示例

以Android为例, 并计算float数组的和

标准 C 代码实现

// 标准 C 代码实现
static float calc_c(const float* data, int size) 
{
    float sum = 0.f;

    for (int i = 0; i < size; ++i) {
        sum += data[i];
    }

    return sum;
}


运用 ARM NEON 优化

// 运用 ARM NEON 优化
#include <arm_neon.h>

static float calc_neon(const float* data, int size) 
{
    float sum = 0.f;
    float32x4_t sum_vec = vdupq_n_f32(0);

    for (int i = 0; i < size / 4; ++i) {
        float32x4_t tmp_vec = vld1q_f32 (data + 4*i);
        sum_vec = vaddq_f32(sum_vec, tmp_vec);
    }

    sum += vgetq_lane_f32(sum_vec, 0);
    sum += vgetq_lane_f32(sum_vec, 1);
    sum += vgetq_lane_f32(sum_vec, 2);
    sum += vgetq_lane_f32(sum_vec, 3);

    int odd = size & 3;
    if(odd) {
        for(int i = size - odd; i < size; ++i) {
            sum += data[i];
        }
    }

    return sum;
}


运用 ARM NEON 优化

// 运用 Ne10 优化
#include <NE10.h>
#define ALIGH_UNIT        4

static float calc_ne10(const float* data, int size) 
{
    float sum = 0.f;
    float sum_vec[ALIGH_UNIT] = {0};

    for (int i = 0; i < size / ALIGH_UNIT; ++i) {
        ne10_add_float_neon (sum_vec, sum_vec, (float*)data+ALIGH_UNIT*i, ALIGH_UNIT);
    }

    for (int i = 0; i < ALIGH_UNIT; ++i) {
        sum += sum_vec[i];
    }

    int odd = size & (ALIGH_UNIT-1);
    if(odd) {
        for(int i = size - odd; i < size; ++i) {
            sum += data[i];
        }
    }

    return sum;
}

运用 ARM NEON 优化

// 主程序 Main
#include <stdlib.h>
#include <time.h>
#include <android/log.h>

#define  LOG_TAG    "Neon/Pref"
#define  LOGD(...)  __android_log_print(ANDROID_LOG_DEBUG,LOG_TAG,__VA_ARGS__)

#define ARRAY_SIZE        5000
#define ELAPSE_BEGIN(a)   struct timeval start##a = {0}; gettimeofday(&start##a, 0);
#define ELAPSE_END(a)     struct timeval end##a = {0};   gettimeofday(&end##a, 0);
#define ELAPSE_COUNT(a)   (1000000 * (end##a.tv_sec - start##a.tv_sec) + (end##a.tv_usec - start##a.tv_usec))

#define DO_ELAPSE(fn,...)  \
	{\
		ELAPSE_BEGIN(_##fn);\
		float sum = fn(__VA_ARGS__);\
		ELAPSE_END(_##fn);\
		LOGD( #fn " : %d, Result: %f", (int)ELAPSE_COUNT(_##fn), sum);\
	}


int main(int argc, char** argv) {
    float data[ARRAY_SIZE] = {0};

    for (int i = 0; i < ARRAY_SIZE; ++i) {
        data[i] = rand() % 5;
    }

    DO_ELAPSE(calc_c   , data, ARRAY_SIZE);
    DO_ELAPSE(calc_neon, data, ARRAY_SIZE);
    DO_ELAPSE(calc_ne10, data, ARRAY_SIZE);
}


[资料文档]
ARM NEON: http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html

NE10 Manual: http://blogs.arm.com/software-enablement/874-ne10-library-getting-started/

[参考资料]
http://hilbert-space.de/?p=22
http://www.crickettechnology.com/blog/?p=691


你可能感兴趣的:(ARM NEON + Ne10 优化示例)