我在C语言里面调用内嵌的neon汇编的时候,算法运行两次后发生stopped错误,最后发现是内嵌的参数属性错误,于是我调换了参数的位置和某个参数的读写属性
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include
#include
#include "opencv2/imgproc/imgproc.hpp"
using namespace cv;
using namespace std;
struct YUVBufferAddr{
unsigned char* Yaddr;
unsigned char* Uaddr;
unsigned char* Vaddr;
};
class q_timer {
public:
void start()
{
m_start = std::chrono::steady_clock::now();
}
double stop()
{
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
return std::chrono::duration_cast
}
void time_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f Seconds.\n", disp, stop() / nr_frame);
}
void fps_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f frame per second.\n", disp, (double)nr_frame / stop());
}
private:
std::chrono::steady_clock::time_point m_start;
};
void ImgYUV2RGB24_neon(unsigned char *pu8RgbBuffer,
unsigned char *pu8SrcYUV,
int l32Width,
int l32Height,
unsigned char* u ,
unsigned char* v)
{
asm volatile (
"add r4, %2, %2 , lsl #1 \n"
"mul r5, r4, %3 \n"
"sub r5, r5, r4 \n"
"mul r5, %2, %3 \n"
"add %4, %1, r5 \n"
"add %5, %4, r5, lsr #2 \n"
"mov r8, %2, lsr #3 \n"
"mov r11, %3, lsr #1 \n"
"add %3, %1, %2 \n"
"mov r5, %0 \n"
"add %0, r5, r4 \n"
"mov r9, #16 \n"
"vdup.8 d8, r9 \n"
"mov r10, #128 \n"
"vdup.8 d9, r10 \n"
"mov r9, #75 \n"
"vdup.16 q5, r9 \n"
"mov r10, #102 \n"
"vdup.16 q6, r10 \n"
"mov r9, #25 \n"
"vdup.16 q7, r9 \n"
"mov r10, #52 \n"
"vdup.16 q8, r10 \n"
"mov r9, #129 \n"
"vdup.16 q9, r9 \n"
"2: \n"
"1: \n"
"subs r8, r8, #1 \n"
"vld1.u8 d0, [%1]! \n"
"vld1.u8 d2, [%3]! \n"
"vld1.32 {d4[0]}, [%4]! \n"
"vld1.32 {d4[1]}, [%5]! \n"
"vsubl.u8 q0, d0, d8 \n"
"vsubl.u8 q1, d2, d8 \n"
"vsubl.u8 q2, d4, d9 \n"
"vmov q3, q2 \n"
"vzip.s16 q2, q3 \n"
"vmul.s16 q10, q3, q8 \n"
"vmla.s16 q10, q2, q7 \n"
"vmul.s16 q11, q2, q9 \n"
"vmul.s16 q12, q3, q6 \n"
"vmul.s16 q0, q0, q5 \n"
"vmul.s16 q1, q1, q5 \n"
"vqsub.s16 q13, q0, q10 \n"
"vqsub.s16 q14, q1, q10 \n"
"vqrshrun.s16 d27, q13, #6 \n"
"vqrshrun.s16 d30, q14, #6 \n"
"vqadd.s16 q10, q0, q11 \n"
"vqadd.s16 q11, q1, q11 \n"
"vqrshrun.s16 d26, q10, #6 \n"
"vqrshrun.s16 d29, q11, #6 \n"
"vqadd.s16 q11, q0, q12 \n"
"vqadd.s16 q12, q1, q12 \n"
"vqrshrun.s16 d28, q11, #6 \n"
"vqrshrun.s16 d31, q12, #6 \n"
"vst3.8 {d26, d27, d28}, [%0]! \n"
"vst3.8 {d29, d30, d31}, [r5]! \n"
"bgt 1b \n"
"subs r11, r11, #1 \n"
"mov r5,%0 \n"
"add %0,r5,r4 \n"
"add %1, %1, %2 \n"
"add %3, %3, %2 \n"
"mov r8, %2, lsr #3 \n"
"bgt 2b \n"
: "+r"(pu8RgbBuffer), // %0 output readwrite
"+r"(pu8SrcYUV), // %1
"+r"(l32Width),// %2 readonly
"+r"(l32Height), // %3
"+r"(u), // %4
"+r"(v) // %5
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15","r4","r5","r8","r9","r10","r11"
);
}
void ImgYUV2RGB24_neoncopy(unsigned char *pu8RgbBuffer,
unsigned char *pu8SrcYUV,
unsigned char* u ,
unsigned char* v,
int l32Height,
int l32Width)
{
asm volatile (
"add r4, %5, %5 , lsl #1 \n"
"mul r5, r4, %4 \n"
"sub r5, r5, r4 \n"
"mul r5, %5, %4 \n"
"add %2, %1, r5 \n"
"add %3, %2, r5, lsr #2 \n"
"mov r8, %5, lsr #3 \n"
"mov r6, %4, lsr #1 \n"
"add %4, %1, %5 \n"
"mov r5, %0 \n"
"add %0, r5, r4 \n"
"mov r9, #16 \n"
"vdup.8 d8, r9 \n"
"mov r10, #128 \n"
"vdup.8 d9, r10 \n"
"mov r9, #75 \n"
"vdup.16 q5, r9 \n"
"mov r10, #102 \n"
"vdup.16 q6, r10 \n"
"mov r9, #25 \n"
"vdup.16 q7, r9 \n"
"mov r10, #52 \n"
"vdup.16 q8, r10 \n"
"mov r9, #129 \n"
"vdup.16 q9, r9 \n"
"2: \n"
"1: \n"
"subs r8, r8, #1 \n"
"vld1.u8 d0, [%1]! \n"
"vld1.u8 d2, [%4]! \n"
"vld1.32 {d4[0]}, [%2]! \n"
"vld1.32 {d4[1]}, [%3]! \n"
"vsubl.u8 q0, d0, d8 \n"
"vsubl.u8 q1, d2, d8 \n"
"vsubl.u8 q2, d4, d9 \n"
"vmov q3, q2 \n"
"vzip.s16 q2, q3 \n"
"vmul.s16 q10, q3, q8 \n"
"vmla.s16 q10, q2, q7 \n"
"vmul.s16 q11, q2, q9 \n"
"vmul.s16 q12, q3, q6 \n"
"vmul.s16 q0, q0, q5 \n"
"vmul.s16 q1, q1, q5 \n"
"vqsub.s16 q13, q0, q10 \n"
"vqsub.s16 q14, q1, q10 \n"
"vqrshrun.s16 d27, q13, #6 \n"
"vqrshrun.s16 d30, q14, #6 \n"
"vqadd.s16 q10, q0, q11 \n"
"vqadd.s16 q11, q1, q11 \n"
"vqrshrun.s16 d26, q10, #6 \n"
"vqrshrun.s16 d29, q11, #6 \n"
"vqadd.s16 q11, q0, q12 \n"
"vqadd.s16 q12, q1, q12 \n"
"vqrshrun.s16 d28, q11, #6 \n"
"vqrshrun.s16 d31, q12, #6 \n"
"vst3.8 {d26, d27, d28}, [%0]! \n"
"vst3.8 {d29, d30, d31}, [r5]! \n"
"bgt 1b \n"
"subs r6, r6, #1 \n"
"mov r5,%0 \n"
"add %0,r5,r4 \n"
"add %1, %1, %5 \n"
"add %4, %4, %5 \n"
"mov r8, %5, lsr #3 \n"
"bgt 2b \n"
: "+r"(pu8RgbBuffer), // %0 output readwrite
"+r"(pu8SrcYUV), // %1
"+r"(u), // %2
"+r"(v), // %3
"+r"(l32Height) // %4
: "r"(l32Width) // %5 readonly
: "cc", "memory","r4","r5","r8","r9","r10","r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
int main()
{
int width;
int height;
unsigned char *main_camera = NULL;
unsigned char *dest = NULL;
std::FILE *f_left = NULL;
f_left = std::fopen("/data/MV_F_Cap1_3000.yuv", "rb");
if (NULL == f_left )
return -1;
width = 4208;
height = 3120;
main_camera = new unsigned char[width*height*3/2];
std::fread(&main_camera[0], sizeof(char), height*width*3/2, f_left);
fclose(f_left); f_left = NULL;
YUVBufferAddr main_addr;
main_addr.Yaddr = main_camera;
main_addr.Uaddr = &main_camera[height*width];
main_addr.Vaddr = &main_camera[height*width + height*width / 4];
dest = new unsigned char[4208*3120*3];
q_timer time;
cv::Mat dst = cv::Mat(height,width, CV_8UC3, dest);
time.start();
ImgYUV2RGB24_neon(dest, main_addr.Yaddr, width , height , main_addr.Uaddr, main_addr.Vaddr);
//ImgYUV2RGB24_neoncopy(dest, main_addr.Yaddr, main_addr.Uaddr, main_addr.Vaddr, height ,width );
time.time_display("yuv2rgb_asm");
time.start();
ImgYUV2RGB24_neon(dest, main_addr.Yaddr, width , height , main_addr.Uaddr, main_addr.Vaddr);
//ImgYUV2RGB24_neoncopy(dest, main_addr.Yaddr, main_addr.Uaddr, main_addr.Vaddr, height ,width );
time.time_display("yuv2rgb_asm");
cv::imwrite("rgb23.png", dst);
return 0;
}
代码下载链接:yuv2rgb_stopped.tar.gz