目录
1. 源码下载
2. DSP库源码简介
3.基于库的移植(DSP库的使用)
3.1 实验1
3.2 实验2
4. 使用V6版本的编译器进行编译
上一篇:STM32F407-Discovery的硬件FPU-CSDN博客
Github地址:GitHub - ARM-software/CMSIS_5: CMSIS Version 5 Development Repository
最新版本是5.9.0,也可以使用HAL库里自带的,本文基于STM32Cube_FW_F4_V1.27.0里自带的DSP版本
目录结构如下:\Drivers\CMSIS\DSP
Include:公共头文件夹目录,其中比较重要的是arm_math.h
Projects:官方自带的工程示例
Source:DSP的源码实现,是重点目录
Source目录下各个文件夹实现功能简介如下表:
文件夹 |
实现的功能(API) |
BasicMathFunctions |
实现基本数学函数,有浮点/定点/向量等基本运算 |
CommonTables |
一些公用的参数表 |
ComplexMathFunctions |
复数的计算:加减乘除、取模等 |
ControllerFunctions |
一些控制功能函数:比如PID控制算法 |
FastMathFunctions |
纯数学理论实现的一些快速计算算法:求正余弦/快速开方 |
FilteringFunctions |
滤波功能的实现:IIR/FIR/LMS/求卷积等 |
MatrixFunctions |
矩阵运算相关API:加减法、转置、求逆等 |
StatisticsFunctions |
常用的统计学方法:求均值/方差/标准差/均方根等 |
SupportFunctions |
功能性函数:数据拷贝(连续的一大块)/定点浮点之间的转换 |
TransformFunctions |
变换函数实现:复数/实数的FFT/IFFT以及离散余弦变换DCT |
对应的DSP LIB库:\Drivers\CMSIS\Lib\ARM:
STM32F4是M4内核,FPU支持单精度浮点数据运算,小端模式,所以:arm_cortexM4lf_math.lib是重点文件。
仿照源代码库文件所在的目录结构,新建文件夹:Drivers\CMSIS\Lib\ARM
然后直接拷贝arm_cortexM4lf_math.lib到ARM目录下。
仿照源代码库的目录结构,新建文件夹:Drivers\CMSIS\DSP\Include
同样将源码目录中的三个头文件拷贝过来:
将库文件添加到Keil工程,并且添加头文件路径:
到此,库的移植已经完毕,接下来是预处理定义一些宏。
(1) 首先是硬件FPU要开启:取决于__FPU_PRESENT和__FPU_USED
详见上一篇:STM32F407-Discovery的硬件FPU-CSDN博客
(2) 使用DSP库中的基本数学运算实现,比如sin()/cos():ARM_MATH_DSP
(3) 如果使用矩阵运算,则矩阵大小是个很值得注意的问题,运算前要对输入矩阵的大小进行检查:ARM_MATH_MATRIX_CHECK
(4) 浮点数转 Q32/Q15/Q7 时,处理四舍五入,最大限度确保数据精度不丢失: ARM_MATH_ROUNDING
(5) 批量处理数据时,加快执行速度,比如批量求绝对值: ARM_MATH_LOOPUNROLL
(6) 最后是CM4内核的一个宏:ARM_MATH_CM4
把这些添加到全局的宏定义中:
USE_HAL_DRIVER,STM32F407xx,USE_STM32F4_DISCO,ARM_MATH_MATRIX_CHECK,ARM_MATH_ROUNDING,ARM_MATH_LOOPUNROLL,ARM_MATH_CM4 |
对比MDK标准库函数和DSP库函数的计算速度。
Main.c
/* Includes ------------------------------------------------------------------*/
#include "main.h"
#define DELTA 0.0001f /* 误差值 */
extern TIM_HandleTypeDef g_timx_handle;
uint8_t g_timeout;
/**
* @brief sin cos 测试
* @param angle : 起始角度
* @param times : 运算次数
* @param mode : 是否使用DSP库
* @arg 0 , 不使用DSP库;
* @arg 1 , 使用DSP库;
*
* @retval 无
*/
uint8_t sin_cos_test(float angle, uint32_t times, uint8_t mode)
{
float sinx, cosx;
float result;
uint32_t i = 0;
if (mode == 0)
{
for (i = 0; i < times; i++)
{
cosx = cosf(angle); /* 不使用DSP优化的sin,cos函数 */
sinx = sinf(angle);
result = sinx * sinx + cosx * cosx; /* 计算结果应该等于1 */
result = fabsf(result - 1.0f); /* 对比与1的差值 */
if (result > DELTA)return 0XFF; /* 判断失败 */
angle += 0.001f; /* 角度自增 */
}
}
else
{
for (i = 0; i < times; i++)
{
cosx = arm_cos_f32(angle); /* 使用DSP优化的sin,cos函数 */
sinx = arm_sin_f32(angle);
result = sinx * sinx + cosx * cosx; /* 计算结果应该等于1 */
result = fabsf(result - 1.0f); /* 对比与1的差值 */
if (result > DELTA)return 0XFF; /* 判断失败 */
angle += 0.001f; /* 角度自增 */
}
}
return 0; /* 任务完成 */
}
int main(void)
{
float time;
uint8_t res;
/* STM32F4xx HAL library initialization:
- Configure the Flash prefetch, instruction and Data caches
- Configure the Systick to generate an interrupt each 1 msec
- Set NVIC Group Priority to 4
- Global MSP (MCU Support Package) initialization
*/
HAL_Init();
/* Configure the system clock to 168 MHz */
SystemClock_Config();
/* 串口2初始化: 只用tx功能 */
if(uart2_init(9600))
{
Error_Handler();
}
#if 1
// BSP_LED_Off(LED6);
// HAL_Delay(200);
//
// BSP_LED_On(LED6);
// //HAL_Delay(1000);
// //test_fpu_tmp1();
// fft_test();
// BSP_LED_Off(LED6);
// for (int i=0; i CPACR:0x%x\n", SCB->CPACR);
while(1){
;
}
return 0;
}
/**
* @brief System Clock Configuration
* The system Clock is configured as follow :
* System Clock source = PLL (HSE)
* SYSCLK(Hz) = 168000000
* HCLK(Hz) = 168000000
* AHB Prescaler = 1
* APB1 Prescaler = 4
* APB2 Prescaler = 2
* HSE Frequency(Hz) = 8000000
* PLL_M = 8
* PLL_N = 336
* PLL_P = 2
* PLL_Q = 7
* VDD(V) = 3.3
* Main regulator output voltage = Scale1 mode
* Flash Latency(WS) = 5
* @param None
* @retval None
*/
static void SystemClock_Config(void)
{
RCC_ClkInitTypeDef RCC_ClkInitStruct;
RCC_OscInitTypeDef RCC_OscInitStruct;
/* Enable Power Control clock */
__HAL_RCC_PWR_CLK_ENABLE();
/* The voltage scaling allows optimizing the power consumption when the device is
clocked below the maximum system frequency, to update the voltage scaling value
regarding system frequency refer to product datasheet. */
__HAL_PWR_VOLTAGESCALING_CONFIG(PWR_REGULATOR_VOLTAGE_SCALE1);
/* Enable HSE Oscillator and activate PLL with HSE as source */
RCC_OscInitStruct.OscillatorType = RCC_OSCILLATORTYPE_HSE;
RCC_OscInitStruct.HSEState = RCC_HSE_ON;
RCC_OscInitStruct.PLL.PLLState = RCC_PLL_ON;
RCC_OscInitStruct.PLL.PLLSource = RCC_PLLSOURCE_HSE;
RCC_OscInitStruct.PLL.PLLM = 8;
RCC_OscInitStruct.PLL.PLLN = 336;
RCC_OscInitStruct.PLL.PLLP = RCC_PLLP_DIV2;
RCC_OscInitStruct.PLL.PLLQ = 7;
if(HAL_RCC_OscConfig(&RCC_OscInitStruct) != HAL_OK)
{
Error_Handler();
}
/* Select PLL as system clock source and configure the HCLK, PCLK1 and PCLK2
clocks dividers */
RCC_ClkInitStruct.ClockType = (RCC_CLOCKTYPE_SYSCLK | RCC_CLOCKTYPE_HCLK | RCC_CLOCKTYPE_PCLK1 | RCC_CLOCKTYPE_PCLK2);
RCC_ClkInitStruct.SYSCLKSource = RCC_SYSCLKSOURCE_PLLCLK;
RCC_ClkInitStruct.AHBCLKDivider = RCC_SYSCLK_DIV1;
RCC_ClkInitStruct.APB1CLKDivider = RCC_HCLK_DIV4;
RCC_ClkInitStruct.APB2CLKDivider = RCC_HCLK_DIV2;
if(HAL_RCC_ClockConfig(&RCC_ClkInitStruct, FLASH_LATENCY_5) != HAL_OK)
{
Error_Handler();
}
/* STM32F405x/407x/415x/417x Revision Z devices: prefetch is supported */
if (HAL_GetREVID() == 0x1001)
{
/* Enable the Flash prefetch */
__HAL_FLASH_PREFETCH_BUFFER_ENABLE();
}
}
运行结果如下:
[22:54:44.691] 315.0ms //使用库函数 [22:54:44.851] 153.9ms //使用DSP [22:54:44.851] __CC_ARM:1 [22:54:44.867] __FPU_PRESENT:1 [22:54:44.883] __FPU_USED:1 [22:54:44.898] SCB->CPACR:0xf00000 |
测量计算1024个点的复数FFT运行时长
main.c
/* FFT长度,如果不指定,则默认是1024个点
* 长度可选范围: 16, 64, 256, 1024.
*/
#define FFT_LENGTH 1024
float fft_inputbuf[FFT_LENGTH * 2]; /* FFT输入数组 */
float fft_outputbuf[FFT_LENGTH]; /* FFT输出数组 */
uint8_t g_timeout;
extern TIM_HandleTypeDef g_timx_handle;
int main(void)
{
float time;
uint8_t res;
int i;
arm_cfft_radix4_instance_f32 scfft;
/* STM32F4xx HAL library initialization:
- Configure the Flash prefetch, instruction and Data caches
- Configure the Systick to generate an interrupt each 1 msec
- Set NVIC Group Priority to 4
- Global MSP (MCU Support Package) initialization
*/
HAL_Init();
/* Configure the system clock to 168 MHz */
SystemClock_Config();
/* 串口2初始化: 只用tx功能 */
if(uart2_init(9600))
{
Error_Handler();
}
/* 初始化scfft结构体,设置相关参数 */
arm_cfft_radix4_init_f32(&scfft, FFT_LENGTH, 0, 1);
#if 1
/* 初始化输入序列 */
for (i = 0; i < FFT_LENGTH; i++)
{
fft_inputbuf[2 * i] = 100 +
10 * arm_sin_f32(2 * PI * i / FFT_LENGTH) +
30 * arm_sin_f32(2 * PI * i * 4 / FFT_LENGTH) +
50 * arm_cos_f32(2 * PI * i * 8 / FFT_LENGTH); /* 实部 */
fft_inputbuf[2 * i + 1] = 0; /* 虚部: 都是0 */
}
btim_timx_int_init(65535, 8400 - 1);
__HAL_TIM_SET_COUNTER(&g_timx_handle, 0); /* 重设TIM6定时器的计数器值 */
g_timeout = 0;
arm_cfft_radix4_f32(&scfft, fft_inputbuf); /* FFT(基4) */
/* 计算运行时间 */
time =__HAL_TIM_GET_COUNTER(&g_timx_handle) + (uint32_t)g_timeout * 65536;
printf("%0.1fms\r\n", time / 10);
arm_cmplx_mag_f32(fft_inputbuf, fft_outputbuf, FFT_LENGTH); /* 求模 */
printf("\r\n%d point FFT runtime:%0.1fms\r\n", FFT_LENGTH, time / 10);
// printf("FFT Result:\r\n");
// for (i = 0; i < FFT_LENGTH; i++)
// {
// printf("fft_outputbuf[%d]:%f\r\n", i, fft_outputbuf[i]);
// }
#endif
printf("__CC_ARM:%d\n", __CC_ARM);
printf("__FPU_PRESENT:%d\n", __FPU_PRESENT);
printf("__FPU_USED:%d\n", __FPU_USED);
printf("SCB->CPACR:0x%x\n", SCB->CPACR);
while(1){
;
}
return 0;
}
打印:大概是0.6ms完成一个1024点的复数运算
[23:39:30.141] 0.6ms [23:39:30.141] [23:39:30.141] 1024 point FFT runtime:0.0ms [23:39:30.172] __CC_ARM:1 [23:39:30.188] __FPU_PRESENT:1 [23:39:30.204] __FPU_USED:1 [23:39:30.220] SCB->CPACR:0xf00000 |
编译出现一堆错:ArmClang: error: unsupported option '--C99'
将--C99改为-xc -std=c99即可:
更改如下:
注:
(1)如果要用V5编译器,则该选项要改回--C99
(2)V6版本的编译器对浮点数运算有做优化,将优化等级配置为fast mode后,相对V5版本有明显速度的提升,如下:
AC6版本编译器配置
基于上面的实验2:运行结果用了0.5ms
[10:47:32.200] test_fft()->times: 0.500000ms
[10:47:32.232] __FPU_PRESENT:1
[10:47:32.248] __FPU_USED:1
[10:47:32.264] SCB->CPACR:0xf00000
关于MDK5的AC5,AC6编译器对比,可以参考这个论坛:
https://www.armbbs.cn/forum.php?mod=viewthread&tid=95455
测试代码:main.c
/**
******************************************************************************
* @file UART/UART_TwoBoards_ComPolling/Src/main.c
* @author MCD Application Team
* @brief This sample code shows how to use STM32F4xx UART HAL API to transmit
* and receive a data buffer with a communication process based on
* polling transfer.
* The communication is done using 2 Boards.
******************************************************************************
* @attention
*
* Copyright (c) 2017 STMicroelectronics.
* All rights reserved.
*
* This software is licensed under terms that can be found in the LICENSE file
* in the root directory of this software component.
* If no LICENSE file comes with this software, it is provided AS-IS.
*
******************************************************************************
*/
/* Includes ------------------------------------------------------------------*/
#include "main.h"
/** @addtogroup STM32F4xx_HAL_Examples
* @{
*/
/** @addtogroup UART_TwoBoards_ComPolling
* @{
*/
/* Private typedef -----------------------------------------------------------*/
/* Private define ------------------------------------------------------------*/
#define TRANSMITTER_BOARD
/* Private function prototypes -----------------------------------------------*/
static void SystemClock_Config(void);
static void Error_Handler(void);
/* Private functions ---------------------------------------------------------*/
extern TIM_HandleTypeDef g_timx_handle;
uint8_t g_timeout;
float re_dat;
float a=0.14f;
float b=0.26f;
void test_fpu_tmp1(void)
{
long i, j;
float re_nul;
float time;
/* 初始化定时器6 */
btim_timx_int_init(65535, 8400 - 1);
/* 重设TIM6定时器的计数器值 */
__HAL_TIM_SET_COUNTER(&g_timx_handle, 0);
g_timeout = 0;
for(i=0; i<10000; i++) {
for(j=0; j<2; j++) {
re_nul=a*b;
re_dat=re_dat+re_nul;
a=a+0.1f;
b=b+0.1f;
}
}
/* 计算运行时间 */
time =__HAL_TIM_GET_COUNTER(&g_timx_handle) + (uint32_t)g_timeout * 65536;
printf("%s()->times: %fms\r\n", __func__, time / 10);
btim_timx_int_deinit(65535, 8400 - 1);
printf("re_dat:%f\n", re_dat);
}
complex INPUT_SEQ[FFT_LEN], RES_SEQ[FFT_LEN], OUTPUT_SEQ[FFT_LEN];
float SEQ_DAT[FFT_LEN], dataR[FFT_LEN], dataI[FFT_LEN];
int fft_priv_test(void)
{
int i, j;
float time;
//构造实数序列
for (i=0; i < FFT_LEN; i++) {
SEQ_DAT[i]=i+0.0f;
}
//构造虚数序列
for (j=0; jtimes: %fms\r\n", __func__, time / 10);
btim_timx_int_deinit(65535, 8400 - 1);
#else
HAL_Delay(1000);
#endif
// for (i=0; i DELTA)return 0XFF; /* 判断失败 */
angle += 0.001f; /* 角度自增 */
}
}
else
{
printf("use DSP\n");
for (i = 0; i < times; i++)
{
cosx = arm_cos_f32(angle); /* 使用DSP优化的sin,cos函数 */
sinx = arm_sin_f32(angle);
result = sinx * sinx + cosx * cosx; /* 计算结果应该等于1 */
result = fabsf(result - 1.0f); /* 对比与1的差值 */
if (result > DELTA)return 0XFF; /* 判断失败 */
angle += 0.001f; /* 角度自增 */
}
}
/* 计算运行时间 */
time =__HAL_TIM_GET_COUNTER(&g_timx_handle) + (uint32_t)g_timeout * 65536;
printf("%s()->times: %fms\r\n", __func__, time / 10);
btim_timx_int_deinit(65535, 8400 - 1);
return 0; /* 任务完成 */
}
/******************************************************************/
/* FFT长度,如果不指定,则默认是1024个点
* 长度可选范围: 16, 64, 256, 1024.
*/
#define FFT_LENGTH 1024
float fft_inputbuf[FFT_LENGTH * 2]; /* FFT输入数组 */
float fft_outputbuf[FFT_LENGTH]; /* FFT输出数组 */
void test_fft(void)
{
int i;
float time;
arm_cfft_radix4_instance_f32 scfft;
/* 初始化scfft结构体,设置相关参数 */
arm_cfft_radix4_init_f32(&scfft, FFT_LENGTH, 0, 1);
/* 初始化输入序列 */
for (i = 0; i < FFT_LENGTH; i++)
{
/* 实部 */
fft_inputbuf[2 * i] = 100 +
10 * arm_sin_f32(2 * PI * i / FFT_LENGTH) +
30 * arm_sin_f32(2 * PI * i * 4 / FFT_LENGTH) +
50 * arm_cos_f32(2 * PI * i * 8 / FFT_LENGTH);
/* 虚部: 都是0 */
fft_inputbuf[2 * i + 1] = 0;
}
/* 初始化定时器6 */
btim_timx_int_init(65535, 8400 - 1);
/* 重设TIM6定时器的计数器值 */
__HAL_TIM_SET_COUNTER(&g_timx_handle, 0);
g_timeout = 0;
/* FFT(基4) */
arm_cfft_radix4_f32(&scfft, fft_inputbuf);
/* 计算运行时间 */
time =__HAL_TIM_GET_COUNTER(&g_timx_handle) + (uint32_t)g_timeout * 65536;
printf("%s()->times: %fms\r\n", __func__, time / 10);
/* 求模 */
// arm_cmplx_mag_f32(fft_inputbuf, fft_outputbuf, FFT_LENGTH);
//
// printf("\r\n%d point FFT runtime:%0.1fms\r\n", FFT_LENGTH, time / 10);
// printf("FFT Result:\r\n");
//
// for (i = 0; i < FFT_LENGTH; i++)
// {
// printf("fft_outputbuf[%d]:%f\r\n", i, fft_outputbuf[i]);
// }
btim_timx_int_deinit(65535, 8400 - 1);
}
/******************************************************************/
int main(void)
{
uint8_t res;
/* STM32F4xx HAL library initialization:
- Configure the Flash prefetch, instruction and Data caches
- Configure the Systick to generate an interrupt each 1 msec
- Set NVIC Group Priority to 4
- Global MSP (MCU Support Package) initialization
*/
HAL_Init();
/* Configure the system clock to 168 MHz */
SystemClock_Config();
/* 串口2初始化: 只用tx功能 */
if(uart2_init(9600))
{
Error_Handler();
}
#if 1
test_fft();
// res=sin_cos_test(PI / 6, 200000, 0);
// res=sin_cos_test(PI / 6, 200000, 1);
// res=fft_priv_test();
// test_fpu_tmp1();
#endif
//printf("__CC_ARM:%d\n", __CC_ARM);
printf("__FPU_PRESENT:%d\n", __FPU_PRESENT);
printf("__FPU_USED:%d\n", __FPU_USED);
printf("SCB->CPACR:0x%x\n", SCB->CPACR);
while(1){
;
}
return 0;
}
/**
* @brief System Clock Configuration
* The system Clock is configured as follow :
* System Clock source = PLL (HSE)
* SYSCLK(Hz) = 168000000
* HCLK(Hz) = 168000000
* AHB Prescaler = 1
* APB1 Prescaler = 4
* APB2 Prescaler = 2
* HSE Frequency(Hz) = 8000000
* PLL_M = 8
* PLL_N = 336
* PLL_P = 2
* PLL_Q = 7
* VDD(V) = 3.3
* Main regulator output voltage = Scale1 mode
* Flash Latency(WS) = 5
* @param None
* @retval None
*/
static void SystemClock_Config(void)
{
RCC_ClkInitTypeDef RCC_ClkInitStruct;
RCC_OscInitTypeDef RCC_OscInitStruct;
/* Enable Power Control clock */
__HAL_RCC_PWR_CLK_ENABLE();
/* The voltage scaling allows optimizing the power consumption when the device is
clocked below the maximum system frequency, to update the voltage scaling value
regarding system frequency refer to product datasheet. */
__HAL_PWR_VOLTAGESCALING_CONFIG(PWR_REGULATOR_VOLTAGE_SCALE1);
/* Enable HSE Oscillator and activate PLL with HSE as source */
RCC_OscInitStruct.OscillatorType = RCC_OSCILLATORTYPE_HSE;
RCC_OscInitStruct.HSEState = RCC_HSE_ON;
RCC_OscInitStruct.PLL.PLLState = RCC_PLL_ON;
RCC_OscInitStruct.PLL.PLLSource = RCC_PLLSOURCE_HSE;
RCC_OscInitStruct.PLL.PLLM = 8;
RCC_OscInitStruct.PLL.PLLN = 336;
RCC_OscInitStruct.PLL.PLLP = RCC_PLLP_DIV2;
RCC_OscInitStruct.PLL.PLLQ = 7;
if(HAL_RCC_OscConfig(&RCC_OscInitStruct) != HAL_OK)
{
Error_Handler();
}
/* Select PLL as system clock source and configure the HCLK, PCLK1 and PCLK2
clocks dividers */
RCC_ClkInitStruct.ClockType = (RCC_CLOCKTYPE_SYSCLK | RCC_CLOCKTYPE_HCLK | RCC_CLOCKTYPE_PCLK1 | RCC_CLOCKTYPE_PCLK2);
RCC_ClkInitStruct.SYSCLKSource = RCC_SYSCLKSOURCE_PLLCLK;
RCC_ClkInitStruct.AHBCLKDivider = RCC_SYSCLK_DIV1;
RCC_ClkInitStruct.APB1CLKDivider = RCC_HCLK_DIV4;
RCC_ClkInitStruct.APB2CLKDivider = RCC_HCLK_DIV2;
if(HAL_RCC_ClockConfig(&RCC_ClkInitStruct, FLASH_LATENCY_5) != HAL_OK)
{
Error_Handler();
}
/* STM32F405x/407x/415x/417x Revision Z devices: prefetch is supported */
if (HAL_GetREVID() == 0x1001)
{
/* Enable the Flash prefetch */
__HAL_FLASH_PREFETCH_BUFFER_ENABLE();
}
}
/**
* @brief UART error callbacks
* @param UartHandle: UART handle
* @note This example shows a simple way to report transfer error, and you can
* add your own implementation.
* @retval None
*/
void HAL_UART_ErrorCallback(UART_HandleTypeDef *UartHandle)
{
/* Turn LED3 on: Transfer error in reception/transmission process */
BSP_LED_On(LED3);
}
/**
* @brief This function is executed in case of error occurrence.
* @param None
* @retval None
*/
static void Error_Handler(void)
{
/* Turn LED5 on */
BSP_LED_On(LED5);
while(1)
{
}
}
#ifdef USE_FULL_ASSERT
/**
* @brief Reports the name of the source file and the source line number
* where the assert_param error has occurred.
* @param file: pointer to the source file name
* @param line: assert_param error line source number
* @retval None
*/
void assert_failed(uint8_t* file, uint32_t line)
{
/* User can add his own implementation to report the file name and line number,
ex: printf("Wrong parameters value: file %s on line %d\r\n", file, line) */
/* Infinite loop */
while (1)
{
}
}
#endif
/**
* @}
*/
/**
* @}
*/
下一篇: