pcen论文《Trainable Frontend For Robust and Far-Field Keyword Spotting》:https://arxiv.org/pdf/1607.05666.pdf
Mel频率倒谱系数(Mel Frequency Cepstrum Coefficient)的缩写是MFCC,是一种在自动语音和说话人识别中广泛使用的特征。
用录音设备录制一段模拟语音信号后,经由自定的取样频率(如8000 Hz、16000 Hz等)采样后转换(A/D)为数字语音信号。由于在时域(time domain)上语音信号的波形变化相当快速、不易观察,因此一般都会在频域(frequency domain)上来观察,其频谱是随着时间而缓慢变化的,因此通常可以假设在一较短时间中,其语音信号的特性是稳定的,通常我们定义这个较短时间为一帧(frame),根据人的语音的音调周期值的变化,一般取10~20ms。
基于:Ubuntu 16.04LTS,Core-i7 8700,PyCharm
# sr = 22050 # Sample rate.
sr = 16000 # 16000 # keda, thchs30, aishell
n_fft = 2048 # fft points (samples)
frame_shift = 0.05 # seconds
frame_length = 0.1 # seconds
hop_length = int(sr * frame_shift) # samples.
win_length = int(sr * frame_length) # samples.
n_mels = 80 # Number of Mel banks to generate
power = 1.2 # Exponent for amplifying the predicted magnitude
n_iter = 50 # Number of inversion iterations
preemphasis = .97 # or None
max_db = 100
ref_db = 20
# log-mel特征提取
def get_spectrograms(fpath, use_path=True):
'''Returns normalized log(melspectrogram) and log(magnitude) from `sound_file`.
sound_file: A string. The full path of a sound file.
mel: A 2d array of shape (T, n_mels) <- Transposed
mag: A 2d array of shape (T, 1+n_fft/2) <- Transposed
# Loading sound file
if use_path:
y, sr = librosa.load(fpath, sr=hp.sr)
# with open("x.bin", 'wb') as fp:
# for i in range(len(y)):
# print("y[", i, "]: ", y[i])
# bs = struct.pack("f", y[i])
# # a = struct.pack('B', i)
# fp.write(bs)
y, sr = fpath, hp.sr
print("y.shape: ", y.shape)
print("sr: ", sr)
time1 = time.time()
# Trimming
# y, _ = librosa.effects.trim(y)
# Preemphasis pre-emphasis,预加重
y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1])
# stftz
linear = librosa.stft(y=y,
# magnitude spectrogram
mag = np.abs(linear) # (1+n_fft//2, T)
# mel spectrogram
mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels) # (n_mels, 1+n_fft//2)
mel = np.dot(mel_basis, mag) # (n_mels, t)
# to decibel
mel = 20 * np.log10(np.maximum(1e-5, mel))
mag = 20 * np.log10(np.maximum(1e-5, mag))
# normalize
mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
# Transpose
mel = mel.T.astype(np.float32) # (T, n_mels)
mag = mag.T.astype(np.float32) # (T, 1+n_fft//2)
mel = mel[:len(mel) // hp.r * hp.r].reshape([len(mel) // hp.r, hp.r * hp.n_mels])
mag = mag[:len(mag) // hp.r * hp.r] # .reshape([len(mag)//hp.r,hp.r*1025])
time2 = time.time()
print("cost time:", time2-time1)
return mel, mag
# pcen-mel特征提取
def get_pcen(fpath, use_path=True):
# Loading sound file
if use_path:
y, sr = librosa.load(fpath, sr=hp.sr)
y, sr = fpath, hp.sr
S = librosa.feature.melspectrogram(y, sr=sr, power=1, n_fft=hp.n_fft, hop_length=hp.hop_length, n_mels=hp.n_mels)
pcen_S = librosa.pcen(S).T
log_S = librosa.amplitude_to_db(S, ref=np.max)
return pcen_S # ,log_S
4.本例中的FFT运算非常慢(约160ms),后来使用的是GitHub上找到的(约6ms): https://github.com/HiFi-LoFi/AudioFFT
基于:Ubuntu 16.04LTS,Core-i7 8700,Clion
// Created by toson on 19-7-17.
// 1.基于该博客程序进行修改:https://blog.csdn.net/LiuPeiP_VIPL/article/details/81742392
// 2.根据Python平台librosa库的运算逻辑进行移植
// 3.使用NumCpp来实现Python平台的NumPy:https://github.com/dpilger26/NumCpp
// 4.本例中的FFT运算非常慢(约160ms),后来使用的是GitHub上找到的(约6ms): https://github.com/HiFi-LoFi/AudioFFT
// 5.后来经过验证发现NumCpp效率比较低,于是使用opencv来实现矩阵运算。
// 6.后来优化使用pcen来实现mel特征提取,其中使用了IIR滤波器:https://blog.csdn.net/liyuanbhu/article/details/38849897
#pragma once
#include "utils/AudioFFT.hpp"
#include "opencv2/opencv.hpp"
#include "iir_filter.hpp"
#include "sas_util.h"
int nSamplesPerSec = 16000; //采样率(每秒样本数) //Sample rate.(keda, thchs30, aishell)
int length_DFT = 2048; //傅里叶点数 //fft points (samples)
int hop_length = int(0.05 * nSamplesPerSec); //步长 //下一帧取数据相对于这一帧的右偏移量
int win_length = int(0.1 * nSamplesPerSec); //帧长 //假设16000采样率,则取取0.1s时间的数据
int number_filterbanks = 80; //过滤器数量 //Number of Mel banks to generate
float preemphasis = 0.97; //预加重(高通滤波器比例值)
int max_db = 100;
int ref_db = 20;
int r = 1; //librosa里的r=1,暂未深入分析其作用
double pi = 3.14159265358979323846;
cv::Mat_ mel_basis;
cv::Mat_ hannWindow;
std::shared_ptr filter;
//"""Convert Hz to Mels"""
double hz_to_mel(double frequencies, bool htk = false) {
if (htk) {
return 2595.0 * log10(1.0 + frequencies / 700.0);
// Fill in the linear part
double f_min = 0.0;
double f_sp = 200.0 / 3;
double mels = (frequencies - f_min) / f_sp;
// Fill in the log-scale part
double min_log_hz = 1000.0; // beginning of log region (Hz)
double min_log_mel = (min_log_hz - f_min) / f_sp; // same (Mels)
double logstep = log(6.4) / 27.0; // step size for log region
// 对照Python平台的librosa库,移植
// if (frequencies.ndim) {
// // If we have array data, vectorize
// log_t = (frequencies >= min_log_hz)
// mels[log_t] = min_log_mel + np.log(frequencies[log_t] / min_log_hz) / logstep
// } else
if (frequencies >= min_log_hz) {
// If we have scalar data, heck directly
mels = min_log_mel + log(frequencies / min_log_hz) / logstep;
return mels;
//"""Convert mel bin numbers to frequencies"""
cv::Mat_ mel_to_hz(cv::Mat_ mels, bool htk = false) {
// if (htk) {
// return //python://700.0 * (10.0**(mels / 2595.0) - 1.0);
// }
// Fill in the linear scale
double f_min = 0.0;
double f_sp = 200.0 / 3;
cv::Mat_ freqs = mels * f_sp + f_min;
// And now the nonlinear scale
double min_log_hz = 1000.0; // beginning of log region (Hz)
double min_log_mel = (min_log_hz - f_min) / f_sp; // same (Mels)
double logstep = log(6.4) / 27.0; // step size for log region
// 对照Python平台的librosa库,移植
//if (mels.ndim) {
// If we have vector data, vectorize
cv::Mat_ log_t = (mels >= min_log_mel);
for (int i = 0; i < log_t.cols; i++) {
if (log_t(0, i)) {
freqs(0, i) = cv::exp((mels(0, i) - min_log_mel) * logstep) * min_log_hz;
return freqs;
// 生成等差数列,类似np.linspace
cv::Mat_ cvlinspace(double min_, double max_, int length) {
auto cvmat = cv::Mat_(1, length);
for (int i = 0; i < length; i++) {
cvmat(0, i) = ((max_ - min_) / (length - 1) * i) + min_;
return cvmat;
//"""Create a Filterbank matrix to combine FFT bins into Mel-frequency bins"""
cv::Mat_ mel_spectrogram_create(int nps, int n_fft, int n_mels) {
double f_max = nps / 2.0;
double f_min = 0;
int n_fft_2 = 1 + n_fft / 2;
// Initialize the weights
//auto weights = nc::zeros(nc::uint32(n_mels), nc::uint32(n_fft_2));
auto weights = cv::Mat_(n_mels, n_fft_2, 0.0);
// Center freqs of each FFT bin
//auto fftfreqs_ = nc::linspace(f_min, f_max, nc::uint32(n_fft_2), true);
auto fftfreqs = cvlinspace(f_min, f_max, n_fft_2);
// 'Center freqs' of mel bands - uniformly spaced between limits
double min_mel = hz_to_mel(f_min, false);
double max_mel = hz_to_mel(f_max, false);
//auto mels_ = nc::linspace(min_mel, max_mel, nc::uint32(n_mels + 2));
auto mels = cvlinspace(min_mel, max_mel, n_mels + 2);
auto mel_f = mel_to_hz(mels, false);
//auto fdiff_ = nc::diff(mel_f_); //沿着指定轴计算第N维的离散差值(后一个元素减去前一个元素)
cv::Mat_ d1(1, mel_f.cols * mel_f.rows - 1, (double *) (mel_f.data) + 1);
cv::Mat_ d2(1, mel_f.cols * mel_f.rows - 1, (double *) (mel_f.data));
cv::Mat_ fdiff = d1 - d2;
//auto ramps = nc::subtract.outer(mel_f, fftfreqs); //nc没有subtract.outer
//nc::NdArray ramps = nc::zeros(mel_f.cols, fftfreqs.cols);
auto ramps = cv::Mat_(mel_f.cols, fftfreqs.cols);
for (int i = 0; i < mel_f.cols; i++) {
for (int j = 0; j < fftfreqs.cols; j++) {
ramps(i, j) = mel_f(0, i) - fftfreqs(0, j);
for (int i = 0; i < n_mels; i++) {
// lower and upper slopes for all bins
//auto ramps_1 = nc::NdArray(1, ramps.cols);
auto ramps_1 = cv::Mat_(1, ramps.cols);
for (int j = 0; j < ramps.cols; j++) {
ramps_1(0, j) = ramps(i, j);
//auto ramps_2 = nc::NdArray(1, ramps.cols);
auto ramps_2 = cv::Mat_(1, ramps.cols);
for (int j = 0; j < ramps.cols; j++) {
ramps_2(0, j) = ramps(i + 2, j);
cv::Mat_ lower = ramps_1 * -1 / fdiff(0, i);
cv::Mat_ upper = ramps_2 / fdiff(0, i + 1);
// .. then intersect them with each other and zero
//auto weights_1 = nc::maximum(nc::zeros(1, ramps.cols), nc::minimum(lower, upper));
cv::Mat c1 = lower;//(cv::Mat_(1,5) << 1,2,-3,4,-5);
cv::Mat c2 = upper;
cv::Mat weights_1 = cv::Mat_(1, lower.cols);
cv::min(c1, c2, weights_1);
cv::max(weights_1, 0, weights_1);
for (int j = 0; j < n_fft_2; j++) {
weights(i, j) = weights_1.at(0, j);
// Slaney-style mel is scaled to be approx constant energy per channel
auto enorm = cv::Mat_(1, n_mels);
for (int j = 0; j < n_mels; j++) {
enorm(0, j) = 2.0 / (mel_f(0, j + 2) - mel_f(0, j));
for (int j = 0; j < n_mels; j++) {
for (int k = 0; k < n_fft_2; k++) {
weights(j, k) *= enorm(0, j);
return weights;
//"""Short-time Fourier transform (STFT)""": 默认center=True, window='hann', pad_mode='reflect'
cv::Mat_ MagnitudeSpectrogram(const cv::Mat_ *emphasis_data, int n_fft = 2048, int hop_length = 0,
int win_length = 0) {
if (win_length == 0) {
win_length = n_fft;
if (hop_length == 0) {
hop_length = win_length / 4;
// reflect对称填充
int pad_lenght = n_fft / 2;
// 使用opencv里的copyMakeBorder来完成reflect填充
cv::Mat_ cv_padbuffer;
cv::copyMakeBorder(*emphasis_data, cv_padbuffer, 0, 0, pad_lenght, pad_lenght, cv::BORDER_REFLECT_101);
// windowing加窗:将每一帧乘以汉宁窗,以增加帧左端和右端的连续性。
// 生成一个1600长度的hannWindow,并居中到2048长度的
if (hannWindow.empty()) {
hannWindow = cv::Mat_(1, n_fft, 0.0f);
int insert_cnt = 0;
if (n_fft > win_length) {
insert_cnt = (n_fft - win_length) / 2;
} else {
std::cout << "\tn_fft:" << n_fft << " > win_length:" << n_fft << std::endl;
return cv::Mat_(0);
for (int k = 1; k <= win_length; k++) {
hannWindow(0, k - 1 + insert_cnt) = float(0.5 * (1 - cos(2 * pi * k / (win_length + 1))));
// opencv虽然有Hann窗生成函数,但是必须要求width > 1,height > 1
//cv::Mat_ cv_hannWindow;
//cv::createHanningWindow(cv_hannWindow, cv::Size(1, win_length), CV_64FC1);
int size = cv_padbuffer.rows * cv_padbuffer.cols;//padbuffer.size()
int number_feature_vectors = (size - n_fft) / hop_length + 1;
int number_coefficients = n_fft / 2 + 1;
cv::Mat_ feature_vector(number_feature_vectors, number_coefficients, 0.0f);
audiofft::AudioFFT fft; //将FFT初始化放在循环外,可达到最优速度
for (int i = 0; i <= size - n_fft; i += hop_length) {
// 每次取一段数据
cv::Mat_ framef = cv::Mat_(1, n_fft, (float *) (cv_padbuffer.data) + i).clone();
// 加hann窗
framef = framef.mul(hannWindow);
// 复数:Xrf实数,Xif虚数。
cv::Mat_ Xrf(1, number_coefficients);
cv::Mat_ Xif(1, number_coefficients);
fft.fft((float *) (framef.data), (float *) (Xrf.data), (float *) (Xif.data));
// 求模
cv::pow(Xrf, 2, Xrf);
cv::pow(Xif, 2, Xif);
cv::Mat_ cv_feature(1, number_coefficients, &(feature_vector[i / hop_length][0]));
cv::sqrt(Xrf + Xif, cv_feature);
cv::Mat_ cv_mag;
cv::transpose(feature_vector, cv_mag);
cv::Mat_ mag;
cv_mag.convertTo(mag, CV_64FC1);
return mag;
* 名称:log_mel
* 功能:传入音频数据,输出log-mel方式提取的特征数据。
* 参数:@ifile_data 传入的音频数据
* @nSamples_per_sec 音频采样率
* 返回:cv::Mat_ 特征数据
cv::Mat_ log_mel(std::vector &ifile_data, int nSamples_per_sec) {
if (nSamples_per_sec != nSamplesPerSec) {
std::cout << R"(the "nSamples_per_sec" is not 16000.)" << std::endl;
return cv::Mat_(nullptr);
int ifile_length = int(ifile_data.size() / 4);
// pre-emphasis 预加重 //高通滤波
cv::Mat_ d1(1, ifile_length - 1, (float *) (ifile_data.data()) + 1);
cv::Mat_ d2(1, ifile_length - 1, (float *) (ifile_data.data()));
cv::Mat_ cv_emphasis_data;
cv::hconcat(cv::Mat_::zeros(1, 1), d1 - d2 * preemphasis, cv_emphasis_data);
// magnitude spectrogram 幅度谱图
auto mag = MagnitudeSpectrogram(&cv_emphasis_data, length_DFT, hop_length, win_length);
mag = cv::abs(mag);
// 生成梅尔谱图 mel spectrogram //3ms
if (mel_basis.empty()) {
mel_basis = mel_spectrogram_create(nSamplesPerSec, length_DFT, number_filterbanks);
// doc
cv::Mat cv_mel = mel_basis * mag;
// to decibel
//mel = 20 * np.log10(np.maximum(1e-5, mel))
//mag = 20 * np.log10(np.maximum(1e-5, mag))
// 使用opencv来实现
cv::log(cv::max(cv_mel, 1e-5), cv_mel);
// opencv没有log10(),所以使用log(x)/log(10)来运算。
cv_mel = cv_mel / 2.3025850929940459 * 20; // 2.3025850929940459=log(10)
// normalize
//mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
//mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
//cv::normalize(cv_mel, cv_mel, 1e-8, 1.0, cv::NORM_MINMAX); // cv::normalize无法实现
cv_mel = (cv_mel - ref_db + max_db) / max_db;
cv_mel = cv::max(cv::min(cv_mel, 1.0), 1e-8);
// Transpose
//mel = mel.T.astype(np.float32)
//mag = mag.T.astype(np.float32)
// 使用opencv的transpose
cv::Mat cv_mel_r;
cv::transpose(cv_mel, cv_mel_r);
cv_mel_r.convertTo(cv_mel_r, CV_32FC1);
if (r == 1) {
// 原计算公式是:
// mel = mel[:len(mel) // hp.r * hp.r].reshape([len(mel) // hp.r, hp.r * hp.n_mels])
// 当r=1的时候公式运算无任何数值改变。
} else {
std::cout << R"(the "r" is not 1.)" << std::endl;
// 返回mel特征向量
return cv_mel_r;
/**--------------------------------- 以下是pcen运算方法 ---------------------------------**/
// scipy.signal.lfilter_zi()
cv::Mat_ cvlfilter_zi(cv::Mat_ b, cv::Mat_ a) {
if ((b.rows != 1) || (a.rows != 1)) {
std::cout << "Numerator b and Denominator a must be 1-D." << std::endl;
if (a(0, 0) != 1) {
// Normalize the coefficients so a[0] == 1.
b = b / a(0, 0);
a = a / a(0, 0);
int len_a = a.cols * a.rows;
int len_b = b.cols * b.rows;
int n = len_a > len_b ? len_a : len_b;
if (len_a < n) {
cv::hconcat(a, cv::Mat_::zeros(1, n - len_a), a);
} else if (len_b < n) {
cv::hconcat(b, cv::Mat_::zeros(1, n - len_b), b);
return cv::Mat_(nullptr);
// scipy.signal.lfilter()
// Filter data along one-dimension with an IIR or FIR filter.
cv::Mat_ cvlfilter(cv::Mat_ &b, cv::Mat_ &a, cv::Mat_ &x,
cv::Mat_ &zi, int axis = -1) {
if (a.rows * a.cols == 1) {
// This path only supports types fdgFDGO to mirror _linear_filter below.
// Any of b, a, x, or zi can set the dtype, but there is no default
// casting of other types; instead a NotImplementedError is raised.
// TODO: 后续如果需要,则进行补充
} else {
// return sigtools._linear_filter(b, a, x, axis, zi)
// sigtools._linear_filter()
// (y,Vf) = _linear_filter(b,a,X,Dim=-1,Vi=None) implemented using Direct Form II transposed flow diagram.
// If Vi is not given, Vf is not returned.
* 名称:pcen
* 功能:传入音频数据,输出pcen方式提取的特征数据。
* 参数:@ifile_data 传入的音频数据
* @nSamples_per_sec 音频采样率
* 返回:cv::Mat_ 特征数据
cv::Mat_ pcen(std::vector &ifile_data, int nSamples_per_sec) {
if (nSamples_per_sec != nSamplesPerSec) {
std::cout << R"(the "nSamples_per_sec" is not 16000.)" << std::endl;
return cv::Mat_(nullptr);
int ifile_length = int(ifile_data.size() / 4);
cv::Mat_ cv_emphasis_data(1, ifile_length, (float *) (ifile_data.data()));
// magnitude spectrogram 幅度谱图
auto mag = MagnitudeSpectrogram(&cv_emphasis_data, length_DFT, hop_length, win_length);
mag = cv::abs(mag);
// 生成梅尔谱图 mel spectrogram //3ms
if (mel_basis.empty()) {
mel_basis = mel_spectrogram_create(nSamplesPerSec, length_DFT, number_filterbanks);
// doc
cv::Mat_ mel = mel_basis * mag;
// 计算pcen特征
// double time_constant = 0.400;
// int sr = 22050;
// int hop_length = 512;
// double t_frames = time_constant * sr / double(hop_length);
// double b = (sqrt(1 + 4 * t_frames * t_frames) - 1) / (2 * t_frames * t_frames);
// cv::Mat_ zi = (cv::Mat_(1, 1) << 0.94361056);
// cv::Mat_ in_b = (cv::Mat_(1, 1) << b);
// cv::Mat_ in_a = (cv::Mat_(1, 2) << 1, b - 1);
// cv::Mat_ zi = cvlfilter_zi(in_b, in_a);
// 第二个公式计算
// cv::Mat_ S_smooth = cvlfilter(in_b, in_a, mel, zi);
#if 1 // IIR滤波器
if (!filter) {
filter = std::make_shared();
double iir_b[1] = {0.05638943879134889};
double iir_a[2] = {1.0, -0.9436105612086512};
filter->setPara(iir_b, 1, iir_a, 2);
cv::Mat_ S_smooth = cv::Mat_(mel.rows, mel.cols);
for (int i = 0; i < mel.rows; i++) {
filter->filter(mel[i], S_smooth[i], mel.cols);
// 第一个公式计算
double gain = 0.98;
double bias = 2.0;
double power = 0.5;
double eps = 1e-6;
//python: smooth = np.exp(-gain * (np.log(eps) + np.log1p(S_smooth / eps)))
cv::Mat_ S_smooth_log1p;
cv::log(S_smooth / eps + 1, S_smooth_log1p);
cv::Mat_ smooth;
cv::exp((S_smooth_log1p + cv::log(eps)) * (-gain), smooth);
//python: S_out = (bias ** power) * np.expm1(power * np.log1p(ref * smooth / bias))
cv::Mat_ smooth_log1p;
cv::Mat_ smooth_log1p_exp;
cv::log(mel.mul(smooth) / bias + 1, smooth_log1p);
cv::exp(power * smooth_log1p, smooth_log1p_exp);
cv::Mat_ S_out = (smooth_log1p_exp - 1) * pow(bias, power);
// transpose
cv::Mat_ pcen;
cv::transpose(S_out, pcen);
return pcen;
import os.path
import sys
import librosa
import numpy as np
import struct
def find_files(path):
:param path:
return os.listdir(path)
def audio48kHz_to_bin16kHz_and_save(files, in_path, out_path):
:param files:
:param out_path:
for file in files:
in_file = in_path + "/" + file
y, sr = librosa.load(in_file, 16000) # keda, thchs30, aishell
out_file = out_path + "/" + file + ".bin"
with open(out_file, 'wb') as fp:
for i in range(len(y)):
# print("y[", i, "]: ", y[i])
bs = struct.pack("f", y[i])
# a = struct.pack('B', i)
if __name__ == '__main__':
print("Example: $ python 48k_to_16k.py /home/toson/Downloads/sounds /home/toson/Downloads/sounds_out")
print(sys.argv[0]) # sys.argv[0] 类似于shell中的$0,但不是脚本名称,而是脚本的路径
print(sys.argv[1]) # sys.argv[1] 表示传入的第一个参数,既 hello
# sys.argv[1] = '/home/toson/Downloads/sounds'
# sys.argv[2] = '/home/toson/Downloads/sounds_out'
# path = '/d/images/'
files = find_files(sys.argv[1])
audio48kHz_to_bin16kHz_and_save(files, sys.argv[1], sys.argv[2])