代码依赖
python3
librosa
如果需要观察特征频谱,请确保自己有matplotlib依赖并将代码中相关注解解掉
注:不要修改文件默认输出test.fbank test.mfcc的文件名
mfcc.py 作业代码
test.wav 测试音频
Readme.md 说明文件
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @FileName :MFCC_test.py
# @Time :2022/2/16 12:10
# @Author :PangXZ
import librosa
import numpy as np
from scipy.fftpack import dct
# Draw spectrogram pictures
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
def plot_spectrogram(spec, note, file_name):
"""
Draw the spectrogram picture
:param spec: a feature_dim by num_frames array (data)
:param note: title of the picture
:param file_name: name of the file
:return:
"""
fig = plt.figure(figsize=(20, 5))
heatmap = plt.pcolor(spec)
fig.colorbar(mappable=heatmap)
plt.xlabel('Time(s')
plt.ylabel(note)
plt.tight_layout()
plt.savefig(file_name)
def preemphasis(signal, coeff=0.97):
"""
语音信号预加重,相当于一个一阶高通滤波器
:param signal: 语音信号
:param coeff: 预加重系数,0表示不加重,默认为0.97
:return: 返回加重后的语音信号
"""
return np.append(signal[0], signal[1:] - coeff * signal[:-1])
def enframe(signal, frame_len=400, frame_shift=160, win=np.hamming(M=400)):
"""
对语音信号进行分帧和加窗(汉明窗)
:param signal: 输入的语音信号
:param frame_len: 帧长,这里用一帧中的采样点数
:param frame_shift: 帧移
:param win: 窗型
:return: 返回加窗之后的语音信号
"""
num_samples = signal.size # 语音信号的大小
num_frames = np.floor((num_samples - frame_len) / frame_shift) + 1 # 帧数
frames = np.zeros((int(num_frames), frame_len))
for i in range(int(num_frames)):
frames[i, :] = signal[i * frame_shift: i * frame_shift + frame_len]
frames[i, :] = frames[i, :] * win
return frames
def get_spectrum(frames, fft_len=512):
"""
使用快速傅里叶变换获得语音信号的频谱
:param frames: 已分帧的语音信号,num_frames by frame_len array
:param fft_len:快速傅里叶变换的长度,默认为512
:return: 返回频谱,a num_frames by fft_len/2+1 array (real)
"""
cFFT = np.fft.fft(frames, n=fft_len)
valid_len = int(fft_len / 2) + 1
spectrum = np.abs(cFFT[:, 0:valid_len])
return spectrum
def fbank(spectrum, num_filter=23, fft_len=512, sample_size=16000):
"""
从频谱中获得mel滤波器组特征
:param spectrum: 频谱
:param num_filter: mel滤波器数目,默认为23
:return: 返回fbank特征
"""
# 1. 获取一组梅尔滤波器组
low_freq_mel = 0
high_freq_mel = 2595 * np.log10(1 + (sample_size / 2) / 700) # 转换到mel尺度
mel_points = np.linspace(low_freq_mel, high_freq_mel, num_filter + 2) # mel空间中线性取点
hz_points = 700 * (np.power(10., (mel_points / 2595)) - 1) # 转回线性谱
bin = np.floor(hz_points / (sample_size / 2) * (fft_len / 2 + 1)) # 把原本的频率对应值缩放到FFT窗长上
# 2. 用滤波器组对每一帧特征滤波,计算特征与滤波器的乘积,使用np.dot
fbank = np.zeros((num_filter, int(np.floor(fft_len / 2 + 1))))
for m in range(1, 1 + num_filter):
f_left = int(bin[m - 1]) # 左边界点
f_center = int(bin[m]) # 中心点
f_right = int(bin[m + 1]) # 右边界点
for k in range(f_left, f_center):
fbank[m - 1, k] = (k - f_left) / (f_center - f_left)
for k in range(f_center, f_right):
fbank[m - 1, k] = (f_right - k) / (f_right - f_center)
filter_banks = np.dot(spectrum, fbank.T)
# finfo函数是根据括号中的类型来获得信息,获得符合这个类型的数型
# eps是取非负的最小值,np.finfo(float).eps的值是2.220446049250313e-16,即最小的正数
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
# 3. 取对数操作
filter_banks = 20 * np.log10(filter_banks)
return filter_banks
def mfcc(fbank, num_mfcc=22):
"""
基于Fbank特征获取MFCC特征
:param fbank: fbank特征
:param num_mfcc: MFCC系数数目
:return: MFCC特征
"""
# feats = np.zeros((fbank.shape[0], num_mfcc))
# 1. 从上一步获得的fbank特征,计算获取的mfcc特征,主要是一个离散余弦变换的操作
mfcc = dct(fbank, type=2, axis=1, norm='ortho')[:, 1:(num_mfcc + 1)]
return mfcc
def write_file(feats, file_name):
"""
保存特征到文件中
:param feats: 语音信号特征
:param file_name: 文件名称
"""
f = open(file_name, 'w')
(row, col) = feats.shape
for i in range(row):
f.write('[')
for j in range(col):
f.write(str(feats[i, j]) + ' ')
f.write(']\n')
f.close()
if __name__ == "__main__":
# 预加重系数 0.97
alpha = 0.97
# 帧设置
sample_size = 16000 # fs 16kHz
frame_len = 400 # 一帧有400个采样点,即16000 × 0.025 = 400, 采样率fs=16kHz, 帧长为25ms
frame_shift = 160 # 帧移10ms × 采样率16kHz = 160
fft_len = 512 # 快速傅里叶变换的采样信号长度512 = 2的9次幂
# Mel滤波器组设置
num_filter = 23 # 滤波器个数
num_mfcc = 12 # 倒谱的个数
# 读取WAV格式语音文件
wav, fs = librosa.load('test.wav', sr=None)
signal = preemphasis(wav, coeff=alpha)
frames = enframe(signal, frame_len=frame_len, frame_shift=frame_shift, win=np.hamming(M=frame_len))
spectrum = get_spectrum(frames, fft_len=fft_len)
fbank_feats = fbank(spectrum, num_filter=num_filter, fft_len=fft_len, sample_size=sample_size)
mfcc_feats = mfcc(fbank_feats, num_mfcc=num_mfcc)
plot_spectrogram(fbank_feats, 'Filter Bank', 'fbank.png')
write_file(fbank_feats, 'test.fbank')
plot_spectrogram(mfcc_feats, 'MFCC', 'mfcc.png')
write_file(mfcc_feats, 'test.mfcc')