import os
import wavio
import numpy as np
import math
from matplotlib import pyplot as plt
from scipy.fftpack import dct
from python_speech_features import mfcc, delta, logfbank
for wav in wavs:
wav_dir = os.path.join(data_dir, wav)
wav_data = wavio.read(wav_dir)
data = wav_data.data
sample_rate = wav_data.rate #16k
sampwidth = wav_data.sampwidth
#normalization
norm_data = data/max(abs(data))
#frames
frames = frames_crop(norm_data,sample_rate)
#add window
win = 160
windows = choose_windows(name = "Hamming",N = win)
#parameters
N = 2048 #NFFT
M = 40 #filters number
num_ceps = 24
#fft
spe_freqs = np.zeros((frames.shape[0],int(N/2))) #spectrogram
fbank_feature = np.zeros((frames.shape[0],M)) #Fbank
fbank_feature_2 = np.zeros((frames.shape[0],M)) #Fbank second version
mfcc_dct = np.zeros((frames.shape[0],num_ceps)) #dct
for i in range(frames.shape[0]):
frames_fft = np.fft.fft(windows * frames[i],N)
spe_freqs[i][:] = log_data(np.abs(frames_fft[:int(N/2)]))
filter_banks = mel_filters(sample_rate = sample_rate, NFFT = N, pow_frames = np.abs(frames_fft[:int(N/2)]), nfilt = M)
fbank_feature[i][:] = log_data(filter_banks)
filter_banks_2,w2 = mel_filters_2(M = M,N = N,fs = sample_rate,l = 0,h = 0.5, pow_frames = np.abs(frames_fft[:int(N/2)]))
fbank_feature_2[i][:] = log_data(filter_banks_2)
D = dct(filter_banks_2,type = 2,norm = 'ortho')[1:(num_ceps+1)]
mfcc_dct[i][:] = D
# 求其delta(一阶差分和二阶差分,这里直接使用python中包)
spe_freqs_delta = delta(spe_freqs, 1) # 语谱图一阶差分
spe_freqs_delta_delta = delta(spe_freqs_delta , 2) #语谱图的二阶差分
#print(mfcc_dct.shape)
plt.pcolor(mfcc_dct.T,cmap = 'jet')
plt.show()
break
def frames_crop(x1, sample_rate):
signal = x1
frame_size, frame_stride = 0.01, 0.0075 # 帧长帧移
frame_length, frame_step = int(round(frame_size * sample_rate)), int(round(frame_stride * sample_rate))
signal_length = len(signal)
num_frames = int(np.ceil(np.abs(signal_length - frame_length) / frame_step)) + 1
pad_signal_length = (num_frames - 1) * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(signal, z)
indices = np.arange(0, frame_length).reshape(1, -1) + np.arange(0, num_frames * frame_step, frame_step).reshape(
-1, 1)
frames1 = pad_signal[indices]
return frames1
def choose_windows(name, N):
# Rect/Hanning/Hamming
if name == 'Hamming':
window = np.array([0.54 - 0.46 * np.cos(2 * np.pi * n / (N - 1)) for n in range(N)])
elif name == 'Hanning':
window = np.array([0.5 - 0.5 * np.cos(2 * np.pi * n / (N - 1)) for n in range(N)])
elif name == 'Rect':
window = np.ones(N)
return window
def log_data(arr):
res = np.zeros((arr.shape[0]))
for i in range(arr.shape[0]):
res[i] = 20 * math.log(arr[i],10)
return res
def mel_filters(sample_rate,NFFT,pow_frames,nfilt = 40):
low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700)) # 将Hz转换为Mel
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # 使得Mel scale间距相等
hz_points = (700 * (10**(mel_points / 2595) - 1)) # 将Mel转换为Hz
bin = np.floor((NFFT + 1) * hz_points / sample_rate)
fbank = np.zeros((nfilt, int(np.floor(NFFT / 2))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin[m - 1]) # 左
f_m = int(bin[m]) # 中
f_m_plus = int(bin[m + 1]) # 右
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = np.dot(pow_frames, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # 数值稳定性
return filter_banks
def mel_filters_2(M,N,fs,l,h,pow_frames):
'''mel滤波器
input:M(int):滤波器个数
N(int):FFT点数
fs(int):采样频率
l(float):低频系数
h(float):高频系数
output:melbank(二维array):mel滤波器
'''
fl = fs * l #滤波器范围的最低频率
fh = fs * h #滤波器范围的最高频率
bl = 1125 * np.log(1 + fl / 700) #将频率转换为mel频率
bh = 1125 * np.log(1 + fh /700)
B = bh - bl #频带宽度
y = np.linspace(0,B,M+2) #将mel刻度等间距
#print('mel间隔',y)
Fb = 700 * (np.exp(y / 1125) - 1) #将mel变为HZ
#print(Fb)
w2 = int(N / 2)
df = fs / N
freq = [] #采样频率值
for n in range(0,w2):
freqs = int(n * df)
freq.append(freqs)
melbank = np.zeros((M,w2))
#print(len(freq))
for k in range(1,M+1):
f1 = Fb[k - 1]
f2 = Fb[k + 1]
f0 = Fb[k]
n1 = np.floor(f1/df)
n2 = np.floor(f2/df)
n0 = np.floor(f0/df)
for i in range(1,w2):
if i >= n1 and i <= n0:
melbank[k-1,i] = (i-n1)/(n0-n1)
if i >= n0 and i <= n2:
melbank[k-1,i] = (n2-i)/(n2-n0)
#plt.plot(freq,melbank[k-1,:])
#plt.show()
print(melbank.shape)
filter_banks = np.dot(pow_frames, melbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # 数值稳定性
return filter_banks,w2
可将以上特征进行保存成txt形式格式。接下来将会使用Opensmile进行提取语音数据特征