语音特征提取(语谱图Spectrogram,Fbank, MFCC, 及其delta-一阶差分)——python代码

导入相关包

import os
import wavio
import numpy as np
import math
from matplotlib import pyplot as plt
from scipy.fftpack import dct
from python_speech_features import mfcc, delta, logfbank

读取语音数据及主函数


for wav in wavs:
	wav_dir = os.path.join(data_dir, wav)
	wav_data = wavio.read(wav_dir)
	data = wav_data.data
	sample_rate = wav_data.rate	#16k
	sampwidth = wav_data.sampwidth
	
	#normalization
	norm_data = data/max(abs(data))
	
	#frames
	frames = frames_crop(norm_data,sample_rate)
	
	#add window
	win = 160
	windows = choose_windows(name = "Hamming",N = win)
	
	#parameters
	N = 2048 #NFFT
	M = 40 #filters number
	num_ceps = 24
	
	#fft
	spe_freqs = np.zeros((frames.shape[0],int(N/2)))	#spectrogram
	fbank_feature = np.zeros((frames.shape[0],M))	#Fbank
	fbank_feature_2 = np.zeros((frames.shape[0],M))	#Fbank second version
	
	mfcc_dct = np.zeros((frames.shape[0],num_ceps))	#dct
	
	
	for i in range(frames.shape[0]):
		frames_fft = np.fft.fft(windows * frames[i],N)
		spe_freqs[i][:] = log_data(np.abs(frames_fft[:int(N/2)]))
		
		filter_banks = mel_filters(sample_rate = sample_rate, NFFT = N, pow_frames = np.abs(frames_fft[:int(N/2)]), nfilt = M)
		fbank_feature[i][:] = log_data(filter_banks)
		
		filter_banks_2,w2 = mel_filters_2(M = M,N = N,fs = sample_rate,l = 0,h = 0.5, pow_frames = np.abs(frames_fft[:int(N/2)]))
		fbank_feature_2[i][:] = log_data(filter_banks_2)
		
		
		D = dct(filter_banks_2,type = 2,norm = 'ortho')[1:(num_ceps+1)]
		mfcc_dct[i][:] = D
	
	# 求其delta(一阶差分和二阶差分,这里直接使用python中包)
	spe_freqs_delta = delta(spe_freqs, 1)  # 语谱图一阶差分
	spe_freqs_delta_delta = delta(spe_freqs_delta , 2)  #语谱图的二阶差分

	#print(mfcc_dct.shape)
	
	plt.pcolor(mfcc_dct.T,cmap = 'jet')
	plt.show()
	break

对语音数据进行分帧

def frames_crop(x1, sample_rate):
    signal = x1
    frame_size, frame_stride = 0.01, 0.0075  # 帧长帧移
    frame_length, frame_step = int(round(frame_size * sample_rate)), int(round(frame_stride * sample_rate))

    signal_length = len(signal)
    num_frames = int(np.ceil(np.abs(signal_length - frame_length) / frame_step)) + 1

    pad_signal_length = (num_frames - 1) * frame_step + frame_length

    z = np.zeros((pad_signal_length - signal_length))
    pad_signal = np.append(signal, z)

    indices = np.arange(0, frame_length).reshape(1, -1) + np.arange(0, num_frames * frame_step, frame_step).reshape(
        -1, 1)
    frames1 = pad_signal[indices]

    return frames1

选择窗函数

def choose_windows(name, N):
    # Rect/Hanning/Hamming
    if name == 'Hamming':
        window = np.array([0.54 - 0.46 * np.cos(2 * np.pi * n / (N - 1)) for n in range(N)])
    elif name == 'Hanning':
        window = np.array([0.5 - 0.5 * np.cos(2 * np.pi * n / (N - 1)) for n in range(N)])
    elif name == 'Rect':
        window = np.ones(N)
    return window

对数据进行求Log

def log_data(arr):
	res = np.zeros((arr.shape[0]))
	for i in range(arr.shape[0]):
		res[i] = 20 * math.log(arr[i],10)
	return res

两种梅尔滤波器

第一种

def mel_filters(sample_rate,NFFT,pow_frames,nfilt = 40):
	low_freq_mel = 0
	high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # 将Hz转换为Mel
	mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # 使得Mel scale间距相等
	hz_points = (700 * (10**(mel_points / 2595) - 1))  # 将Mel转换为Hz

	bin = np.floor((NFFT + 1) * hz_points / sample_rate)


	fbank = np.zeros((nfilt, int(np.floor(NFFT / 2))))

	for m in range(1, nfilt + 1):
		f_m_minus = int(bin[m - 1])   # 左
		f_m = int(bin[m])             # 中
		f_m_plus = int(bin[m + 1])    # 右

		for k in range(f_m_minus, f_m):
			fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
		for k in range(f_m, f_m_plus):
			fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
	filter_banks = np.dot(pow_frames, fbank.T)
	filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # 数值稳定性

	return filter_banks

第二种

def mel_filters_2(M,N,fs,l,h,pow_frames):
	'''mel滤波器
	input:M(int):滤波器个数
	N(int):FFT点数
	fs(int):采样频率
	l(float):低频系数
	h(float):高频系数
	output:melbank(二维array):mel滤波器
	'''
	fl = fs * l #滤波器范围的最低频率
	fh = fs * h #滤波器范围的最高频率
	bl = 1125 * np.log(1 + fl / 700) #将频率转换为mel频率
	bh = 1125 * np.log(1 + fh /700) 
	B = bh - bl #频带宽度
	y = np.linspace(0,B,M+2) #将mel刻度等间距
	#print('mel间隔',y)
	Fb = 700 * (np.exp(y / 1125) - 1) #将mel变为HZ
	#print(Fb)
	w2 = int(N / 2)
	df = fs / N
	freq = [] #采样频率值
	for n in range(0,w2):
		freqs = int(n * df)
		freq.append(freqs)
		melbank = np.zeros((M,w2))
	#print(len(freq))
 
	for k in range(1,M+1):
		f1 = Fb[k - 1]
		f2 = Fb[k + 1]
		f0 = Fb[k]
		n1 = np.floor(f1/df)
		n2 = np.floor(f2/df)
		n0 = np.floor(f0/df)
		for i in range(1,w2):
			if i >= n1 and i <= n0:
				melbank[k-1,i] = (i-n1)/(n0-n1)
			if i >= n0 and i <= n2:
				melbank[k-1,i] = (n2-i)/(n2-n0)
		#plt.plot(freq,melbank[k-1,:])
	#plt.show()
	print(melbank.shape)
	filter_banks = np.dot(pow_frames, melbank.T)
	filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # 数值稳定性
	return filter_banks,w2

可将以上特征进行保存成txt形式格式。接下来将会使用Opensmile进行提取语音数据特征

你可能感兴趣的:(python,语音识别,深度学习)