zero crossing rate
每帧信号内,信号过零点的次数,体现的是频率特性。
Z n = 1 2 ∑ m = 0 N − 1 ∣ s g n [ x n ( m ) ] − s g n [ x n ( m − 1 ) ] ∣ Z_n = \frac{1}{2}\sum_{m=0}^{N-1}|sgn[x_n(m)]-sgn[x_n(m-1)]| Zn=21m=0∑N−1∣sgn[xn(m)]−sgn[xn(m−1)]∣
import numpy as np
def stZCR(frame):
# computing zero crossing rate
count = len(frame)
count_z = np.sum(np.abs(np.diff(np.sign(frame)))) / 2
return (np.float64(count_z) / np.float64(count - 1))
energy
短时能量,即每帧信号的平方和,体现的是信号能量的强弱
E n = ∑ m = 0 N − 1 x n 2 ( m ) E_n = \sum_{m=0}^{N-1}x_n^2(m) En=m=0∑N−1xn2(m)
import numpy as np
def stEnergy(frame):
return (np.sum(frame ** 2) / np.float64(len(frame)))
shimmer in DB
S h i m m e r ( d B ) = 1 N − 1 ∑ i = 1 N − − 1 ∣ 20 l o g ( A i + 1 / A i ) ∣ Shimmer(dB) = {\frac{1}{N-1}}\sum_{i=1}^{N--1}|20log(A_{i+1}/A_i)| Shimmer(dB)=N−11i=1∑N−−1∣20log(Ai+1/Ai)∣
import numpy as np
def stShimmerDB(frame):
'''
amplitude shimmer 振幅扰动度
expressed as variability of the peak-to-peak amplitude in decibels 分贝
[3]
'''
count = len(frame)
sigma = 0
for i in range(count):
if i == count - 1:
break
sigma += np.abs(20 * (np.log10(np.abs(frame[i + 1] / (frame[i] + eps)))))
return np.float64(sigma) / np.float64(count - 1)
S h i m m e r ( r e l a t i v e ) = 1 N − 1 ∑ N − 1 i = 1 ∣ A i − A i + 1 ∣ 1 N ∑ i = 1 N A i Shimmer(relative) = \frac{\frac{1}{N-1}\sum_{N-1}^{i=1}|A_i-A_{i+1}|}{\frac{1}{N}\sum_{i=1}^{N}A_i} Shimmer(relative)=N1∑i=1NAiN−11∑N−1i=1∣Ai−Ai+1∣
def stShimmerRelative(frame):
'''
shimmer relative is defined as average absolute difference between the amplitude
of consecutive periods divided by the average amplitude, expressed as percentage
[3]
'''
count = len(frame)
sigma_diff = 0
sigma_sum = 0
for i in range(count):
if i < count - 1:
sigma_diff += np.abs(np.abs(frame[i]) - np.abs(frame[i + 1]))
sigma_sum += np.abs(frame[i])
return np.float64(sigma_diff / (count - 1)) / np.float64(sigma_sum / count + eps)
intensity / loudness
###################
##
## from opensimle
##
#####################
def stIntensity(frame):
'''
cannot understand what differ from energy
'''
fn = len(frame)
hamWin = np.hamming(fn)
winSum = np.sum(hamWin)
if winSum <= 0.0:
winSum = 1.0
I0 = 0.000001
Im = 0
for i in range(fn):
Im = hamWin[i] * frame[i] ** 2
intensity = Im/winSum
loudness = (Im / I0) ** .3
return intensity, loudness
计算基频的方法包括倒谱法、短时自相关法和线性预测法。本文采用短时自相关法
1)基音检测预处理:语音端点检测
由于语音的头部和尾部不具有周期性,因此为了提高基音检测的准确性,在基音检测时采用了端点检测。使用谱熵法进行端点检测。
语音信号时域波形为 x ( i ) x(i) x(i),加窗分帧后第 n n n帧语音信号为 x n ( m ) x_n(m) xn(m),其FFT表示为 X n ( k ) X_n(k) Xn(k), k k k表示第 k k k条谱线。该语音帧在频域中的短时能量 E n E_n En为
E n = ∑ k = 0 N / 2 X n ( k ) X n ∗ ( k ) E_n = \sum_{k=0}^{N/2}X_n(k)X_n^*(k) En=k=0∑N/2Xn(k)Xn∗(k)
N N N为FFT长度,只取正频率部分
某一谱线 k k k的能量谱为 Y n ( k ) = X n ( k ) X n ( k ) ∗ Y_n(k) = X_n(k)X_n(k)^* Yn(k)=Xn(k)Xn(k)∗,则每个频率分量的归一化谱概率密度函数定义为 p n ( k ) = Y n ( k ) E n = Y n ( k ) ∑ l = 0 N / 2 Y n ( l ) p_n(k)=\frac{Y_n(k)}{E_n}=\frac{Y_n(k)}{\sum_{l=0}^{N/2}Y_n(l)} pn(k)=EnYn(k)=∑l=0N/2Yn(l)Yn(k)
该语音帧短时谱熵定义为
H n = − ∑ l = 0 N / 2 p n ( k ) l n p n ( k ) H_n = -\sum_{l=0}^{N/2}p_n(k)lnp_n(k) Hn=−l=0∑N/2pn(k)lnpn(k)
H n H_n Hn是第 n n n帧的谱熵,它表示谱的能量变化,在不同水平噪声环境中谱熵参数具有一定稳健性,但每一谱点的幅值易受噪声的污染而影响端点检测。
在说话区间内的谱熵值小于噪声区段内的谱熵值,因此能量与谱熵的比值,说话区间大于噪声区间。能量与谱熵的比值称为能熵比 E E F n EEF_n EEFn。
E E F n = 1 + ∣ E n / H n ∣ EEF_n = \sqrt{1+|E_n/H_n|} EEFn=1+∣En/Hn∣
设置阈值 T 1 T1 T1,当 E E F n EEF_n EEFn不小于 T 1 T1 T1时,为说话区间。
2)自相关法检测基频
为了减少共振峰的干扰,基频检测的频率范围一般为60Hz~500Hz,此时相应的样本点值为 f s / 60 f_s/60 fs/60、 f s / 500 f_s/500 fs/500
将一帧信号进行自相关,当延迟量等于基音周期时,此时信号的值最大。因此,检测基频的具体做法即为:将信号进行自相关运算后,找出最大值对应的采样点数。
####################################
##
## calculate pitch with correlation
## calPitch() JitterAbsolute() JitterRelative()
##
####################################
class voiceSegment:
def __init__(self, in1=0, in2=0, duratioin=0):
self.begin = in1
self.end = in2
self.duratioin = duratioin
def pitch_vad(x, win, step, T1, miniL):
# 端点检测
y = enframe(x, win, step).T
fn = len(y[0, :])
print(fn)
Esum = [] # energy of frames
H = [] # spectrom entropy
for i in range(fn):
Sp = np.abs(np.fft.fft(y[:, i]))
Sp = Sp[:int(win / 2)] # fft positive
Esum.append(np.sum(Sp ** 2)) # energy
prob = Sp / (np.sum(Sp)) # probability
H.append(- np.sum(prob * np.log(prob + eps)))
H = np.array(H)
hindex = np.where(H < 0.1)
H[hindex] = np.max(H)
# Ef = np.sqrt(1 + np.abs(Esum / np.linalg.inv(H))) # energy entropy percentage
Ef = np.sqrt(1 + np.abs(Esum / H))
Ef = Ef / np.max(Ef)
zindex = np.where(Ef >= T1) # 寻找Ef中大于T1的部分
zseg = findSegment(zindex) # 给出端点检测各段的信息
zsl = len(zseg) # 给出段数
j = 0
SF = np.zeros(fn)
voiceseg = []
for k in range(zsl):
if zseg[k].duratioin >= miniL:
j = j + 1
in1 = zseg[k].begin
in2 = zseg[k].end
voiceseg.append(zseg[k])
SF[in1:in2] = 1
vosl = len(voiceseg) # 有话段的段数
return voiceseg, vosl, SF, Ef
def findSegment(express):
# express = np.array(express)
'''
if express[0][0] == 0:
voiceIndex = np.where(express == 1)
else:
voiceIndex = express
'''
voiceIndex = np.array(express).flatten()
soundSegment = []
k = 0
soundSegment.append(voiceSegment(voiceIndex[0]))
for i in range(len(voiceIndex) - 1):
if voiceIndex[i + 1] - voiceIndex[i] > 1:
soundSegment[k].end = voiceIndex[i]
soundSegment.append(voiceSegment(voiceIndex[i + 1]))
k = k + 1
soundSegment[k].end = voiceIndex[-1]
for i in range(k + 1):
soundSegment[i].duratioin = soundSegment[i].end - soundSegment[i].begin + 1
return soundSegment
def pitch_Corr(x, win, step, T1, sr, miniL=10):
win = int(win);
step = int(step)
vseg, vsl, SF, Ef = pitch_vad(x, win, step, T1, miniL)
y = enframe(x, win, step).T
fn = len(SF)
lmin = int(sr / 500)
lmax = int(sr / 27.5)
period = np.zeros(fn)
for i in range(vsl):
ixb = vseg[i].begin
ixe = vseg[i].end
ixd = vseg[i].duratioin
for k in range(ixd):
u = y[:, k + ixb]
ru = np.correlate(u, u, mode='full')
ru = ru[win - 1:] # positive
tloc = np.array(np.where(ru[lmin:lmax] == np.max(ru[lmin:lmax]))).flatten()
period[k + ixb] = lmin + tloc - 1
return vseg, vsl, SF, Ef, period
def calPitch(y, win, step, sr):
'''
calculate pitch
:param y: data of wav file
:param win: windows
:param step: inc
:param sr: frequency of wav file
:return: pitch Hz, period dot
'''
T1 = 0.05
voicesef, vosl, SF, Ef, period = pitch_Corr(y, win, step, T1, sr)
# period is pitch period
pitch = sr / (period + eps)
pindex = np.where(pitch > 5000)
pitch[pindex] = 0
return pitch, period
jitter
J i t t e r ( a b s o l u t e ) = 1 N − 1 ∑ i = 1 N − 1 ∣ T i − T i + 1 ∣ Jitter(absolute)=\frac{1}{N-1}\sum_{i=1}^{N-1}|T_i-T_{i+1}| Jitter(absolute)=N−11i=1∑N−1∣Ti−Ti+1∣
def JitterAbsolute(pitch):
period = 1 / (pitch + eps)
pindex = np.where(period > 5000)
period[pindex] = 0
n = len(period)
sigma = 0
for i in range(n - 1):
sigma = np.abs(period[i] - period[i + 1])
jitter_absolute = sigma / (n - 1)
return jitter_absolute
jitter
J i t t e r ( r e l a t i v e ) = 1 N − 1 ∑ i = 1 N − 1 ∣ T 1 − T i + 1 ∣ 1 N ∑ i = 1 N T i Jitter(relative)=\frac{\frac{1}{N-1}\sum_{i=1}^{N-1}|T_1-T_{i+1}|}{\frac{1}{N}\sum_{i=1}^{N}T_i} Jitter(relative)=N1∑i=1NTiN−11∑i=1N−1∣T1−Ti+1∣
def JitterRelative(pitch):
period = 1 / (pitch + eps)
pindex = np.where(period > 5000)
period[pindex] = 0
n = len(period)
sigma = 0
jitter_relative = JitterAbsolute(pitch) / (np.sum(period) / n)
harmonic to noise ratio
H N R = 10 ∗ l o g ( A C F ( T 0 ) A C F ( 0 ) − A C F ( T 0 ) ) HNR=10*log(\frac{ACF(T0)}{ACF(0)-ACF(T0)}) HNR=10∗log(ACF(0)−ACF(T0)ACF(T0))
T 0 T0 T0指基音频率
def stHNR(frame, period):
'''
harmonics to noise ratio 谐噪比
HNR = 10 * log( ACF(T0) / (ACF(0) - ACF(T0)) )
frame: a frame of signal
period: pitch period of given frame
return hnr db
'''
period = int(period)
if period == 0:
return 0 # when pitch period is zero, return zero
ru = np.correlate(frame, frame, mode='full')
win = len(frame)
print(np.where(ru == np.max(ru)))
ru = ru[win - 1:]
print(np.max(ru))
print(ru[period])
HNR = 10 * np.log(ru[period] / (ru[0] - ru[period]))
return HNR
formant
共振峰可用倒谱法与LPC法估计得出。其中LPC法是通过FFT对任意频率求得其功率谱幅值响应,并从幅值响应中找到共振峰,相应的方法包括抛物线内插法和线性预测系数求复数根法。
本文采用抛物线内插法。
声道可以看成具有非均匀截面的声管,在发音时起到共鸣器的作用。当准周期激励进入声道时会引起共振特性,产生一组共振频率,称为共振峰频率。
语音产生模型是将辐射、声道以及声门激励的全部效应简化为一个时变的数字滤波器,其传递函数为 H ( z ) = S ( z ) U ( z ) = G 1 − ∑ i = 1 p a i z − i H(z)=\frac{S(z)}{U(z)}=\frac{G}{1-\sum_{i=1}^{p}a_iz^{-i}} H(z)=U(z)S(z)=1−∑i=1paiz−iG
这种表现形式称为 p p p阶线性预测模型,这是一个全极点模型。
令 z − 1 = e x p ( − j 2 π f / f s ) z^{-1}=exp(-j2\pi{f}/f_s) z−1=exp(−j2πf/fs),则功率谱 P ( f ) P(f) P(f)可表示为 P ( f ) = ∣ H ( f ) ∣ 2 = G 2 ∣ 1 − ∑ i = 1 p a i e x p ( − j 2 π i f / f s ) ∣ 2 P(f)=|H(f)|^2=\frac{G^2}{|1-\sum_{i=1}^{p}a_iexp(-j2\pi if/f_s)|^2} P(f)=∣H(f)∣2=∣1−∑i=1paiexp(−j2πif/fs)∣2G2
利用FFT对任意频率求功率谱幅值响应,从幅值响应中找出共振峰。相应的求解方法包括抛物线内插法和线性预测系数求复根法。
抛物线内插法,见《语音信号处理实验教程》p105-107
#############################################
##
## calculate formant frequency and bandwidth
##
#############################################
# def Formant_Interpolation(u, sr, p=12):
def stFormant(u, sr, p=12):
'''
F: formant frequency
Bw: formant bandwith
u: one frame of signal
p: number of LPC
sr: sampling rate
return
[1] formant frequency array
[2] formant bandwidth array
'''
### calculate lpc begin
a_filter = lpc.autocor(u, p)
a_filter_num = a_filter.numdict
i = 0
a = []
for k, v in a_filter_num.items():
if i != k:
while (i != k):
a.append(0)
i = i + 1
a.append(v)
i = i + 1
a = np.array(a)
### calculate lpc end
U = lpcar2pf(a, 255) # 由LPC系数求出频谱曲线
df = sr / 512 # 频谱分辨力
Loc, Mdict = signal.find_peaks(U) # find peaks in U
nFormant = len(Loc)
F = np.zeros(nFormant) # 共振峰频率
Bw = np.zeros(nFormant) # 共振峰带宽
# 内插法
i = 0
for m in Loc:
m1 = m - 1;
m2 = m + 1
p = U[m];
p1 = U[m1];
p2 = U[m2]
aa = (p1 + p2) / 2 - p
bb = (p2 - p1) / 2
cc = p
dm = - bb / (2 * aa) # 极大值对应频率
pp = -bb ** 2 / (4 * aa) + cc # 中心频率对应功率谱
bf = - np.sqrt(bb ** 2 - 4 * aa * (cc - 0.5 * pp)) / (2 * aa) # 带宽x轴值
F[i] = (m + dm) * df
Bw[i] = 2 * bf * df
i = i + 1
return F, Bw
def lpcar2pf(ar, npoints):
'''
ar : lpc coefficient
np : 频谱范围
return : 频谱曲线
'''
return np.abs(np.fft.rfft(ar, 2 * npoints + 2)) ** (-2)
def pre_emphasis(y, coefficient=0.99):
'''
y : original signal
coefficient: emphasis coefficient
'''
return np.append(y[0], y[1:] - coefficient * y[:-1])
跟频谱的谱熵(Spectral Entropy)有点类似,不过它描述的是信号的时域分布情况,体现的是连续性。也是短时特征。在第 i i i帧信号内,将信号分为 K K K个子块。
第 j j j个子块与一帧信号总能量的比值为 e j = E s u b F r a m e j E s h o r t F r a m e i e_j =\frac{E_{subFrame_j}}{E_{shortFrame_i}} ej=EshortFrameiEsubFramej,其中 E s h o r t F r a m e i E_{shortFrame_i} EshortFramei的值与子块的总能量对应,为 E s h o r t F r a m e i = ∑ k = 1 K E s u b F r a m e k E_{shortFrame_i}=\sum_{k=1}^{K}E_{subFrame_k} EshortFramei=∑k=1KEsubFramek.
则一帧信号的谱熵值为 H i = − ∑ k = 1 K e j l o g 2 ( e j ) H{i}=-\sum_{k=1}^{K} e_j log_2(e_j) Hi=−k=1∑Kejlog2(ej)
def stEnergyEntropy(frame, n_short_blocks=10):
"""Computes entropy of energy"""
Eol = numpy.sum(frame ** 2) # total frame energy
L = len(frame)
sub_win_len = int(numpy.floor(L / n_short_blocks))
if L != sub_win_len * n_short_blocks:
frame = frame[0:sub_win_len * n_short_blocks]
# sub_wins is of size [n_short_blocks x L]
sub_wins = frame.reshape(sub_win_len, n_short_blocks, order='F').copy()
# Compute normalized sub-frame energies:
s = numpy.sum(sub_wins ** 2, axis=0) / (Eol + eps)
# Compute entropy of the normalized sub-frame energies:
Entropy = -numpy.sum(s * numpy.log2(s + eps))
return Entropy
又称为频谱一阶距,频谱中心的值越小,表明越多的频谱能量集中在低频范围内,如:voice与music相比,通常spectral centroid较低。第 i i i帧信号的频谱质心 C i C_i Ci:
C i = ∑ k = 1 f n k X i ( k ) ∑ k = 1 f n X i ( k ) C_i=\frac{\sum_{k=1}^{f_n}kX_i(k)}{\sum_{k=1}^{f_n}X_i(k)} Ci=∑k=1fnXi(k)∑k=1fnkXi(k)
X i ( k ) X_i(k) Xi(k)为第 i i i帧信号的第 k k k根频谱线, f n f_n fn为一帧信号的长度
def stSpectralCentroidAndSpread(X, fs):
"""Computes spectral centroid of frame (given abs(FFT))"""
ind = (numpy.arange(1, len(X) + 1)) * (fs/(2.0 * len(X)))
Xt = X.copy()
Xt = Xt / Xt.max()
NUM = numpy.sum(ind * Xt)
DEN = numpy.sum(Xt) + eps
# Centroid:
C = (NUM / DEN)
# Spread:
S = numpy.sqrt(numpy.sum(((ind - C) ** 2) * Xt) / DEN)
# Normalize:
C = C / (fs / 2.0)
S = S / (fs / 2.0)
return (C, S)
又称为频谱二阶中心矩,它描述了信号在频谱中心周围的分布状况。第i帧信号的频谱延展度 S i S_i Si
S i = ∑ k = 1 f n ( k − C i ) 2 X i ( k ) ∑ k = 1 f n X i k S_i=\sqrt{\frac {\sum_{k=1}^{f_n} (k-C_i)^2X_i(k)} {\sum_{k=1}^{f_n}X_i{k}}} Si=∑k=1fnXik∑k=1fn(k−Ci)2Xi(k)
程序同7.Spectral Centroid
根据熵的特性可以知道,分布越均匀,熵越大,能量熵反应了每一帧信号的均匀程度,如说话人频谱由于共振峰存在显得不均匀,而白噪声的频谱就更加均匀,借此进行VAD便是应用之一。
与计算能量熵类似,谱熵是将FFT后的频谱划分为 K K K个子块,分别计算每个子块与频谱总能量的比例,最后将所有子块的谱熵相加,即为第 i i i帧信号的谱熵。
第 k k k个子块的比例: n k = E k ∑ l = 1 K E l n_k = \frac{E_k}{\sum_{l=1}^{K}E_l} nk=∑l=1KElEk
第 i i i帧谱熵
H i = − ∑ k = 1 K n k l o g 2 ( n f ) H_i=-\sum_{k=1}^{K}n_klog_2(n_f) Hi=−k=1∑Knklog2(nf)
def stSpectralEntropy(X, n_short_blocks=10):
"""Computes the spectral entropy"""
L = len(X) # number of frame samples
Eol = numpy.sum(X ** 2) # total spectral energy
sub_win_len = int(numpy.floor(L / n_short_blocks)) # length of sub-frame
if L != sub_win_len * n_short_blocks:
X = X[0:sub_win_len * n_short_blocks]
sub_wins = X.reshape(sub_win_len, n_short_blocks, order='F').copy() # define sub-frames (using matrix reshape)
s = numpy.sum(sub_wins ** 2, axis=0) / (Eol + eps) # compute spectral sub-energies
En = -numpy.sum(s*numpy.log2(s + eps)) # compute spectral entropy
return En
描述的是相邻帧频谱的变化情况。计算了频谱归一化之后,两帧频谱差的平方的总和
F l ( i , i − 1 ) = ∑ k = 1 f n ( E N i ( k ) − E N i − 1 ( k ) ) 2 Fl_{(i,i-1)}=\sum_{k=1}^{f_n}(EN_i(k)-EN_{i-1}(k))^2 Fl(i,i−1)=k=1∑fn(ENi(k)−ENi−1(k))2
其中, E N i ( k ) = X i ( k ) ∑ l = 1 f n X i ( l ) EN_i(k)=\frac{X_i(k)}{\sum_{l=1}^{f_n}X_i(l)} ENi(k)=∑l=1fnXi(l)Xi(k)
X i ( k ) X_i(k) Xi(k)指第 k k k根频谱线。
def stSpectralFlux(X, X_prev):
"""
Computes the spectral flux feature of the current frame
ARGUMENTS:
X: the abs(fft) of the current frame
X_prev: the abs(fft) of the previous frame
"""
# compute the spectral flux as the sum of square distances:
sumX = numpy.sum(X + eps)
sumPrevX = numpy.sum(X_prev + eps)
F = numpy.sum((X / sumX - X_prev/sumPrevX) ** 2)
return F
频谱的能量在一定频率范围内是集中的。当频谱能量达到一确切百分比(通常为90%左右),相应的DFT坐标即为滚降点的坐标。然后将滚降点坐标除以FFT长度归一化。 ∑ k = 1 m X i ( k ) = C ∑ k = 1 f n X i ( k ) \sum_{k=1}^{m}X_i(k)=C\sum_{k=1}^{f_n}X_i(k) k=1∑mXi(k)=Ck=1∑fnXi(k)
m m m为滚降点坐标, f n f_n fn为FFT长度, X i ( k ) X_i(k) Xi(k)为第 k k k根谱线。
def stSpectralRollOff(X, c, fs):
"""Computes spectral roll-off"""
totalEnergy = numpy.sum(X ** 2)
fftLength = len(X)
Thres = c*totalEnergy
# Ffind the spectral rolloff as the frequency position
# where the respective spectral energy is equal to c*totalEnergy
CumSum = numpy.cumsum(X ** 2) + eps
[a, ] = numpy.nonzero(CumSum > Thres)
if len(a) > 0:
mC = numpy.float64(a[0]) / (float(fftLength))
else:
mC = 0.0
return (mC)
梅尔频率倒谱系数基于人的听觉特性机理,即根据人的听觉实验结果分析语音频谱。与实际频率的关系为 F m e l ( f ) = 1125 l n ( 1 + f / 700 ) F_{mel}(f)=1125ln(1+f/700) Fmel(f)=1125ln(1+f/700)
def mfccInitFilterBanks(fs, nfft):
"""
Computes the triangular filterbank for MFCC computation
(used in the stFeatureExtraction function before the stMFCC function call)
This function is taken from the scikits.talkbox library (MIT Licence):
https://pypi.python.org/pypi/scikits.talkbox
"""
# filter bank params:
lowfreq = 133.33
linsc = 200/3.
logsc = 1.0711703
numLinFiltTotal = 13
numLogFilt = 27
if fs < 8000:
nlogfil = 5
# Total number of filters
nFiltTotal = numLinFiltTotal + numLogFilt
# Compute frequency points of the triangle:
freqs = numpy.zeros(nFiltTotal+2)
freqs[:numLinFiltTotal] = lowfreq + numpy.arange(numLinFiltTotal) * linsc
freqs[numLinFiltTotal:] = freqs[numLinFiltTotal-1] * logsc ** numpy.arange(1, numLogFilt + 3)
heights = 2./(freqs[2:] - freqs[0:-2])
# Compute filterbank coeff (in fft domain, in bins)
fbank = numpy.zeros((nFiltTotal, nfft))
nfreqs = numpy.arange(nfft) / (1. * nfft) * fs
for i in range(nFiltTotal):
lowTrFreq = freqs[i]
cenTrFreq = freqs[i+1]
highTrFreq = freqs[i+2]
lid = numpy.arange(numpy.floor(lowTrFreq * nfft / fs) + 1,
numpy.floor(cenTrFreq * nfft / fs) + 1,
dtype=numpy.int)
lslope = heights[i] / (cenTrFreq - lowTrFreq)
rid = numpy.arange(numpy.floor(cenTrFreq * nfft / fs) + 1,
numpy.floor(highTrFreq * nfft / fs) + 1,
dtype=numpy.int)
rslope = heights[i] / (highTrFreq - cenTrFreq)
fbank[i][lid] = lslope * (nfreqs[lid] - lowTrFreq)
fbank[i][rid] = rslope * (highTrFreq - nfreqs[rid])
return fbank, freqs
def stMFCC(X, fbank, n_mfcc_feats):
"""
Computes the MFCCs of a frame, given the fft mag
ARGUMENTS:
X: fft magnitude abs(FFT)
fbank: filter bank (see mfccInitFilterBanks)
RETURN
ceps: MFCCs (13 element vector)
Note: MFCC calculation is, in general, taken from the
scikits.talkbox library (MIT Licence),
# with a small number of modifications to make it more
compact and suitable for the pyAudioAnalysis Lib
"""
mspec = numpy.log10(numpy.dot(X, fbank.T)+eps)
ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:n_mfcc_feats]
return ceps