import librosa
import os
from random import shuffle
import numpy as np
from sklearn import svm
import joblib
import sklearn
# C:误差项惩罚参数,对误差的容忍程度。C越大,越不能容忍误差
# gamma:选择RBF函数作为kernel,越大,支持的向量越少;越小,支持的向量越多
# kernel: linear, poly, rbf, sigmoid, precomputed
# decision_function_shape: ovo, ovr(default)
#
# #
path = './casia'
# 定义一个情感标签的字典,用于将情感类别映射到整数标签。
EMOTION_LABEL = {
'angry': '1',
'fear': '2',
'happy': '3',
'neutral': '4',
'sad': '5',
'surprise': '6'
}
'''
getFeature函数用于从音频文件中提取特征。
主要步骤包括读取音频文件、提取MFCC(Mel频率倒谱系数)特征、零交叉率、能量和均方根等音频特征
,并将它们拼接成一个特征向量。
'''
def getFeature(path, mfcc_feature_num=16):
y, sr = librosa.load(path)
# 对于每一个音频文件提取其mfcc特征
# y:音频时间序列;
# n_mfcc:要返回的MFCC数量
mfcc_feature = librosa.feature.mfcc(y, sr, n_mfcc=16)
zcr_feature = librosa.feature.zero_crossing_rate(y)
# energy_feature = librosa.feature.rmse(y)
energy_feature = librosa.feature.rms(y)
rms_feature = librosa.feature.rms(y)
mfcc_feature = mfcc_feature.T.flatten()[:mfcc_feature_num]
zcr_feature = zcr_feature.flatten()
energy_feature = energy_feature.flatten()
rms_feature = rms_feature.flatten()
zcr_feature = np.array([np.mean(zcr_feature)])
energy_feature = np.array([np.mean(energy_feature)])
rms_feature = np.array([np.mean(rms_feature)])
data_feature = np.concatenate((mfcc_feature, zcr_feature, energy_feature,
rms_feature))
return data_feature
'''
getData函数用于获取所有语音文件的特征和对应的情感标签。
它首先遍历数据集中的所有音频文件,并对其进行随机排列。
然后调用getFeature()函数提取特征,并将特征向量与情感标签一一对应,形成训练数据集。
'''
def getData(mfcc_feature_num=16):
"""找到数据集中的所有语音文件的特征以及语音的情感标签"""
wav_file_path = []
person_dirs = os.listdir(path)
for person in person_dirs:
if person.endswith('txt'):
continue
emotion_dir_path = os.path.join(path, person)
emotion_dirs = os.listdir(emotion_dir_path)
for emotion_dir in emotion_dirs:
if emotion_dir.endswith('.ini'):
continue
emotion_file_path = os.path.join(emotion_dir_path, emotion_dir)
emotion_files = os.listdir(emotion_file_path)
for file in emotion_files:
if not file.endswith('wav'):
continue
wav_path = os.path.join(emotion_file_path, file)
wav_file_path.append(wav_path)
# 将语音文件随机排列
shuffle(wav_file_path)
data_feature = []
data_labels = []
for wav_file in wav_file_path:
data_feature.append(getFeature(wav_file, mfcc_feature_num))
data_labels.append(int(EMOTION_LABEL[wav_file.split('\\')[-2]]))
return np.array(data_feature), np.array(data_labels)
'''
train函数用于训练并选择最佳的SVM模型。
它通过循环尝试不同的参数组合(C和mfcc_feature_num),从数据集中提取特征,并将数据集分为训练集和测试集。
然后使用SVM分类器进行训练,并计算准确率。在循环过程中记录最佳准确率对应的参数组合。
'''
def train():
# 使用svm进行预测
best_acc = 0
best_mfcc_feature_num = 0
best_C = 0
for C in range(13, 20):
for i in range(40, 55):
data_feature, data_labels = getData(i)
split_num = 200
train_data = data_feature[:split_num, :]
train_label = data_labels[:split_num]
test_data = data_feature[split_num:, :]
test_label = data_labels[split_num:]
clf = svm.SVC(
decision_function_shape='ovo',
kernel='rbf',
C=C,
gamma=0.0001,
probability=True)
print("train start")
clf.fit(train_data, train_label)
print("train over")
print(C, i)
acc_dict = {}
for test_x, test_y in zip(test_data, test_label):
pre = clf.predict([test_x])[0]
if pre in acc_dict.keys():
continue
acc_dict[pre] = test_y
acc = sklearn.metrics.accuracy_score(
clf.predict(test_data), test_label)
if acc > best_acc:
best_acc = acc
best_C = C
best_mfcc_feature_num = i
print('best_acc', best_acc)
print('best_C', best_C)
print('best_mfcc_feature_num', best_mfcc_feature_num)
print()
# 保存模型
joblib.dump(clf,
'Models/C_' + str(C) + '_mfccNum_' + str(i) + '.m')
print('best_acc', best_acc)
print('best_C', best_C)
print('best_mfcc_feature_num', best_mfcc_feature_num)
'''
调用train()函数进行训练,并输出最佳准确率及对应的参数C和mfcc_feature_num。
'''
if __name__ == "__main__":
train()
EMOTION_LABEL字典、get_feature函数和get_data函数和SVM的一致。build_model函数用于构建LSTM模型,这里只写了一个简单的框架,根据需要再自行调整吧
import os
import librosa
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
# 将情感标签转换为数字
EMOTION_LABEL = {
'angry': 0,
'fear': 1,
'happy': 2,
'neutral': 3,
'sad': 4,
'surprise': 5
}
def get_feature(path, n_mfcc=13, max_len=80):
# 从音频文件中提取 MFCC 特征
y, sr = librosa.load(path)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
# 使用 StandardScaler 缩放 MFCC 特征
scaler = StandardScaler()
mfcc = scaler.fit_transform(mfcc.T).T
if (max_len > mfcc.shape[1]):
pad_width = max_len - mfcc.shape[1]
mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
else:
mfcc = mfcc[:, :max_len]
return np.squeeze(np.expand_dims(mfcc, axis=-1), axis=-1)
def get_data(n_mfcc=13, max_len=80):
# 获取所有音频文件的 MFCC 特征和情感标签,并将 MFCC 特征转换为时间序列数据
wav_file_path = []
person_dirs = os.listdir('./casia')
for person in person_dirs:
if person.endswith('txt'):
continue
emotion_dir_path = os.path.join('./casia', person)
emotion_dirs = os.listdir(emotion_dir_path)
for emotion_dir in emotion_dirs:
if emotion_dir.endswith('.ini'):
continue
emotion_file_path = os.path.join(emotion_dir_path, emotion_dir)
emotion_files = os.listdir(emotion_file_path)
for file in emotion_files:
if not file.endswith('wav'):
continue
wav_path = os.path.join(emotion_file_path, file)
wav_file_path.append(wav_path)
# 随机打乱音频文件
np.random.shuffle(wav_file_path)
data_feature = []
data_labels = []
for wav_file in wav_file_path:
# 提取 MFCC 特征,并将其转换为时间序列数据
mfcc = get_feature(wav_file, n_mfcc, max_len)
data_feature.append(mfcc)
data_labels.append(EMOTION_LABEL[wav_file.split('\\')[-2]])
# 将情感标签转换为独热编码
data_labels = to_categorical(data_labels)
return np.array(data_feature), np.array(data_labels)
def build_model(n_mfcc=13, max_len=80, n_classes=6):
# 构建 LSTM 模型
model = Sequential()
model.add(LSTM(128, input_shape=(n_mfcc, max_len), return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(n_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
if __name__ == '__main__':
# 获取数据集
X, y = get_data()
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 构建模型
model = build_model()
# 训练模型
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=50)
# 评估模型准确率
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# 保存模型
model.save('emotion_recognition_model.h5')
import joblib
import numpy as np
import wave
import librosa
path = './casia'
EMOTION_LABEL = {
'angry': '1',
'fear': '2',
'happy': '3',
'neutral': '4',
'sad': '5',
'surprise': '6'
}
def getFeature(path, mfcc_feature_num=16):
y, sr = librosa.load(path)
# 对于每一个音频文件提取其mfcc特征
# y:音频时间序列;
# n_mfcc:要返回的MFCC数量
mfcc_feature = librosa.feature.mfcc(y, sr, n_mfcc=16)
zcr_feature = librosa.feature.zero_crossing_rate(y)
# energy_feature = librosa.feature.rmse(y)
energy_feature = librosa.feature.rms(y)
rms_feature = librosa.feature.rms(y)
mfcc_feature = mfcc_feature.T.flatten()[:mfcc_feature_num]
zcr_feature = zcr_feature.flatten()
energy_feature = energy_feature.flatten()
rms_feature = rms_feature.flatten()
zcr_feature = np.array([np.mean(zcr_feature)])
energy_feature = np.array([np.mean(energy_feature)])
rms_feature = np.array([np.mean(rms_feature)])
data_feature = np.concatenate((mfcc_feature, zcr_feature, energy_feature,
rms_feature))
return data_feature
wav_paths = ['test1.wav','test2.wav','test3.wav','test4.wav']
model = joblib.load("E:/speech/media/weights/classfier.m")
labels = np.array(['angry', 'fear', 'happy', 'neutral', 'sad', 'surprise'])
emotion_label_list = []
emotion_value_list = []
for wav_path in wav_paths:
print(wav_path)
f = wave.open(wav_path, 'rb')
data_feature = getFeature(wav_path, 48)
probability_data = model.predict_proba([data_feature])[0] # 获取概率列表
max_probability_index = np.argmax(probability_data) # 最大概率的坐标
max_probability = probability_data[max_probability_index] # 最大概率值
emotion_label = labels[max_probability_index] # 最终的表情
emotion_label_list.append(emotion_label)
emotion_value_list.append(max_probability)
combined_list = [[emotion, value] for emotion, value in zip(emotion_label_list, emotion_value_list)]
f.close()
print(combined_list)
输出:
[[‘neutral’, 0.28888379468448255], [‘happy’, 0.4550522457604587], [‘happy’, 0.5324629391829391], [‘happy’, 0.4118509132866488]]
简单的一个demo。
个人邮箱:[email protected]