python语音情感分类(训练过程+预测)

1. SVM

import librosa
import os
from random import shuffle
import numpy as np
from sklearn import svm
import joblib
import sklearn

# C:误差项惩罚参数,对误差的容忍程度。C越大,越不能容忍误差
# gamma:选择RBF函数作为kernel,越大,支持的向量越少;越小,支持的向量越多
# kernel: linear, poly, rbf, sigmoid, precomputed
# decision_function_shape: ovo, ovr(default)
#
# #

path = './casia'
# 定义一个情感标签的字典,用于将情感类别映射到整数标签。
EMOTION_LABEL = {
    'angry': '1',
    'fear': '2',
    'happy': '3',
    'neutral': '4',
    'sad': '5',
    'surprise': '6'
}


'''
getFeature函数用于从音频文件中提取特征。
主要步骤包括读取音频文件、提取MFCC(Mel频率倒谱系数)特征、零交叉率、能量和均方根等音频特征
,并将它们拼接成一个特征向量。
'''
def getFeature(path, mfcc_feature_num=16):
    y, sr = librosa.load(path)

    # 对于每一个音频文件提取其mfcc特征
    # y:音频时间序列;
    # n_mfcc:要返回的MFCC数量
    mfcc_feature = librosa.feature.mfcc(y, sr, n_mfcc=16)
    zcr_feature = librosa.feature.zero_crossing_rate(y)
    # energy_feature = librosa.feature.rmse(y)
    energy_feature = librosa.feature.rms(y)
    rms_feature = librosa.feature.rms(y)

    mfcc_feature = mfcc_feature.T.flatten()[:mfcc_feature_num]
    zcr_feature = zcr_feature.flatten()
    energy_feature = energy_feature.flatten()
    rms_feature = rms_feature.flatten()

    zcr_feature = np.array([np.mean(zcr_feature)])
    energy_feature = np.array([np.mean(energy_feature)])
    rms_feature = np.array([np.mean(rms_feature)])

    data_feature = np.concatenate((mfcc_feature, zcr_feature, energy_feature,
                                   rms_feature))
    return data_feature


'''
getData函数用于获取所有语音文件的特征和对应的情感标签。
它首先遍历数据集中的所有音频文件,并对其进行随机排列。
然后调用getFeature()函数提取特征,并将特征向量与情感标签一一对应,形成训练数据集。
'''
def getData(mfcc_feature_num=16):
    """找到数据集中的所有语音文件的特征以及语音的情感标签"""
    wav_file_path = []
    person_dirs = os.listdir(path)
    for person in person_dirs:
        if person.endswith('txt'):
            continue
        emotion_dir_path = os.path.join(path, person)
        emotion_dirs = os.listdir(emotion_dir_path)
        for emotion_dir in emotion_dirs:
            if emotion_dir.endswith('.ini'):
                continue
            emotion_file_path = os.path.join(emotion_dir_path, emotion_dir)
            emotion_files = os.listdir(emotion_file_path)
            for file in emotion_files:
                if not file.endswith('wav'):
                    continue
                wav_path = os.path.join(emotion_file_path, file)
                wav_file_path.append(wav_path)

    # 将语音文件随机排列
    shuffle(wav_file_path)
    data_feature = []
    data_labels = []

    for wav_file in wav_file_path:

        data_feature.append(getFeature(wav_file, mfcc_feature_num))
        data_labels.append(int(EMOTION_LABEL[wav_file.split('\\')[-2]]))

    return np.array(data_feature), np.array(data_labels)


'''
train函数用于训练并选择最佳的SVM模型。
它通过循环尝试不同的参数组合(C和mfcc_feature_num),从数据集中提取特征,并将数据集分为训练集和测试集。
然后使用SVM分类器进行训练,并计算准确率。在循环过程中记录最佳准确率对应的参数组合。
'''
def train():
    # 使用svm进行预测
    best_acc = 0
    best_mfcc_feature_num = 0
    best_C = 0

    for C in range(13, 20):
        for i in range(40, 55):
            data_feature, data_labels = getData(i)
            split_num = 200
            train_data = data_feature[:split_num, :]
            train_label = data_labels[:split_num]
            test_data = data_feature[split_num:, :]
            test_label = data_labels[split_num:]
            clf = svm.SVC(
                decision_function_shape='ovo',
                kernel='rbf',
                C=C,
                gamma=0.0001,
                probability=True)
            print("train start")
            clf.fit(train_data, train_label)
            print("train over")
            print(C, i)
            acc_dict = {}
            for test_x, test_y in zip(test_data, test_label):
                pre = clf.predict([test_x])[0]
                if pre in acc_dict.keys():
                    continue
                acc_dict[pre] = test_y
            acc = sklearn.metrics.accuracy_score(
                clf.predict(test_data), test_label)
            if acc > best_acc:
                best_acc = acc
                best_C = C
                best_mfcc_feature_num = i
                print('best_acc', best_acc)
                print('best_C', best_C)
                print('best_mfcc_feature_num', best_mfcc_feature_num)
                print()

            # 保存模型
            joblib.dump(clf,
                        'Models/C_' + str(C) + '_mfccNum_' + str(i) + '.m')

    print('best_acc', best_acc)
    print('best_C', best_C)
    print('best_mfcc_feature_num', best_mfcc_feature_num)

'''
调用train()函数进行训练,并输出最佳准确率及对应的参数C和mfcc_feature_num。
'''
if __name__ == "__main__":
    train()

2. tensorflow—LSTM

EMOTION_LABEL字典、get_feature函数和get_data函数和SVM的一致。build_model函数用于构建LSTM模型,这里只写了一个简单的框架,根据需要再自行调整吧

import os
import librosa
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.utils import to_categorical




# 将情感标签转换为数字
EMOTION_LABEL = {
    'angry': 0,
    'fear': 1,
    'happy': 2,
    'neutral': 3,
    'sad': 4,
    'surprise': 5
}


def get_feature(path, n_mfcc=13, max_len=80):
    # 从音频文件中提取 MFCC 特征
    y, sr = librosa.load(path)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

    # 使用 StandardScaler 缩放 MFCC 特征
    scaler = StandardScaler()
    mfcc = scaler.fit_transform(mfcc.T).T

    if (max_len > mfcc.shape[1]):
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return np.squeeze(np.expand_dims(mfcc, axis=-1), axis=-1)


def get_data(n_mfcc=13, max_len=80):
    # 获取所有音频文件的 MFCC 特征和情感标签,并将 MFCC 特征转换为时间序列数据
    wav_file_path = []
    person_dirs = os.listdir('./casia')
    for person in person_dirs:
        if person.endswith('txt'):
            continue
        emotion_dir_path = os.path.join('./casia', person)
        emotion_dirs = os.listdir(emotion_dir_path)
        for emotion_dir in emotion_dirs:
            if emotion_dir.endswith('.ini'):
                continue
            emotion_file_path = os.path.join(emotion_dir_path, emotion_dir)
            emotion_files = os.listdir(emotion_file_path)
            for file in emotion_files:
                if not file.endswith('wav'):
                    continue
                wav_path = os.path.join(emotion_file_path, file)
                wav_file_path.append(wav_path)

    # 随机打乱音频文件
    np.random.shuffle(wav_file_path)

    data_feature = []
    data_labels = []

    for wav_file in wav_file_path:
        # 提取 MFCC 特征,并将其转换为时间序列数据
        mfcc = get_feature(wav_file, n_mfcc, max_len)
        data_feature.append(mfcc)
        data_labels.append(EMOTION_LABEL[wav_file.split('\\')[-2]])

    # 将情感标签转换为独热编码
    data_labels = to_categorical(data_labels)
    return np.array(data_feature), np.array(data_labels)


def build_model(n_mfcc=13, max_len=80, n_classes=6):
    # 构建 LSTM 模型
    model = Sequential()
    model.add(LSTM(128, input_shape=(n_mfcc, max_len), return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(64, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(n_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model




if __name__ == '__main__':
    # 获取数据集
    X, y = get_data()
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # 构建模型
    model = build_model()
    # 训练模型
    model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=50)
    # 评估模型准确率
    score = model.evaluate(X_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
    # 保存模型
    model.save('emotion_recognition_model.h5')

测试

import joblib
import numpy as np
import wave
import librosa


path = './casia'
EMOTION_LABEL = {
    'angry': '1',
    'fear': '2',
    'happy': '3',
    'neutral': '4',
    'sad': '5',
    'surprise': '6'
}


def getFeature(path, mfcc_feature_num=16):
    y, sr = librosa.load(path)

    # 对于每一个音频文件提取其mfcc特征
    # y:音频时间序列;
    # n_mfcc:要返回的MFCC数量
    mfcc_feature = librosa.feature.mfcc(y, sr, n_mfcc=16)
    zcr_feature = librosa.feature.zero_crossing_rate(y)
    # energy_feature = librosa.feature.rmse(y)
    energy_feature = librosa.feature.rms(y)
    rms_feature = librosa.feature.rms(y)

    mfcc_feature = mfcc_feature.T.flatten()[:mfcc_feature_num]
    zcr_feature = zcr_feature.flatten()
    energy_feature = energy_feature.flatten()
    rms_feature = rms_feature.flatten()

    zcr_feature = np.array([np.mean(zcr_feature)])
    energy_feature = np.array([np.mean(energy_feature)])
    rms_feature = np.array([np.mean(rms_feature)])

    data_feature = np.concatenate((mfcc_feature, zcr_feature, energy_feature,
                                   rms_feature))
    return data_feature

wav_paths = ['test1.wav','test2.wav','test3.wav','test4.wav']
model = joblib.load("E:/speech/media/weights/classfier.m")

labels = np.array(['angry', 'fear', 'happy', 'neutral', 'sad', 'surprise'])
emotion_label_list = []
emotion_value_list = []

for wav_path in wav_paths:
    print(wav_path)
    f = wave.open(wav_path, 'rb')
    data_feature = getFeature(wav_path, 48)

    probability_data = model.predict_proba([data_feature])[0] # 获取概率列表
    max_probability_index = np.argmax(probability_data) # 最大概率的坐标
    max_probability = probability_data[max_probability_index] # 最大概率值
    emotion_label = labels[max_probability_index]  # 最终的表情
    emotion_label_list.append(emotion_label)
    emotion_value_list.append(max_probability)
    combined_list = [[emotion, value] for emotion, value in zip(emotion_label_list, emotion_value_list)]
    f.close()

print(combined_list)

输出:
[[‘neutral’, 0.28888379468448255], [‘happy’, 0.4550522457604587], [‘happy’, 0.5324629391829391], [‘happy’, 0.4118509132866488]]

简单的一个demo。
个人邮箱:[email protected]

你可能感兴趣的:(python,分类,开发语言)