信号处理课程结束了,没有像期望的那样学到很多东西,买的书(Discrete-Time)也没怎么看,作业算是完成了,但是要达到该坑的国际领先水平,距离有些遥远。我只是抚摸了一下信号处理的皮毛,我只是用了一下支持向量机。然而,我对语音情感的实际使用价值却没一个月之前那么看好了,或许面部表情和生理信号更加靠谱,也或许……
在人类的面对面交流场景中,语音所传达的信息量占有很大的比重,仅次于面部表情。基于比较成熟的信号处理技术、认知心理学模型以及计算机技术,语音情感计算模型在情感计算领域已经发展了十多年。它聚焦于语音数字信号的特征提取与分析、情感语音库的建立和语音信号的情感分类。本文的目的是给出语音情感计算的概况以及最新进展,探讨语音情感计算的整个过程。首先,介绍了语音情感计算的定义以及它的交叉学科的本质。接下来,从数字信号处理的角度研究了基于最常用的数字语音信号特征——MFCC的语音情感检测的全过程。然后,利用柏林语音情感数据库分析了在提取MFCC的过程中一些具体的参数的选择对检测结果准确率的影响。最后,阐述了语音情感计算领域面临的挑战以及今后的可能发展趋势。
用的是柏林语音情感数据库,最终的识别率勉强接近50%,存在的问题我也思考了,但是一时解决不了。
% process
clear all;%
%cd D:\Program Files\MATLAB\R2014a\toolbox\libsvm-3.21\matlab
cd C:\Users\mayax\Desktop\现代信号处理【报告】——参考文献\samples\data\wav
file = dir('*.wav');
%load rawdata.mat;
class_label = containers.Map({'anger','boredom','disgust','fear','happiness','neutral','sadness'},{1,2,3,4,5,6,7});
class_key = ['anger','boredom','disgust','fear','happiness','neutral','sadness'];
anger = file(1:127);
boredom = file(128:208);
disgust = file(209:254);
fear = file(255:323);
happiness = file(324:394);
neutral = file(395:473);
sadness = file(474:535);
%读取音频原始序列并得到特征参数
for i = 1:1:length(anger)
[y fs]=audioread(anger(i).name);
feature_anger(i,:) = mfcc_extract_func( y,fs );
end
for i = 1:1:length(boredom)
[y fs]=audioread(boredom(i).name);
feature_boredom(i,:) = mfcc_extract_func( y,fs );
end
for i = 1:1:length(disgust)
[y fs]=audioread(disgust(i).name);
feature_disgust(i,:) = mfcc_extract_func( y,fs );
end
for i = 1:1:length(fear)
[y fs]=audioread(fear(i).name);
feature_fear(i,:) = mfcc_extract_func( y,fs );
end
for i = 1:1:length(happiness)
[y fs]=audioread(happiness(i).name);
feature_happiness(i,:) = mfcc_extract_func( y,fs );
end
for i = 1:1:length(neutral)
[y fs]=audioread(neutral(i).name);
feature_neutral(i,:) = mfcc_extract_func( y,fs );
end
for i = 1:1:length(sadness)
[y fs]=audioread(sadness(i).name);
feature_sadness(i,:) = mfcc_extract_func( y,fs );
end
%
train_matrix_size = [42,27,15,23,24,26,21];
%construct training matrix separately
training_matrix_anger = feature_anger(1:train_matrix_size(class_label('anger')),:);
training_matrix_boredom = feature_boredom(1:train_matrix_size(class_label('boredom')),:);
training_matrix_disgust = feature_disgust(1:train_matrix_size(class_label('disgust')),:);
training_matrix_fear = feature_fear(1:train_matrix_size(class_label('fear')),:);
training_matrix_happiness = feature_happiness(1:train_matrix_size(class_label('happiness')),:);
training_matrix_neutral = feature_neutral(1:train_matrix_size(class_label('neutral')),:);
training_matrix_sadness = feature_sadness(1:train_matrix_size(class_label('sadness')),:);
%construct training label seprately
training_label_anger = class_label('anger')*ones(size(training_matrix_anger,1),1);
training_label_boredom = class_label('boredom')*ones(size(training_matrix_boredom,1),1);
training_label_disgust = class_label('disgust')*ones(size(training_matrix_disgust,1),1);
training_label_fear = class_label('fear')*ones(size(training_matrix_fear,1),1);
training_label_happiness = class_label('happiness')*ones(size(training_matrix_happiness,1),1);
training_label_neutral = class_label('neutral')*ones(size(training_matrix_neutral,1),1);
training_label_sadness = class_label('sadness')*ones(size(training_matrix_sadness,1),1);
% fuse training data
%matrix
training_matrix = [training_matrix_anger; training_matrix_boredom; training_matrix_disgust; training_matrix_fear; ...
training_matrix_happiness; training_matrix_neutral; training_matrix_sadness];
%label
training_label = [training_label_anger; training_label_boredom; training_label_disgust; training_label_fear; ...
training_label_happiness; training_label_neutral; training_label_sadness];
%construct testing matrix separately
testing_matrix_anger = feature_anger(train_matrix_size(class_label('anger'))+1:end,:);
testing_matrix_boredom = feature_boredom(train_matrix_size(class_label('boredom'))+1:end,:);
testing_matrix_disgust = feature_disgust(train_matrix_size(class_label('disgust'))+1:end,:);
testing_matrix_fear = feature_fear(train_matrix_size(class_label('fear'))+1:end,:);
testing_matrix_happiness = feature_happiness(train_matrix_size(class_label('happiness'))+1:end,:);
testing_matrix_neutral = feature_neutral(train_matrix_size(class_label('neutral'))+1:end,:);
testing_matrix_sadness = feature_sadness(train_matrix_size(class_label('sadness'))+1:end,:);
%construct testing label seprately
testing_label_anger = class_label('anger')*ones(size(testing_matrix_anger,1),1);
testing_label_boredom = class_label('boredom')*ones(size(testing_matrix_boredom,1),1);
testing_label_disgust = class_label('disgust')*ones(size(testing_matrix_disgust,1),1);
testing_label_fear = class_label('fear')*ones(size(testing_matrix_fear,1),1);
testing_label_happiness = class_label('happiness')*ones(size(testing_matrix_happiness,1),1);
testing_label_neutral = class_label('neutral')*ones(size(testing_matrix_neutral,1),1);
testing_label_sadness = class_label('sadness')*ones(size(testing_matrix_sadness,1),1);
% fuse testing data
%matrix
testing_matrix = [testing_matrix_anger; testing_matrix_boredom; testing_matrix_disgust; testing_matrix_fear; ...
testing_matrix_happiness; testing_matrix_neutral; testing_matrix_sadness];
%label
testing_label = [testing_label_anger; testing_label_boredom; testing_label_disgust; testing_label_fear; ...
testing_label_happiness; testing_label_neutral; testing_label_sadness];
%training svm model
cd 'D:\Program Files\MATLAB\R2014a\toolbox\libsvm-3.21\matlab'
[testing_matrix_scale, training_matrix_scale] = scaleForSVM(testing_matrix, training_matrix,-1,1);
[testing_matrix_pca,training_matrix_pca] = pcaForSVM(testing_matrix_scale, training_matrix_scale, 90);
model = svmtrain(training_label, training_matrix_pca);
[predicted_label] = svmpredict(testing_label, testing_matrix_pca, model);
%分别统计各个情感的识别率
acc = zeros(7,7);
for i = 1:1:size(testing_label,1)
acc(testing_label(i),predicted_label(i)) = acc(testing_label(i),predicted_label(i)) + 1;
end
mfcc_extract_func:
function [ features ] = mfcc_extract_func( y,fs )
%UNTITLED3 此处显示有关此函数的摘要
% 此处显示详细说明
yy = filter([1 -0.97],1,y);
%组帧
frame_yy = enframe(yy,512,285);%对x 512点分为一帧
frame_yy_copy = frame_yy;
frame_count = size(frame_yy,1);
%加窗---给每一帧加上长度512的汉明窗。
for i=1:frame_count
win_frame_yy(i,:) = ((frame_yy(i,:))'.* hamming(512))';
end
%计算离散傅里叶变换以及信号功率
for i=1:frame_count
frame_YY(i,:) = fft((win_frame_yy(i,:))',512);
end
amp_frame_YY = abs(frame_YY);
pow_frame_YY = amp_frame_YY.^2;
%截取能量频谱的前257个
newpow_frame_YY = pow_frame_YY(:,1:257);
%mel滤波器组的设置
bank=melbankm(26,512,fs,0,0.5,'t');%Mel滤波器的阶数为26,fft变换的长度为512,采样频率为16000Hz
mel_energy = bank*newpow_frame_YY';
log_mel_energy = log10(mel_energy);
%离散余弦变换参数
for k=1:1:13
n=0:1:25;
dctcoef(k,:)=cos((2*n+1)*k*pi/(2*26));%第一阶MFCC是要被丢弃的,所以此处的变换矩阵的第一行可以随便算算,不必严格按照DCT的原理进行
end
mfcc1 = dctcoef*log_mel_energy;
mfcc2 = mfcc1(2:13,:);
mfcc = mfcc2;
mean_mfcc = mean(mfcc,2);
max_mfcc = max(mfcc,[],2);
min_mfcc = min(mfcc,[],2);
var_mfcc = var(mfcc,[],2);
%融合
features = [mean_mfcc',max_mfcc',min_mfcc',var_mfcc'];
for i = 1:1:48
if isnan(features(i)) == 1
features(i) = randi([-3,3])*abs(randi([-3,3]));
end
end
end
报告和ppt链接: http://pan.baidu.com/s/1o8Uxmwa 密码: diwc
参考资料:
[1] MATLAB添加VOICEBOX工具箱
[2]语音特征参数MFCC提取过程详解
[3]mfcc详解
[4]支持向量机的MATLAB工具箱
[5]顶尖水平(并没有多顶尖,只是数据比较漂亮)——-Speech Emotion Recognition Using Fourier Parameters