1、首先是读取语音:
首先要知道语音信号常见的有:*.txt文本文件和*.wav语音文件;
为什么会有*.txt文件?
这个很好理解,对于*.wav可以理解为以为时间信号,经过采样之后就变成了离散的点,即为*.txt文件存放的一堆数字。接下来,看一下读取语音信号的两种方式:(注意这里使用的是MATLAB代码)
Example1:(*.txt--即把采样点读取出来)
fid=fopen('happy.txt','rt'); %打开文件
Example2:(*.wav)
[y,fs,nbits]=wavread('happy.wav');
注意:wavread()该函数是适用于MATLAB2010版本,后期的版本就需要使用audioread(),(注意:由于2017年10月左右,matlab的大量陈旧老版本失效,后期的都是使用:audioread()来完成的。)但是这个函数的输出只有两个,见Example3。
Example3:(*.wav)
[y, fs]=audioread('happy.wav');
注意:关于比特率在*.wav文件的属性中的详细信息中是可以看到的。可见MATLAB的不断更新的是为更简便、更快。
输出参数中:y表示的是采样点,fs表示的是采样频率
2、然后就是预处理操作(端点检测-预加重-加窗分帧)
这里有两个制胜函数:
①epdByVolZcr.m
function [epInSampleIndex, epInFrameIndex, soundSegment, zeroOneVec, volume] = epdByVolZcr(y, fs, nbits, epdParam, plotOpt)
% epdByVol: EPD based on volume only
% Usage: [epInSampleIndex, epInFrameIndex, soundSegment, zeroOneVec, volume] = epdByVol(y, fs, nbits, epdParam, plotOpt)
% epInSampleIndex: two-element end-points in sample index
% epInFrameIndex: two-element end-points in frame index
% soundSegment: resulting sound segments
% zeroOneVec: zero-one vector for each frame
% volume: volume
% y: input audio signals
% fs: sampling rate
% epdParam: parameters for EPD
% plotOpt: 0 for silence operation, 1 for plotting
%
% Example:
% waveFile='SingaporeIsAFinePlace.wav';
% [y, fs, nbits]=wavReadInt(waveFile);
% epdParam=epdParamSet(fs);
% plotOpt=1;
% out=epdByVol(y, fs, nbits, epdParam, plotOpt);
% Roger Jang, 20040413, 20070320
if nargin<1, selfdemo; return; end
if nargin<2, fs=16000; end
if nargin<3, nbits=16; end
if nargin<4 | isempty(epdParam), epdParam=epdParamSet(fs); end
if nargin<5, plotOpt=0; end
if size(y, 2)~=1, error('Wave is not mono!'); end
frameSize=epdParam.frameSize;
overlap=epdParam.overlap;minSegment=round(epdParam.minSegment*fs/(frameSize-overlap));
maxSilBetweenWord=round(epdParam.maxSilBetweenWord*fs/(frameSize-overlap));
%minLastWordDuration=round(epdParam.minLastWordDuration*fs/(frameSize-overlap));
y = double(y); % convert to double data type
frameMat=buffer2(y, frameSize, overlap); % frame blocking
frameMat=frameZeroMean(frameMat, 2);
frameNum=size(frameMat, 2); % no. of frames
volume=frame2volume(frameMat, 1); % compute volume
temp=sort(volume);
index=round(frameNum/32); if index==0, index=1; end
volMin=temp(index);
volMax=temp(frameNum-index+1); % To avoid qiYin
volTh1=(volMax-volMin)/epdParam.volRatio+volMin; % compute volume threshold
volTh2=(volMax-volMin)/epdParam.volRatio2+volMin; % compute volume threshold
% ====== Identify voiced part that's larger than volTh2
soundSegment=findSegment(volume>volTh1);
% ====== Compute ZCR
[minVol, index]=min(volume);
shiftAmount=epdParam.zcrShiftGain*max(abs(frameMat(:,index))); % shiftAmount is equal to epdParam.zcrShiftGain times the max. abs. sample within the frame of min. volume
shiftAmount=max(shiftAmount, 2);
zcr=frame2zcr(frameMat, 1, shiftAmount);
zcrTh=max(zcr)*epdParam.zcrRatio;
% ====== Expansion 1: Expand end points to volume level1 (lower level)
for i=1:length(soundSegment),
head = soundSegment(i).begin;
while (head-1)>=1 & volume(head-1)>=volTh1,
head=head-1;
end
soundSegment(i).begin = head;
tail = soundSegment(i).end;
while (tail+1)<=length(volume) & volume(tail+1)>=volTh1,
tail=tail+1;
end
soundSegment(i).end = tail;
end
% ====== Expansion 2: Expand end points to include high zcr region
for i=1:length(soundSegment),
head = soundSegment(i).begin;
while (head-1)>=1 & zcr(head-1)>zcrTh % Extend at beginning
head=head-1;
end
soundSegment(i).begin = head;
tail = soundSegment(i).end;
while (tail+1)<=length(zcr) & zcr(tail+1)>zcrTh % Extend at ending
tail=tail+1;
end
soundSegment(i).end = tail;
end
% ====== Delete repeated sound segments
index = [];
for i=1:length(soundSegment)-1,
if soundSegment(i).begin==soundSegment(i+1).begin & soundSegment(i).end==soundSegment(i+1).end,
index=[index, i];
end
end
soundSegment(index) = [];
% ====== Delete short sound clips
index = [];
for i=1:length(soundSegment),
if soundSegment(i).duration<=minSegment
index = [index, i];
end
end
soundSegment(index) = [];
zeroOneVec=0*volume;
for i=1:length(soundSegment)
for j=soundSegment(i).begin:soundSegment(i).end
zeroOneVec(j)=1;
end
end
if isempty(soundSegment)
epInSampleIndex=[];
epInFrameIndex=[];
fprintf('Warning: No sound segment found in %s.m.\n', mfilename);
else
epInFrameIndex=[soundSegment(1).begin, soundSegment(end).end];
epInSampleIndex=frame2sampleIndex(epInFrameIndex, frameSize, overlap); % conversion from frame index to sample index
for i=1:length(soundSegment),
soundSegment(i).beginSample = frame2sampleIndex(soundSegment(i).begin, frameSize, overlap);
soundSegment(i).endSample = min(length(y), frame2sampleIndex(soundSegment(i).end, frameSize, overlap));
soundSegment(i).beginFrame = soundSegment(i).begin;
soundSegment(i).endFrame = soundSegment(i).end;
end
soundSegment=rmfield(soundSegment, 'begin');
soundSegment=rmfield(soundSegment, 'end');
soundSegment=rmfield(soundSegment, 'duration');
end
% Plotting...
if plotOpt,
axes1H=subplot(4,1,1);
time=(1:length(y))/fs;
plot(time, y);
axisLimit=[min(time) max(time) -2^nbits/2, 2^nbits/2];
if -1<=min(y) & max(y)<=1
axisLimit=[min(time) max(time) -1, 1];
end
axis(axisLimit);
ylabel('Amplitude'); title('Waveform'); grid on
% Plot end points
yBound=axisLimit(3:4);
for i=1:length(soundSegment),
line(frame2sampleIndex(soundSegment(i).beginFrame, frameSize, overlap)/fs*[1,1], yBound, 'color', 'm');
line(frame2sampleIndex( soundSegment(i).endFrame, frameSize, overlap)/fs*[1,1], yBound, 'color', 'g');
end
axes2H=subplot(4,1,2);
frameTime = frame2sampleIndex(1:frameNum, frameSize, overlap)/fs;
plot(frameTime, volume, '.-');
line([min(frameTime), max(frameTime)], volTh1*[1 1], 'color', 'r');
line([min(frameTime), max(frameTime)], volTh2*[1 1], 'color', 'r');
axis tight
ylabel('Volume'); title('Volume'); grid on
% Plot end points
yBound = [min(volume) max(volume)];
for i=1:length(soundSegment),
line(frame2sampleIndex(soundSegment(i).beginFrame, frameSize, overlap)/fs*[1,1], yBound, 'color', 'm');
line(frame2sampleIndex( soundSegment(i).endFrame, frameSize, overlap)/fs*[1,1], yBound, 'color', 'g');
end
axes3H=subplot(4,1,3);
plot(frameTime, zcr, '.-');
line([min(frameTime), max(frameTime)], zcrTh*[1 1], 'color', 'c');
axis([min(frameTime), max(frameTime), 0, max(zcr)]);
ylabel('ZCR'); title('Zero crossing rate'); grid on
% Plot end points
yBound = [0 max(zcr)];
for i=1:length(soundSegment),
line(frame2sampleIndex(soundSegment(i).beginFrame, frameSize, overlap)/fs*[1,1], yBound, 'color', 'm');
line(frame2sampleIndex( soundSegment(i).endFrame, frameSize, overlap)/fs*[1,1], yBound, 'color', 'g');
end
%axes4H=subplot(4,1,4);
%voicedIndex=epInSampleIndex(1):epInSampleIndex(2);
%voicedTime=time(voicedIndex);
%voicedY=y(voicedIndex);
%voicedH=plot(voicedTime, voicedY);
%axis([time(epInSampleIndex(1)), time(epInSampleIndex(2)), -2^nbits/2, 2^nbits/2]);
%ylabel('Amplitude'); title('Voiced waveform'); grid on
%U.y=y; U.fs=fs; U.nbits=nbits;
%U.axes1H=axes1H; U.axes2H=axes2H; U.axes3H=axes3H; U.axes4H=axes4H;
%U.voicedIndex=voicedIndex; U.voicedH=voicedH;
%U.voicedY=voicedY; U.voicedTime=voicedTime;
%set(gcf, 'userData', U);
%uicontrol('string', 'Play all', 'callback', 'U=get(gcf, ''userData''); sound(U.y/(2^U.nbits/2), U.fs);');
%uicontrol('string', 'Play voiced', 'callback', 'U=get(gcf, ''userData''); sound(U.voicedY/(2^U.nbits/2), U.fs);', 'position', [100, 20, 60, 20]);
% Play the segmented sound
% head = soundSegment(1).beginFrame*(frameSize-overlap);
% tail = min(length(y), soundSegment(end).endFrame*(frameSize-overlap));
% thisY = y(head:tail);
% fprintf('His return to hear the cutted sound %g:', i);
% pause;
% fprintf('\n');
% wavplay(thisY, fs, 'sync');
% fprintf('\n');
end
% ====== Self demo
function selfdemo
waveFile='SingaporeIsAFinePlace.wav';
[y, fs, nbits]=wavReadInt(waveFile);
epdParam=epdParamSet(fs);
plotOpt=1;
out=feval(mfilename, y, fs, nbits, epdParam, plotOpt);
②buffer2
function out = buffer2(y, frameSize, overlap)
% buffer2: Frame blocking
% Usage: out = buffer2(y, frameSize, overlap)
% This is almost the same as "buffer" except that there is no leading/trailing zeros
% Roger Jang, 20010908
if nargin<3, overlap=0; end
if nargin<2, frameSize=256; end
y = y(:);
step = frameSize-overlap;
frameCount = floor((length(y)-overlap)/step);
out = zeros(frameSize, frameCount);
for i=1:frameCount,
startIndex = (i-1)*step+1;
out(:, i) = y(startIndex:(startIndex+frameSize-1));
end
一般设置:帧长为256,帧移为128,(帧移的取值单位一般是帧长的0~1/2)。
对于预处理获得的信号:y1
plotOpt1=0;
[endPoint, epInFrameIndex, soundSegment, zeroOneVec, volume]=epdByVolZcr(y, fs, nbits, [], plotOpt1);
y1=y(endPoint(1):endPoint(2));
注意:plotOpt1的取值决定于是否绘制图像,认真读过epdByVolZrc()函数的就可以理解。
3,接下来就是特征提取环节,这里就需要根据自己提取的方法进行编程了。
推荐看《情感语音信号入门解析》中推荐的张雪英教授的书籍,就可以提取常用的传统声学特征。