python对语音进行分帧,过零率、短时能量、FTT计算,端点监测

import wave
import numpy as np
import matplotlib.pyplot as plt


def get_audio_data(file_path):
    f = wave.open(file_path, "rb")
    params = f.getparams()
    nchannels, sampwidth, framerate, nframes = params[:4]
    str_data = f.readframes(nframes)
    f.close()
    audio_data = np.frombuffer(str_data, dtype=np.short)
    audio_data = audio_data*1.0/(max(abs(audio_data)))
    time = np.arange(0, nframes) * (1.0 / framerate)
    return time, audio_data


def cut_signal(sig, nw=512, inc=128):
    signal_length = len(sig)  # 信号总长度
    if signal_length <= nw:  # 如果信号长度小于一帧的长度,则帧数定义为1
        nf = 1  # nf表示帧数量
    else:
        nf = int(np.ceil((1.0 * signal_length - nw + inc) / inc))  # 处理后,所有帧的数量,不要最后一帧
    pad_length = int((nf - 1) * inc + nw)  # 所有帧加起来总的平铺后的长度
    pad_signal = np.pad(sig, (0, pad_length - signal_length), 'constant')  # 0填充最后一帧

    indices = np.tile(np.arange(0, nw), (nf, 1)) \
        + np.tile(np.arange(0, nf * inc, inc), (nw, 1)).T  # 相当于对所有帧的时间点进行抽取,得到nf*nw长度的矩阵
    indices = np.array(indices, dtype=np.int32)  # 将indices转化为矩阵
    frames = pad_signal[indices]  # 得到帧信号
    win = np.tile(np.hanning(nw), (nf, 1))
    return frames*win


def fft(audio_data):
    out = np.array([np.fft.rfft(audio_data[i])
                    for i in range(audio_data.shape[0])])
    return out


def compute_zcr(cut_sig):
    # 过零率
    frames = cut_sig.shape[0]
    frame_size = cut_sig.shape[1]
    _zcr = np.zeros((frames, 1))

    for i in range(frames):
        frame = cut_sig[i]
        tmp1 = frame[:frame_size-1]
        tmp2 = frame[1:frame_size]
        signs = tmp1*tmp2.T < 0
        # 但得限制能量,因为对于噪音的话,也会在0点附近上下摆动,但噪声能量显然是没有语音大的
        diff = np.abs((tmp1-tmp2)) > 0.02
        _zcr[i] = np.sum(signs.T * diff)
    return _zcr


def compute_amp(cut_sig):
    # 短时能量
    return np.sum(np.abs(cut_sig), axis=1)


def point_check(cut_sig):
    zcr = compute_zcr(cut_sig)
    amp = compute_amp(cut_sig)
    zcr_low = max(np.round(np.mean(zcr)*0.1), 3)  # 过零率低门限
    zcr_high = max(np.round(max(zcr)*0.1), 5)  # 过零率高门限
    amp_low = min([min(amp)*10, np.mean(amp)*0.2, max(amp)*0.1])  # 能量低门限
    amp_high = max([min(amp)*10, np.mean(amp)*0.2, max(amp)*0.1])  # 能量高门限

    max_slice = 8
    min_audio = 16
    sig_length = len(zcr)
    status = 0  # 状态0:静音段, 1:过渡段, 2:语音段, 3:结束段
    hold_time = 0  # 语音持续时间
    slice_time = 0  # 语音间隙时间
    start_point = 0
    points = []
    for i in range(len(zcr)):
        if status == 0 or status == 1:
            if amp[i] > amp_high or zcr[i] > zcr_high:
                start_point = i - hold_time
                status = 2
                hold_time += 1
                slice_time = 0
            elif amp[i] > amp_low or zcr[i] > zcr_low:
                status = 1
                hold_time += 1
            else:
                status = 0
                hold_time = 0
        elif status == 2:
            if amp[i] > amp_low or zcr[i] > zcr_low:
                hold_time += 1
            else:
                slice_time += 1
                if slice_time < max_slice and i < sig_length-1:
                    hold_time += 1
                elif (hold_time - slice_time) < min_audio:
                    status = 0
                    hold_time = 0
                    slice_time = 0
                else:
                    points.append(start_point)
                    hold_time = hold_time - slice_time
                    end_point = start_point + hold_time
                    points.append(end_point)
                    status = 0

    return points


def plt_audio(time, y, points=[]):
    plt.flag()
    plt.plot(time, y)
    for p in points:
        plt.axvline(p*128/16000, c='r')
    plt.show()
```

你可能感兴趣的:(python对语音进行分帧,过零率、短时能量、FTT计算,端点监测)