尝试搭建本地语音转文字系统[2]

实现完上面的之后,我把一个视频,大约2h,利用格式工厂导出音频,将其进行语音识别,报错,告知需要转化为wav才能操作,这里我直接使用格式工厂对音频进行格式转化,先完成核心代码的编写。

转化完wav格式后,报错
尝试搭建本地语音转文字系统[2]_第1张图片错误很容易解决:将音频分片就行了,这里参考博客

代码参考

from paddlespeech.cli.asr.infer import ASRExecutor
import csv
# import moviepy.editor as mp
import auditok
import os
import paddle
from paddlespeech.cli.text.infer import TextExecutor

import soundfile
import librosa
import warnings

warnings.filterwarnings('ignore')

'''
音频切分
'''
# 输入类别为audio
def qiefen(path, ty='audio', mmin_dur=1, mmax_dur=100000, mmax_silence=1, menergy_threshold=55):
    audio_file = path
    audio, audio_sample_rate = soundfile.read(
        audio_file, dtype="int16", always_2d=True)

    audio_regions = auditok.split(
        audio_file,
        min_dur=mmin_dur,  # minimum duration of a valid audio event in seconds
        max_dur=mmax_dur,  # maximum duration of an event
        # maximum duration of tolerated continuous silence within an event
        max_silence=mmax_silence,
        energy_threshold=menergy_threshold  # threshold of detection
    )

    for i, r in enumerate(audio_regions):
        # Regions returned by `split` have 'start' and 'end' metadata fields
        print(
            "Region {i}: {r.meta.start:.3f}s -- {r.meta.end:.3f}s".format(i=i, r=r))

        epath = ''
        file_pre = str(epath.join(audio_file.split('.')[0].split('/')[-1]))

        mk = 'change'
        if (os.path.exists(mk) == False):
            os.mkdir(mk)
        if (os.path.exists(mk + '/' + ty) == False):
            os.mkdir(mk + '/' + ty)
        if (os.path.exists(mk + '/' + ty + '/' + file_pre) == False):
            os.mkdir(mk + '/' + ty + '/' + file_pre)
        num = i
        # 为了取前三位数字排序
        s = '000000' + str(num)

        file_save = mk + '/' + ty + '/' + file_pre + '/' + \
                    s[-3:] + '-' + '{meta.start:.3f}-{meta.end:.3f}' + '.wav'
        filename = r.save(file_save)
        print("region saved as: {}".format(filename))
    return mk + '/' + ty + '/' + file_pre
'''
语音转文本
直接调用ASRExecutor进行语音到文本转换。
需要注意的是,此处 force_yes=True, 即强行进行音频频率转换,PaddleSpeech使用16000hz频率。如force_yes=False,则需要手动确认
'''
asr_executor = ASRExecutor()
def audio2txt(path):
    # 返回path下所有文件构成的一个list列表
    print(f"path: {path}")
    filelist = os.listdir(path)
    # 保证读取按照文件的顺序
    filelist.sort(key=lambda x: int(os.path.splitext(x)[0][:3]))
    # 遍历输出每一个文件的名字和类型
    words = []
    for file in filelist:
        print(path + '/' + file)
        text = asr_executor(
            audio_file=path + '/' + file,
            device=paddle.get_device(), force_yes=True) # force_yes参数需要注意
        words.append(text)
    return words

'''
保存
'''
def txt2csv(txt_all):
    with open('result.csv', 'w+', encoding='utf-8') as f:
        f_csv = csv.writer(f)
        for row in txt_all:
            f_csv.writerow([row])


#增加标点
#
# 拿到新生成的音频的路径
def add_punctuation(source_path='result.csv'):
    texts = ''
    with open(source_path, 'r') as f:
        text = f.readlines()
    for i in range(len(text)):
        text[i] = text[i].replace('\n', '')
        texts = texts + text[i]
    print(texts)
    text_executor = TextExecutor()
    if text:
        result = text_executor(
            text=texts,
            task='punc',
            model='ernie_linear_p3_wudao',
            device=paddle.get_device(),
            # force_yes=True
        )
    print(result)
    with open("Final.txt", 'w') as f_:
        f_.write(result)
    f_.close()
    f.close()



if __name__ == '__main__':
    source_path = 'E:/FFOutput/音频1.wav'
    # 划分音频
    path = qiefen(path=source_path, ty='audio',
                  mmin_dur=0.5, mmax_dur=50, mmax_silence=0.5, menergy_threshold=55)
    # 音频转文本  需要GPU
    txt_all = audio2txt(path)
    # 存入csv
    txt2csv(txt_all)
    add_punctuation()

期间遇到运行添加标点时出错:

OSError: (External) CUBLAS error(1).
  [Hint: 'CUBLAS_STATUS_NOT_INITIALIZED'.  The cuBLAS library was not initialized. This is usually caused by the lack of a prior cublasCreate() call, an
error in the CUDA Runtime API called by the cuBLAS routine, or an error in the hardware setup.  To correct: call cublasCreate() prior to the function cal
l; and check that the hardware, an appropriate version of the driver, and the cuBLAS library are correctly installed.  ] (at ..\paddle\phi\backends\gpu\g
pu_resources.cc:140)
  [operator < matmul_v2 > error]

查询资料并尝试后,我的这个问题是由于数据量太大导致的,故将代码进行修改,分批次进行读取,再进行处理,最终代码

from paddlespeech.cli.asr.infer import ASRExecutor
import csv
# import moviepy.editor as mp
import auditok
import os
import paddle
from paddlespeech.cli.text.infer import TextExecutor

import soundfile
import librosa
import warnings
import time
warnings.filterwarnings('ignore')

'''
音频切分
'''
# 输入类别为audio
def qiefen(path, ty='audio', mmin_dur=1, mmax_dur=100000, mmax_silence=1, menergy_threshold=55):
    audio_file = path
    audio, audio_sample_rate = soundfile.read(
        audio_file, dtype="int16", always_2d=True)

    audio_regions = auditok.split(
        audio_file,
        min_dur=mmin_dur,  # minimum duration of a valid audio event in seconds
        max_dur=mmax_dur,  # maximum duration of an event
        # maximum duration of tolerated continuous silence within an event
        max_silence=mmax_silence,
        energy_threshold=menergy_threshold  # threshold of detection
    )

    for i, r in enumerate(audio_regions):
        # Regions returned by `split` have 'start' and 'end' metadata fields
        print(
            "Region {i}: {r.meta.start:.3f}s -- {r.meta.end:.3f}s".format(i=i, r=r))

        epath = ''
        file_pre = str(epath.join(audio_file.split('.')[0].split('/')[-1]))

        mk = 'change'
        if (os.path.exists(mk) == False):
            os.mkdir(mk)
        if (os.path.exists(mk + '/' + ty) == False):
            os.mkdir(mk + '/' + ty)
        if (os.path.exists(mk + '/' + ty + '/' + file_pre) == False):
            os.mkdir(mk + '/' + ty + '/' + file_pre)
        num = i
        # 为了取前三位数字排序
        s = '000000' + str(num)

        file_save = mk + '/' + ty + '/' + file_pre + '/' + \
                    s[-3:] + '-' + '{meta.start:.3f}-{meta.end:.3f}' + '.wav'
        filename = r.save(file_save)
        print("region saved as: {}".format(filename))
    return mk + '/' + ty + '/' + file_pre
'''
语音转文本
直接调用ASRExecutor进行语音到文本转换。
需要注意的是,此处 force_yes=True, 即强行进行音频频率转换,PaddleSpeech使用16000hz频率。如force_yes=False,则需要手动确认
'''
asr_executor = ASRExecutor()
def audio2txt(path):
    # 返回path下所有文件构成的一个list列表
    print(f"path: {path}")
    filelist = os.listdir(path)
    # 保证读取按照文件的顺序
    filelist.sort(key=lambda x: int(os.path.splitext(x)[0][:3]))
    # 遍历输出每一个文件的名字和类型
    words = []
    for file in filelist:
        print(path + '/' + file)
        text = asr_executor(
            audio_file=path + '/' + file,
            device=paddle.get_device(), force_yes=True) # force_yes参数需要注意
        words.append(text)
    return words

'''
保存
'''
def txt2csv(txt_all):
    with open('result.csv', 'w+', encoding='utf-8') as f:
        f_csv = csv.writer(f)
        for row in txt_all:
            f_csv.writerow([row])


#增加标点
#
# 拿到新生成的音频的路径
def add_punctuation(source_path='result.csv'):
    texts = ''
    with open(source_path, 'r') as f:
        text = f.readlines()
    f_=open("Final.txt","w+")
    count=0
    text_executor = TextExecutor()
    for i in range(len(text)):
        text[i] = text[i].replace('\n', '')
        if(text[i]):
            count+=1
            texts = texts + text[i]
        if(count>=5):
            print(texts)
            count=0
            result = text_executor(
                text=texts,
                task='punc',
                model='ernie_linear_p3_wudao',
                device=paddle.get_device(),
                # force_yes=True
            )
            texts=''
            f_.write(result+'\n')
    if(texts):
        result = text_executor(
            text=texts,
            task='punc',
            model='ernie_linear_p3_wudao',
            device=paddle.get_device(),
            # force_yes=True
        )
        texts = ''
        f_.write(result)
    f_.close()
    f.close()



if __name__ == '__main__':
    time1=time.time()
    source_path = 'E:/FFOutput/音频1.wav'
    # 划分音频
    path = qiefen(path=source_path, ty='audio',
                  mmin_dur=0.5, mmax_dur=50, mmax_silence=0.5, menergy_threshold=55)
    # 音频转文本  需要GPU
    txt_all = audio2txt(path)
    # 存入csv
    txt2csv(txt_all)
    add_punctuation()
    time2=time.time()
    cost=time2-time1
    print("#"*10+"Cost total time:{}s".format(cost)+"#"*10)

使用显卡
尝试搭建本地语音转文字系统[2]_第2张图片
处理1.48G,2h左右的录音文件,从开始到结束总共花费2703s
尝试搭建本地语音转文字系统[2]_第3张图片在这里插入图片描述
耗时还是比较久的,如果使用CPU的话会更慢。
但是优点在于

  1. 搭建好后在本地进行运行,保密性和可控性较好。
  2. 免费开源,不需要进行上传给语音识别平台,也不会受到平台的次数和时间限制。
  3. 效果不错,经检验准确率较高,质量有保障,比自己搭建的二流子项目稳定、质量高。
    至此,核心功能基本实现,没有多少工作量,后续将学习python项目封装,做个GUI界面在本地使用,或者是在内网建设一台服务器运行服务,采用前后端分离的形式进行使用。
    未完待续
    挖个坑

你可能感兴趣的:(语音识别,语音识别,人工智能)