实现完上面的之后,我把一个视频,大约2h,利用格式工厂导出音频,将其进行语音识别,报错,告知需要转化为wav才能操作,这里我直接使用格式工厂对音频进行格式转化,先完成核心代码的编写。
转化完wav格式后,报错
错误很容易解决:将音频分片就行了,这里参考博客
代码参考
from paddlespeech.cli.asr.infer import ASRExecutor
import csv
# import moviepy.editor as mp
import auditok
import os
import paddle
from paddlespeech.cli.text.infer import TextExecutor
import soundfile
import librosa
import warnings
warnings.filterwarnings('ignore')
'''
音频切分
'''
# 输入类别为audio
def qiefen(path, ty='audio', mmin_dur=1, mmax_dur=100000, mmax_silence=1, menergy_threshold=55):
audio_file = path
audio, audio_sample_rate = soundfile.read(
audio_file, dtype="int16", always_2d=True)
audio_regions = auditok.split(
audio_file,
min_dur=mmin_dur, # minimum duration of a valid audio event in seconds
max_dur=mmax_dur, # maximum duration of an event
# maximum duration of tolerated continuous silence within an event
max_silence=mmax_silence,
energy_threshold=menergy_threshold # threshold of detection
)
for i, r in enumerate(audio_regions):
# Regions returned by `split` have 'start' and 'end' metadata fields
print(
"Region {i}: {r.meta.start:.3f}s -- {r.meta.end:.3f}s".format(i=i, r=r))
epath = ''
file_pre = str(epath.join(audio_file.split('.')[0].split('/')[-1]))
mk = 'change'
if (os.path.exists(mk) == False):
os.mkdir(mk)
if (os.path.exists(mk + '/' + ty) == False):
os.mkdir(mk + '/' + ty)
if (os.path.exists(mk + '/' + ty + '/' + file_pre) == False):
os.mkdir(mk + '/' + ty + '/' + file_pre)
num = i
# 为了取前三位数字排序
s = '000000' + str(num)
file_save = mk + '/' + ty + '/' + file_pre + '/' + \
s[-3:] + '-' + '{meta.start:.3f}-{meta.end:.3f}' + '.wav'
filename = r.save(file_save)
print("region saved as: {}".format(filename))
return mk + '/' + ty + '/' + file_pre
'''
语音转文本
直接调用ASRExecutor进行语音到文本转换。
需要注意的是,此处 force_yes=True, 即强行进行音频频率转换,PaddleSpeech使用16000hz频率。如force_yes=False,则需要手动确认
'''
asr_executor = ASRExecutor()
def audio2txt(path):
# 返回path下所有文件构成的一个list列表
print(f"path: {path}")
filelist = os.listdir(path)
# 保证读取按照文件的顺序
filelist.sort(key=lambda x: int(os.path.splitext(x)[0][:3]))
# 遍历输出每一个文件的名字和类型
words = []
for file in filelist:
print(path + '/' + file)
text = asr_executor(
audio_file=path + '/' + file,
device=paddle.get_device(), force_yes=True) # force_yes参数需要注意
words.append(text)
return words
'''
保存
'''
def txt2csv(txt_all):
with open('result.csv', 'w+', encoding='utf-8') as f:
f_csv = csv.writer(f)
for row in txt_all:
f_csv.writerow([row])
#增加标点
#
# 拿到新生成的音频的路径
def add_punctuation(source_path='result.csv'):
texts = ''
with open(source_path, 'r') as f:
text = f.readlines()
for i in range(len(text)):
text[i] = text[i].replace('\n', '')
texts = texts + text[i]
print(texts)
text_executor = TextExecutor()
if text:
result = text_executor(
text=texts,
task='punc',
model='ernie_linear_p3_wudao',
device=paddle.get_device(),
# force_yes=True
)
print(result)
with open("Final.txt", 'w') as f_:
f_.write(result)
f_.close()
f.close()
if __name__ == '__main__':
source_path = 'E:/FFOutput/音频1.wav'
# 划分音频
path = qiefen(path=source_path, ty='audio',
mmin_dur=0.5, mmax_dur=50, mmax_silence=0.5, menergy_threshold=55)
# 音频转文本 需要GPU
txt_all = audio2txt(path)
# 存入csv
txt2csv(txt_all)
add_punctuation()
期间遇到运行添加标点时出错:
OSError: (External) CUBLAS error(1).
[Hint: 'CUBLAS_STATUS_NOT_INITIALIZED'. The cuBLAS library was not initialized. This is usually caused by the lack of a prior cublasCreate() call, an
error in the CUDA Runtime API called by the cuBLAS routine, or an error in the hardware setup. To correct: call cublasCreate() prior to the function cal
l; and check that the hardware, an appropriate version of the driver, and the cuBLAS library are correctly installed. ] (at ..\paddle\phi\backends\gpu\g
pu_resources.cc:140)
[operator < matmul_v2 > error]
查询资料并尝试后,我的这个问题是由于数据量太大导致的,故将代码进行修改,分批次进行读取,再进行处理,最终代码
from paddlespeech.cli.asr.infer import ASRExecutor
import csv
# import moviepy.editor as mp
import auditok
import os
import paddle
from paddlespeech.cli.text.infer import TextExecutor
import soundfile
import librosa
import warnings
import time
warnings.filterwarnings('ignore')
'''
音频切分
'''
# 输入类别为audio
def qiefen(path, ty='audio', mmin_dur=1, mmax_dur=100000, mmax_silence=1, menergy_threshold=55):
audio_file = path
audio, audio_sample_rate = soundfile.read(
audio_file, dtype="int16", always_2d=True)
audio_regions = auditok.split(
audio_file,
min_dur=mmin_dur, # minimum duration of a valid audio event in seconds
max_dur=mmax_dur, # maximum duration of an event
# maximum duration of tolerated continuous silence within an event
max_silence=mmax_silence,
energy_threshold=menergy_threshold # threshold of detection
)
for i, r in enumerate(audio_regions):
# Regions returned by `split` have 'start' and 'end' metadata fields
print(
"Region {i}: {r.meta.start:.3f}s -- {r.meta.end:.3f}s".format(i=i, r=r))
epath = ''
file_pre = str(epath.join(audio_file.split('.')[0].split('/')[-1]))
mk = 'change'
if (os.path.exists(mk) == False):
os.mkdir(mk)
if (os.path.exists(mk + '/' + ty) == False):
os.mkdir(mk + '/' + ty)
if (os.path.exists(mk + '/' + ty + '/' + file_pre) == False):
os.mkdir(mk + '/' + ty + '/' + file_pre)
num = i
# 为了取前三位数字排序
s = '000000' + str(num)
file_save = mk + '/' + ty + '/' + file_pre + '/' + \
s[-3:] + '-' + '{meta.start:.3f}-{meta.end:.3f}' + '.wav'
filename = r.save(file_save)
print("region saved as: {}".format(filename))
return mk + '/' + ty + '/' + file_pre
'''
语音转文本
直接调用ASRExecutor进行语音到文本转换。
需要注意的是,此处 force_yes=True, 即强行进行音频频率转换,PaddleSpeech使用16000hz频率。如force_yes=False,则需要手动确认
'''
asr_executor = ASRExecutor()
def audio2txt(path):
# 返回path下所有文件构成的一个list列表
print(f"path: {path}")
filelist = os.listdir(path)
# 保证读取按照文件的顺序
filelist.sort(key=lambda x: int(os.path.splitext(x)[0][:3]))
# 遍历输出每一个文件的名字和类型
words = []
for file in filelist:
print(path + '/' + file)
text = asr_executor(
audio_file=path + '/' + file,
device=paddle.get_device(), force_yes=True) # force_yes参数需要注意
words.append(text)
return words
'''
保存
'''
def txt2csv(txt_all):
with open('result.csv', 'w+', encoding='utf-8') as f:
f_csv = csv.writer(f)
for row in txt_all:
f_csv.writerow([row])
#增加标点
#
# 拿到新生成的音频的路径
def add_punctuation(source_path='result.csv'):
texts = ''
with open(source_path, 'r') as f:
text = f.readlines()
f_=open("Final.txt","w+")
count=0
text_executor = TextExecutor()
for i in range(len(text)):
text[i] = text[i].replace('\n', '')
if(text[i]):
count+=1
texts = texts + text[i]
if(count>=5):
print(texts)
count=0
result = text_executor(
text=texts,
task='punc',
model='ernie_linear_p3_wudao',
device=paddle.get_device(),
# force_yes=True
)
texts=''
f_.write(result+'\n')
if(texts):
result = text_executor(
text=texts,
task='punc',
model='ernie_linear_p3_wudao',
device=paddle.get_device(),
# force_yes=True
)
texts = ''
f_.write(result)
f_.close()
f.close()
if __name__ == '__main__':
time1=time.time()
source_path = 'E:/FFOutput/音频1.wav'
# 划分音频
path = qiefen(path=source_path, ty='audio',
mmin_dur=0.5, mmax_dur=50, mmax_silence=0.5, menergy_threshold=55)
# 音频转文本 需要GPU
txt_all = audio2txt(path)
# 存入csv
txt2csv(txt_all)
add_punctuation()
time2=time.time()
cost=time2-time1
print("#"*10+"Cost total time:{}s".format(cost)+"#"*10)
使用显卡
处理1.48G,2h左右的录音文件,从开始到结束总共花费2703s
耗时还是比较久的,如果使用CPU的话会更慢。
但是优点在于