python -m pip install --upgrade pip
# ssh登录系统
# 切换到root用户
mkdir /opt/tools/
cd /opt/tools/
# 安装miniconda
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
chmod +x Miniconda3-latest-Linux-x86_64.sh
./Miniconda3-latest-Linux-x86_64.sh
#按提示操作,安装目录建议选择/opt/miniconda3
#创建软链接
ln -s /opt/miniconda3/bin/conda /usr/local/bin/conda
#退出shell重新登陆,然后后续操作
#创建环境
conda create -n whisper python=3.9
conda activate whisper
pip install -U openai-whisper
或者
pip install git+https://github.com/openai/whisper.git
或者
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple openai-whisper
pip install tiktoken
pip install setuptools-rust
#在conda whisper环境外执行,安装ffmpeg
sudo apt update && sudo apt install ffmpeg
whisper audio.mp3 --model medium --language Chinese
代码调用
import whisper
import arrow
# 定义模型、音频地址、录音开始时间
def excute(model_name,file_path,start_time):
model = whisper.load_model(model_name)
result = model.transcribe(file_path)
for segment in result["segments"]:
now = arrow.get(start_time)
start = now.shift(seconds=segment["start"]).format("YYYY-MM-DD HH:mm:ss")
end = now.shift(seconds=segment["end"]).format("YYYY-MM-DD HH:mm:ss")
print("【"+start+"->" +end+"】:"+segment["text"])
if __name__ == '__main__':
excute("base","1001.mp3","2022-10-24 16:23:00")
import os
import whisper
from pyannote.audio import Pipeline
from pyannote_whisper.utils import diarize_text
import concurrent.futures
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
output_dir = '/root/autodl-tmp/pyannote-whisper'
def process_audio(file_path):
model = whisper.load_model("large")
asr_result = model.transcribe(file_path, initial_prompt="语音转换")
diarization_result = pipeline(file_path)
final_result = diarize_text(asr_result, diarization_result)
output_file = os.path.join(output_dir, os.path.basename(file_path)[:-4] + '.txt')
with open(output_file, 'w') as f:
for seg, spk, sent in final_result:
line = f'{seg.start:.2f} {seg.end:.2f} {spk} {sent}\n'
f.write(line)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
wave_dir = '/root/autodl-tmp/pyannote-whisper'
# 获取当前目录下所有wav文件名
wav_files = [os.path.join(wave_dir, file) for file in os.listdir(wave_dir) if file.endswith('.wav')]
# 处理每个wav文件
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
executor.map(process_audio, wav_files)
print('处理完成!')
报错ModuleNotFoundError: No module named 'pyannote'
pip install pyannote.audio
报错No module named 'pyannote_whisper'
如果你使用使用AutoDL平台,你可以使用学术代理
加速
source /etc/network_turbo
git clone https://github.com/yinruiqing/pyannote-whisper.git
在项目里面写代码就可以了,或者复制代码里面的pyannote_whisper.utils模块代码
官网
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
sh Miniconda3-latest-Linux-x86_64.sh
source ~/.bashrc
conda create -n funasr python=3.8
conda activate funasr
pip3 install torch torchaudio
如果您的环境中存在CUDA,您应该安装与CUDA匹配的版本的pytorch。匹配列表可以在docs中找到。
从 pip 安装
pip3 install -U funasr
# 对于中国的用户,您可以使用以下命令进行安装:
# pip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
或者从源码安装funASR
git clone https://github.com/alibaba/FunASR.git && cd FunASR
pip3 install -e ./
如果您想使用 ModelScope 中的预训练模型,您应该安装 modelscope:
pip3 install -U modelscope
# 对于中国的用户,您可以使用以下命令进行安装:
# pip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple
通过 modelscope-sdk 将模型下载到本地目录
from modelscope.hub.snapshot_download import snapshot_download
local_dir_root = "./models_from_modelscope"
model_dir = snapshot_download('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', cache_dir=local_dir_root)
或者通过 git lfs 将模型下载到本地目录
git lfs install
# git clone https://www.modelscope.cn//.git
git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git
使用本地模型路径进行推断
local_dir_root = "./models_from_modelscope/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model=local_dir_root,
)
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
model_revision="v1.2.4")
rec_result = inference_pipeline(audio_in='1001.wav')
print(rec_result['sentences'])
with open('result.txt', 'w', encoding='utf-8') as f:
print(rec_result, file=f)
print(rec_result)
pip install pyannote.audio
https://huggingface.co/settings/tokens
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
# send pipeline to GPU (when available)
import torch
pipeline.to(torch.device("cuda"))
# apply pretrained pipeline
diarization = pipeline("1002.wav")
print(diarization)
# print the result
for turn, _, speaker in diarization.itertracks(yield_label=True):
print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
# start=0.2s stop=1.5s speaker_0
# start=1.8s stop=3.9s speaker_1
# start=4.2s stop=5.7s speaker_0
# ...
from pyannote.core import Segment, Annotation, Timeline
def get_text_with_timestamp(transcribe_res):
timestamp_texts = []
for item in transcribe_res['segments']:
start = item['start']
end = item['end']
text = item['text']
timestamp_texts.append((Segment(start, end), text))
print(timestamp_texts)
return timestamp_texts
def get_text_with_timestampFun(transcribe_res):
print(transcribe_res['sentences'])
timestamp_texts = []
for item in transcribe_res['sentences']:
start = item['start']/1000.0
end = item['end']/1000.0
text = item['text']
timestamp_texts.append((Segment(start, end), text))
return timestamp_texts
def add_speaker_info_to_text(timestamp_texts, ann):
spk_text = []
for seg, text in timestamp_texts:
#这行代码的作用是在给定的时间段 seg 中根据说话人分离结果 ann 获取出现次数最多的说话人。
spk = ann.crop(seg).argmax()
spk_text.append((seg, spk, text))
return spk_text
def merge_cache(text_cache):
sentence = ''.join([item[-1] for item in text_cache])
spk = text_cache[0][1]
start = text_cache[0][0].start
end = text_cache[-1][0].end
return Segment(start, end), spk, sentence
PUNC_SENT_END = ['.', '?', '!', '。', '?', '!']
def merge_sentence(spk_text):
merged_spk_text = []
pre_spk = None
text_cache = []
for seg, spk, text in spk_text:
if spk != pre_spk and pre_spk is not None and len(text_cache) > 0:
merged_spk_text.append(merge_cache(text_cache))
text_cache = [(seg, spk, text)]
pre_spk = spk
elif text[-1] in PUNC_SENT_END:
text_cache.append((seg, spk, text))
merged_spk_text.append(merge_cache(text_cache))
text_cache = []
pre_spk = spk
else:
text_cache.append((seg, spk, text))
pre_spk = spk
if len(text_cache) > 0:
merged_spk_text.append(merge_cache(text_cache))
return merged_spk_text
def diarize_text(transcribe_res, diarization_result):
timestamp_texts = get_text_with_timestampFun(transcribe_res)
spk_text = add_speaker_info_to_text(timestamp_texts, diarization_result)
res_processed = merge_sentence(spk_text)
return res_processed
def write_to_txt(spk_sent, file):
with open(file, 'w') as fp:
for seg, spk, sentence in spk_sent:
line = f'{seg.start:.2f} {seg.end:.2f} {spk} {sentence}\n'
fp.write(line)
import os
import whisper
from pyannote.audio import Pipeline
from pyannote_funasr.utils import diarize_text
import concurrent.futures
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
# 输出位置
output_dir = '/root/autodl-tmp/pyannote-whisper'
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
# 语音转文字的模型
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
model_revision="v1.2.4")
# rec_result = inference_pipeline(audio_in='1002.wav')
# with open('result.txt', 'w', encoding='utf-8') as f:
# print(rec_result, file=f)
# # print(rec_result)
def process_audio(file_path):
print("----------1")
asr_result = inference_pipeline(audio_in=file_path)
print("-----------2.2")
# 语者分离pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
# 使用显卡加速
import torch
pipeline.to(torch.device("cuda"))
#num_speakers 几个说话者,可以不带
diarization_result = pipeline(file_path, num_speakers=2)
# 转文字结果
print(diarization_result)
# 进行语着分离
final_result = diarize_text(asr_result, diarization_result)
print("-----------5")
# 输出结果
output_file = os.path.join(output_dir, os.path.basename(file_path)[:-4] + '.txt')
with open(output_file, 'w') as f:
for seg, spk, sent in final_result:
line = f'{seg.start:.2f} {seg.end:.2f} {spk} {sent}\n'
f.write(line)
print(line)
# 判断输出文件夹是否存在
if not os.path.exists(output_dir):
os.makedirs(output_dir)
wave_dir = '/root/autodl-tmp/pyannote-whisper'
# 获取当前目录下所有wav文件名
wav_files = [os.path.join(wave_dir, file) for file in os.listdir(wave_dir) if file.endswith('.wav')]
# 处理每个wav文件
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(process_audio, wav_files)
print('处理完成!')
微调.py
import os
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from modelscope.msdatasets.audio.asr_dataset import ASRDataset
def modelscope_finetune(params):
if not os.path.exists(params.output_dir):
os.makedirs(params.output_dir, exist_ok=True)
# dataset split ["train", "validation"]
ds_dict = ASRDataset.load(params.data_path, namespace='speech_asr')
kwargs = dict(
model=params.model,
data_dir=ds_dict,
dataset_type=params.dataset_type,
work_dir=params.output_dir,
batch_bins=params.batch_bins,
max_epoch=params.max_epoch,
lr=params.lr)
trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
trainer.train()
if __name__ == '__main__':
from funasr.utils.modelscope_param import modelscope_args
params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
params.output_dir = "./checkpoint" # 模型保存路径
params.data_path = "speech_asr_aishell1_trainsets" # 数据路径,可以为modelscope中已上传数据,也可以是本地数据
params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large
params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒,
params.max_epoch = 50 # 最大训练轮数
params.lr = 0.00005 # 设置学习率
modelscope_finetune(params)