git clone https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
cd vits-uma-genshin-honkai
conda create -n vits python=3.8
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install -r requirements.txt
其中的gradio的版本存在问题,如果你在后续运行中出现错误,可以安装老版本的gradio:
pip install gradio==3.17.0
python app.py --device cuda
import os
import gradio as gr
import utils
import argparse
import commons
from models import SynthesizerTrn
from text import text_to_sequence
import torch
from torch import no_grad, LongTensor
import soundfile as sf
import json
import tqdm
def get_text(text, hps):
text_norm, clean_text = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = LongTensor(text_norm)
return text_norm, clean_text
def vits(text, language, speaker_id, noise_scale, noise_scale_w, length_scale, hps_ms, device, speakers, net_g_ms):
if not len(text):
return "输入文本不能为空!", None, None
text = text.replace('\n', ' ').replace('\r', '').replace(" ", "")
if len(text) > 100 and limitation:
return f"输入文字过长!{len(text)}>100", None, None
if language == 0:
text = f"[ZH]{text}[ZH]"
elif language == 1:
text = f"[JA]{text}[JA]"
else:
text = f"{text}"
import time
t1 = time.time()
stn_tst, clean_text = get_text(text, hps_ms)
with no_grad():
x_tst = stn_tst.unsqueeze(0).to(device)
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
speaker_id = LongTensor([speaker_id]).to(device)
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=speaker_id, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
print(f"cost: {time.time() - t1} s")
return 22050, audio
def tts_model_init(model_dir="./model", device='cuda'):
audio_postprocess_ori = gr.Audio.postprocess
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
device = torch.device(device)
hps_ms = utils.get_hparams_from_file(os.path.join(model_dir, r'config.json'))
net_g_ms = SynthesizerTrn(
len(hps_ms.symbols),
hps_ms.data.filter_length // 2 + 1,
hps_ms.train.segment_size // hps_ms.data.hop_length,
n_speakers=hps_ms.data.n_speakers,
**hps_ms.model)
_ = net_g_ms.eval().to(device)
speakers = hps_ms.speakers
model, optimizer, learning_rate, epochs = utils.load_checkpoint(os.path.join(model_dir, r'G_953000.pth'), net_g_ms, None)
return hps_ms, device, speakers, net_g_ms
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--device', type=str, default='cuda')
args = parser.parse_args()
hps_ms, device, speakers, net_g_ms = tts_model_init(device=args.device)
# sr, audio = vits('你好呀,我不知道该怎么告诉你这件事,但是我真的很需要你。', 0, torch.tensor([103]), 0.1, 0.668, 1.2)
sr, audio = vits('可莉不知道哦,但是可莉真的很需要你。', 0, torch.tensor([103]), 0.1, 0.668, 1.2, hps_ms, device, speakers, net_g_ms)
sf.write('output_file.wav', audio, samplerate=sr)
if __name__ == '__main__':
main()
参考文章: