Vits——语音模型Vits部署

本地安装环境

git clone https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
cd vits-uma-genshin-honkai

  • 创建conda环境:
conda create -n vits python=3.8
  • 安装Pytorch,其他版本参照:https://pytorch.org/get-started/previous-versions
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
  • 安装依赖
pip install -r requirements.txt

其中的gradio的版本存在问题,如果你在后续运行中出现错误,可以安装老版本的gradio:

pip install gradio==3.17.0

运行

python app.py --device cuda

代码

import os
import gradio as gr
import utils
import argparse
import commons

from models import SynthesizerTrn
from text import text_to_sequence
import torch
from torch import no_grad, LongTensor
import soundfile as sf
import json
import tqdm


def get_text(text, hps):
    text_norm, clean_text = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = LongTensor(text_norm)
    return text_norm, clean_text


def vits(text, language, speaker_id, noise_scale, noise_scale_w, length_scale, hps_ms, device, speakers, net_g_ms):
    if not len(text):
        return "输入文本不能为空!", None, None
    text = text.replace('\n', ' ').replace('\r', '').replace(" ", "")
    if len(text) > 100 and limitation:
        return f"输入文字过长!{len(text)}>100", None, None
    if language == 0:
        text = f"[ZH]{text}[ZH]"
    elif language == 1:
        text = f"[JA]{text}[JA]"
    else:
        text = f"{text}"
    import time
    t1 = time.time()
    stn_tst, clean_text = get_text(text, hps_ms)
    with no_grad():
        x_tst = stn_tst.unsqueeze(0).to(device)
        x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
        speaker_id = LongTensor([speaker_id]).to(device)
        audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=speaker_id, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
                               length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
    print(f"cost: {time.time() - t1} s")
    return 22050, audio


def tts_model_init(model_dir="./model", device='cuda'):
    audio_postprocess_ori = gr.Audio.postprocess
    limitation = os.getenv("SYSTEM") == "spaces"  # limit text and audio length in huggingface spaces

    device = torch.device(device)

    hps_ms = utils.get_hparams_from_file(os.path.join(model_dir, r'config.json'))
    net_g_ms = SynthesizerTrn(
        len(hps_ms.symbols),
        hps_ms.data.filter_length // 2 + 1,
        hps_ms.train.segment_size // hps_ms.data.hop_length,
        n_speakers=hps_ms.data.n_speakers,
        **hps_ms.model)
    _ = net_g_ms.eval().to(device)
    speakers = hps_ms.speakers

    model, optimizer, learning_rate, epochs = utils.load_checkpoint(os.path.join(model_dir, r'G_953000.pth'), net_g_ms, None)

    return hps_ms, device, speakers, net_g_ms

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', type=str, default='cuda')
    args = parser.parse_args()
    
    hps_ms, device, speakers, net_g_ms = tts_model_init(device=args.device)
    # sr, audio = vits('你好呀,我不知道该怎么告诉你这件事,但是我真的很需要你。', 0, torch.tensor([103]), 0.1, 0.668, 1.2)
    sr, audio = vits('可莉不知道哦,但是可莉真的很需要你。', 0, torch.tensor([103]), 0.1, 0.668, 1.2, hps_ms, device, speakers, net_g_ms)
    sf.write('output_file.wav', audio, samplerate=sr)

if __name__ == '__main__':
    main()

参考文章:

你可能感兴趣的:(GPT,python,gpt)