在Unity中实现语音合成(TTS)webapi

在Unity中使用科大讯飞语音合成webapi实现语音合成

直接上代码!

using System.Collections;
using System.Collections.Generic;
using System.Text;
using UnityEngine;
using System.Net;
using System.Net.WebSockets;
using System.Net.Sockets;
using System;
using System.Security.Cryptography;
using System.Threading;
using UnityEngine.UI;
using System.Threading.Tasks;

namespace Webiat
{
    public class TTSCore : MonoBehaviour
    {
    	//绑定你自己的参数,不知道如何获取的,可以自己百度
        private static string app_id = "APPID";
        private static string api_secret = "APISECRET";
        private static string api_key = "APIKEY";
        public string Uri = "wss://tts-api.xfyun.cn/v2/tts";

        List<float> _clipData = new List<float>();
        
        [HideInInspector]
        public ClientWebSocket ttsWebSocket;//语音合成websocket

        #region ----------------------接口鉴权-----------------------------
        /// 
        /// 基于HMACSHA256加密获取接口鉴权
        /// 
        /// 
        public string GetUrl(string uriStr)
        {
            Uri uri = new Uri(uriStr);
            string host = uri.Host;
            string date = DateTime.UtcNow.ToString("R");
            //使用base64编码获取签名前
            string signature_origin = $"host: {host}\ndate: {date}\nGET /v2/tts HTTP/1.1";
            //获取最终签名
            string signature = hmacsha256(signature_origin, api_secret);
            //使用base64编码获取鉴权前
            string authorization_origion = $"api_key=\"{api_key}\", algorithm=\"hmac-sha256\", headers=\"host date request-line\", signature=\"{signature}\"";
            //最终生成的鉴权接口
            string authorization = Convert.ToBase64String(Encoding.UTF8.GetBytes(authorization_origion));
            string url = $"wss://tts-api.xfyun.cn/v2/tts?authorization={authorization}&date={date}&host={host}";
            return url;
        }

        /// 
        /// HMACSHA256加密
        /// 
        /// 
        /// 
        /// 
        private string hmacsha256(string signature_origion, string secrect)
        {
            HMACSHA256 mac = new HMACSHA256(Encoding.UTF8.GetBytes(secrect));
            string signature = Convert.ToBase64String(mac.ComputeHash(Encoding.UTF8.GetBytes(signature_origion)));
            return signature;
        }
        #endregion

        #region ---------------------------------------自写的websocket--------------------------------------

        /// 
        /// 开启语音合成
        /// 
        /// 
        /// 
        public async void StartTTS(string text,AudioSource audioSource)
        {
            if (ttsWebSocket != null)
            {
                ttsWebSocket.Abort();//每次开启前先终止此websocket
            }
            await ConnectTTSWebSocket(text, audioSource);//发起websocket连接,获取数据。//发起websocket连接,获取数据。这里也可以不等待,主要是方便结束后进行一些其他操作
        }
        /// 
        /// 定义一个异步方法进行websocket连接
        /// 
        /// 需要转换成语音的文本
        public async Task ConnectTTSWebSocket(string text,AudioSource audioSource)
        {
            using (ttsWebSocket = new ClientWebSocket())
            {
                CancellationToken ct = new CancellationToken();
                Uri url = new Uri(GetUrl(Uri));
                await ttsWebSocket.ConnectAsync(url, ct);//在请求行URL字段后面添加authorization,date,host三个参数 发起websocket连接GET请求
                text = Convert.ToBase64String(Encoding.UTF8.GetBytes(text));
                TTSPara.common common = new TTSPara.common();
                TTSPara.business business = new TTSPara.business();
                TTSPara.data data = new TTSPara.data(text);
                //将参数转换成Json格式字符串
                string message = JsonUtility.ToJson(new TTSPara.TTSParameter(common, business, data));
                //发送数据
                await ttsWebSocket.SendAsync(new ArraySegment<byte>(Encoding.UTF8.GetBytes(message)), WebSocketMessageType.Binary, true, ct);

                StringBuilder sb = new StringBuilder();
                while (ttsWebSocket.State == WebSocketState.Open)
                {
                    //Debug.Log("开始接收数据");
                    var result = new byte[4096];
                    await ttsWebSocket.ReceiveAsync(new ArraySegment<byte>(result), ct);
                    //去除空字符
                    List<byte> list = new List<byte>(result);
                    while (list[list.Count - 1] == 0x00)
                    {
                        list.RemoveAt(list.Count - 1);
                    }
                    var str = Encoding.UTF8.GetString(list.ToArray());
                    sb.Append(str);//进入队列。
                    if (str.EndsWith("}}"))
                    {
                        CompositionData.Data data1 = JsonUtility.FromJson<CompositionData>(sb.ToString()).data;
                        Debug.Log("返回的数据内容data:" + JsonUtility.ToJson(data1));
                        int status = data1.status;
                        float[] fs = bytesToFloat(Convert.FromBase64String(data1.audio));
                        foreach (var f in fs)
                        {
                            //PlayQueue.Enqueue(f);
                            _clipData.Add(f);
                        }
                        sb.Clear();
                        if (status == 2)//语音合成结束标识
                        {
                            ttsWebSocket.Abort();//如果语音合成完毕就终止这个socket
                            //这里采样总数就是我们获取到的浮点list的大小,不懂的同学可以去查阅一下关于AudioClip与float之间的转换!
                            //注意:主要还要看第三个参数“channels”声道数,这里是1,所以_clipData.Count等于AudioClip的samples
                            audioSource.clip= AudioClip.Create("tts", _clipData.Count, 1, 16000, false);
                            audioSource.clip.SetData(_clipData.ToArray(),0);
                            _clipData.Clear();//到这儿就基本完成了,自己就可以另外写脚本控制相机上AudioSource组件的播放了,这里我就不写了!
                            break;
                        }
                    }
                }
            }
        }

        #region ----------------------数据定义及数据转化-----------------
        /// 
        /// 把接收到的byte[]数据转换成AudioClip可读取的float[]类型
        /// 
        /// 
        /// 
        public static float[] bytesToFloat(byte[] byteArray)//byte[]数组转化为AudioClip可读取的float[]类型
        {
            float[] sounddata = new float[byteArray.Length / 2];
            for (int i = 0; i < sounddata.Length; i++)
            {
                sounddata[i] = bytesToFloat(byteArray[i * 2], byteArray[i * 2 + 1]);
            }
            return sounddata;
        }

        static float bytesToFloat(byte firstByte, byte secondByte)
        {
            // convert two bytes to one short (little endian)
            //小端和大端顺序要调整
            short s;
            if (BitConverter.IsLittleEndian)
                s = (short)((secondByte << 8) | firstByte);
            else
                s = (short)((firstByte << 8) | secondByte);
            // convert to range from -1 to (just below) 1
            return s / 32768.0F;
        }
        [Serializable]
        public class CompositionData//合成数据
        {
            [Serializable]
            public class Data
            {
                public int status;
                public string audio;
            }
            public Data data;
        }
        #endregion
        
        #endregion
    }
}


using System;
namespace TTSPara
{
    [Serializable]
    public class TTSParameter
    {
        public common common;
        public business business;
        public data data;
        public TTSParameter(common common, business business, data data)
        {
            this.common = common;
            this.business = business;
            this.data = data;
        }
        public TTSParameter()
        {
            common = new common();
            business = new business();
            data = new data("这里是默认参数");
        }
    }
    [Serializable]
    public class common
    {
        public string app_id = "de01ae2d";
    }
    [Serializable]
    public class business
    {
        public string aue = "raw";
        //public int sfl = 0;
        public string auf = "audio/L16;rate=16000";
        public string vcn = "catherine";
        public int speed = 25;
        public int volume = 50;
        //public int pitch=50;
        //public int bgs=0;
        //public string tte = "UTF8";
        //public string reg = "2";
        //public string rdn = "0";
    }
    [Serializable]
    public class data
    {
        public string text;
        public int status = 2;
        public data(string text)
        {
            this.text = text;
        }
        public data() { }
    }
}

最重要的就这两个脚本了,可以直接复制使用!至于怎么控制播放大家可以自由控制,不过建议在调用StartTTS时,需要等待,否则会出错!下面我再给大家一个调用示例吧!

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using System;
using Webiat;

public TTSCore TTSCore;//把上面写的TTSCore脚本挂到场景物体中记得赋值哦
public AudioSource _ttsTalkAudioSource;//不要忘记赋值哦


public async void TTSPlay()
{
	await TTSCore.StartTTS(resTxt, _ttsTalkAudioSource);
	_ttsTalkAudioSource.Play();
}
//如果你还想播放完之后回调其他方法的话,你可以参考下面个
public async void TTSPlay(Action action=null)
{
	await TTSCore.StartTTS(resTxt, _ttsTalkAudioSource);
	_ttsTalkAudioSource.Play();
	if (action==null)
		return;
	float clipLength = _ttsTalkAudioSource.clip.length;
	Debug.Log("合成音频时长:"+clipLength);
	
	//下列两种延时调用都可用,不过个人建议使用第二种。需要注意的是第二种的参数单位是ms(毫秒),而第一种是s(秒)

	//1、开启协程,延时调用action委托,此延时调用方法我已放到下边
	StartCoroutine(DelayToInvoke.DelayToInvokeDo(() =>
	{
		action?.Invoke();
	},clipLength));
	//2、异步等待
	//await Task.Delay(clipLength*1000);
	//action?.Invoke();
}
using UnityEngine;
using System.Collections;
using System;
public class DelayToInvoke : MonoBehaviour

{
    public static IEnumerator DelayToInvokeDo(Action action, float delaySeconds)

    {
        yield return new WaitForSeconds(delaySeconds);
        action();
    }

}

好了,如果对你有帮助的话,不要忘记点赞+收藏!Thanks~

你可能感兴趣的:(unity,游戏引擎,c#)