Unity 讯飞实时语音转写(一)—— 使用WebSocket连接讯飞语音服务器
Unity 讯飞实时语音转写(二)—— 接收转写结果
Unity 讯飞实时语音转写(三)—— 分析转写结果
一、官网示例
在官方文档中,有一段说了转写结果的json字符串大概长什么样子
{
"action":"result",
"code":"0",
"data":"{\"cn\":{\"st\":{\"bg\":\"820\",\"ed\":\"0\",\"rt\":[{\"ws\":[{\"cw\":[{\"w\":\"啊\",\"wp\":\"n\"}],\"wb\":0,\"we\":0},{\"cw\":[{\"w\":\"喂\",\"wp\":\"n\"}],\"wb\":0,\"we\":0},{\"cw\":[{\"w\":\"!\",\"wp\":\"p\"}],\"wb\":0,\"we\":0},{\"cw\":[{\"w\":\"你好\",\"wp\":\"n\"}],\"wb\":0,\"we\":0},{\"cw\":[{\"w\":\"!\",\"wp\":\"p\"}],\"wb\":0,\"we\":0},{\"cw\":[{\"w\":\"我\",\"wp\":\"n\"}],\"wb\":0,\"we\":0},{\"cw\":[{\"w\":\"是\",\"wp\":\"n\"}],\"wb\":0,\"we\":0},{\"cw\":[{\"w\":\"上\",\"wp\":\"n\"}],\"wb\":0,\"we\":0}]}],\"type\":\"1\"}},\"seg_id\":5}\n",
"desc":"success",
"sid":"rta0000000e@ch312c0e3f6bcc9f0900"
}
经过测试,我自己转写的结果
接收消息:{
"action":"result",
"code":"0",
"data":"{
\"seg_id\":7,
\"cn\":{
\"st\":{
\"rt\":[
{\"ws\":[
{\"cw\":[{\"w\":\"我们\",\"wp\":\"n\"}],\"wb\":23,\"we\":70},
{\"cw\":[{\"w\":\"生活\",\"wp\":\"n\"}],\"wb\":71,\"we\":118},
{\"cw\":[{\"w\":\"的\",\"wp\":\"n\"}],\"wb\":119,\"we\":130},
{\"cw\":[{\"w\":\"世界\",\"wp\":\"n\"}],\"wb\":131,\"we\":172},
{\"cw\":[{\"w\":\"里\",\"wp\":\"n\"}],\"wb\":173,\"we\":201},
{\"cw\":[{\"w\":\"有\",\"wp\":\"n\"}],\"wb\":202,\"we\":226},
{\"cw\":[{\"w\":\"两\",\"wp\":\"n\"}],\"wb\":227,\"we\":249},
{\"cw\":[{\"w\":\"个\",\"wp\":\"n\"}],\"wb\":250,\"we\":263},
{\"cw\":[{\"w\":\"世界\",\"wp\":\"n\"}
],
\"wb\":264,
\"we\":320}
]}
],
\"bg\":\"5120\",
\"type\":\"0\",
\"ed\":\"8520\"
}
},
\"ls\":false
}",
"desc":"success",
"sid":"xxxxxxxxxxxxxxxxxxxxxxxxxx"
}
发现实际转写结果有一些细微差别,例如:增加了一个字段“ls”,尚不知道是干嘛的。不过,既然接收到了该字段,就说明以后可能会用到,先在代码中添加上就好了。
二、关键数据结构
///
/// 返回值结果格式为json
///
[Serializable]
public struct JsonData
{
///
/// 结果标识,started:握手,result:结果,error:异常
///
public string action;
///
/// 结果码(具体见错误码)
///
public string code;
///
/// 转写结果数据
///
public Data data;
///
/// 描述
///
public string desc;
///
/// 会话ID
/// 主要用于DEBUG追查问题,如果出现问题,可以提供sid帮助确认问题。
///
public string sid;
}
结果格式为json,字段说明如下:
参数 | 类型 | 说明 |
---|---|---|
action | string | 结果标识,started:握手,result:结果,error:异常 |
code | string | 结果码(具体见错误码) |
data | string | 转写结果 |
desc | string | 描述 |
sid | string | 会话ID |
///
/// 语音识别的结果
///
[Serializable]
public struct Data
{
///
/// 转写结果序号 从0开始
///
public string seg_id;
[Serializable]
public struct CN
{
[Serializable]
public struct ST
{
[Serializable]
public struct RT
{
[Serializable]
public class WS
{
[Serializable]
public class CW
{
///
/// 词识别结果
///
public string w;
///
/// 词标识 n-普通词;s-顺滑词(语气词);p-标点
///
public string wp;
}
public CW[] cw;
///
/// 词在本句中的开始时间,单位是帧,1帧=10ms 即词在整段语音中的开始时间为(bg+wb*10)ms
/// 中间结果的 wb 为 0
///
public string wb;
///
/// 词在本句中的结束时间,单位是帧,1帧=10ms 即词在整段语音中的结束时间为(bg+we*10)ms
/// 中间结果的 we 为 0
///
public string we;
}
public WS[] ws;
}
public RT rt;
///
/// 句子在整段语音中的开始时间,单位毫秒(ms)
/// 中间结果的bg为准确值
///
public string bg;
///
/// 结果类型标识 0-最终结果;1-中间结果
///
public string type;
///
/// 句子在整段语音中的结束时间,单位毫秒(ms)
/// 中间结果的ed为0
///
public string ed;
}
public ST st;
}
public CN cn;
///
///
///
public string ls;
}
转写结果data字段说明如下:
字段 | 含义 | 描述 |
---|---|---|
seg_id | 转写结果序号 | 从0开始 |
w | 词识别结果 | |
wp | 词标识 | n-普通词;s-顺滑词(语气词);p-标点 |
wb | 词在本句中的开始时间,单位是帧,1帧=10ms 即词在整段语音中的开始时间为(bg+wb*10)ms | 中间结果的 wb 为 0 |
we | 词在本句中的结束时间,单位是帧,1帧=10ms 即词在整段语音中的结束时间为(bg+we*10)ms | 中间结果的 we 为 0 |
bg | 句子在整段语音中的开始时间,单位毫秒(ms) | 中间结果的bg为准确值 |
type | 结果类型标识 | 0-最终结果;1-中间结果 |
ed | 句子在整段语音中的结束时间,单位毫秒(ms) | 中间结果的ed为0 |
ls | 官网未说明 | 官网未说明 |
三、该篇全部代码
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using UnityEngine;
using System.Net.WebSockets;
using System.Security.Cryptography;
using System.Text;
using System.Threading;
using Newtonsoft.Json;
///
/// 返回值结果格式为json
///
[Serializable]
public struct JsonData
{
///
/// 结果标识,started:握手,result:结果,error:异常
///
public string action;
///
/// 结果码(具体见错误码)
///
public string code;
///
/// 转写结果数据
///
public Data data;
///
/// 描述
///
public string desc;
///
/// 会话ID
/// 主要用于DEBUG追查问题,如果出现问题,可以提供sid帮助确认问题。
///
public string sid;
}
///
/// 语音识别的结果
///
[Serializable]
public struct Data
{
///
/// 转写结果序号 从0开始
///
public string seg_id;
[Serializable]
public struct CN
{
[Serializable]
public struct ST
{
[Serializable]
public struct RT
{
[Serializable]
public class WS
{
[Serializable]
public class CW
{
///
/// 词识别结果
///
public string w;
///
/// 词标识 n-普通词;s-顺滑词(语气词);p-标点
///
public string wp;
}
public CW[] cw;
///
/// 词在本句中的开始时间,单位是帧,1帧=10ms 即词在整段语音中的开始时间为(bg+wb*10)ms
/// 中间结果的 wb 为 0
///
public string wb;
///
/// 词在本句中的结束时间,单位是帧,1帧=10ms 即词在整段语音中的结束时间为(bg+we*10)ms
/// 中间结果的 we 为 0
///
public string we;
}
public WS[] ws;
}
public RT rt;
///
/// 句子在整段语音中的开始时间,单位毫秒(ms)
/// 中间结果的bg为准确值
///
public string bg;
///
/// 结果类型标识 0-最终结果;1-中间结果
///
public string type;
///
/// 句子在整段语音中的结束时间,单位毫秒(ms)
/// 中间结果的ed为0
///
public string ed;
}
public ST st;
}
public CN cn;
///
///
///
public string ls;
}
public class VisualCommunication : MonoBehaviour
{
private string appid = "xxxxx";
private string appkey = "xxxxxxxxxxxxxxxxxxxxxxxxx";
private string timeStamp;
private string baseString;
private string toMd5;
private string signa;
public AudioClip RecordedClip;
ClientWebSocket ws;
CancellationToken ct;
private int MAX_RECORD_LENGTH = 3599;
///
/// 语音识别回调事件
///
public event Action<string> asrCallback;
void Start()
{
asrCallback += Output;
//Debug.Log("\nappid: " + appid);
//Debug.Log("\nappkey: " + appkey);
//Debug.Log("\ntimeStamp: " + timeStamp);
//Debug.Log("\nbaseString: " + baseString);
//Debug.Log("\nToMD5: " + toMd5);
//Debug.Log("\nsigna: " + signa);
}
void Output(string str)
{
Debug.Log("语音识别结果:" + str);
}
public void StartASR()
{
if (ws != null && ws.State == WebSocketState.Open)
{
Debug.LogWarning("开始语音识别失败!,等待上次识别连接结束");
return;
}
if (Microphone.devices.Length == 0)
{
Debug.LogError("未检测到可用的麦克风");
return;
}
ConnectASR_Aysnc();
RecordedClip = Microphone.Start(null, false, MAX_RECORD_LENGTH, 16000);
}
public void StopASR()
{
if (ws != null)
{
//关掉发送音频的协程
StopCoroutine(SendAudioClip());
//Debug.Log("发送结束标识" + ws.State);
//音频数据上传完成后,客户端需发送一个 {"end": true} 到服务端作为结束标识
ws.SendAsync(new ArraySegment<byte>(Encoding.UTF8.GetBytes("{\"end\": true}")), WebSocketMessageType.Binary,
true, new CancellationToken());
//ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "关闭WebSocket连接", new CancellationToken());
Microphone.End(null);
StartCoroutine(StopRecord());
}
}
private IEnumerator StopRecord()
{
yield return new WaitUntil(() => ws.State != WebSocketState.Open);
Debug.Log("识别结束,停止录音");
}
async void ConnectASR_Aysnc()
{
ws = new ClientWebSocket();
ct = new CancellationToken();
Uri url = GetUri();
await ws.ConnectAsync(url, ct);
StartCoroutine(SendAudioClip());
StringBuilder stringBuilder = new StringBuilder();
while (ws.State == WebSocketState.Open)
{
var result = new byte[4096];
await ws.ReceiveAsync(new ArraySegment<byte>(result), ct); //接受数据
List<byte> list = new List<byte>(result);
while (list[list.Count - 1] == 0x00) list.RemoveAt(list.Count - 1); //去除空字节
string str = Encoding.UTF8.GetString(list.ToArray());
Debug.Log("接收消息:" + str);
if (string.IsNullOrEmpty(str))
{
return;
}
JsonData jsonData = JsonUtility.FromJson<JsonData>(str);
if (jsonData.action.Equals("started"))
{
Debug.Log("握手成功!");
}
else if (jsonData.action.Equals("result"))
{
stringBuilder.Append(AnalysisResult(jsonData));
}
else if (jsonData.action.Equals("error"))
{
Debug.Log("Error: " + jsonData.desc);
ws.Abort();
}
}
Debug.LogWarning("断开连接");
string s = stringBuilder.ToString();
if (!string.IsNullOrEmpty(s))
{
asrCallback?.Invoke(s);
Debug.LogWarning("识别到声音:" + s);
}
}
///
/// 发送音频片段
///
///
///
IEnumerator SendAudioClip()
{
yield return new WaitWhile(() => Microphone.GetPosition(null) <= 0);
float t = 0;
int position = Microphone.GetPosition(null);
const float waitTime = 0.04f; //每隔40ms发送音频
const int Maxlength = 1280; //最多发送1280字节
int status = 0;
int lastPosition = 0;
while (position < RecordedClip.samples && ws.State == WebSocketState.Open)
{
t += waitTime;
if (t >= MAX_RECORD_LENGTH)
{
Debug.Log("录音时长已用尽,结束语音识别!");
break;
}
yield return new WaitForSecondsRealtime(waitTime);
if (Microphone.IsRecording(null))
{
position = Microphone.GetPosition(null);
//Debug.Log("录音时长:" + t + "position=" + position + ",lastPosition=" + lastPosition);
}
if (position <= lastPosition)
{
// 防止出现当前采样位置和上一帧采样位置一样,导致length为0
// 那么在调用AudioClip.GetData(float[] data, int offsetSamples);时,将报错
continue;
}
int length = position - lastPosition > Maxlength ? Maxlength : position - lastPosition;
byte[] data = GetAudioClip(lastPosition, length, RecordedClip);
ws.SendAsync(new ArraySegment<byte>(data), WebSocketMessageType.Binary, true,
new CancellationToken()); //发送数据
lastPosition = lastPosition + length;
status = 1;
}
}
private void OnApplicationQuit()
{
StopASR();
}
///
/// 获取识别并返回字符串
///
/// 所获取的识别的Json字符串
/// 所识别的连贯的一句话
string AnalysisResult(JsonData jsonData)
{
StringBuilder stringBuilder = new StringBuilder();
var ws = jsonData.data.cn.st.rt.ws;
if (ws != null)
{
foreach (var item in ws)
{
var cw = item.cw;
foreach (var w in cw)
{
stringBuilder.Append(w.w);
}
}
return stringBuilder.ToString();
}
return "";
}
///
/// 获取音频流片段
///
/// 起始采样点
/// 采样长度
/// 音频
///
public static byte[] GetAudioClip(int start, int length, AudioClip recordedClip)
{
float[] soundata = new float[length];
recordedClip.GetData(soundata, start);
int rescaleFactor = 32767;
byte[] outData = new byte[soundata.Length * 2];
for (int i = 0; i < soundata.Length; i++)
{
short temshort = (short) (soundata[i] * rescaleFactor);
byte[] temdata = BitConverter.GetBytes(temshort);
outData[i * 2] = temdata[0];
outData[i * 2 + 1] = temdata[1];
}
return outData;
}
///
/// 获得请求URI
///
/// 请求的URI
private Uri GetUri()
{
//精确到秒
timeStamp = GetTimeStamp();
//baseString由appid和当前时间戳ts拼接而成
baseString = appid + timeStamp;
//对baseString进行MD5
toMd5 = ToMD5(baseString);
//以apiKey为key对MD5之后的baseString进行HmacSHA1加密
//然后再对加密后的字符串进行base64编码
signa = ToHmacSHA1(toMd5, appkey);
string requestUrl = string.Format("wss://rtasr.xfyun.cn/v1/ws?appid={0}&ts={1}&signa={2}&pd=tech", appid,
timeStamp, UrlEncode(signa));
Debug.Log("requestUrl: " + requestUrl);
return new Uri(requestUrl);
}
///
/// 对字符串进行UrlEncode转码
///
/// 需要转码的字符串
/// 经过UrlEncode转码的字符串
public static string UrlEncode(string str)
{
StringBuilder sb = new StringBuilder();
byte[] byStr = System.Text.Encoding.UTF8.GetBytes(str); //默认是System.Text.Encoding.Default.GetBytes(str)
for (int i = 0; i < byStr.Length; i++)
{
sb.Append(@"%" + Convert.ToString(byStr[i], 16));
}
return (sb.ToString());
}
///
/// 获取时间戳
///
/// 时间戳,精确到秒
public static string GetTimeStamp()
{
TimeSpan ts = DateTime.UtcNow - new DateTime(1970, 1, 1, 0, 0, 0, 0);
return Convert.ToInt64(ts.TotalSeconds).ToString();
}
///
/// MD5字符串加密
///
/// 需要加密的字符串
/// 加密后字符串
public static string ToMD5(string txt)
{
using (MD5 mi = MD5.Create())
{
byte[] buffer = Encoding.Default.GetBytes(txt);
//开始加密
byte[] newBuffer = mi.ComputeHash(buffer);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < newBuffer.Length; i++)
{
sb.Append(newBuffer[i].ToString("x2"));
}
return sb.ToString();
}
}
///
/// HMACSHA1算法加密并返回ToBase64String
///
/// 要加密的原串
///私钥
/// 返回一个签名值(即哈希值)
public static string ToHmacSHA1(string text, string key)
{
//HMACSHA1加密
HMACSHA1 hmacsha1 = new HMACSHA1();
hmacsha1.Key = System.Text.Encoding.UTF8.GetBytes(key);
byte[] dataBuffer = System.Text.Encoding.UTF8.GetBytes(text);
byte[] hashBytes = hmacsha1.ComputeHash(dataBuffer);
return Convert.ToBase64String(hashBytes);
}
}
四、如何测试
五、说明
在调用StopASR()方法后,223行会报一个错误“WebSocketException: The remote party closed the WebSocket connection without completing the close handshake.”,尚未解决
六、该篇参考资料
官方文档:https://www.xfyun.cn/doc/asr/rtasr/API.html
代码实现:https://blog.csdn.net/chunyu90225/article/details/106172895
转载注明出处!