百度AI的语音识别与语音合成

机器人语音问答的需要,调用百度AI的语音识别

这里的思路很简单,就是用百度的API,初始化客户端,然后输入参数进行调用。

代码

import wave
import pyaudio
from aip import AipSpeech,AipNlp
from playsound import playsound

""" 你的 APPID AK SK """
APP_ID = '****'
API_KEY = '****'
SECRET_KEY = '****'

# 读取文件
def get_file_content(filePath):
    with open(filePath, 'rb') as fp:
        return fp.read()
    
    
# 录音功能
def record_content():
    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    RECORD_SECONDS = 3

    WAVE_OUTPUT_FILENAME = "audio.wav"
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)
    print("* recording")

    frames = []
    for j in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("* done recording")

    stream.stop_stream()
    stream.close()
    p.terminate()

    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()
    print("done ------------------------------   ")
    return WAVE_OUTPUT_FILENAME


# 生成语音功能客户端
client_audio = AipSpeech(APP_ID, API_KEY, SECRET_KEY)

# 语音录制
filePath = record_content()

# 语音识别
result_audio = client_audio.asr(get_file_content(filePath), 'wav', 16000, {
    'dev_pid': 1536,
})
content_audio = result_audio['result'][0]
print(content_audio)


# 自然语音处理客户端
client_nlp = AipNlp(APP_ID, API_KEY, SECRET_KEY)

# text = "百度是一家高科技公司"
text = content_audio

""" 调用词法分析 """
xx = client_nlp.lexer(text)
content_answer = xx['items'][0]['item']

# 语音合成
try:
    result_answer  = client_audio.synthesis(content_answer, 'zh', 1, {
    'vol': 5,
})
except Exception as e:
    print(e)

# 语音写入
if not isinstance(result_answer, dict):
    with open('audio.mp3', 'wb') as f:
        f.write(result_answer)

# 语音播放
playsound('audio.mp3')

 

录音

首先将对方的语音录下,存为 **.wav 音频文件,其中原始 PCM 的录音参数必须符合 16k 采样率16bit 位深单声道,支持的格式有:pcm(不压缩)、wav(不压缩,pcm编码)、amr(压缩格式)

# 录音功能
def record_content():
    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    RECORD_SECONDS = 3

    WAVE_OUTPUT_FILENAME = "audio.wav"
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)
    print("* recording")

    frames = []
    for j in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("* done recording")

    stream.stop_stream()
    stream.close()
    p.terminate()

    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()
    print("done ------------------------------   ")
    return WAVE_OUTPUT_FILENAME

 

识别

然后将录音文件进行识别

代码如下

# 语音识别
result_audio = client_audio.asr(get_file_content(filePath), 'wav', 16000, {
    'dev_pid': 1536,
})
content_audio = result_audio['result'][0]
print(content_audio)

处理

# 自然语音处理客户端
client_nlp = AipNlp(APP_ID, API_KEY, SECRET_KEY)

# text = "百度是一家高科技公司"
text = content_audio

""" 调用词法分析 """
xx = client_nlp.lexer(text)
content_answer = xx['items'][0]['item']

 

回答

语音文件识别结束之后,将其写入到本地文件,并进行播放(python几种播放方法)

# 语音合成
try:
    result_answer  = client_audio.synthesis(content_answer, 'zh', 1, {
    'vol': 5,
})
except Exception as e:
    print(e)

# 语音写入
if not isinstance(result_answer, dict):
    with open('audio.mp3', 'wb') as f:
        f.write(result_answer)

# 语音播放
playsound('audio.mp3')

 

 

需要解决的问题(有建议请评论告知,感谢!):

1.不定长语音文件的判定(音频文件时长不固定,根据说话时长来确定)

2.一群人中确定一个说话人接收指令

你可能感兴趣的:(无人车,Python)