机器人语音问答的需要,调用百度AI的语音识别
这里的思路很简单,就是用百度的API,初始化客户端,然后输入参数进行调用。
import wave
import pyaudio
from aip import AipSpeech,AipNlp
from playsound import playsound
""" 你的 APPID AK SK """
APP_ID = '****'
API_KEY = '****'
SECRET_KEY = '****'
# 读取文件
def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()
# 录音功能
def record_content():
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
RECORD_SECONDS = 3
WAVE_OUTPUT_FILENAME = "audio.wav"
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
frames_per_buffer=CHUNK)
print("* recording")
frames = []
for j in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
print("done ------------------------------ ")
return WAVE_OUTPUT_FILENAME
# 生成语音功能客户端
client_audio = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
# 语音录制
filePath = record_content()
# 语音识别
result_audio = client_audio.asr(get_file_content(filePath), 'wav', 16000, {
'dev_pid': 1536,
})
content_audio = result_audio['result'][0]
print(content_audio)
# 自然语音处理客户端
client_nlp = AipNlp(APP_ID, API_KEY, SECRET_KEY)
# text = "百度是一家高科技公司"
text = content_audio
""" 调用词法分析 """
xx = client_nlp.lexer(text)
content_answer = xx['items'][0]['item']
# 语音合成
try:
result_answer = client_audio.synthesis(content_answer, 'zh', 1, {
'vol': 5,
})
except Exception as e:
print(e)
# 语音写入
if not isinstance(result_answer, dict):
with open('audio.mp3', 'wb') as f:
f.write(result_answer)
# 语音播放
playsound('audio.mp3')
首先将对方的语音录下,存为 **.wav 音频文件,其中原始 PCM 的录音参数必须符合 16k 采样率、16bit 位深、单声道,支持的格式有:pcm(不压缩)、wav(不压缩,pcm编码)、amr(压缩格式)
# 录音功能
def record_content():
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
RECORD_SECONDS = 3
WAVE_OUTPUT_FILENAME = "audio.wav"
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
frames_per_buffer=CHUNK)
print("* recording")
frames = []
for j in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
print("done ------------------------------ ")
return WAVE_OUTPUT_FILENAME
然后将录音文件进行识别
代码如下
# 语音识别
result_audio = client_audio.asr(get_file_content(filePath), 'wav', 16000, {
'dev_pid': 1536,
})
content_audio = result_audio['result'][0]
print(content_audio)
# 自然语音处理客户端
client_nlp = AipNlp(APP_ID, API_KEY, SECRET_KEY)
# text = "百度是一家高科技公司"
text = content_audio
""" 调用词法分析 """
xx = client_nlp.lexer(text)
content_answer = xx['items'][0]['item']
语音文件识别结束之后,将其写入到本地文件,并进行播放(python几种播放方法)
# 语音合成
try:
result_answer = client_audio.synthesis(content_answer, 'zh', 1, {
'vol': 5,
})
except Exception as e:
print(e)
# 语音写入
if not isinstance(result_answer, dict):
with open('audio.mp3', 'wb') as f:
f.write(result_answer)
# 语音播放
playsound('audio.mp3')
需要解决的问题(有建议请评论告知,感谢!):
1.不定长语音文件的判定(音频文件时长不固定,根据说话时长来确定)
2.一群人中确定一个说话人接收指令