语音识别问题首先可以通过深度学习方法,训练语言模型后进行音频识别,但该方法要求设备内存足够大,训练时间通常较久,后期有时间的话会再学习使用;
另外就是离线识别方法——使用python自带语音包,对于简单音频的识别率还可以;
联网识别方法——调用各大已经实现语音识别功能网站的接口,目前人家做得已经很成熟了,比较常见的有百度、讯飞,这里使用的标贝科技也不错,具体参数可以去官网查询使用。
查阅很多资料,有些代码是不能用的,这里就直接给出测试过的代码了,都是python语言,别的知识自己去查吧
支持音频文件类型:
- WAV: 必须是 PCM/LPCM 格式
- AIFF
- AIFF-CFLAC: 必须是初始 FLAC 格式;OGG-FLAC 格式不可用
# 终端安装
pip3 install SpeechRecognition
pip3 install pocketsphinx
# 若要访问麦克风则必须安装 PyAudio 软件包
pip3 install PyAudio
# 以下识别中文需要添加中文语音包,可以去查询获取
# 离线识别
# -*- coding: utf-8 -*-
import speech_recognition as sr
from os import path
audio_file = path.join(path.dirname(path.realpath('C:/Users/263000/Desktop/')), 'C:/Users/263000/Desktop/test.mp3')
r = sr.Recognizer()
with sr.AudioFile(audio_file) as source:
audio = r.record(source)
r = sr.Recognizer()
try:
print(" 音频内容为: " + r.recognize_sphinx(audio, language='zh-CN'))
except sr.UnknownValueError:
print('Sphinx could not understand audio')
except sr.RequestError as e:
print('Sphinx error; {0}'.format(e))
# print('文本内容: ', r.recognize_sphinx(audio,language='zh-CN')) #汉语
# print('文本内容: ', r.recognize_sphinx(audio)) # 英语
# 在线识别
# -*- coding: utf-8 -*-
import speech_recognition as sr
# obtain audio from the microphone
r = sr.Recognizer()
with sr.Microphone() as source:
r.adjust_for_ambient_noise(source) #收听1秒,以校准环境噪声级的能量阈值
print('say something')
# print("")
audio = r.listen(source)
#
# # recognize speech using Sphinx
try:
print("Sphinx thinks you said " + r.recognize_sphinx(audio))
except sr.UnknownValueError:
print("Sphinx could not understand audio")
except sr.RequestError as e:
print("Sphinx error; {0}".format(e))
# 一些改进
# offset设置命令起点, duration设置持续时间
audio = r.record(source, offset=4.7, duration=2.8)
# 使用adjust_for_ambient_noise()命令减少噪音
r.adjust_for_ambient_noise(source)
audio = r.record(source)
只支持 pcm/wav/amr 格式,采样率为固定值16000
# coding=utf-8
import json
import time
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import URLError
from urllib.parse import urlencode
timer = time.time
API_KEY = '你自己的'
SECRET_KEY = '你自己的'
# 需要识别的文件
AUDIO_FILE = './test.wav' # 只支持 pcm/wav/amr 格式,极速版额外支持m4a 格式
# 文件格式
FORMAT = AUDIO_FILE[-3:] # 文件后缀只支持 pcm/wav/amr 格式,极速版额外支持m4a 格式
CUID = '123456PYTHON'
# 采样率
RATE = 16000 # 固定值
# 普通版
DEV_PID = 1537 # 1537 表示识别普通话,使用输入法模型。根据文档填写PID,选择语言及识别模型
ASR_URL = 'http://vop.baidu.com/server_api'
SCOPE = 'audio_voice_assistant_get' # 有此scope表示有asr能力,没有请在网页里勾选,非常旧的应用可能没有
""" TOKEN start """
TOKEN_URL = 'http://openapi.baidu.com/oauth/2.0/token'
def fetch_token():
params = {'grant_type': 'client_credentials',
'client_id': API_KEY,
'client_secret': SECRET_KEY}
post_data = urlencode(params)
post_data = post_data.encode('utf-8')
req = Request(TOKEN_URL, post_data)
try:
f = urlopen(req)
result_str = f.read()
except URLError as err:
print('token http response http code : ' + str(err.code))
result_str = err.read()
result_str = result_str.decode()
result = json.loads(result_str)
if 'access_token' in result.keys() and 'scope' in result.keys():
if SCOPE and (SCOPE not in result['scope'].split(' ')): # SCOPE = False 忽略检查
raise DemoError('scope is not correct')
return result['access_token']
else:
raise DemoError('MAYBE API_KEY or SECRET_KEY not correct: access_token or scope not found in token response')
""" TOKEN end """
if __name__ == '__main__':
token = fetch_token()
speech_data = []
with open(AUDIO_FILE, 'rb') as speech_file:
speech_data = speech_file.read()
length = len(speech_data)
if length == 0:
raise DemoError('file %s length read 0 bytes' % AUDIO_FILE)
params = {'cuid': CUID, 'token': token, 'dev_pid': DEV_PID}
params_query = urlencode(params)
headers = {
'Content-Type': 'audio/' + FORMAT + '; rate=' + str(RATE),
'Content-Length': length
}
# print post_data
req = Request(ASR_URL + "?" + params_query, speech_data, headers)
try:
begin = timer()
f = urlopen(req)
result_str = f.read()
print("Request time cost %f" % (timer() - begin))
except URLError as err:
print('asr http response http code : ' + str(err.code))
result_str = err.read()
result_str = str(result_str, 'utf-8')
print(result_str)
with open("result.txt", "w") as of:
of.write(result_str)
与百度一样,需要先去官网认证登录,创建语音识别的应用,获取自己的API和SECRET,每天只有有限次数的免费使用额度
#!/usr/bin/env python
# coding: utf-8
import requests
import json
import argparse
# 获取access_token用于鉴权
def get_access_token(client_secret, client_id):
grant_type = "client_credentials"
url = "https://openapi.data-baker.com/oauth/2.0/token?grant_type={}&client_secret={}&client_id={}" \
.format(grant_type, client_secret, client_id)
try:
response = requests.post(url)
response.raise_for_status()
except Exception as e:
print(e)
return
else:
access_token = json.loads(response.text).get('access_token')
return access_token
# 获取识别后文本
def get_text(file, headers):
url = "https://asr.data-baker.com/asr/api?"
response = requests.post(url, data=file, headers=headers)
code = json.loads(response.text).get("code")
text = json.loads(response.text).get("text")
if code != 20000:
print(response.text)
return text
# 获取命令行输入参数
def get_args():
parser = argparse.ArgumentParser(description='ASR')
parser.add_argument('-client_secret', type=str, required=True)
parser.add_argument('-client_id', type=str, required=True)
parser.add_argument('-file_path', type=str, required=True)
parser.add_argument('--audio_format', type=str, default='wav')
parser.add_argument('--sample_rate', type=str, default='16000')
parser.add_argument('--add_pct', type=str, default='true')
args = parser.parse_args()
return args
if __name__ == '__main__':
args = get_args()
# 获取access_token
client_secret = args.client_secret
client_id = args.client_id
access_token = get_access_token(client_secret, client_id)
# 读取音频文件
with open(args.file_path, 'rb') as f:
file = f.read()
# 填写Header信息
audio_format = args.audio_format
sample_rate = args.sample_rate
add_pct = args.add_pct
headers = {'access_token': access_token, 'audio_format': audio_format, 'sample_rate': sample_rate,
'add_pct': add_pct}
text = get_text(file, headers)
print(text)