python语音识别音频文件的方法

问题描述

语音识别问题首先可以通过深度学习方法,训练语言模型后进行音频识别,但该方法要求设备内存足够大,训练时间通常较久,后期有时间的话会再学习使用;

另外就是离线识别方法——使用python自带语音包,对于简单音频的识别率还可以;

联网识别方法——调用各大已经实现语音识别功能网站的接口,目前人家做得已经很成熟了,比较常见的有百度、讯飞,这里使用的标贝科技也不错,具体参数可以去官网查询使用。

查阅很多资料,有些代码是不能用的,这里就直接给出测试过的代码了,都是python语言,别的知识自己去查吧


SpeechRecognition

支持音频文件类型:

  • WAV: 必须是 PCM/LPCM 格式
  • AIFF
  • AIFF-CFLAC: 必须是初始 FLAC 格式;OGG-FLAC 格式不可用
# 终端安装
pip3 install SpeechRecognition
pip3 install pocketsphinx

# 若要访问麦克风则必须安装 PyAudio 软件包
pip3 install PyAudio
# 以下识别中文需要添加中文语音包,可以去查询获取
# 离线识别
# -*- coding: utf-8 -*-
import speech_recognition as sr
from os import path

audio_file =  path.join(path.dirname(path.realpath('C:/Users/263000/Desktop/')), 'C:/Users/263000/Desktop/test.mp3')

r = sr.Recognizer()
with sr.AudioFile(audio_file) as source:
    audio = r.record(source)
    r = sr.Recognizer()

try:
    print(" 音频内容为: " + r.recognize_sphinx(audio, language='zh-CN'))
except sr.UnknownValueError:
    print('Sphinx could not understand audio')
except sr.RequestError as e:
    print('Sphinx error; {0}'.format(e))

# print('文本内容: ', r.recognize_sphinx(audio,language='zh-CN'))  #汉语
# print('文本内容: ', r.recognize_sphinx(audio))  # 英语


# 在线识别
# -*- coding: utf-8 -*-
import speech_recognition as sr

# obtain audio from the microphone
r = sr.Recognizer()
with sr.Microphone() as source:
    r.adjust_for_ambient_noise(source)    #收听1秒,以校准环境噪声级的能量阈值
    print('say something')

    # print("")
    audio = r.listen(source)
#
# # recognize speech using Sphinx
try:
    print("Sphinx thinks you said " + r.recognize_sphinx(audio))
except sr.UnknownValueError:
    print("Sphinx could not understand audio")
except sr.RequestError as e:
    print("Sphinx error; {0}".format(e))

# 一些改进
# offset设置命令起点, duration设置持续时间
audio = r.record(source, offset=4.7, duration=2.8)   

# 使用adjust_for_ambient_noise()命令减少噪音
r.adjust_for_ambient_noise(source)
audio = r.record(source)

百度

只支持 pcm/wav/amr 格式,采样率为固定值16000

# coding=utf-8

import json
import time
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import URLError
from urllib.parse import urlencode

timer = time.time

API_KEY = '你自己的'
SECRET_KEY = '你自己的'

# 需要识别的文件
AUDIO_FILE = './test.wav'  # 只支持 pcm/wav/amr 格式,极速版额外支持m4a 格式
# 文件格式
FORMAT = AUDIO_FILE[-3:]  # 文件后缀只支持 pcm/wav/amr 格式,极速版额外支持m4a 格式

CUID = '123456PYTHON'
# 采样率
RATE = 16000  # 固定值

# 普通版
DEV_PID = 1537  # 1537 表示识别普通话,使用输入法模型。根据文档填写PID,选择语言及识别模型
ASR_URL = 'http://vop.baidu.com/server_api'
SCOPE = 'audio_voice_assistant_get'  # 有此scope表示有asr能力,没有请在网页里勾选,非常旧的应用可能没有

"""  TOKEN start """

TOKEN_URL = 'http://openapi.baidu.com/oauth/2.0/token'


def fetch_token():
    params = {'grant_type': 'client_credentials',
              'client_id': API_KEY,
              'client_secret': SECRET_KEY}
    post_data = urlencode(params)
    post_data = post_data.encode('utf-8')
    req = Request(TOKEN_URL, post_data)
    try:
        f = urlopen(req)
        result_str = f.read()
    except URLError as err:
        print('token http response http code : ' + str(err.code))
        result_str = err.read()
    result_str = result_str.decode()
    result = json.loads(result_str)
    if 'access_token' in result.keys() and 'scope' in result.keys():
        if SCOPE and (SCOPE not in result['scope'].split(' ')):  # SCOPE = False 忽略检查
            raise DemoError('scope is not correct')
        return result['access_token']
    else:
        raise DemoError('MAYBE API_KEY or SECRET_KEY not correct: access_token or scope not found in token response')


"""  TOKEN end """

if __name__ == '__main__':
    token = fetch_token()
    speech_data = []
    with open(AUDIO_FILE, 'rb') as speech_file:
        speech_data = speech_file.read()
    length = len(speech_data)
    if length == 0:
        raise DemoError('file %s length read 0 bytes' % AUDIO_FILE)

    params = {'cuid': CUID, 'token': token, 'dev_pid': DEV_PID}
    params_query = urlencode(params)

    headers = {
        'Content-Type': 'audio/' + FORMAT + '; rate=' + str(RATE),
        'Content-Length': length
    }
    # print post_data
    req = Request(ASR_URL + "?" + params_query, speech_data, headers)
    try:
        begin = timer()
        f = urlopen(req)
        result_str = f.read()
        print("Request time cost %f" % (timer() - begin))
    except URLError as err:
        print('asr http response http code : ' + str(err.code))
        result_str = err.read()

    result_str = str(result_str, 'utf-8')
    print(result_str)
    with open("result.txt", "w") as of:
        of.write(result_str)

标贝

与百度一样,需要先去官网认证登录,创建语音识别的应用,获取自己的API和SECRET,每天只有有限次数的免费使用额度

#!/usr/bin/env python
# coding: utf-8

import requests
import json
import argparse


# 获取access_token用于鉴权
def get_access_token(client_secret, client_id):
    grant_type = "client_credentials"
    url = "https://openapi.data-baker.com/oauth/2.0/token?grant_type={}&client_secret={}&client_id={}" \
        .format(grant_type, client_secret, client_id)

    try:
        response = requests.post(url)
        response.raise_for_status()
    except Exception as e:
        print(e)
        return
    else:
        access_token = json.loads(response.text).get('access_token')

    return access_token


# 获取识别后文本
def get_text(file, headers):
    url = "https://asr.data-baker.com/asr/api?"
    response = requests.post(url, data=file, headers=headers)
    code = json.loads(response.text).get("code")
    text = json.loads(response.text).get("text")
    if code != 20000:
        print(response.text)

    return text


# 获取命令行输入参数
def get_args():
    parser = argparse.ArgumentParser(description='ASR')
    parser.add_argument('-client_secret', type=str, required=True)
    parser.add_argument('-client_id', type=str, required=True)
    parser.add_argument('-file_path', type=str, required=True)
    parser.add_argument('--audio_format', type=str, default='wav')
    parser.add_argument('--sample_rate', type=str, default='16000')
    parser.add_argument('--add_pct', type=str, default='true')
    args = parser.parse_args()

    return args


if __name__ == '__main__':
    args = get_args()

    # 获取access_token
    client_secret = args.client_secret
    client_id = args.client_id
    access_token = get_access_token(client_secret, client_id)

    # 读取音频文件
    with open(args.file_path, 'rb') as f:
        file = f.read()

    # 填写Header信息
    audio_format = args.audio_format
    sample_rate = args.sample_rate
    add_pct = args.add_pct
    headers = {'access_token': access_token, 'audio_format': audio_format, 'sample_rate': sample_rate,
               'add_pct': add_pct}
    text = get_text(file, headers)
    print(text)

你可能感兴趣的:(树莓派使用中的问题,语音识别,人工智能,python)