Python调用科大讯飞C语言接口(语音听写linux sdk)实现语音识别

科大接口调用文档:
https://www.xfyun.cn/doc/asr/voicedictation/Linux-SDK.html#_2、sdk集成指南

from ctypes import *
import time
import threading

# 调用动态链接库
dll = cdll.LoadLibrary("../Linux_iat1226_xxxxxxx/libs/x64/libmsc.so")
# 登录参数,apppid一定要和你的下载SDK对应
login_params = b"appid = xxxxxx, work_dir = ."

FRAME_LEN = 640  # Byte

MSP_SUCCESS = 0
# 返回结果状态
MSP_AUDIO_SAMPLE_FIRST = c_int(1)
MSP_AUDIO_SAMPLE_CONTINUE = c_int(2)
MSP_AUDIO_SAMPLE_LAST = c_int(4)
MSP_REC_STATUS_COMPLETE = c_int(5)
# 你的语音文件路径
filename = "./f1.wav"


class Msp:
    def __init__( self ):
        self.recogStatus = c_int(8)
        self.counter = 0
        self.laststr = ''
        self.sessionID = None
        self.epStatus = None
        self.count = 0

    def login( self ):
        ret = dll.MSPLogin(None, None, login_params)
        # print('MSPLogin =>', ret)

    def logout( self ):
        ret = dll.MSPLogout()
        # print('MSPLogout =>', ret)

    def isr( self, audiofile, session_begin_params ):
        ret = c_int()
        self.sessionID = c_voidp()
        dll.QISRSessionBegin.restype = c_char_p
        self.sessionID = dll.QISRSessionBegin(None, session_begin_params, byref(ret))
        print('QISRSessionBegin => self.sessionID:', self.sessionID, 'ret:', ret.value)

        # 每秒【1000ms】  16000 次 * 16 bit 【20B】 ,每毫秒:1.6 * 16bit 【1.6*2B】 = 32Byte
        # 1帧音频20ms【640B】 每次写入 10帧=200ms 【6400B】

        # piceLne = FRAME_LEN * 20
        piceLne = 1638 * 2
        self.epStatus = c_int(0)
        self.recogStatus = c_int(0)

        wavFile = open(audiofile, 'rb')

        while wavFile:
            wavData = wavFile.read(piceLne)

            aud_stat = MSP_AUDIO_SAMPLE_CONTINUE
            if (self.count == 0):
                aud_stat = MSP_AUDIO_SAMPLE_FIRST  # 第一句

            if len(wavData) <= 0:
                # print('最后一句话')
                ret = dll.QISRAudioWrite(self.sessionID, None, 0, MSP_AUDIO_SAMPLE_LAST, byref(self.epStatus),byref(self.recogStatus))
                #print('send last ,recogStatus:',self.recogStatus.value,'ret:',ret)
                break
            else:
                ret = dll.QISRAudioWrite(self.sessionID, wavData, len(wavData), aud_stat,
                                         byref(self.epStatus),
                                         byref(self.recogStatus))
                #print('len(wavData):', len(wavData), 'QISRAudioWrite ret:', ret, 'epStatus:', self.epStatus.value, 'recogStatus:', self.recogStatus.value)
            self.count += 1
            time.sleep(0.1)
            if self.recogStatus.value==0:
                self.get_result()
        wavFile.close()
        print("所有待识别音频已全部发送完毕")

        while self.recogStatus.value != 5:
            self.get_result()
            time.sleep(1)


    def get_result( self , ):
        #print('开始获取识别结果', self.recogStatus.value)
        ret = c_int(0)
        dll.QISRGetResult.restype = c_char_p
        retstr = dll.QISRGetResult(self.sessionID, byref(self.recogStatus), 0, byref(ret))
        #print("error code : ", ret.value, 'recogStatus:', self.recogStatus.value)

        if retstr is not None:
            self.laststr += retstr.decode()
            print(self.laststr)

        if self.recogStatus.value == 5:
            ret = dll.QISRSessionEnd(self.sessionID, 'end')
            print('语音识别结束')
        return self.laststr

        


def XF_text( filepath, audiorate ):
    msp = Msp()
    print("登录科大讯飞")
    msp.login()
    print("科大讯飞登录成功")
    session_begin_params = b"sub = iat, ptt = 0, result_encoding = utf8, result_type = plain, domain = iat"
    if 16000 == audiorate:
        session_begin_params = b"sub = iat, domain = iat, language = zh_cn, accent = mandarin, sample_rate = 16000, result_type = plain, result_encoding = utf8,vad_enable=0"
    text = msp.isr(filepath, session_begin_params)
    msp.logout()
    return text





if __name__ == '__main__':
    res = XF_text(filename, 16000)

你可能感兴趣的:(python)