HTML5的Speech API相关标准的现状

规范现状

浏览器提供语音搜索已经不是什么新鲜事,在google上搜索了一下speech api相关内容,发现存在三个不同的标准规范,分别是:

HTML Speech Web API

HTML Speech Web API是由微软、google、mozilla等公司的人提出,目前最近的版本是2011年10月29日。

该规范提出了一个reco标记用来进行语音识别,该标记在用户界面上表示一个语音输入,可以将另一个表单元素与之进行关联,从而将识别结果呈献给用户。此外规范还提供了tts标记完成语音合成功能。

[NamedConstructor=Reco(),

  NamedConstructor=Reco(in DOMString for)]

    interface HTMLRecoElement : HTMLElement {

        // Attributes

        readonly attribute HTMLFormElement? form;

        attribute DOMString htmlFor;

        readonly attribute HTMLElement? control;

        attribute SpeechInputRequest request;



        attribute DOMString grammar;



        // From the SpeechInputRequest

        integer maxNBest;

        DOMString language;

        boolean saveForRereco;

        boolean endpointDetection;

        boolean finalizeBeforeEnd;

        integer interimResults;

        float confidenceThreshold;

        float sensitivity;

        float speedVsAccuracy;

        integer completeTimeout;

        integer incompleteTimeout;

        integer maxSpeechTimeout;

        DOMString inputWaveformURI;

        attribute DOMString serviceURI;

        attribute boolean continuous;



        // event handlers

        attribute Function onaudiostart;

        attribute Function onsoundstart;

        attribute Function onspeechstart;

        attribute Function onspeechend;

        attribute Function onsoundend;

        attribute Function onaudioend;

        attribute Function onresult;

        attribute Function onnomatch;

        attribute Function onerror;

        attribute Function onauthorizationchange;

        attribute Function onopen;

        attribute Function onstart;

    };

TTS定义如下:

  [NamedConstructor=TTS(),

  NamedConstructor=TTS(in DOMString src)]

    interface HTMLTTSElement : HTMLMediaElement {

        attribute DOMString serviceURI;

        attribute DOMString lastMark;

    };

在JS方面,规范定义了SpeechInputRequest接口可以独立发起语音识别并将结果通过回调返回给浏览器。

 [Constructor]

    interface SpeechInputRequest {

        // recognition parameters

        SpeechGrammars[] grammars;



        // misc parameter attributes

        integer maxNBest;

        DOMString language;

        boolean saveForRereco;

        boolean endpointDetection;

        boolean finalizeBeforeEnd;

        integer interimResults;

        float confidenceThreshold;

        float sensitivity;

        float speedVsAccuracy;

        integer completeTimeout;

        integer incompleteTimeout;

        integer maxSpeechTimeout;

        DOMString inputWaveformURI;



        // the generic set of parameters

        SpeechParameter[] parameters;



        // other attributes

        attribute DOMString serviceURI;

        attribute MediaStream input;

        const unsigned short SPEECH_AUTHORIZATION_UNKNOWN = 0;

        const unsigned short SPEECH_AUTHORIZATION_AUTHROIZED = 1;

        const unsigned short SPEECH_AUTHORIZATION_NOT_AUTHORIZED = 2;

        readonly attribute unsigned short authorizationState;

        attribute boolean continuous;



        // the generic send info method

        void sendInfo(in DOMString type, in DOMString value);



        // Default markup binding methods

        void addGrammarFrom(in Element inputElement, optional float weight, optional boolean modal);

        void outputToElement(in Element outputElement);

        

        // methods to drive the speech interaction

        void open();

        void start();

        void stop();

        void abort();

        void interpret(in DOMString text);



        // event methods

        attribute Function onaudiostart;

        attribute Function onsoundstart;

        attribute Function onspeechstart;

        attribute Function onspeechend;

        attribute Function onsoundend;

        attribute Function onaudioend;

        attribute Function onresult;

        attribute Function onnomatch;

        attribute Function onerror;

        attribute Function onauthorizationchange;

        attribute Function onopen;

        attribute Function onstart;

        attribute Function onend;

    };

    SpeechInputRequest implements EventTarget;



    interface SpeechInputNomatchEvent : Event {

        readonly attribute SpeechInputResult result;

    };



    interface SpeechInputErrorEvent : Event {

        readonly attribute SpeechInputError error;

    };



    interface SpeechInputError {

        const unsigned short SPEECH_INPUT_ERR_OTHER = 0;

        const unsigned short SPEECH_INPUT_ERR_NO_SPEECH = 1;

        const unsigned short SPEECH_INPUT_ERR_ABORTED = 2;

        const unsigned short SPEECH_INPUT_ERR_AUDIO_CAPTURE = 3;

        const unsigned short SPEECH_INPUT_ERR_NETWORK = 4;

        const unsigned short SPEECH_INPUT_ERR_NOT_ALLOWED = 5;

        const unsigned short SPEECH_INPUT_ERR_SERVICE_NOT_ALLOWED = 6;

        const unsigned short SPEECH_INPUT_ERR_BAD_GRAMMAR = 7;

        const unsigned short SPEECH_INPUT_ERR_LANGUAGE_NOT_SUPPORTED = 8;



        readonly attribute unsigned short code;

        readonly attribute DOMString message;

    };



    // Item in N-best list

    interface SpeechInputAlternative {

        readonly attribute DOMString utterance;

        readonly attribute float confidence;

        readonly attribute any interpretation;

    };



    // A complete one-shot simple response

    interface SpeechInputResult {

        readonly attribute Document resultEMMAXML;

        readonly attribute DOMString resultEMMAText;

        readonly attribute unsigned long length;

        getter SpeechInputAlternative item(in unsigned long index);

        readonly attribute boolean final;

    };



    // A full response, which could be interim or final, part of a continuous response or not

    interface SpeechInputResultEvent : Event {

        readonly attribute SpeechInputResult result;

        readonly attribute short resultIndex;

        readonly attribute SpeechInputResult[] results;

        readonly attribute DOMString sessionId;

    };



    // The object representing a speech grammar

    [Constructor]

    interface SpeechGrammar {

        attribute DOMString src;

        attribute float weight;

        attribute boolean modal;

    };



    // The object representing a speech parameter

    [Constructor]

    interface SpeechParameter {

        attribute DOMString name;

        attribute DOMString value;

    };

Speech Input API Specification

Speech Input API Specification这一份是2个google的员工提交的方案,时间也比较早,为2010年10月18日,不知道现在是否还在维护,Latest Editor's Draft的链接都没有了。该文档提供的语音输入方案为在现有的input标签上增加speech属性,另外通过给input元素增加相应事件来获取语音输入的状态。具体扩展的方案如下:

interface HTMLInputElement : HTMLElement {

    ...



    // speech input attributes

    attribute boolean speech;

    attribute DOMString grammar;

    attribute short maxresults;

    attribute long nospeechtimeout;



    // speech input event handler IDL attributes

    attribute Function oncapturestart();

    attribute Function onspeechstart();

    attribute Function onspeechchange(in SpeechInputEvent event);

    attribute Function onspeechend();

    attribute Function onspeecherror(in SpeechInputError error);



    // speech input methods

    void startSpeechInput();

    void stopSpeechInput();

    void cancelSpeechInput();

  };

该规范还提出同一时刻只允许有一个speech input sesseion存在。

Speech JavaScript API Specification

该规范最新版本为2011年12月22日,也是由google工程师提交的,目前处于非正式提案的状态,该规范可以和Speech Input API Specification一起看(工程师基本是同一拨人),一个是针对input元素的扩展,一个是针对js层面的扩展。

该规范仅涉及JS的API,提供了两个接口:SpeechReco和TTS,其中SpeechReco与HTML Speech Web API的SpeechInputRequest功能类似,可以发起语音识别,并通过事件获取结果。

 [Constructor]

    interface SpeechReco {

        // recognition parameters

        SpeechGrammarList grammars;



        DOMString lang;

        attribute boolean continuous;

        // methods to drive the speech interaction

        void start();

        void stop();

        void abort();



        // event methods

        attribute Function onaudiostart;

        attribute Function onsoundstart;

        attribute Function onspeechstart;

        attribute Function onspeechend;

        attribute Function onsoundend;

        attribute Function onaudioend;

        attribute Function onresult;

        attribute Function onnomatch;

        attribute Function onresultdeleted;

        attribute Function onerror;

        attribute Function onstart;

        attribute Function onend;

    };



    SpeechReco implements EventTarget;

    interface SpeechInputError {

        const unsigned short OTHER = 0;

        const unsigned short NO_SPEECH = 1;

        const unsigned short ABORTED = 2;

        const unsigned short AUDIO_CAPTURE = 3;

        const unsigned short NETWORK = 4;

        const unsigned short NOT_ALLOWED = 5;

        const unsigned short SERVICE_NOT_ALLOWED = 6;

        const unsigned short BAD_GRAMMAR = 7;

        const unsigned short LANGUAGE_NOT_SUPPORTED = 8;



        readonly attribute unsigned short code;

        readonly attribute DOMString message;

    };



    // Item in N-best list

    interface SpeechInputAlternative {

        readonly attribute DOMString transcript;

        readonly attribute float confidence;

        readonly attribute any interpretation;

    };



    // A complete one-shot simple response

    interface SpeechInputResult {

        readonly attribute unsigned long length;

        getter SpeechInputAlternative item(in unsigned long index);

        readonly attribute boolean final;

    };



    // A collection of responses (used in continuous mode)

    interface SpeechInputResultList {

        readonly attribute unsigned long length;

        getter SpeechInputResult item(in unsigned long index);

    };



    // A full response, which could be interim or final, part of a continuous response or not

    interface SpeechInputResultEvent : Event {

        readonly attribute SpeechInputResult result;

        readonly attribute SpeechInputError error;

        readonly attribute short resultIndex;

        readonly attribute SpeechInputResultList resultHistory;

    };



    // The object representing a speech grammar

    [Constructor]

    interface SpeechGrammar {

        attribute DOMString src;

        attribute float weight;

    };



    // The object representing a speech grammar collection

    [Constructor]

    interface SpeechGrammarList {

        readonly attribute unsigned long length;

        getter SpeechGrammar item(in unsigned long index);

        void addFromUri(in DOMString src,

                        optional float weight);

        void addFromString(in DOMString string,

                        optional float weight);

    };
    [Constructor]

      interface TTS {

          attribute DOMString text;

          attribute DOMString lang;

          readonly attribute boolean paused;

          readonly attribute boolean ended;



          // methods to drive the speech interaction

          void play();

          void pause();

          void stop();



          attribute Function onstart;

          attribute Function onend;

    };

小结

HTML Speech Web API方案为增加reco元素负责提供语音输入的UI,tts元素负责提供text-to-speech功能,JS的SpeechInputRequest可作为独立的接口调用获取语音识别结果。Speech Input API Specification和Speech Javascript API Specification可以一起来看,它的方案是在原有的input元素上扩展,在界面上提供语音输入入口,同时提供SpeechReco的JS接口来单独发起语音识别并获取结果。

你可能感兴趣的:(html5)