Microsoft Speech Object Library

Abstract

This article is about Text-to-Speech (TTS) using Microsoft Word and the Microsoft Speech Object Libraray (SpVoice SAPI 5.4).

Description

Findings

XML markup seems to work only when speak-mode is not asynchronous.

Sample Code (VBA)

Voices Overview

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

 

Sub listAvailableVoices()

 

    Dim tts As SpVoice

    Set tts = New SpVoice

    Dim voices As Variant

    Dim v As Variant

   

    Set voices = tts.GetVoices

    For Each v In voices

        Debug.Print v.GetDescription

    Next

   

    Set tts = Nothing

   

End Sub

Speak Text

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

 

Sub speakXMLText(ByVal strText As String)

 

    Dim tts As SpVoice

    Set tts = New SpVoice

    Dim voices As Variant

   

    ' setup

    With tts

        Set voices = tts.GetVoices

        Set tts.voice = voices(0) '"CereVoice Isabella - English (East Coast America)"

        .Rate = -1 ' speed

    End With

   

    tts.Speak strText, SVSFIsXML

   

    Set tts = Nothing

   

End Sub

Speak Text (output to file)

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

023

024

025

026

 

Sub speakXMLTextToFile(ByVal strText As String, ByVal strFilePath As String)

 

    Dim tts As SpVoice

    Set tts = New SpVoice

    Dim voices As Variant

    Dim FileStream As New SpFileStream

       

    ' setup

    With tts

        Set voices = tts.GetVoices

        Set tts.voice = voices(0) '"CereVoice Isabella - English (East Coast America)"

        .Rate = -1 ' speed

        .Volume = 100

    End With

   

    FileStream.Open strFilePath, SSFMCreateForWrite, True

    Set tts.AudioOutputStream = FileStream

    tts.Speak strText, SVSFIsXML

    'tts.Speak st, SVSFlagsAsync + SVSFPurgeBeforeSpeak

   

    FileStream.Close

 

    Set tts = Nothing

    Set FileStream = Nothing

   

End Sub

Speak Text Sentence by Sentence (with given Silence)

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

 

Sub speakDocumentSentenceBySentence(Optional ByVal intSilence As Integer = 1000)

 

    Dim tts As SpVoice

    Set tts = New SpVoice

    Dim voices As Variant

   

    ' setup

    With tts

        Set voices = tts.GetVoices

        Set tts.voice = voices(0) '"CereVoice Isabella - English (East Coast America)"

        .Rate = -1 ' speed

    End With

   

    Dim st As Variant

    For Each st In ThisDocument.Sentences

        tts.Speak st, SVSFIsXML

        tts.Speak "<silence msec='" & intSilence & "'/>", SVSFIsXML

    Next

   

    Set tts = Nothing

   

End Sub

Speak Text Sentence by Sentence (output to file with given silence)

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

023

024

025

026

027

028

029

030

 

Sub speakDocumentSentenceBySentenceToFile(ByVal strFilePath As String, Optional ByVal intSilence As Integer = 500)

 

    Dim tts As SpVoice

    Set tts = New SpVoice

    Dim voices As Variant

    Dim FileStream As New SpFileStream

   

    ' setup

    With tts

        Set voices = tts.GetVoices

        Set tts.voice = voices(0) '"CereVoice Isabella - English (East Coast America)"

        .Rate = -1 ' speed

    End With

   

    FileStream.Open strFilePath, SSFMCreateForWrite, True

    Set tts.AudioOutputStream = FileStream

   

    Dim st As Variant

    For Each st In ThisDocument.Sentences

        tts.Speak st, SVSFIsXML

        tts.Speak "<silence msec='" & intSilence & "'/>", SVSFIsXML

    Next

   

    FileStream.Close

 

    Set tts = Nothing

    Set FileStream = Nothing

   

End Sub

 


Source Text

Source text can be enriched with XML tags for example to speak up or to spell parts of the given text.

 

<voice emotion='happy'>the sun is shining.</voice>

<voice emotion='sad'>the sun is shining.</voice>

 

<volume level="20">

This text should be spoken at volume level fifty.

 

   <volume level="100">

      This text should be spoken at volume level one hundred.

   </volume>

 

</volume>

 

<volume level="80"/>

All text which follows should be spoken at volume level eighty.

 

<rate absspeed="5">

   This text should be spoken at rate five.

   <rate absspeed="-5">

      This text should be spoken at rate negative five.

   </rate>

</rate>

<rate absspeed="10"/>

 

<rate speed="5">

   This text should be spoken at rate five.

      <rate speed="-5">

         This text should be spoken at rate zero.

      </rate>

</rate>

 

<pitch absmiddle="5">

This text should be spoken at pitch five.

   <pitch absmiddle="-5">

      This text should be spoken at pitch negative five.

   </pitch>

</pitch>

<pitch absmiddle="10"/>

 

<pitch middle="5">

This text should be spoken at pitch five.

   <pitch middle="-5">

      This text should be spoken at pitch zero.

   </pitch>

</pitch>

 

<emph> boo </emph>!

 

<spell>

These words should be spelled out.

</spell>

These words should not be spelled out.

 

Five hundred milliseconds of silence <silence msec="500"/> just occurred.

 

<pron sym="h eh 1 l ow & w er 1 l d "/>

<pron sym="h eh 1 l ow & w er 1 l d"> hello world </pron>

 

<context id="date_mdy"> 03/04/01 </context> should be March fourth, two thousand one.

<context id="date_dmy"> 03/04/01 </context> should be April third, two thousand one.

<context id="date_ymd"> 03/04/01 </context> should be April first, two thousand four.

 

<voice required="Gender=Female;Age!=Child">

A female non-child should speak this sentence, if one exists.

 

<voice required="Age=Teen">

   A teen should speak this sentence - if a female, non-child teen is present, she will be selected over a male teen, for example.

   </voice>

</voice>

 

<voice required="Language=409">

A U.S. English voice should speak this.

</voice>

<lang langid="409">

   A U.S. English voice should speak this.

</lang>

 

<P DISP="disp" PRON="pron">word</P>

 

<P>/disp/word/pron;</P>

 

<P DISP="greeting" PRON="ah">hello</P>

 

<P>/greeting/hello/ah;</P>

 


Dieter Neumann