Examples

Converting Speech to Text

In our first example, we create a stream for audio input, start a server to handle communication with the speech recognition server, direct the server to decode the stream, and print out the utterances stored in the speechdata.

Like the following example, most speech-to-text applications will follow some number of these general steps:

  1. Make streams describing audio sources:

    To make streams for all wave files in a directory.

    from client.stream import createdirstreams
    streams = createdirstreams(<wavdir>, "wav", activitylevel=0)

    To construct a Stream from a socket, use the following syntax:

    s = Stream(("mymachine", <port>))
  2. Connect to the speech recognition server:

    from client.server import Server
    server = Server("host")
    
  3. Launch decoding of streams:

    server.decodestreams(streams)
    
  4. Collect and process utterances for each stream:

    from client.speechdata import SpeechData
    for s in streams:
      sd = SpeechData(s)
      sd.join() # wait for stream to finish
      for utt in sd.utterances: print utt
    
  5. Delete streams from the server:

    server.deletestreams(streams)
    

This example program provides one speech recognition parameter, --nosegment. --nosegment turns off utterance segmentation by setting each stream’s “activitylevel” param to 0. “activitylevel” defines the voice activity level below which sound is considered silence. By setting it to zero, no sound will be considered silence and thus the audio input will not be segmented by periods of silence.:

nosegment = "--nosegment" in argv

tags = { "activitylevel" : 0 } if nosegment else {}

The complete program follows:

#!/usr/bin/env python
# Copyright 2010 Silicon Vox Corp.  All rights reserved.  Contains confidential company information.

# insert parent directory of this file into system path
if __name__ == '__main__':
    import sys, os, os.path as osp
    fdir = osp.dirname(__file__)
    if not fdir: pdir = osp.dirname(os.getcwd())
    else:
        pdir = osp.dirname(fdir)
        if not pdir: pdir = os.getcwd()
    if pdir: sys.path.insert(0, pdir)
    del sys, os, osp

from sys import argv, stdout, exit
from os.path import isfile, join, split, splitext, abspath
from datetime import datetime
from client.stream import Stream, createdirstreams
from client.server import Server
from client.speechdata import SpeechData
from client.miscutils import isoutputdir, timedeltaseconds

def usage():
    print "Usage:", argv[0], "[-o outputdir/--nosegment] -s <server> <wavfiles or dirs>"
    exit(1)
    
def processcmdln():
    if len(argv) < 3 or any(x in argv for x in ("--help", "-h")): usage()

    try:
        i = argv.index("-s")
        host = argv[i+1]
        del argv[i:i+2]
    except: exit("Must specify Silicon Vox server using form '-s <server>'")

    try:
        i = argv.index("-o")
        odir = argv[i+1]
        del argv[i:i+2]
    except:
        odir = None
        print "output directory: <wav file directory>"
    else:
        if isoutputdir(odir): print "output directory:", abspath(odir)
        else: exit("Cannot write to output directory: " + odir)

    nosegment = "--nosegment" in argv
    if nosegment: argv.remove("--nosegment")

    return host, odir, nosegment, argv[1:]
    
if __name__ == '__main__':
    host, odir, nosegment, wavfiles = processcmdln()

    try: server = Server(host)
    except Exception as e: exit("Cannot connect to server: " + str(e))
    
    streams = []
    tags = { "activitylevel" : 0 } if nosegment else {}
    for s in wavfiles:
        if isfile(s): streams.append(Stream(s, **tags))
        else: streams += createdirstreams(s, "wav", **tags)

    if not streams: exit("No wav files found")
            
    decodestart = datetime.now()

    print "streaming files to server '%s'" % host
    server.decodestreams(streams)

    maxlnlen = 70
    cntw = len(str(len(streams)))
    statmsgprefix = "[%%0%dd/%d] " % (cntw, len(streams))
    maxmsglen = maxlnlen - len(statmsgprefix % 1)

    cnt = 0
    for s in streams:
        sd = SpeechData(s)
        sd.join()

        fdir, fname = split(s.source)
        if odir: fdir = odir
        fname = join(fdir, splitext(fname)[0] + ".txt")

        with open(fname, "w") as f:
            for u in sd.utterances: print >> f, u.text()

        cnt += 1

        # print status message
        if len(fname) > maxmsglen: msg = fname[-maxmsglen:]
        else:
            print "\r" + " " * maxlnlen, # clear line
            msg = fname
        print "\r" + (statmsgprefix % cnt) + msg,
        stdout.flush()
        
    totsecs = timedeltaseconds(datetime.now() - decodestart)

    # refresh stream attributes so sourceseconds is available
    server.refreshstreams(streams)

    sourcesecs = 0
    for s in streams: sourcesecs += s.sourceseconds() or 0

    # delete streams from server
    server.deletestreams(streams)

    msg = "files transcribed at %dX [%.1fs in %.1fs]" % (round(sourcesecs/totsecs),
                                                         sourcesecs, totsecs)
    print "\r" + (statmsgprefix % cnt) + msg.ljust(maxmsglen)

Communication with Other Applications

In this example, the Silicon Vox client acts as a “middle man” server, passing requests and data between the Silicon Vox speech recognition server and another application. This example uses a simple TCP socket interface, enabling Silicon Vox to connect to any client that can use sockets. These clients can be written in any language (C/C++,JAVA,etc) which is especially important for tightly constrained platforms that might not have a Python implementation, such as mobile devices.

This socket interface passes packets back and forth between the middleman server and socket clients. For each packet, first the size of the message is sent/received, then the message, either a string or a wave file, is sent/received. Functions include recvsize(), sendsize(), recvstring(), sendstring(), and recvtempfile().

  1. The first message is the command from the client, received via recvstring(). In this simple example, decode is the only command available.

    def recvstring(conn, maxsize=None):
        sz = recvsize(conn)
        if maxsize and sz > maxsize:
            raise Exception("Command string size too large: %d" % sz)
        return recvall(conn, sz)
    
    cmd = recvstring(conn, 80)
    if cmd == "decode":
  2. Then the client sends the size of the file, and the raw file data, received via recvtempfile().

    def recvtempfile(conn):
        sz = recvsize(conn)
        if sz <= 0: raise Exception("Bad decode datasize: " + str(sz))
        tmpf = TempFile(suffix=".wav", dir="/dev/shm/")
        ftruncate(tmpf.file.fileno(), sz)
        mmf = mmap(tmpf.file.fileno(), 0)
        recvall(conn, sz, mmf)
        return tmpf
    
    tmpf = recvtempfile(conn)
    
  3. A Stream is created for the file and sent for decoding to Silicon Vox speech recognition server. A SpeechData object is used to collect results and get the resulting Utterance text.:

    s = Stream(tmpf.name)
    server.decodestreams(s)
    sd = SpeechData(s)
    sd.join()
    utt = sd.flat_utterance().text()
    
  4. Finally the text of the utterance is sent from the middleman to the client via sendstring().

    def sendstring(conn, string):
       sendsize(conn, string)
       conn.sendall(string)
    
    sendstring(conn, utt)
    

The complete program follows:

#!/usr/bin/env python
# Copyright 2010 Silicon Vox Corp.  All rights reserved.  Contains confidential company information.

# insert parent directory of this file into system path
if __name__ == '__main__':
    import sys, os, os.path as osp
    fdir = osp.dirname(__file__)
    if not fdir: pdir = osp.dirname(os.getcwd())
    else:
        pdir = osp.dirname(fdir)
        if not pdir: pdir = os.getcwd()
    if pdir: sys.path.insert(0, pdir)
    del sys, os, osp
    
from sys import argv, exit
from tempfile import NamedTemporaryFile as TempFile
from struct import pack, unpack, calcsize
from os import ftruncate
from mmap import mmap
from client.socketutil import serversocket, sockport, recvall
from client.threadutil import newthread
from client.stream import Stream
from client.server import Server
from client.speechdata import SpeechData

def usage():
    print "Usage:", argv[0], "-s <server> [-p <localport>] [--debug]"
    exit(1)
    
def processcmdln():
    if not 3 <= len(argv) < 7 or any(x in argv for x in ("--help", "-h")): usage()

    try:
        i = argv.index("-s")
        host = argv[i+1]
        del argv[i:i+2]
    except: exit("Must specify Silicon Vox server using form '-s <server>'")

    try:
        i = argv.index("-p")
        port = argv[i+1]
        del argv[i:i+2]
        try: port = int(port)
        except: exit("Error: localport (%s) must be an integer" % port)
    except: port = 0

    global _debug
    _debug = "--debug" in argv
    if _debug: argv.remove("--debug")
    
    return host, port
    
_debug = False
def debug(*msg):
    if _debug: print " ".join(map(str, msg))
    
_SZFMT = "<L" # 4 byte unsigned long in little endian
_SZLEN = calcsize(_SZFMT)

def recvsize(conn):
    rdata = recvall(conn, _SZLEN)
    return unpack(_SZFMT, rdata)[0]

def sendsize(conn, string): conn.sendall(pack(_SZFMT, len(string)))

def recvstring(conn, maxsize=None):
    sz = recvsize(conn)
    if maxsize and sz > maxsize:
        raise Exception("Command string size too large: %d" % sz)
    return recvall(conn, sz)

def sendstring(conn, string):
    sendsize(conn, string)
    conn.sendall(string)

def recvtempfile(conn):
    sz = recvsize(conn)
    debug("recv'd size", sz)
    if sz <= 0: raise Exception("Bad decode datasize: " + str(sz))
    tmpf = TempFile(suffix=".wav", dir="/dev/shm/")
    ftruncate(tmpf.file.fileno(), sz)
    mmf = mmap(tmpf.file.fileno(), 0)
    recvall(conn, sz, mmf)
    debug("recv'd file")
    return tmpf

def handleconn(conn, server):
    try:
        while True:
            cmd = recvstring(conn, 80)
            if cmd == "decode":
                tmpf = recvtempfile(conn)
                try: s = Stream(tmpf.name)
                except Exception as e:
                    debug("Stream exception:", e)
                    sendstring(conn, "BAD WAV FILE")
                    continue
                server.decodestreams(s)
                sd = SpeechData(s)
                sd.join()
                utt = sd.flat_utterance().text()
                debug("sending utt:", utt)
                sendstring(conn, utt)
                tmpf.close() # also deletes temp file
                server.deletestreams(s)
            else:
                debug("received unknown command:", cmd)
                sendstring(conn, "UNKNOWN COMMAND")
    except Exception as e:
        debug("closing connection:", e)

    conn.close()
    
if __name__ == '__main__':
    host, port = processcmdln()

    try: server = Server(host)
    except Exception as e: exit("Error: cannot connect to server: " + str(e))
    
    try: ss = serversocket(port)
    except Exception as e: exit("Error: could not start local server: " + str(e))

    debug("Local server on port:", sockport(ss))
    
    try:
        while True:
            conn, addr = ss.accept()
            newthread(handleconn, (conn,server))
    except:
        debug("local server exiting")

Module Quick Links

Table Of Contents

Previous topic

Silicon Vox: Accelerated Speech Recognition

Next topic

The server Module

This Page