In our first example, we create a stream for audio input, start a server to handle communication with the speech recognition server, direct the server to decode the stream, and print out the utterances stored in the speechdata.
Like the following example, most speech-to-text applications will follow some number of these general steps:
Make streams describing audio sources:
To make streams for all wave files in a directory.
from client.stream import createdirstreams
streams = createdirstreams(<wavdir>, "wav", activitylevel=0)
To construct a Stream from a socket, use the following syntax:
s = Stream(("mymachine", <port>))
Connect to the speech recognition server:
from client.server import Server
server = Server("host")
Launch decoding of streams:
server.decodestreams(streams)
Collect and process utterances for each stream:
from client.speechdata import SpeechData
for s in streams:
sd = SpeechData(s)
sd.join() # wait for stream to finish
for utt in sd.utterances: print utt
Delete streams from the server:
server.deletestreams(streams)
This example program provides one speech recognition parameter, --nosegment. --nosegment turns off utterance segmentation by setting each stream’s “activitylevel” param to 0. “activitylevel” defines the voice activity level below which sound is considered silence. By setting it to zero, no sound will be considered silence and thus the audio input will not be segmented by periods of silence.:
nosegment = "--nosegment" in argv
tags = { "activitylevel" : 0 } if nosegment else {}
The complete program follows:
#!/usr/bin/env python
# Copyright 2010 Silicon Vox Corp. All rights reserved. Contains confidential company information.
# insert parent directory of this file into system path
if __name__ == '__main__':
import sys, os, os.path as osp
fdir = osp.dirname(__file__)
if not fdir: pdir = osp.dirname(os.getcwd())
else:
pdir = osp.dirname(fdir)
if not pdir: pdir = os.getcwd()
if pdir: sys.path.insert(0, pdir)
del sys, os, osp
from sys import argv, stdout, exit
from os.path import isfile, join, split, splitext, abspath
from datetime import datetime
from client.stream import Stream, createdirstreams
from client.server import Server
from client.speechdata import SpeechData
from client.miscutils import isoutputdir, timedeltaseconds
def usage():
print "Usage:", argv[0], "[-o outputdir/--nosegment] -s <server> <wavfiles or dirs>"
exit(1)
def processcmdln():
if len(argv) < 3 or any(x in argv for x in ("--help", "-h")): usage()
try:
i = argv.index("-s")
host = argv[i+1]
del argv[i:i+2]
except: exit("Must specify Silicon Vox server using form '-s <server>'")
try:
i = argv.index("-o")
odir = argv[i+1]
del argv[i:i+2]
except:
odir = None
print "output directory: <wav file directory>"
else:
if isoutputdir(odir): print "output directory:", abspath(odir)
else: exit("Cannot write to output directory: " + odir)
nosegment = "--nosegment" in argv
if nosegment: argv.remove("--nosegment")
return host, odir, nosegment, argv[1:]
if __name__ == '__main__':
host, odir, nosegment, wavfiles = processcmdln()
try: server = Server(host)
except Exception as e: exit("Cannot connect to server: " + str(e))
streams = []
tags = { "activitylevel" : 0 } if nosegment else {}
for s in wavfiles:
if isfile(s): streams.append(Stream(s, **tags))
else: streams += createdirstreams(s, "wav", **tags)
if not streams: exit("No wav files found")
decodestart = datetime.now()
print "streaming files to server '%s'" % host
server.decodestreams(streams)
maxlnlen = 70
cntw = len(str(len(streams)))
statmsgprefix = "[%%0%dd/%d] " % (cntw, len(streams))
maxmsglen = maxlnlen - len(statmsgprefix % 1)
cnt = 0
for s in streams:
sd = SpeechData(s)
sd.join()
fdir, fname = split(s.source)
if odir: fdir = odir
fname = join(fdir, splitext(fname)[0] + ".txt")
with open(fname, "w") as f:
for u in sd.utterances: print >> f, u.text()
cnt += 1
# print status message
if len(fname) > maxmsglen: msg = fname[-maxmsglen:]
else:
print "\r" + " " * maxlnlen, # clear line
msg = fname
print "\r" + (statmsgprefix % cnt) + msg,
stdout.flush()
totsecs = timedeltaseconds(datetime.now() - decodestart)
# refresh stream attributes so sourceseconds is available
server.refreshstreams(streams)
sourcesecs = 0
for s in streams: sourcesecs += s.sourceseconds() or 0
# delete streams from server
server.deletestreams(streams)
msg = "files transcribed at %dX [%.1fs in %.1fs]" % (round(sourcesecs/totsecs),
sourcesecs, totsecs)
print "\r" + (statmsgprefix % cnt) + msg.ljust(maxmsglen)
In this example, the Silicon Vox client acts as a “middle man” server, passing requests and data between the Silicon Vox speech recognition server and another application. This example uses a simple TCP socket interface, enabling Silicon Vox to connect to any client that can use sockets. These clients can be written in any language (C/C++,JAVA,etc) which is especially important for tightly constrained platforms that might not have a Python implementation, such as mobile devices.
This socket interface passes packets back and forth between the middleman server and socket clients. For each packet, first the size of the message is sent/received, then the message, either a string or a wave file, is sent/received. Functions include recvsize(), sendsize(), recvstring(), sendstring(), and recvtempfile().
The first message is the command from the client, received via recvstring(). In this simple example, decode is the only command available.
def recvstring(conn, maxsize=None):
sz = recvsize(conn)
if maxsize and sz > maxsize:
raise Exception("Command string size too large: %d" % sz)
return recvall(conn, sz)
cmd = recvstring(conn, 80)
if cmd == "decode":
Then the client sends the size of the file, and the raw file data, received via recvtempfile().
def recvtempfile(conn):
sz = recvsize(conn)
if sz <= 0: raise Exception("Bad decode datasize: " + str(sz))
tmpf = TempFile(suffix=".wav", dir="/dev/shm/")
ftruncate(tmpf.file.fileno(), sz)
mmf = mmap(tmpf.file.fileno(), 0)
recvall(conn, sz, mmf)
return tmpf
tmpf = recvtempfile(conn)
A Stream is created for the file and sent for decoding to Silicon Vox speech recognition server. A SpeechData object is used to collect results and get the resulting Utterance text.:
s = Stream(tmpf.name)
server.decodestreams(s)
sd = SpeechData(s)
sd.join()
utt = sd.flat_utterance().text()
Finally the text of the utterance is sent from the middleman to the client via sendstring().
def sendstring(conn, string):
sendsize(conn, string)
conn.sendall(string)
sendstring(conn, utt)
The complete program follows:
#!/usr/bin/env python
# Copyright 2010 Silicon Vox Corp. All rights reserved. Contains confidential company information.
# insert parent directory of this file into system path
if __name__ == '__main__':
import sys, os, os.path as osp
fdir = osp.dirname(__file__)
if not fdir: pdir = osp.dirname(os.getcwd())
else:
pdir = osp.dirname(fdir)
if not pdir: pdir = os.getcwd()
if pdir: sys.path.insert(0, pdir)
del sys, os, osp
from sys import argv, exit
from tempfile import NamedTemporaryFile as TempFile
from struct import pack, unpack, calcsize
from os import ftruncate
from mmap import mmap
from client.socketutil import serversocket, sockport, recvall
from client.threadutil import newthread
from client.stream import Stream
from client.server import Server
from client.speechdata import SpeechData
def usage():
print "Usage:", argv[0], "-s <server> [-p <localport>] [--debug]"
exit(1)
def processcmdln():
if not 3 <= len(argv) < 7 or any(x in argv for x in ("--help", "-h")): usage()
try:
i = argv.index("-s")
host = argv[i+1]
del argv[i:i+2]
except: exit("Must specify Silicon Vox server using form '-s <server>'")
try:
i = argv.index("-p")
port = argv[i+1]
del argv[i:i+2]
try: port = int(port)
except: exit("Error: localport (%s) must be an integer" % port)
except: port = 0
global _debug
_debug = "--debug" in argv
if _debug: argv.remove("--debug")
return host, port
_debug = False
def debug(*msg):
if _debug: print " ".join(map(str, msg))
_SZFMT = "<L" # 4 byte unsigned long in little endian
_SZLEN = calcsize(_SZFMT)
def recvsize(conn):
rdata = recvall(conn, _SZLEN)
return unpack(_SZFMT, rdata)[0]
def sendsize(conn, string): conn.sendall(pack(_SZFMT, len(string)))
def recvstring(conn, maxsize=None):
sz = recvsize(conn)
if maxsize and sz > maxsize:
raise Exception("Command string size too large: %d" % sz)
return recvall(conn, sz)
def sendstring(conn, string):
sendsize(conn, string)
conn.sendall(string)
def recvtempfile(conn):
sz = recvsize(conn)
debug("recv'd size", sz)
if sz <= 0: raise Exception("Bad decode datasize: " + str(sz))
tmpf = TempFile(suffix=".wav", dir="/dev/shm/")
ftruncate(tmpf.file.fileno(), sz)
mmf = mmap(tmpf.file.fileno(), 0)
recvall(conn, sz, mmf)
debug("recv'd file")
return tmpf
def handleconn(conn, server):
try:
while True:
cmd = recvstring(conn, 80)
if cmd == "decode":
tmpf = recvtempfile(conn)
try: s = Stream(tmpf.name)
except Exception as e:
debug("Stream exception:", e)
sendstring(conn, "BAD WAV FILE")
continue
server.decodestreams(s)
sd = SpeechData(s)
sd.join()
utt = sd.flat_utterance().text()
debug("sending utt:", utt)
sendstring(conn, utt)
tmpf.close() # also deletes temp file
server.deletestreams(s)
else:
debug("received unknown command:", cmd)
sendstring(conn, "UNKNOWN COMMAND")
except Exception as e:
debug("closing connection:", e)
conn.close()
if __name__ == '__main__':
host, port = processcmdln()
try: server = Server(host)
except Exception as e: exit("Error: cannot connect to server: " + str(e))
try: ss = serversocket(port)
except Exception as e: exit("Error: could not start local server: " + str(e))
debug("Local server on port:", sockport(ss))
try:
while True:
conn, addr = ss.accept()
newthread(handleconn, (conn,server))
except:
debug("local server exiting")