#!/usr/bin/perl -w # # voiceServer.pl # Voice server for Zaza's new face and voice. # # Copyright (C) 2002 Brian Rudy (brudyNO@SPAMpraecogito.com) # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # # Revision history # # 0.03 5-21-2002 # Updated face client outbox shm to use 'stack' of n last cues (specified by # $cuestacksize). This should allow slower async clients to keep current, and # prevent the loss of rapidly-sent cues. Added emotion support. # # 0.02 5-20-2002 # Added audio post-processing option to clean up output on Zaza's hardware. # Fixed shm init to properly auto-generate 'voiceServer initialized' message # if not already cached. # # 0.01 2-13-2002 # First functional version. Basic speech request support (no SABLE, JSML, # VXML or other markup supported). No 'emotion' support. # # # Bugs: # Sending a request for the same speech cue works, but behaves a little # differently than other requests. # # # Notes: # # Persistent server application with CGI front-end for speech requests. # IPC::Shareable for client comm. # # Speech rendering requests will be sent to a persistant festival_server # instance specified by $festival_server and $festival_port. # # Speech 'cache' used for speed. Pre-rendered scripts and wavefiles stored # on disk, and re-indexed each time voiceServer is run. # # Since this approach only speaks one sentance at a time, might want to # split multi-sentance speech request text into one or more speech # requests. (might be tricky w/ current implimentation) # # Hooks for CGI face client programs # Seperate CGI clients for Java face interface and speech cue input handling. # # # Requires: # String::CRC32 # IPC::Shareable # Festival::Client::Async use String::CRC32; use IPC::Shareable; use Festival::Client::Async qw(parse_lisp); ## Config info ## my $festival_server = "localhost"; my $festival_port = 1314; # Where on local filesystem phones and waveforms are stored. my $speech_data_dir = "/home/brudy/public_html/zaza/speechdata/"; # Post-process the audio after rendering. If no post-processing # is desired, just leave this blank. my $ppopts = "| sox -twav - -twav - pitch 550 35 c c "; # Number of cues to keep in shm outbox 'stack' my $cuestacksize = 10; # Clean up on untrapped signal error $SIG{INT} = \&catch_int; sub catch_int { print "Got SIGHUP, shutting down.\n"; exit; } ## main # setup shared memory my $cueglue = 'cuedata'; my $faceglue = 'facedata'; my %options = ( 'create' => 1, 'exclusive' => 0, 'mode' => 0666, 'destroy' => 1, ); my %faceoptions = ( 'create' => 1, 'exclusive' => 0, 'mode' => 0666, 'destroy' => 1, ); ## If this code dies or is killed unexpectedly the semaphore arrays ## will be unwritable. They must be identified with ipcs, and deleted with ## ipcrm manually! catch_int should resolve this problem now. # This is our speech cue input shm tie(%cueinfo, IPC::Shareable, $cueglue, { %options }) or die "voiceServer: tie failed\n"; # This is our face client output shm tie(@faceinfo, IPC::Shareable, $faceglue, { %faceoptions }) or die "voiceServer: tie failed\n"; #%faceinfo = ( 'timestamp' => time(), # 'stringfinger' => "1234567890", # ); # index pre-rendered speech directory, store in local array. print "Indexing speech cache.\n"; opendir(DIR, $speech_data_dir) || die "Can't open speech data directory!"; @filenames = readdir(DIR); closedir(DIR); my @stringlist; my $fileindex = 0; my $renderedspeech = "nothing"; my $rendering = 0; my $donerendering; my $currenttime; my $match; foreach $file (@filenames) { if (($file ne "..") && ($file ne ".")) { # we are only counting the wavs if ($file =~ m/(\d+).wav/) { print "Loading " . $1 . "\n"; $stringlist[$fileindex] = $1; $fileindex++; } } } # connect to festival server $fest = Festival::Client::Async->new($festival_server, $festival_port) or die "couldn't connect to Festival server: $!"; # send setup info to festival (voice, Wavefiletype, phoneme ouput, etc) my @lispsetup = ( "(Parameter.set \'Wavefiletype \'wav)", "(voice_us1_mbrola)", "(tts_return_to_client)", "(load \"/home/brudy/voiceServer/zaza-mode.scm\")" ); for $index (0 .. $#lispsetup) { print "Sending:" . $lispsetup[$index] . "\n"; $fest->server_eval_sync($lispsetup[$index], { LP => sub { my $lisp = shift; print "lisp response=" . $lisp; }, WV => sub { my $wave = shift; print "I got wave data.\n"; } }) or die "error from Festival server"; } # Set some sane initial values in both shm mailboxes $faceinfo[0]{timestamp} = 0; $faceinfo[0]{stringfinger} = sprintf "%s", "0"; $faceinfo[0]{emotion} = "happy"; # We might end up in a race condition with a cue client if the # client trys to write the shm before we do. $cueinfo{timestamp} = time(); $cueinfo{cuestring} = "voiceServer initialized"; $currenttime = $faceinfo[0]{timestamp}; $cueinfo{emotion} = "happy"; #main loop while (1) { if ($cueinfo{timestamp} != $currenttime) { # Do we need to do something, or just wait for renderspeech to finish? if (!$rendering && ((crc32($cueinfo{cuestring}) != crc32($renderedspeech)) || ($cueinfo{cuestring} eq '0'))) { if ($cueinfo{cuestring} eq '0') { # Client only requested emotion change print "Met first emotion only change condition.\n"; $match = 1; } else { print "Got a speech request for: " . $cueinfo{cuestring} . ", fingerprint:" . (crc32($cueinfo{cuestring})) . "\n"; $match = 0; for $stringindex (0 .. $#stringlist) { if($stringlist[$stringindex] == (crc32($cueinfo{cuestring}))) { $match = 1; } } } if ($match) { if ($cueinfo{cuestring} eq '0') { print "Emotion only change.\n"; } else { print "Returning cached request.\n"; } $renderedspeech = $cueinfo{cuestring}; $donerendering = 1; } else { $donerendering = 0; renderspeech($cueinfo{cuestring}); } } if ($donerendering) { $currenttime = $cueinfo{timestamp}; if ($cueinfo{cuestring} eq '0') { push @faceinfo, {timestamp => $currenttime, stringfinger => '0', emotion => $cueinfo{emotion}}; } else { push @faceinfo, {timestamp => $currenttime, stringfinger => (sprintf "%s", crc32($renderedspeech)), emotion => $cueinfo{emotion}}; } if (($#faceinfo > $cuestacksize) || ($faceinfo[0]{timestamp} eq '0')) { shift @faceinfo; } print "Done handling request for: $renderedspeech\n"; print "There are now " . $#faceinfo . " elements on the stack.\n"; print "Timestamp\tStringfinger\tEmotion\n"; for $stackelement (0 .. $#faceinfo) { print $faceinfo[$stackelement]{timestamp} . "\t" . $faceinfo[$stackelement]{stringfinger} . "\t" . $faceinfo[$stackelement]{emotion} . "\n"; } } } # We need to sleep for a little while, or IPC::Sharable misbehaves # sleep for 250ms select(undef, undef, undef, 0.25); } sub renderspeech { ($speechstring) = @_; $rendering = 1; my $stringfinger = crc32($speechstring); my $outfile = $speech_data_dir . $stringfinger . "\.wav"; print "Creating $outfile.\n"; open(AUDIO, "$ppopts>$outfile") || die "Cannot create $outfile!\n"; # Render and save waveform to local filesystem my $mrstring = "(tts_text \"$speechstring\" \'zaza)"; print "Sending $mrstring\n"; $fest->server_eval_sync($mrstring, { LP => sub { my $lisp = shift; #my $lisp = @_; print "lisp response=" . $lisp; }, WV => sub { my $wave = shift; #print "I got wave data.\n"; print AUDIO $wave; } }) or die "error from Festival server"; close(AUDIO); print "Waveform written.\n"; # Extract phones from previous request my $concatresponse; $fest->server_eval_sync("(print_string myphones)", { LP => sub { my $lisp = shift; # We need to remove the escape characters, # leading and trailing quotes and # concatinate before running parse_lisp() # on the response. $lisp =~ s/\\(.)/$1/g; $lisp =~ s/^\"//; $lisp =~ s/\"$//; $concatresponse = $concatresponse . $lisp; } }) or die "error from Festival server"; #print "Done: $concatresponse"; my @names = $concatresponse =~ m/name \"(\w+)\"/g; #print "There are " . scalar @names . " names in the response.\n"; my @ends = $concatresponse =~ m/end ([0-9]+[.]?[0-9]*|[.][0-9]+)/g; #print "There are " . scalar @ends . " ends in the response.\n"; my $phoneout = $speech_data_dir . $stringfinger . "\.script"; open(PHONES, ">$phoneout") || die "Cannot create $phoneout!\n"; print "Creating $phoneout\n"; for $phonename (0 .. $#names) { printf PHONES "%.4f %s\n", $ends[$phonename], $names[$phonename]; } close(PHONES); print "Phoneme script written.\n"; # push newly rendered onto end of @stringlist push @stringlist, $stringfinger; print "There are now " . $#stringlist . " cues in the cache.\n"; for $theindex (0 .. $#stringlist) { print $stringlist[$theindex] . "\n"; } $renderedspeech = $speechstring; $rendering = 0; $donerendering = 1; }