#!/usr/bin/perl -w # # voiceServer.pl # Voice server for Zaza's new face and voice. # # Copyright (C) 2002-2003 Brian Rudy (brudyNO@SPAMpraecogito.com) # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # # Revision history # # 0.11 1-30-2004 # Added support for cmu_us_slt_arctic_hts voice. Added voice variable # to make changing the voice a little easier. # # 0.10 11-14-2003 # Bug fix in regex matching for Lisp responses caused name/end mismatch # for phonemes with additional info included (such as allophone_name) # # 0.09 11-22-2002 (yes two versions in one day ;) # Added support for Sable audio source 'padding' of face cues with open # mouth (work-around) # # 0.08 11-22-2002 # Fixed another bug in renderspeech() dealing with returning to # normal mode after Sable mode. No more Multiple Personality Disorder ;) # # 0.07 11-15-2002 # Fixed bug in cue cache handling routine preventing normal cached # cues from being handled properly after a Sable cue. # # 0.06 11-8-2002 # Reduced loop delay from 250ms to 50ms with new version of # IPC::Shareable # # 0.05 9-10-2002 # Initial Sable text mode support. # # 0.04 8-14-2002 # Quick update to support OGI 'tll' voice. Just un-comment the old # @lispsetup array to use the old MBROLA voice. Regex improvements to # support non 'word' characters in the phonemes. # # 0.03 5-21-2002 # Updated face client outbox shm to use 'stack' of n last cues (specified by # $cuestacksize). This should allow slower async clients to keep current, and # prevent the loss of rapidly-sent cues. Added emotion support. # # 0.02 5-20-2002 # Added audio post-processing option to clean up output on Zaza's hardware. # Fixed shm init to properly auto-generate 'voiceServer initialized' message # if not already cached. # # 0.01 2-13-2002 # First functional version. Basic speech request support (no SABLE, JSML, # VXML or other markup supported). No 'emotion' support. # # # Bugs: # Sending a request for the same speech cue works, but behaves a little # differently than other requests. # # # Notes: # # Persistent server application with CGI front-end for speech requests. # IPC::Shareable for client comm. # # Speech rendering requests will be sent to a persistant festival_server # instance specified by $festival_server and $festival_port. # # Speech 'cache' used for speed. Pre-rendered scripts and wavefiles stored # on disk, and re-indexed each time voiceServer is run. # # Since this approach only speaks one sentance at a time, might want to # split multi-sentance speech request text into one or more speech # requests. (might be tricky w/ current implimentation) # # Hooks for CGI face client programs # Seperate CGI clients for Java face interface and speech cue input handling. # # # Requires: # String::CRC32 # IPC::Shareable # Festival::Client::Async # Audio::SoundFile use String::CRC32; use IPC::Shareable; use Festival::Client::Async qw(parse_lisp); use Audio::SoundFile; ## Config info ## my $festival_server = "localhost"; my $festival_port = 1314; # Where on local filesystem phones and waveforms are stored. my $speech_data_dir = "/home/brudy/public_html/zaza/speechdata/"; # Post-process the audio after rendering. If no post-processing # is desired, just leave this blank. #my $ppopts = "| sox -twav - -twav - pitch 550 35 c c "; #my $ppopts = "| sox -twav - -twav - pitch 550 35 c c highpass 1000 vol 5.0 "; #my $ppopts = "| sox -twav - -twav /var/tmp/tmpout.wav pitch 750 55 c c; sox /var/tmp/tmpout.wav -twav - highp 4000 vol 6.0 "; #$ppopts = "| sox -twav - -twav /var/tmp/tmpin.wav vol 2.0 ; #sox -v -1.0 /var/tmp/tmpin.wav -twav - pick 2 ; #my $ppopts = "| sox -twav - -c2 -twav - vol 2.0 "; #my $ppopts = "| sox -twav - -traw -w -s -r16000 - | sox -traw -w -r16000 -s - -twav - mask vol 2.0 "; my $ppopts = "| sox -twav - -twav - mask vol 2.0 "; #my $ppopts = ""; # Number of cues to keep in shm outbox 'stack' my $cuestacksize = 10; # voice to use #my $voice = "tll_diphone"; my $voice = "cmu_us_slt_arctic_hts"; # Clean up on untrapped signal error $SIG{INT} = \&catch_int; sub catch_int { print "Got SIGHUP, shutting down.\n"; exit; } ## main # setup shared memory my $cueglue = 'cuedata'; my $faceglue = 'facedata'; my %options = ( 'create' => 1, 'exclusive' => 0, 'mode' => 0666, 'destroy' => 1, ); my %faceoptions = ( 'create' => 1, 'exclusive' => 0, 'mode' => 0666, 'destroy' => 1, ); ## If this code dies or is killed unexpectedly the semaphore arrays ## will be unwritable. They must be identified with ipcs, and deleted with ## ipcrm manually! catch_int should resolve this problem now. # This is our speech cue input shm tie(%cueinfo, IPC::Shareable, $cueglue, { %options }) or die "voiceServer: tie failed\n"; # This is our face client output shm tie(@faceinfo, IPC::Shareable, $faceglue, { %faceoptions }) or die "voiceServer: tie failed\n"; #%faceinfo = ( 'timestamp' => time(), # 'stringfinger' => "1234567890", # ); # index pre-rendered speech directory, store in local array. print "Indexing speech cache.\n"; opendir(DIR, $speech_data_dir) || die "Can't open speech data directory!"; @filenames = readdir(DIR); closedir(DIR); my @stringlist; my $fileindex = 0; my $renderedspeech = "nothing"; my $rendering = 0; my $donerendering; my $currenttime; my $match; my $textmode = "normal"; foreach $file (@filenames) { if (($file ne "..") && ($file ne ".")) { # we are only counting the wavs if ($file =~ m/(\d+).wav/) { print "Loading " . $1 . "\n"; $stringlist[$fileindex] = $1; $fileindex++; } } } # connect to festival server $fest = Festival::Client::Async->new($festival_server, $festival_port) or die "couldn't connect to Festival server: $!"; # send setup info to festival (voice, Wavefiletype, phoneme ouput, etc) # To use Sable markup, just: # (set! after_analysis_hooks (list phone_output_info)) # to dump the talking head info while using the sable-mode #my @lispsetup = ( # "(Parameter.set \'Wavefiletype \'wav)", # "(voice_us1_mbrola)", # "(tts_return_to_client)", # "(load \"/home/brudy/voiceServer/zaza-mode.scm\")" # ); #my @lispsetup = ( # "(Parameter.set \'Wavefiletype \'wav)", # "(voice_" . $voice . ")", # "(tts_return_to_client)", # "(load \"/home/brudy/voiceServer/zaza-mode.scm\")", # "(set! after_analysis_hooks phone_output_info)" # ); my @lispsetup = ( "(Parameter.set \'Wavefiletype \'wav)", "(voice_" . $voice . ")", "(tts_return_to_client)", "(load \"/home/brudy/voiceServer/zaza-mode.scm\")", "(set! hts_uv_threshold 0.8)", "(set! hts_use_phone_align 1)", "(set! after_analysis_hooks phone_output_info)" ); for $index (0 .. $#lispsetup) { print "Sending:" . $lispsetup[$index] . "\n"; $fest->server_eval_sync($lispsetup[$index], { LP => sub { my $lisp = shift; print "lisp response=" . $lisp; }, WV => sub { my $wave = shift; print "I got wave data.\n"; } }) or die "error from Festival server"; } # Set some sane initial values in both shm mailboxes $faceinfo[0]{timestamp} = 0; $faceinfo[0]{stringfinger} = sprintf "%s", "0"; $faceinfo[0]{emotion} = "happy"; # We might end up in a race condition with a cue client if the # client trys to write the shm before we do. $cueinfo{timestamp} = time(); $cueinfo{cuestring} = "voice server initialized"; $currenttime = $faceinfo[0]{timestamp}; $cueinfo{emotion} = "happy"; #main loop while (1) { if ($cueinfo{timestamp} != $currenttime) { # Do we need to do something, or just wait for renderspeech to finish? if (!$rendering && ((crc32($cueinfo{cuestring}) != crc32($renderedspeech)) || ($cueinfo{cuestring} eq '0'))) { if ($cueinfo{cuestring} eq '0') { # Client only requested emotion change print "Met first emotion only change condition.\n"; $match = 1; } else { print "Got a speech request for: " . $cueinfo{cuestring} . ", fingerprint:" . (crc32($cueinfo{cuestring})) . "\n"; $match = 0; for $stringindex (0 .. $#stringlist) { if($stringlist[$stringindex] == (crc32($cueinfo{cuestring}))) { $match = 1; } } } if ($match) { if ($cueinfo{cuestring} eq '0') { print "Emotion only change.\n"; } else { print "Returning cached request.\n"; } $renderedspeech = $cueinfo{cuestring}; $donerendering = 1; } else { $donerendering = 0; renderspeech($cueinfo{cuestring}); } } if ($donerendering) { $currenttime = $cueinfo{timestamp}; if ($cueinfo{cuestring} eq '0') { push @faceinfo, {timestamp => $currenttime, stringfinger => '0', emotion => $cueinfo{emotion}}; } else { if ($renderedspeech =~ m/SABLE/) { print "Pushing Sable-mode text stringfinger to stack.\n"; # We need to unescape the string to get the correct stringfinger $renderedspeech =~ s/\\\"/\"/go; push @faceinfo, {timestamp => $currenttime, stringfinger => (sprintf "%s", crc32($renderedspeech)), emotion => $cueinfo{emotion}}; } else { push @faceinfo, {timestamp => $currenttime, stringfinger => (sprintf "%s", crc32($renderedspeech)), emotion => $cueinfo{emotion}}; } } if (($#faceinfo > $cuestacksize) || ($faceinfo[0]{timestamp} eq '0')) { shift @faceinfo; } print "Done handling request for: $renderedspeech\n"; print "There are now " . $#faceinfo . " elements on the stack.\n"; print "Timestamp\tStringfinger\tEmotion\n"; for $stackelement (0 .. $#faceinfo) { print $faceinfo[$stackelement]{timestamp} . "\t" . $faceinfo[$stackelement]{stringfinger} . "\t" . $faceinfo[$stackelement]{emotion} . "\n"; } } } # We need to sleep for a little while, or IPC::Sharable misbehaves # sleep for 250ms select(undef, undef, undef, 0.05); } sub renderspeech { ($speechstring) = @_; #use URI::Escape; my $mrstring; $rendering = 1; my $stringfinger = crc32($speechstring); my $outfile = $speech_data_dir . $stringfinger . "\.wav"; print "Creating $outfile.\n"; open(AUDIO, "$ppopts>$outfile") || die "Cannot create $outfile!\n"; # Render and save waveform to local filesystem if ($speechstring =~ m/SABLE/) { print "Sable mode enabled.\n"; #$speechstring = uri_escape($speechstring); $speechstring =~ s/\"/\\"/go; $mrstring = "(tts_textall \"$speechstring\" \'sable)"; $textmode = "sable"; } else { $mrstring = "(tts_textall \"$speechstring\" nil)"; $textmode = "normal"; } my @lisprequest = ( "(voice_" . $voice . ")", $mrstring ); for $index (0 .. $#lisprequest) { print "Sending $lisprequest[$index]\n"; $fest->server_eval_sync($lisprequest[$index], { LP => sub { my $lisp = shift; #my $lisp = @_; print "lisp response=" . $lisp; }, WV => sub { my $wave = shift; #print "I got wave data.\n"; print AUDIO $wave; } }) or die "error from Festival server"; } close(AUDIO); print "Waveform written.\n"; if ($speechstring =~ m/AUDIO.SRC/) { # Special case, pad phone script with open mouth for the length of # the wav file. # Get the length of the wav file my $reader = new Audio::SoundFile::Reader($outfile, \$header); $reader->close; my $wavlength = $header->get("samples")/$header->get("samplerate"); my $phoneout = $speech_data_dir . $stringfinger . "\.script"; open(PHONES, ">$phoneout") || die "Cannot create $phoneout!\n"; print "Creating padded $phoneout\n"; print PHONES "0.2000 pau\n"; printf PHONES "%.4f A\n", $wavlength-0.100; printf PHONES "%.4f pau\n", $wavlength; close(PHONES); print "Padded phoneme script written.\n"; } else { # Extract phones from previous request my $concatresponse; $fest->server_eval_sync("(print_string myphones)", { LP => sub { my $lisp = shift; # We need to remove the escape characters, # leading and trailing quotes and # concatinate before running parse_lisp() # on the response. $lisp =~ s/\\(.)/$1/g; $lisp =~ s/^\"//; $lisp =~ s/\"$//; $concatresponse = $concatresponse . $lisp; } }) or die "error from Festival server"; #print "Done: $concatresponse"; #my @names = $concatresponse =~ m/name \"(\S+)\"/g; my @names = $concatresponse =~ m/\(name \"(\S+)\"\)/g; print "There are " . scalar @names . " names in the response.\n"; #my @ends = $concatresponse =~ m/end ([0-9]+[.]?[0-9]*|[.][0-9]+)/g; my @ends = $concatresponse =~ m/end ([0-9]+[.]?[0-9]*|[.][0-9]+)/g; print "There are " . scalar @ends . " ends in the response.\n"; my $phoneout = $speech_data_dir . $stringfinger . "\.script"; open(PHONES, ">$phoneout") || die "Cannot create $phoneout!\n"; print "Creating $phoneout\n"; for $phonename (0 .. $#names) { printf PHONES "%.4f %s\n", $ends[$phonename], $names[$phonename]; } close(PHONES); print "Phoneme script written.\n"; } # push newly rendered onto end of @stringlist push @stringlist, $stringfinger; print "There are now " . $#stringlist . " cues in the cache.\n"; for $theindex (0 .. $#stringlist) { print $stringlist[$theindex] . "\n"; } $renderedspeech = $speechstring; $rendering = 0; $donerendering = 1; }