#!/usr/bin/perl -w # POEvoiceServer.pl # Voice server for Zaza's face and voice. # # Copyright (C) 2002-2004 Brian Rudy (brudyNO@SPAMpraecogito.com) # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # # Revision history # # 1.00 2-20-2004 # Complete re-write using POE and POE::Component::Festival. This version # buffers input cue requests and processes them in a FIFO manner. Unlike # previous versions, it does not drop cue requests if they are submitted # more rapidly than can be handled in real-time. There is no longer a # dependence on zaza-mode.scm, as utterances are handled at a lower level # than the previous versions using the simplified Festival interface. This # version adds support for command line switches for selecting the voice, # sampling rate, festival server and port. We have also switched to using # Audio::Wav in place of the seemly abandoned Audio::SoundFile which does # not compile on modern Linux/Perl versions. # # 0.11 1-30-2004 # Added support for cmu_us_slt_arctic_hts voice. Added voice variable # to make changing the voice a little easier. # # 0.10 11-14-2003 # Bug fix in regex matching for Lisp responses caused name/end mismatch # for phonemes with additional info included (such as allophone_name) # # 0.09 11-22-2002 (yes two versions in one day ;) # Added support for Sable audio source 'padding' of face cues with open # mouth (work-around) # # 0.08 11-22-2002 # Fixed another bug in renderspeech() dealing with returning to # normal mode after Sable mode. No more Multiple Personality Disorder ;) # # 0.07 11-15-2002 # Fixed bug in cue cache handling routine preventing normal cached # cues from being handled properly after a Sable cue. # # 0.06 11-8-2002 # Reduced loop delay from 250ms to 50ms with new version of # IPC::Shareable # # 0.05 9-10-2002 # Initial Sable text mode support. # # 0.04 8-14-2002 # Quick update to support OGI 'tll' voice. Just un-comment the old # @lispsetup array to use the old MBROLA voice. Regex improvements to # support non 'word' characters in the phonemes. # # 0.03 5-21-2002 # Updated face client outbox shm to use 'stack' of n last cues (specified by # $cuestacksize). This should allow slower async clients to keep current, and # prevent the loss of rapidly-sent cues. Added emotion support. # # 0.02 5-20-2002 # Added audio post-processing option to clean up output on Zaza's hardware. # Fixed shm init to properly auto-generate 'voiceServer initialized' message # if not already cached. # # 0.01 2-13-2002 # First functional version. Basic speech request support (no SABLE, JSML, # VXML or other markup supported). No 'emotion' support. # # # Requires: # String::CRC32 # IPC::Shareable # Audio::Wav # POE # POE::Component::Festival # Using strict causes warnings w/ the stock version on PoCo Festival #use strict; use constant DEBUG => 0; # Defines the level of debugging verbosity use constant AUDIODEBUG => 1; # Set this to get audio feedback of cue requests use POE; use POE::Component::Festival; use Getopt::Std; use String::CRC32; use IPC::Shareable; use Audio::Wav; my %opts; my $res = getopts("s:v:d:f:p:?h", \%opts); if (not defined($res) or $opts{'?'} or $opts{'h'}) { print STDERR < 1, 'exclusive' => 0, 'mode' => 0666, 'destroy' => 1, ); my %faceoptions = ( 'create' => 1, 'exclusive' => 0, 'mode' => 0666, 'destroy' => 1, ); # This is our speech cue input shm tie(%cueinfo, IPC::Shareable, $cueglue, { %options }) or die "voiceServer: tie failed\n"; # This is our face client output shm tie(@faceinfo, IPC::Shareable, $faceglue, { %faceoptions }) or die "voiceServer: tie failed\n"; # Set some sane initial values in both shm mailboxes $faceinfo[0]{timestamp} = 0; $faceinfo[0]{stringfinger} = sprintf "%s", "0"; $faceinfo[0]{emotion} = "happy"; $cueinfo{timestamp} = time(); $cueinfo{cuestring} = "voice server initialized"; $currenttime = $faceinfo[0]{timestamp}; $cueinfo{emotion} = "happy"; local *AUDIO; POE::Session->create ( args => [ @opts{'v', 'd'} ], # options => { trace => 1 }, heap => { sps => $sps, soxopts => $soxopts, uttq => [], cuebuffer => [], watermark => 0, stringlist => [], currenttime => 0, cuestacksize => $cuestacksize, }, inline_states => { _start => sub { my ($voice, $dir) = @_[ARG0, ARG1]; my @hook= (q/(require 'tts)/); if (defined($voice)) { if (defined($dir)) { push @hook, qq{(cd "$dir")}; push @hook, qq{(load "festvox/$voice.scm")} if -f "$dir/festvox/$voice.scm"; } } else { push @hook, qq{(voice_$voice)}; } POE::Component::Festival->start('synthesizer', { startup_hook => \@hook, host => $host, port => $port}); $_[KERNEL]->post(synthesizer => 'attach'); # index pre-rendered speech directory, store in local array. print "Indexing speech cache.\n"; opendir(DIR, $speech_data_dir) || die "Can't open speech data directory!"; my @filenames = readdir(DIR); closedir(DIR); my $fileindex = 0; foreach $file (@filenames) { if (($file ne "..") && ($file ne ".")) { # we are only counting the wavs if ($file =~ m/(\d+).wav/) { print "Loading " . $1 . "\n"; push @{$_[HEAP]{stringlist}}, $1; } } } $_[KERNEL]->yield('check_inbox'); }, quit => sub { print "Quitting\n"; $_[KERNEL]->post(synthesizer => 'quit'); }, check_inbox => sub { my $cuetime = $cueinfo{timestamp}; my $cuestring = $cueinfo{cuestring}; my $cueemotion = $cueinfo{emotion}; # check the shared memory inbox for a new cue if ($cuetime != $_[HEAP]{currenttime}) { # Set currenttime to the timestamp we are working on $_[HEAP]{currenttime} = $cuetime; print "Received request for cue: $cuestring\n"; $_[KERNEL]->yield('process_this' => $cuetime, $cuestring, $cueemotion); } # We need to sleep for a little while, or IPC::Sharable misbehaves # sleep for 10ms select(undef, undef, undef, 0.01); $_[KERNEL]->yield('check_inbox'); }, process_this => sub { my ($cuetime, $cuestring, $cueemotion) = @_[ARG0, ARG1, ARG2]; my $skipcheck = 0; if ((defined $cuetime) && (defined $cuestring) && (defined $cueemotion)) { push @{$_[HEAP]{cuebuffer}}, {'cuetime' => $cuetime, 'cuestring' => $cuestring, 'cueemotion' => $cueemotion }; } else { $cuetime = $_[HEAP]{cuebuffer}[0]->{cuetime}; $cuestring = $_[HEAP]{cuebuffer}[0]->{cuestring}; $cueemotion = $_[HEAP]{cuebuffer}[0]->{cueemotion}; $skipcheck = 1; } if (($#{$_[HEAP]{cuebuffer}} <= 0) || ($skipcheck)) { # if the cue has already been rendered, just update # the ouput stack. if ($cuestring eq '0') { # Client only requested emotion change print "Met first emotion only change condition.\n"; $match = 1; } else { print "Processing request for: " . $cuestring . ", fingerprint:" . (crc32($cuestring)) . "\n"; $match = 0; for $stringindex (0 .. $#{$_[HEAP]{stringlist}}) { print "$_[HEAP]{stringlist}[$stringindex]\n"; if($_[HEAP]{stringlist}[$stringindex] == (crc32($cuestring))) { $match = 1; last; } } } # If the cue isn't in the cache, render it if ($match) { if ($cuestring eq '0') { print "Emotion only change.\n"; } else { print "Returning cached request.\n"; } # just update the output stack $_[KERNEL]->yield('update_outbox' => $cuetime, $cuestring, $cueemotion); } else { # render the request print "Rendering \'$cuestring\'\n"; $_[KERNEL]->yield('render_text' => $cuetime, $cuestring, $cueemotion); } } else { print "Not ready for cue, buffering...\n"; } }, update_outbox => sub { my ($cuetime, $cuestring, $cueemotion) = @_[ARG0, ARG1, ARG2]; #print "Updating outbox for cuetime=$cuetime, cuestring=$cuestring, cueemotion=$cueemotion\n"; if ($cuestring eq '0') { push @faceinfo, {timestamp => $cuetime, stringfinger => '0', emotion => $cueemotion}; } else { if ($cuestring =~ m/SABLE/) { print "Pushing Sable-mode text stringfinger to stack.\n"; # We need to unescape the string to get the correct stringfinger $cuestring =~ s/\\\"/\"/go; push @faceinfo, {timestamp => $cuetime, stringfinger => (sprintf "%s", crc32($cuestring)), emotion => $cueemotion}; } else { push @faceinfo, {timestamp => $cuetime, stringfinger => (sprintf "%s", crc32($cuestring)), emotion => $cueemotion}; } } if (($#faceinfo > $_[HEAP]{cuestacksize}) || ($faceinfo[0]{timestamp} eq '0')) { shift @faceinfo; } print "Done handling request for: $cuestring\n"; print "There are now " . (scalar @faceinfo) . " elements on the outbox stack.\n"; print "Timestamp\tStringfinger\tEmotion\n"; for $stackelement (0 .. $#faceinfo) { print $faceinfo[$stackelement]{timestamp} . "\t" . $faceinfo[$stackelement]{stringfinger} . "\t" . $faceinfo[$stackelement]{emotion} . "\n"; } #print "There are " . @{$_[HEAP]{stringlist}} . " elements in the stringlist\n"; my $match = 0; foreach (@{$_[HEAP]{stringlist}}) { my ($stringindex) = $_; #print "Checking match for " . $stringindex . "\n"; if($stringindex == crc32($cuestring)) { $match = 1; last; } } if (!$match) { # push requested new stringfinger onto end of stringlist push @{$_[HEAP]{stringlist}}, (sprintf "%s", crc32($cuestring)); print "There are now " . (scalar @{$_[HEAP]{stringlist}}) . " cues in the cache.\n"; for $theindex (0 .. $#{$_[HEAP]{stringlist}}) { print $_[HEAP]{stringlist}[$theindex] . "\n"; } } # Handy for debugging w/o needing a faceclient system("esdplay " . $speech_data_dir . crc32($_[HEAP]{cuebuffer}[0]->{cuestring}) . "\.wav &") if AUDIODEBUG > 0; shift @{$_[HEAP]{cuebuffer}}; if (defined $_[HEAP]{cuebuffer}[0]) { print "There is something in the buffer, processing it...\n"; $_[KERNEL]->yield('process_this'); } }, render_text => sub { my ($cuetime, $cuestring, $cueemotion) = @_[ARG0, ARG1, ARG2]; #print "Rending request for cuetime=$cuetime, cuestring=$cuestring, cueemotion=$cueemotion\n"; push @{$_[HEAP]{uttq}}, $cuestring; #if (defined $_[HEAP]{uttq}[0]) { #} if ($_[HEAP]{watermark} > 1) { print "Pipeline full $_[HEAP]{watermark}, throttling input\n" if DEBUG > 0; return; } if ($cuestring =~ m/SABLE/) { print "Sable mode enabled.\n"; $cuestring =~ s/\"/\\"/go; # we may need to do something different to render this if (defined(my $utt = shift @{$_[HEAP]{uttq}})) { $_[KERNEL]->post(synthesizer => evaluate => <<"EOS" => 'wave'); (begin (Parameter.set 'Wavefiletype 'wav) (tts_return_to_client) (tts_textall "$cuestring" 'sable)) EOS } $_[HEAP]{watermark}++; print "Pipeline in: $_[HEAP]{watermark}\n" if DEBUG > 0; } else { if (defined(my $utt = shift @{$_[HEAP]{uttq}})) { $_[KERNEL]->post(synthesizer => evaluate => <<"EOL" => 'synth'); (begin (voice_$voice) (set! hts_uv_threshold 0.8) (set! hts_use_phone_align 1) (set! u (Utterance Text "$utt")) (Parameter.set 'Wavefiletype 'wav) (utt.synth u) (utt.wave.resample u $_[HEAP]{sps})) EOL $_[KERNEL]->post(synthesizer => evaluate => q((print_string (utt.relation_tree u 'Segment))) => 'phonemes'); $_[KERNEL]->post(synthesizer => evaluate => q/(utt.send.wave.client u)/ => 'wave'); } $_[HEAP]{watermark}++; print "Pipeline in: $_[HEAP]{watermark}\n" if DEBUG > 0; } }, lisp_data => sub { chomp(my $lisp = $_[ARG0]); print "lisp data $_[ARG1]:\n$lisp\n" if DEBUG > 0; unless ($_[ARG1] eq 'phonemes') { return; } my $stringfinger = crc32($_[HEAP]{cuebuffer}[0]->{cuestring}); my @names = $lisp =~ m/\(name \\\"(\S+)\\\"\)/g; print "There are " . scalar @names . " names in the response for $stringfinger.\n"; my @ends = $lisp =~ m/\(end ([0-9]+[.]?[0-9]*|[.][0-9]+)\)/g; print "There are " . scalar @ends . " ends in the response for $stringfinger.\n"; my $phoneout = $speech_data_dir . $stringfinger . "\.script"; open(PHONES, ">$phoneout") || die "Cannot create $phoneout!\n"; print "Creating $phoneout\n"; for $phonename (0 .. $#names) { printf PHONES "%.4f %s\n", $ends[$phonename], $names[$phonename]; } close(PHONES); print "Phoneme script written.\n"; }, waveform_data => sub { my $rbuf = $_[ARG0]; my $burf = length($$rbuf); # if the file hasn't been opened/created then do it. my $stringfinger = crc32($_[HEAP]{cuebuffer}[0]->{cuestring}); my $outfile = $speech_tmp_dir . $stringfinger . "\.wav"; if (! (defined *AUDIO)) { print "Creating $outfile\n"; open(AUDIO, ">$outfile") || die "Cannot create $outfile!\n"; } print "waveform data: $burf bytes\n" if DEBUG > 1; if ($_[HEAP]{watermark} == 0) { print "WARNING! Got waveform data with watermark = 0, discarding\n"; return; } if (!(defined *AUDIO)) { print "Something is wrong, we can't write to an unopened filehandle\n"; } # write the waveform print AUDIO $$rbuf; }, eval_done => sub { print "eval_done for $_[ARG1]\n" if DEBUG > 0; return unless $_[ARG1] eq 'wave'; if (!(defined *AUDIO)) { print "Something is wrong, we can't close an unopened filehandle\n"; } close AUDIO; # close the open wav file my $stringfinger = crc32($_[HEAP]{cuebuffer}[0]->{cuestring}); my $wavtmpfile = $speech_tmp_dir . $stringfinger . "\.wav"; my $wavfile = $speech_data_dir . $stringfinger . "\.wav"; print "Running: sox " . $wavtmpfile . " " . $wavfile . $_[HEAP]{soxopts} . "\n" if DEBUG > 1; system("sox " . $wavtmpfile . " " . $wavfile . $_[HEAP]{soxopts}); if ($_[HEAP]{cuebuffer}[0]->{cuestring} =~ m/SABLE/) { # Special case, pad phone script with open mouth for the length of # the wav file. # Get the length of the wav file my $wav = new Audio::Wav; my $readwav = $wav->read($wavfile); my $wavlength = $readwav->length_seconds(); my $phoneout = $speech_data_dir . $stringfinger . "\.script"; open(PHONES, ">$phoneout") || die "Cannot create $phoneout!\n"; print "Creating padded $phoneout\n"; print PHONES "0.2000 pau\n"; printf PHONES "%.4f A\n", $wavlength-0.100; printf PHONES "%.4f pau\n", $wavlength; close(PHONES); print "Padded phoneme script written.\n"; } #print "Time=" . $_[HEAP]{cuebuffer}[0]->{cuetime} . # ", String=" . $_[HEAP]{cuebuffer}[0]->{cuestring} . # ", Emotion=" . $_[HEAP]{cuebuffer}[0]->{cueemotion} . "\n"; $_[HEAP]{watermark}--; print "Pipeline out: $_[HEAP]{watermark}\n" if DEBUG > 0; $_[KERNEL]->yield('update_outbox' => $_[HEAP]{cuebuffer}[0]->{cuetime}, $_[HEAP]{cuebuffer}[0]->{cuestring}, $_[HEAP]{cuebuffer}[0]->{cueemotion} ); }, } ); $poe_kernel->run(); __END__ =head1 NAME POEvoiceServer - renders voice cue requests for use with talking head applications =head1 SYNOPSIS C [B<-s> I] [B<-v> I] [B<-d> I] [B<-f> I] [B<-p> I] [B<-?> | B<-h>] =head1 DESCRIPTION C Creates shared memory segments for both input and output. When a new text cue is pushed into the input segment by an external client (such as the voiceClient CGI), the voiceServer checks the cue against a list of pre-rendered cues currently cached on disk, identified by a CRC32 of the string. If the cue has not been rendered, it sends the request to a local or remote Festival server. Both the waveform and phoneme timing info are extracted from the processed utterance and saved locally for future requests. Post-processing is performed on the waveform to maximise the volume. A timestamp, CRC32 of the string and the requested emotion are pushed onto the outbox stack for use by an external server application (such as POEfaceClient) that manages remote talking head clients (such as zazaface2). Because there is no support for specifying a host and port of the Festival server in the stock 0.03 version of PoCoFestival a patch must be applied to Festival.pm to enable this. Most of the options are self-explanatory. The C<-d> option is provided so that you can test voices built with Festvox without having to install them in your Festival library directory. So, for instance, if you had built a voice called 'net_foobar_quux_ldom' in the directory F, you would run it like this: POEvoiceServer -v net_foobar_quux_ldom -d /home/quux/foobar =head1 BUGS Some of the waveforms returned by Festival have inacurate timing info which may cause sync problems with the face application. Some combinations of POE and Perl 5.8.0 needlessly complain about unopened filehandles, it's safe to ignore these warnings. =head1 AUTHOR Brian Rudy =head1 SEE ALSO L, L =cut