#!/usr/bin/perl -w
#

# voiceServer.pl
# Voice server for Zaza's new face and voice.
#
#  Copyright (C) 2002-2003 Brian Rudy (brudyNO@SPAMpraecogito.com)
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#
# Revision history
#
# 0.11 1-30-2004
# Added support for cmu_us_slt_arctic_hts voice. Added voice variable
# to make changing the voice a little easier.
#
# 0.10 11-14-2003 
# Bug fix in regex matching for Lisp responses caused name/end mismatch 
# for phonemes with additional info included (such as allophone_name)
#
# 0.09 11-22-2002 (yes two versions in one day ;)
# Added support for Sable audio source 'padding' of face cues with open
# mouth (work-around)
#
# 0.08 11-22-2002 
# Fixed another bug in renderspeech() dealing with returning to 
# normal mode after Sable mode. No more Multiple Personality Disorder ;) 
#
# 0.07 11-15-2002
# Fixed bug in cue cache handling routine preventing normal cached 
# cues from being handled properly after a Sable cue.
#
# 0.06 11-8-2002
# Reduced loop delay from 250ms to 50ms with new version of 
# IPC::Shareable
#
# 0.05 9-10-2002
# Initial Sable text mode support.
#
# 0.04 8-14-2002
# Quick update to support OGI 'tll' voice. Just un-comment the old 
# @lispsetup array to use the old MBROLA voice. Regex improvements to 
# support non 'word' characters in the phonemes.
#
# 0.03 5-21-2002
# Updated face client outbox shm to use 'stack' of n last cues (specified by 
# $cuestacksize). This should allow slower async clients to keep current, and
# prevent the loss of rapidly-sent cues. Added emotion support.
#
# 0.02 5-20-2002
# Added audio post-processing option to clean up output on Zaza's hardware. 
# Fixed shm init to properly auto-generate 'voiceServer initialized' message
# if not already cached.
#
# 0.01 2-13-2002 
# First functional version. Basic speech request support (no SABLE, JSML, 
# VXML or other markup supported). No 'emotion' support.
#
#
# Bugs: 
# Sending a request for the same speech cue works, but behaves a little 
# differently than other requests. 
#
#
# Notes:
#
# Persistent server application with CGI front-end for speech requests.
# IPC::Shareable for client comm.
#
# Speech rendering requests will be sent to a persistant festival_server
# instance specified by $festival_server and $festival_port.
#
# Speech 'cache' used for speed. Pre-rendered scripts and wavefiles stored 
# on disk, and re-indexed each time voiceServer is run.
#
# Since this approach only speaks one sentance at a time, might want to 
# split multi-sentance speech request text into one or more speech 
# requests. (might be tricky w/ current implimentation)
#
# Hooks for CGI face client programs
# Seperate CGI clients for Java face interface and speech cue input handling.
#
#
# Requires:
#  String::CRC32
#  IPC::Shareable
#  Festival::Client::Async
#  Audio::SoundFile

use String::CRC32;
use IPC::Shareable;
use Festival::Client::Async qw(parse_lisp);
use Audio::SoundFile;
   
## Config info 
##
my $festival_server = "localhost";
my $festival_port   = 1314;

# Where on local filesystem phones and waveforms are stored.
my $speech_data_dir = "/home/brudy/public_html/zaza/speechdata/";

# Post-process the audio after rendering. If no post-processing
# is desired, just leave this blank.
#my $ppopts = "| sox -twav - -twav - pitch 550 35 c c ";
#my $ppopts = "| sox -twav - -twav - pitch 550 35 c c highpass 1000 vol 5.0 ";
#my $ppopts = "| sox -twav - -twav /var/tmp/tmpout.wav pitch 750 55 c c; sox /var/tmp/tmpout.wav -twav - highp 4000 vol 6.0 ";
#$ppopts = "| sox -twav - -twav /var/tmp/tmpin.wav vol 2.0 ; 
#sox -v -1.0 /var/tmp/tmpin.wav -twav - pick 2 ;
#my $ppopts = "| sox -twav - -c2 -twav - vol 2.0 ";
#my $ppopts = "| sox -twav - -traw -w -s -r16000 - | sox -traw -w -r16000 -s - -twav - mask vol 2.0 ";
my $ppopts = "| sox -twav - -twav - mask vol 2.0 ";
#my $ppopts = "";


# Number of cues to keep in shm outbox 'stack'
my $cuestacksize = 10;

# voice to use
#my $voice = "tll_diphone";
my $voice = "cmu_us_slt_arctic_hts";

# Clean up on untrapped signal error 
$SIG{INT} = \&catch_int;
sub catch_int {
  print "Got SIGHUP, shutting down.\n";
  exit;
}


## main

# setup shared memory

my $cueglue = 'cuedata';
my $faceglue = 'facedata';
my %options = ( 'create'    => 1,
                'exclusive' => 0,
                'mode'      => 0666,
                'destroy'   => 1,
              );

my %faceoptions = ( 'create'    => 1,
                'exclusive' => 0,
                'mode'      => 0666,
                'destroy'   => 1,
              );

## If this code dies or is killed unexpectedly the semaphore arrays
## will be unwritable. They must be identified with ipcs, and deleted with 
## ipcrm manually! catch_int should resolve this problem now.

# This is our speech cue input shm
tie(%cueinfo, IPC::Shareable, $cueglue, { %options }) or
        die "voiceServer: tie failed\n";

# This is our face client output shm
tie(@faceinfo, IPC::Shareable, $faceglue, { %faceoptions }) or
        die "voiceServer: tie failed\n";

#%faceinfo = ( 'timestamp'    => time(),
#              'stringfinger' => "1234567890",
#            );


# index pre-rendered speech directory, store in local array.

print "Indexing speech cache.\n";
opendir(DIR, $speech_data_dir) || die "Can't open speech data directory!";
@filenames = readdir(DIR);
closedir(DIR);

my @stringlist;
my $fileindex = 0;
my $renderedspeech = "nothing";
my $rendering = 0;
my $donerendering;
my $currenttime;
my $match;
my $textmode = "normal";

foreach $file (@filenames) {
  if (($file ne "..") && ($file ne ".")) {
    # we are only counting the wavs
    if ($file =~ m/(\d+).wav/) {
      print "Loading " . $1 . "\n";
      $stringlist[$fileindex] = $1;
      $fileindex++;
    }
  }
}


# connect to festival server
$fest = Festival::Client::Async->new($festival_server, $festival_port)
      or die "couldn't connect to Festival server: $!";

# send setup info to festival (voice, Wavefiletype, phoneme ouput, etc)

# To use Sable markup, just:
# (set! after_analysis_hooks (list phone_output_info))
# to dump the talking head info while using the sable-mode

#my @lispsetup = ( 
#                  "(Parameter.set \'Wavefiletype \'wav)",
#                  "(voice_us1_mbrola)",
#                  "(tts_return_to_client)",
#                  "(load \"/home/brudy/voiceServer/zaza-mode.scm\")"
#                );
#my @lispsetup = ( 
#                  "(Parameter.set \'Wavefiletype \'wav)",
#                  "(voice_" . $voice . ")",
#                  "(tts_return_to_client)",
#                  "(load \"/home/brudy/voiceServer/zaza-mode.scm\")",
#                  "(set! after_analysis_hooks phone_output_info)"
#                );
my @lispsetup = (
                  "(Parameter.set \'Wavefiletype \'wav)",
                  "(voice_" . $voice . ")",
                  "(tts_return_to_client)",
                  "(load \"/home/brudy/voiceServer/zaza-mode.scm\")",
                  "(set! hts_uv_threshold 0.8)",
                  "(set! hts_use_phone_align 1)",
                  "(set! after_analysis_hooks phone_output_info)"
                );


for $index (0 .. $#lispsetup) {
  print "Sending:" . $lispsetup[$index] . "\n";
  $fest->server_eval_sync($lispsetup[$index], 
                          {
			    LP => sub {
				     my $lisp = shift;
                                     print "lisp response=" . $lisp;
			    },
			    WV => sub {
				     my $wave = shift;
                                     print "I got wave data.\n";
			    }
                          }) or die "error from Festival server";
}

# Set some sane initial values in both shm mailboxes
$faceinfo[0]{timestamp} = 0;
$faceinfo[0]{stringfinger} = sprintf "%s", "0";
$faceinfo[0]{emotion} = "happy";

# We might end up in a race condition with a cue client if the 
# client trys to write the shm before we do.
$cueinfo{timestamp} = time(); 
$cueinfo{cuestring} = "voice server initialized";
$currenttime = $faceinfo[0]{timestamp};
$cueinfo{emotion} = "happy";


#main loop
while (1) {
  if ($cueinfo{timestamp} != $currenttime) {
    # Do we need to do something, or just wait for renderspeech to finish?
    if (!$rendering && ((crc32($cueinfo{cuestring}) != crc32($renderedspeech)) || ($cueinfo{cuestring} eq '0'))) {
      if ($cueinfo{cuestring} eq '0') {
        # Client only requested emotion change
        print "Met first emotion only change condition.\n";
        $match = 1;
      }
      else {
        print "Got a speech request for: " . $cueinfo{cuestring} . 
          ", fingerprint:" . (crc32($cueinfo{cuestring})) . "\n";
        $match = 0;
        for $stringindex (0 .. $#stringlist) {
          if($stringlist[$stringindex] == (crc32($cueinfo{cuestring}))) {
            $match = 1;
          }
        }
      }
      if ($match) {
        if ($cueinfo{cuestring} eq '0') {
          print "Emotion only change.\n";
        }
        else {
          print "Returning cached request.\n";
        }
        $renderedspeech = $cueinfo{cuestring}; 
        $donerendering = 1;
      }
      else {
        $donerendering = 0;
        renderspeech($cueinfo{cuestring});
      }
    }
    if ($donerendering) {
      $currenttime = $cueinfo{timestamp};
      if ($cueinfo{cuestring} eq '0') {
        push @faceinfo, {timestamp => $currenttime, stringfinger => '0', emotion => $cueinfo{emotion}};
      }
      else {
        if ($renderedspeech =~ m/SABLE/) {
          print "Pushing Sable-mode text stringfinger to stack.\n";
          # We need to unescape the string to get the correct stringfinger
          $renderedspeech =~ s/\\\"/\"/go;
          push @faceinfo, {timestamp => $currenttime, stringfinger => (sprintf "%s", crc32($renderedspeech)), emotion => $cueinfo{emotion}};
        } 
        else {
          push @faceinfo, {timestamp => $currenttime, stringfinger => (sprintf "%s", crc32($renderedspeech)), emotion => $cueinfo{emotion}};
        }
      }
      if (($#faceinfo > $cuestacksize) || ($faceinfo[0]{timestamp} eq '0')) {
        shift @faceinfo;
      }
      print "Done handling request for: $renderedspeech\n";
      print "There are now " . $#faceinfo . " elements on the stack.\n";
      print "Timestamp\tStringfinger\tEmotion\n";
      for $stackelement (0 .. $#faceinfo) {
        print $faceinfo[$stackelement]{timestamp} . "\t" . $faceinfo[$stackelement]{stringfinger} . "\t" . $faceinfo[$stackelement]{emotion} . "\n";
      }
    }
  }
  # We need to sleep for a little while, or IPC::Sharable misbehaves
  # sleep for 250ms
  select(undef, undef, undef, 0.05);
}


sub renderspeech {
  ($speechstring) = @_;

  #use URI::Escape;
  my $mrstring;
  $rendering = 1;
  my $stringfinger = crc32($speechstring); 
  my $outfile = $speech_data_dir . $stringfinger . "\.wav";
  print "Creating $outfile.\n";
  open(AUDIO, "$ppopts>$outfile") || die "Cannot create $outfile!\n";
  # Render and save waveform to local filesystem
  if ($speechstring =~ m/SABLE/) {
    print "Sable mode enabled.\n";
    #$speechstring = uri_escape($speechstring);
    $speechstring =~ s/\"/\\"/go;
    $mrstring = "(tts_textall \"$speechstring\" \'sable)";
    $textmode = "sable";
  } 
  else {
    $mrstring = "(tts_textall \"$speechstring\" nil)";
    $textmode = "normal";
  }
  my @lisprequest = (
                    "(voice_" . $voice . ")",
                    $mrstring
                    );
  for $index (0 .. $#lisprequest) {
    print "Sending $lisprequest[$index]\n";
    $fest->server_eval_sync($lisprequest[$index],
                          {
                            LP => sub {
                                     my $lisp = shift;
                                     #my $lisp = @_;
                                     print "lisp response=" . $lisp;
                            },
                            WV => sub {
                                     my $wave = shift; 
                                     #print "I got wave data.\n";
                                     print AUDIO $wave;
                            }
                          }) or die "error from Festival server";
  }
  close(AUDIO);
  print "Waveform written.\n";

  if ($speechstring =~ m/AUDIO.SRC/) {
    # Special case, pad phone script with open mouth for the length of
    # the wav file.

    # Get the length of the wav file
    my $reader = new Audio::SoundFile::Reader($outfile, \$header);
    $reader->close;
    my $wavlength = $header->get("samples")/$header->get("samplerate");

    my $phoneout = $speech_data_dir . $stringfinger . "\.script";
    open(PHONES, ">$phoneout") || die "Cannot create $phoneout!\n";
    print "Creating padded $phoneout\n";
    print PHONES "0.2000 pau\n";
    printf PHONES "%.4f A\n", $wavlength-0.100;
    printf PHONES "%.4f pau\n", $wavlength;
    close(PHONES);
    print "Padded phoneme script written.\n";
  }
  else {
    # Extract phones from previous request
    my $concatresponse;
    $fest->server_eval_sync("(print_string myphones)",
                          { 
                            LP => sub {
                                     my $lisp = shift;
                                     # We need to remove the escape characters,
                                     # leading and trailing quotes and 
                                     # concatinate before running parse_lisp() 
                                     # on the response.
                                     $lisp =~  s/\\(.)/$1/g;
                                     $lisp =~ s/^\"//;
                                     $lisp =~ s/\"$//;
                                     $concatresponse = $concatresponse . $lisp;
                            }
                          }) or die "error from Festival server";

    #print "Done: $concatresponse";

    #my @names = $concatresponse =~ m/name \"(\S+)\"/g;
    my @names = $concatresponse =~ m/\(name \"(\S+)\"\)/g;
    print "There are " . scalar @names . " names in the response.\n";
  
    #my @ends = $concatresponse =~ m/end ([0-9]+[.]?[0-9]*|[.][0-9]+)/g;
    my @ends = $concatresponse =~ m/end ([0-9]+[.]?[0-9]*|[.][0-9]+)/g;
    print "There are " . scalar @ends . " ends in the response.\n";
 

    my $phoneout = $speech_data_dir . $stringfinger . "\.script";
    open(PHONES, ">$phoneout") || die "Cannot create $phoneout!\n";

    print "Creating $phoneout\n";
    for $phonename (0 .. $#names) {
      printf PHONES "%.4f %s\n", $ends[$phonename], $names[$phonename];
    }
    close(PHONES);  
    print "Phoneme script written.\n";
  }
  # push newly rendered onto end of @stringlist
  push @stringlist, $stringfinger;

  print "There are now " . $#stringlist . " cues in the cache.\n";
  for $theindex (0 .. $#stringlist) {
    print $stringlist[$theindex] . "\n";
  }
  
  $renderedspeech = $speechstring;
  $rendering = 0;
  $donerendering = 1;
}