#!/usr/bin/perl

# Copyright (c) 2006 Carnegie Mellon University.  All rights
# reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in
#    the documentation and/or other materials provided with the
#    distribution.
#
# This work was supported in part by funding from the Defense Advanced
# Research Projects Agency and the National Science Foundation of the
# United States of America, and the CMU Sphinx Speech Consortium.
#
# THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
# NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use strict;
use File::Basename qw(dirname);
use lib dirname($0);
use Text::CMU::NGramModel::ARPA;
use Text::CMU::Vocabulary;
use Getopt::Long;
use IO::File;
use Pod::Usage;
use File::Temp qw(tempdir);
use File::Spec::Functions qw(catfile catdir updir);

binmode STDOUT, ':utf8';

Getopt::Long::Configure('bundling', 'no_getopt_compat');

my %opts = (
	   );
GetOptions(\%opts,
	   'indict|i=s',
	   'indef|d=s',
	   'outdict|o=s',
	   'invocab|v=s@',
	   'inwfreq|w=s@',
	   'festival=s',
	   'tempdir|t=s',
	   'lexfile=s',
	   'lexicon=s',
	   'phoneset=s',
	   'bindir=s',
	   'allwords',
	   'mandarin',
	   'upper',
	   'lower',
	   'help|h')
    or pod2usage(-verbose => 1, -exitval => 1);
pod2usage(-verbose => 1) if $opts{help};

# Exit cleanly on Ctrl-C so that temporary dirs get cleaned up
$SIG{INT} = sub { exit 1 };

$opts{tempdir} = tempdir(CLEANUP => 1) unless defined $opts{tempdir};
$opts{festival} = 'festival' unless defined $opts{festival};

# Get 1-grams
my %unigram;
foreach my $lmfile (@ARGV) {
    my $lm = Text::CMU::NGramModel::ARPA->new($lmfile, 1);
    foreach my $ngram ($lm->ngrams(1)) {
	my ($word) = $ngram->words();
	# Use undef to signify that this word exists and yet has no
	# valid pronunciation
	$unigram{$word} = undef;
    }
}

# Get additional 1-grams from vocabulary files if given
if ($opts{invocab}) {
    foreach my $v (@{$opts{invocab}}) {
	my $vocab = Text::CMU::Vocabulary->new();
	$vocab->load_words($v);
	foreach my $word ($vocab->words()) {
	    $unigram{$word} = undef;
	}
    }
}

# Get additional 1-grams from word frequency files if given
if ($opts{inwfreq}) {
    foreach my $w (@{$opts{inwfreq}}) {
	my $vocab = Text::CMU::Vocabulary->new();
	$vocab->load($w);
	foreach my $word ($vocab->words()) {
	    $unigram{$word} = undef;
	}
    }
}

# Get additional 1-grams from a class definition file if given
if ($opts{indef}) {
    open DEF, "<$opts{indef}" or die "Failed to open $opts{indef}: $!";
    while (<DEF>) {
	next if /^$/;
	next if /^LMCLASS \[/;
	next if /^END \[/;
	my ($word) = split;
	$unigram{$word} = undef;
    }
    close DEF;
}

# Get input dictionary
# FIXME: We assume here that it is case insensitive (and UTF-8)
my %indict;
sub read_dict {
    my ($dict, $dictfn, $allwords, $keepnums) = @_;
    my $fh = IO::File->new($dictfn, "<:utf8")
	or die "Failed to open $dictfn: $!";
    while (<$fh>) {
	chomp;
	my ($word, @phones) = split;
	# Remove class tags
	$word =~ s/:[^\(]+//;
	if ($word =~ s/\((\d+)\)$//) {
	    unless (ref $dict->{uc$word}) {
		$dict->{uc$word} = [ $dict->{uc$word} ];
	    }
	    if ($keepnums) {
		$dict->{uc$word}[$1-1] = "@phones";
	    }
	    else {
		my $dup;
		foreach my $p (@{$dict->{uc$word}}) {
		    $dup = ($p eq "@phones");
		    last if $dup;
		}
		push @{$dict->{uc$word}}, "@phones" unless $dup;
	    }
	}
	else {
	    if ($keepnums) {
		$dict->{uc$word} = "@phones";
	    }
	    else {
		if (not exists $dict->{uc$word}) {
		    $dict->{uc$word} = "@phones";
		}
		else {
		    unless (ref $dict->{uc$word}) {
			$dict->{uc$word} = [ $dict->{uc$word} ];
		    }
		    my $dup;
		    foreach my $p (@{$dict->{uc$word}}) {
			$dup = ($p eq "@phones");
			last if $dup;
		    }
		    push @{$dict->{uc$word}}, "@phones" unless $dup;
		}
	    }
	}
	# Add this to the 1-grams if --allwords
	if ($allwords) {
	    if ($opts{upper}) {
		$word = uc $word;
	    }
	    elsif ($opts{lower}) {
		$word = lc $word;
	    }
	    $unigram{$word} = undef;
	}
    }
}
if (defined($opts{indict})) {
    read_dict(\%indict, $opts{indict}, $opts{allwords}, 1);
}
my %unihan;
if ($opts{mandarin}) {
    read_dict(\%unihan, catfile(dirname($0), "unihan.dic"));
}

# Expand all COMPOUND_WORDS in %unigram, because sphinx3_align wants
# the constituent parts to exist for some reason
foreach (sort keys %unigram) {
    next if /^\+\+.*\+\+$/; # Don't do this with filler words!
    my @parts = split /_/;
    foreach my $part (@parts) {
	unless (exists($unigram{$part})) {
	    $unigram{$part} = undef;
	}
    }
}

# Try to find things that are in the input dictionary
# Update %unigram with these pronunciations
my @unknown;
foreach (sort keys %unigram) {
    my $orig = $_;

    # Sometimes these are in the input, sometimes not
    next if /^<.*>$/; # Skip <UNK>, <s>, etc
    next if /^\+\+.*\+\+$/; # Skip filler words

    # Skip class tags
    next if /^\[.*\]$/;

    # Remove class tags
    s/^([^:]+):.*/$1/;

    s/^[[:punct:]]+//; # Remove leading punctuation

    # Try to find this in the input dictionary
    if (exists $indict{uc$_}) {
	$unigram{$orig} = $indict{uc$_};
	next;
    }
    # Try Unihan too
    elsif (exists $unihan{$_}) {
	$unigram{$orig} = $unihan{$_};
	next;
    }

    # If Mandarin, assume any capitalized sequence is an acronym
    if ($opts{mandarin} and /^[A-Z]+$/) {
	print STDERR "Assuming $_ is an acronym\n";
	$_ = join '_', split "", $_;
    }

    # Deal with acronyms
    if (/^((?:[A-Za-z](?:_|$)){2,})(.*)$/) {
	my $expand = $1;
	my $rest = $2;
	$expand =~ s/_/. /g;
	$expand =~ s/([A-Za-z])$/$1./g;
	$_ = $expand . lc$rest;
    }
    else {
	# Lowercase it (best guess at true case)
	$_ = lc $_
    }
    # Replace all remaining underscores with spaces
    tr/_-/ /;
    s/ $//;
    s/^ //;

    # And continue if there is nothing left
    next if /^\s*$/;

    # Try to find the resulting parts in the input dictionary and
    # concatenate them if found
    my @parts;
    # Split Hanzi into individual characters
    if (/\p{Han}/) {
      @parts = split "";
    }
    # Otherwise use whitespace
    else {
      @parts = split;
    }
    my @concat;
    # FIXME: For Mandarin this isn't optimal, actually we want to try
    # to find a segmentation of the compound in terms of existing
    # words, not characters.
    while (@parts) {
	last unless exists $indict{uc$parts[0]} or exists $unihan{$parts[0]};
	my $part = shift @parts;
	if (ref $indict{uc$part}) {
	    # If there are alternates take (1)
	    # FIXME: hope it exists!
	    push @concat, $indict{uc$part}[0];
	}
	elsif (exists $indict{uc$part}) {
	    push @concat, $indict{uc$part};
	}
	elsif (ref $unihan{$part}) {
	    # If there are alternates take (1), as that is the most common
	    push @concat, $unihan{$part}[0];
	}
	elsif (exists $unihan{$part}) {
	    push @concat, $unihan{$part};
	}
    }
    # Some parts left, so they weren't found
    if (@parts) {
	# Not found (compound as a whole)
	s/\"/\\"/g; # Escape quotes!
	push @unknown, [$orig => $_];
	print uc("$_\n") foreach split;
    }
    else {
	# Whitespace and concatenate them
	$unigram{$orig} = "@concat";
	warn "Synthesizing compound for $orig: $_ => @concat";
    }
}

# Get Festival to do text-processing and synthesis on them
my $tmpfile = catfile($opts{tempdir}, "words.in");
my $fh = IO::File->new($tmpfile, ">:utf8")
    or die "Failed to open $tmpfile: $!";
foreach (@unknown) {
    print $fh qq{"$$_[1]"\n};
}
$fh->close();
my $lispfile = catfile($opts{tempdir}, "pronounce.scm");
$fh = IO::File->new($lispfile, ">:utf8");
print $fh qq((load "$opts{lexfile}")\n) if defined $opts{lexfile};
print $fh qq((lex.select '$opts{lexicon})\n) if defined $opts{lexicon};
if (defined($opts{phoneset})) {
    print $fh qq((require '$opts{phoneset}_phones)\n);
    print $fh qq((PhoneSet.select '$opts{phoneset})\n);
}
my $outfile = catfile($opts{tempdir}, "words.out");
print $fh <<"EOL";
(setq words (load "$tmpfile" t))
(setq fp (fopen "$outfile" "w"))
(while words
 (let ((u (eval (list 'Utterance 'Text (car words)))))
  (Text u)
  (Token u)
  (Word u)
  (PostLex u)
  (format fp "%l\\n" (mapcar item.name (utt.relation.items u 'Segment))))
 (setq words (cdr words)))
(fclose fp)
EOL

# We have to use this bogus way to circumvent a weird bug in Festival
open FESTIVAL, "| $opts{festival} --pipe" or die "Failed to popen festival: $!";
print FESTIVAL qq{(load "$lispfile")};
close FESTIVAL or die "Festival failed with code $?: $!";

# Now read them in and merge them with the existing dictionary
$fh = IO::File->new($outfile, "<:utf8")
    or die "Failed to open $outfile: $!";
my $i = 0;
while (<$fh>) {
    chomp;
    # Just strip out parens and quotes
    tr/()"//d;

    unless (defined($opts{lexicon})) {
	# Do some hackish post-processing on them for default CMUdict (argh)
	$_ = uc $_;
	s/\bAX\b/AH/g;
    }

    unless ($_ eq 'NIL' or /^\s*$/) {
	# Get original orthography
	my $orig = $unknown[$i++][0];
	if ($opts{mandarin}) {
	    # Find an appropriate transliteration
	    # FIXME: Actually, don't, for the time being...
	}
	else {
	    $unigram{$orig} = $_;
	}
    }
}

# Write out the merged dictionary
if (defined($opts{outdict})) {
    $fh = IO::File->new($opts{outdict}, ">:utf8")
	or die "Failed to open $opts{outdict}: $!";
}
else {
    $fh = \*STDOUT;
}
foreach my $word (sort keys %unigram) {
    unless (defined $unigram{$word} and $unigram{$word} =~ /\S/) {
	warn "No pronunciation for $word, skipping";
	next;
    }
    if (ref $unigram{$word}) {
	my $i = 0;
	foreach (@{$unigram{$word}}) {
	    print $fh "$word\t$_\n" if $i == 0;
	    print $fh "$word(",$i+1,")\t$_\n" if $i != 0;
	    ++$i;
	}
    }
    else {
	print $fh "$word\t$unigram{$word}\n"
    }
}
$fh->close();

__END__

=head1 NAME

ngram_pronounce - Generate pronunciation dictionary from one or more language models

=head1 SYNOPSIS

B<ngram_pronounce> [I<OPTIONS>] I<ARPALM> [I<ARPALM>...]

=head1 OPTIONS

=over 4

=item B<--indict> | B<-i> I<DICTFILE>

File to take default pronunciations from.  New pronunciations will
only be generated if not present in this dictionary.  If B<--allwords>
is given, then all words from this dictionary will be included in the
output.  Otherwise, only words actually present in the vocabulary will
be included.

=item B<--outdict> | B<-o> I<DICTFILE>

File to save the resulting pronunciation dictionary in.

=item [B<--invocab> | B<-v> I<VOCABFILE> ...]

CMU language modeling toolkit-style vocabulary file containing
additional words to be added to the lexicon (format is one word per
line).  If this option is specified multiple times, all I<VOCABFILE>
arguments will be added.

=item [B<--inwfreq> | B<-w> I<WFREQFILE> ...]

CMU language modeling toolkit-style word frequency file containing
additional words to be added to the lexicon.  If this option is
specified multiple times, all I<WFREQFILE> arguments will be added.

=item B<--indef> | B<-d> I<DEFFILE>

Class-based language model definition file containing additional words
to be added to the lexicon.

=item B<--festival> I<FESTIVAL>

The path to the Festival binary to use.  If not specified, $PATH will
be searched.

=item B<--tempdir> | B<-t> I<TEMPDIR>

Directory to use for storing temporary files.  If not specified, a
random directory will be generated in $TMPDIR and removed at program
exit.

=item B<--lexfile> I<LEXICON.SCM>

Scheme file containing a lexicon to use for pronunciation.

=item B<--lexicon> I<LEXNAME>

The name of the (predefined) Festival lexicon to use for pronunciation.

=item B<--phoneset> I<PHONESET>

The name of the (predefined) Festival phoneset to use for
pronunciation.  This may be required if you use --lexicon.

=item B<--allwords>

Include all words from the input dictionary (specified with
B<--indict>) in the output.

=item B<--upper>

Force words from the input dictionary (specified with
B<--indict>) to be upper-case in the output.

=item B<--lower>

Force words from the input dictionary (specified with
B<--indict>) to be lower-case in the output.

=back

=head1 DESCRIPTION

B<ngram_pronounce> takes a set of ARPA-format language models and
generates a pronunciation dictionary from the union of their
vocabularies.  If an input dictionary is specified, it will attempt to
take pronunciations from the input dictionary.  Otherwise, it will use
the Festival speech-synthesis system to generate pronunciations.  A
specific lexicon file, lexicon name, and phoneset can be passed to
Festival.  Otherwise it will default to the CMU dictionary's version
of the ARPABet for English.

=head1 BUGS

Ideally we would train letter-to-sound rules from the input dictionary
and use these to generate output pronunciations (or generate them in a
form that they could be loaded into a future version of Sphinx).
Currently it's necessary to have a Festival version of the lexicon,
phoneset, and LTS rules you wish to use for pronunciation modeling.

=head1 SEE ALSO

L<ngram_train>, L<ngram_test>, L<ngram_interp>

=head1 AUTHOR

David Huggins-Daines E<lt>dhuggins@cs.cmu.eduE<gt>

=head1 COPYRIGHT

Copyright (c) 2006 Carnegie Mellon University.  All rights reserved.
This is free software; see source code for copying conditions.

=cut
