#!/usr/bin/perl

# Copyright (c) 2006 Carnegie Mellon University.  All rights
# reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in
#    the documentation and/or other materials provided with the
#    distribution.
#
# This work was supported in part by funding from the Defense Advanced
# Research Projects Agency and the National Science Foundation of the
# United States of America, and the CMU Sphinx Speech Consortium.
#
# THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
# NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use strict;
use File::Basename qw(dirname);
use File::Temp qw(tempdir);
use File::Spec::Functions;
use File::Path;
use lib dirname($0);
use Text::CMU::LMTraining;
use Getopt::Long;
use Pod::Usage;

Getopt::Long::Configure("no_getopt_compat", "bundling");

my %opts = (
	   );
GetOptions(\%opts, "help|h",
	   "verbose|v",
	   "inputfilter|i=s",
	   "fillermap|f=s",
	   "filesfrom=s",
	   "sentencebreaks=s",
	   "split",
	   "partials",
	   "falsestarts",
	   "uttered",
	   "filledpauses",
	   "tempdir|t=s",
	   "interpolate|I",
	   "outtrans|O=s",
	   "bindir=s",
	   "contextfile|c=s")
    or pod2usage(-verbose => 1, -exitval => 1);
pod2usage(-verbose => 1) if $opts{help};
pod2usage(-verbose => 0, -exitval => 1,
	  -msg => 'Please specify an ARPA language model file and test set.'.
	  "\n(use -h for a summary of usage)") unless @ARGV;

$opts{inputfilter} = "Text::CMU::InputFilter::$opts{inputfilter}"
    if defined $opts{inputfilter};
my $ngramfile = shift;

if (defined($opts{filesfrom})) {
    open LIST, "<$opts{filesfrom}" or die "Failed to open file listing $opts{filesfrom}: $!";
    while (<LIST>) {
	chomp;
	push @ARGV, $_;
    }
}

# Exit cleanly on Ctrl-C so that temporary dirs get cleaned up
$SIG{INT} = sub { exit 1 };

if ($opts{interpolate}) {
    test_interpolated($ngramfile, @ARGV);
    exit 0;
}

my $ngram = Text::CMU::NGramModel->new(%opts);
$ngram->load($ngramfile);

my $results = $ngram->evaluate([@ARGV]);
printf "Perplexity %.2f OOV rate %s\n",
    $results->perplexity(), $results->oov();
foreach my $n (1..10) {
    my $hits = $results->ngram_hits($n);
    last unless defined $hits;
    print "$n-gram hits: $hits\n";
}

sub test_interpolated {
    my ($ngramfile, @files) = @_;
    my @lms = split /:/, $ngramfile;

    # Get probability stream and report individual perplexity
    my $i = 0;
    foreach my $lm (@lms) {
	my $ngram;
	# Force each LM to use a separate tempdir
	if (defined($opts{tempdir})) {
	    my $subdir = catdir($opts{tempdir}, $i++);
	    mkpath($subdir);
	    $ngram = Text::CMU::NGramModel->new(%opts, tempdir => $subdir);
	}
	else {
	    $ngram = Text::CMU::NGramModel->new(%opts);
	}
	$ngram->load($lm);
	my $results = $ngram->evaluate(\@files, $ngram->tempfile("fprobs"), 1);
	printf "%s: Perplexity %.2f OOV rate %s\n",
	    $lm, $results->perplexity(), $results->oov();
	# Keep track of the Text::CMU::NGramModel object
	$lm = $ngram;
    }

    # Now run interpolate to get the final values
    my @args;
    foreach my $lm (@lms) {
	push @args, '+';
	push @args, $lm->tempfile("fprobs");
    }
    system('interpolate', @args);
}

1;

__END__

=head1 NAME

ngram_test - Test an N-gram language model

=head1 SYNOPSIS

B<ngram_test> [I<OPTIONS>] I<ARPABO> I<TRANSCRIPTS>

=head1 DESCRIPTION

=head1 OPTIONS

=over 4

=item B<--help> | B<-h>

Print a short help message and exit.

=item B<--verbose> | B<-v>

Show intimate details of what is being done.

=item B<--bindir> I<DIR>

Look for CMU LM toolkit binaries (idngram2lm and friends) in I<DIR>
(the default is just to use $PATH).

=item B<--interpolate> | B<-I>

If this option is given, I<ARPABO> will be treated as a
colon-separated list of language models to b e interpolated together
for testing.

=item B<--filesfrom> I<LISTING>

Take the list of input files from I<LISTING> instead of the command-line.

=item B<--tempdir> | B<-t> I<TEMPDIR>

Directory to use for storing temporary files.  If not specified, a
random directory will be generated in $TMPDIR and removed at program
exit.

=item B<--outtrans> | B<-O> I<OUTPUT>

Dump the (filtered and pre-procesed) evaluation transcript to
I<OUTPUT>.

=item B<--contextfile> | B<-c> I<CONTEXT>

File listing context cues to be removed from the text in evaluation.
Otherwise, "<s>" will be the only context cue used.

=item B<--inputfilter> | B<-i> I<FILTER>

Use the input filter specified for I<TRANSCRIPTS>.  Currently defined
input filters are B<CMU>, B<ICSI>, B<ISL>, B<NIST>, and B<SWB>.  These
are implemented by Perl modules in the C<Text::CMU::InputFilter::> namespace.
See L<Text::CMU::InputFilter> for more information.

=item B<--fillermap> | B<-f> I<FILLERMAP>

Pass the filler map file given to the input filter.  This file maps
"filler words" (i.e. vocal and non-vocal noises, disfluencies) to
words or nothing.  The implementation is somewhat dependent on the
input filter, and only B<CMU>, B<ICSI>, and B<ISL> actually support it
at the moment.

=item B<--sentencebreaks> I<PUNCTUATION>

The set of punctuations used to break sentences in the input.
Implementation depends on the input filter and only the B<ISL> filter
actually supports it at the moment.

=item B<--split>

Split compound words.

=item B<--partials>

Include partial words.

=item B<--falsestarts>

Include false starts.

=item B<--uttered>

Use "uttered" rather than "intended" text for mispronunciations.

=item B<--filledpauses>

Include filled pauses (UH, UM, ER, AH, etc) in the text.

=back

=head1 SEE ALSO

L<ngram_train>, L<ngram_interpolate>, L<Text::CMU::NGramModel>, L<Text::CMU::InputFilter>

=head1 AUTHOR

David Huggins-Daines E<lt>dhuggins@cs.cmu.eduE<gt>

=head1 COPYRIGHT

Copyright (c) 2006 Carnegie Mellon University.  All rights reserved.
This is free software; see source code for copying conditions.

=cut
