#!/usr/bin/perl

# Copyright (c) 2006 Carnegie Mellon University.  All rights
# reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in
#    the documentation and/or other materials provided with the
#    distribution.
#
# This work was supported in part by funding from the Defense Advanced
# Research Projects Agency and the National Science Foundation of the
# United States of America, and the CMU Sphinx Speech Consortium.
#
# THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
# NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use strict;
use File::Basename qw(dirname);
use File::Spec::Functions qw(catdir updir);
use lib dirname($0);
use Text::CMU::NGramFactory;
use Getopt::Long;
use Pod::Usage;

Getopt::Long::Configure("no_getopt_compat", "bundling");
my %opts = (
	   );
GetOptions(\%opts,
	   "help|h",
	   "verbose|v",
	   "logfile|l=s",
	   "reusefiles|r",
	   "bindir=s",
	   "output|o=s",
	   "builddir|b=s",
	   "testset|t=s")
    or pod2usage(-verbose => 1, -exitval => 1);
pod2usage(-verbose => 1, -exitval => 0) if $opts{help};
pod2usage(-verbose => 0, -exitval => 1,
	  -msg => 'Please specify an XML specification file (use -h for a summary of usage).')
    unless @ARGV;

# Clean up temporary directories on Ctrl-C
$SIG{INT} = sub { exit 1 };

my $xmlfile = shift;
my $factory = Text::CMU::NGramFactory->new(%opts);
my $model = $factory->train($xmlfile);
$model->evaluate($opts{testset}) if defined($opts{testset});
$model->save($opts{output}) if defined($opts{output});

__END__

=head1 NAME

lm_train - Train a language model from an XML specification file

=head1 SYNOPSIS

 lm_train -o ARPAFILE [ I<OPTIONS> ] SPECFILE.xml

=head1 OPTIONS

=over 4

=item B<--help> | B<-h>

Prints a short help message and exits.

=item B<--verbose> | B<-v>

Show progress messages while running.

=item B<--bindir> I<DIR>

Look for CMU LM toolkit binaries (idngram2lm and friends) in I<DIR>
(the default is just to use $PATH).

=item B<--logfile> | B<-l> I<LOGFILE>

Write logging information to I<LOGFILE>.

=item B<--output> | B<-o> I<ARPAFILE>

Write the completed language model to I<ARPAFILE>.

=item B<--builddir> | B<-b> I<DIRECTORY>

Use I<DIRECTORY> for storing intermediate files.  Each object in the
specification file will have its own directory, which may or may not
end up being populated by the program.  One top-level directory is
created for each type of object, e.g. C<Text::CMU::NGramModel>,
C<Text::CMU::Vocabulary>, C<Text::CMU::Transcripts>,
C<Text::CMU::InputFilter>.  Within these directories, the objects'
directories are named according to the names given to them in the
specification file, or autogenerated names (e.g. "g0006") if none were
specified.

=item B<--reusefiles> | B<-r>

Reuse intermediate files in the directory specified by B<--builddir>,
if they exist.  Currently this is limited to transcript files.

=item B<--testset> | B<-t> I<TRANSCRIPT>

Use I<TRANSCRIPT> to evaluate the completed language model.

=back

=head1 DESCRIPTION

This program reads a specification file and trains a language model
based on it.  The full format of the specification file is described
below, and in several other relevant manual pages.

=head1 SPECIFICATION FILE

The specification file is a simple XML file.  The top-level element of
the file currently must be C<E<lt>NGramModelE<gt>>.  Within this
element, you may define various sections describing the components of
this language model.  This includes vocabularies
(C<E<lt>VocabularyE<gt>>), sets of training transcripts
(C<E<lt>TranscriptsE<gt>>), and interpolations of multiple language
models (C<E<lt>InterpolationE<gt>>).

=head1 SEE ALSO

L<ngram_train>, L<ngram_test>, L<ngram_interp>, L<build_vocab>,
L<ngram_pronounce>, L<Text::CMU::NGramFactory>,
L<Text::CMU::NGramModel>, L<Text::CMU::Vocabulary>,
L<Text::CMU::InputFilter>

=head1 AUTHOR

David Huggins-Daines E<lt>dhuggins@cs.cmu.eduE<gt>

=head1 COPYRIGHT

Copyright (c) 2006 Carnegie Mellon University.  All rights reserved.
This is free software; see source code for copying conditions.

=cut
