#!/usr/bin/perl

# Copyright (c) 2006 Carnegie Mellon University.  All rights
# reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in
#    the documentation and/or other materials provided with the
#    distribution.
#
# This work was supported in part by funding from the Defense Advanced
# Research Projects Agency and the National Science Foundation of the
# United States of America, and the CMU Sphinx Speech Consortium.
#
# THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
# NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use strict;
use File::Basename qw(dirname);
use File::Spec::Functions qw(catdir updir);
use lib dirname($0);
use Text::CMU::LMTraining;
use Getopt::Long;
use Pod::Usage;

Getopt::Long::Configure("no_getopt_compat", "bundling");

my %opts = (
	   );
GetOptions(\%opts,
	   "help|h",
	   "verbose",
	   "vocabfile|v=s",
	   "inputfilter|i=s",
	   "fillermap|f=s",
	   "replacelist|r=s",
	   "filesfrom=s",
	   "sentencebreaks=s",
	   "split",
	   "partials",
	   "falsestarts",
	   "uttered",
	   "filledpauses",
	   "tempdir|t=s",
	   "bindir=s",
	   "n=i",
	   "open",
	   "output|o=s",
	   "lmctl=s",
	   "probdef=s",
	   "smoothing=s",
	   "cutoff=i",
	   "topn=i",
	   "cutoffs=s",
	   "oov_fraction=f",
	   "zeroton_fraction=f",
	   "min_unicount=f",
	   "disc_ranges=f",
	   "outtrans|O=s",
	   "contextfile|c=s")
    or pod2usage(-verbose => 1, -exitval => 1);
pod2usage(-verbose => 1) if $opts{help};
pod2usage(-verbose => 0, -exitval => 1,
	  -msg => 'Please specify a training set (use -h for a summary of usage).')
    unless @ARGV or $opts{filesfrom};

$opts{n} = 3 unless defined($opts{n});
$opts{inputfilter} = "Text::CMU::InputFilter::$opts{inputfilter}"
    if defined $opts{inputfilter};
$opts{smoothing} = "Text::CMU::Smoothing::$opts{smoothing}"
    if defined $opts{smoothing};
$opts{vocabulary} = Text::CMU::Vocabulary->new(vocabfile => $opts{vocabfile})
    if defined $opts{vocabfile};

# Exit cleanly on Ctrl-C so that temporary dirs get cleaned up
$SIG{INT} = sub { exit 1 };

my $ngram = Text::CMU::NGramModel->new(%opts);
if (defined($opts{filesfrom})) {
    open LIST, "<$opts{filesfrom}" or die "Failed to open file listing $opts{filesfrom}: $!";
    while (<LIST>) {
	chomp;
	$ngram->add_transcript($_);
    }
}
foreach (@ARGV) {
    $ngram->add_transcript($_);
}
$ngram->estimate();
$ngram->save($opts{output}) if defined($opts{output});
$ngram->save_transcripts($opts{outtrans}) if defined($opts{outtrans});
$ngram->save_probdef($opts{probdef}) if defined($opts{probdef});
$ngram->save_lmctl($opts{lmctl}, $opts{output}, $opts{probdef})
    if defined($opts{lmctl}) and defined($opts{output}) and defined($opts{probdef});

1;

__END__

=head1 NAME

ngram_train - Train an N-gram language model

=head1 SYNOPSIS

B<ngram_train> [I<OPTIONS>] I<TRANSCRIPTS>

=head1 OPTIONS

=over 4

=item B<--help> | B<-h>

Print a short help message and exit.

=item B<--verbose>

Show intimate details of what is being done.

=item B<--bindir> I<DIR>

Look for CMU LM toolkit binaries (idngram2lm and friends) in I<DIR>
(the default is just to use $PATH).

=item B<--filesfrom> I<LISTING>

Take the list of input files from I<LISTING> instead of the command-line.

=item B<--output> | B<-o> I<ARPA_LM>

Output an ARPA-format language model to I<ARPA_LM>.  Otherwise no
output will be created!

=item B<--lmctl> I<LMCTL>

Output a Sphinx-format language model control file to I<LMCTL>.  This
is only useful in the case where there are class tags in the input.

=item B<--probdef> I<PROBDEF>

Output a Sphinx-format class probability definition file to
I<PROBDEF>.  Probabilities are estimated with maximum likelihood.
This is only useful in the case where there are class tags in the
input.

=item B<-n> I<N>

Number of words of context to use in building a language model, i.e.,
the N in the N-gram model.  Default value is 3.

=item B<--open>

Build an open-vocabulary model.  The default is to build a
closed-vocabulary model.  Note that with an open-vocabulary model you
may wish to set the discounting range for unigrams to be non-zero.
For typical speech recognizers, open-vocabulary models are defective,
since <UNK> cannot be recognized, and discounting unigrams in
particular makes no sense at all.

=item B<--tempdir> | B<-t> I<TEMPDIR>

Directory to use for storing temporary files.  If not specified, a
random directory will be generated in $TMPDIR and removed at program
exit.

=item B<--outtrans> | B<-O> I<OUTPUT>

Dump the (filtered and pre-procesed) training transcripts to
I<OUTPUT>.

=item B<--contextfile> | B<-c> I<CONTEXT>

File listing context cues to be removed from the text in training.
Otherwise, "<s>" will be the only context cue used.

=item B<--vocabfile> | B<-v> I<VOCABFILE>

File explicitly specifying the vocabulary to use when training the
language model.  This is useful if you wish to use a common vocabulary
between a set of language models to be interpolated.  Also, all words
from this vocabulary will be included in the unigrams, and thus could
conceivably be recognized, therefore allowing you to do closure on the
vocabulary from the training data.

=item B<--cutoff> I<CUTOFF>

Exclude all words from the (automatically generated) vocabulary
occuring less than I<CUTOFF> times.  This option is exclusive with
B<--vocabfile>.

=item B<--topn> I<TOPN>

Use the I<TOPN> most frequent words when automatically generating a
vocabulary.  This option is exclusive with B<--vocabfile>.

=item B<--smoothing> I<METHOD>

Use I<METHOD> for smoothing probabilities.  Currently defined methods
are B<GoodTuring> (the default), B<Linear>, B<Absolute>, and
B<WittenBell>.

=item B<--cutoffs> I<1GRAMS,2GRAMS,3GRAMS...>

This option is followed by a comma-separated list of counts.  All
N-grams occuring less frequently than the count given will be excluded
from the model training.

=item B<--disc_ranges> I<N,N,N...>

This option is followed by a comma-separated list of integers.  These
are the discounting ranges to use for Good-Turing smoothing.  If
building an open-vocabulary model it's advisable to set the first one
to non-zero, in order to reserve some probability mass for unknown
unigrams.

=item B<--oov_fraction> I<OOV_FRACTION>

=item B<--min_unicount> I<MIN_UNICOUNT>

=item B<--inputfilter> | B<-i> I<FILTER>

Use the input filter specified for I<TRANSCRIPTS>.  Currently defined
input filters are B<CMU>, B<ICSI>, B<ISL>, B<NIST>, and B<SWB>.  These
are implemented by Perl modules in the C<Text::CMU::InputFilter::> namespace.
See L<Text::CMU::InputFilter> for more information.

=item B<--fillermap> | B<-f> I<FILLERMAP>

Pass the filler map file given to the input filter.  This file maps
"filler words" (i.e. vocal and non-vocal noises, disfluencies) to
words or nothing.  The implementation is somewhat dependent on the
input filter, and only B<CMU>, B<ICSI>, and B<ISL> actually support it
at the moment.

=item B<--replacelist> | B<-r> I<REPLACELIST>

Pass the replacement map file given to the input filter.  This file is
a list of words and their replacements, separated by ':'.  You can
also replace regular expression pattersn in the text by prefixing the
source with a '+'.  This doesn't always exactly do what you want
because it applies to the output of the input filter.  That's a bug.

=item B<--sentencebreaks> I<PUNCTUATION>

The set of punctuations used to break sentences in the input.
Implementation depends on the input filter and only the B<ISL> filter
actually supports it at the moment.

=item B<--split>

Split compound words.

=item B<--partials>

Include partial words.

=item B<--falsestarts>

Include false starts.

=item B<--uttered>

Use "uttered" rather than "intended" text for mispronunciations.

=item B<--filledpauses>

Include filled pauses (UH, UM, ER, AH, etc) in the text.

=back

=head1 DESCRIPTION

This program reads one or more transcriptions and builds an N-gram
language model from them in ARPA format.  It allows for a variety of
input formats, and new ones can be defined by writing Perl modules
using a simple API (see L<Text::CMU::InputFilter>).  A number of options are
available for varying the text normalization for experimental
purposes.

=head2 Input Formats

By default, the transcriptions are expected to be in the standard
format used by SphinxTrain, which is similar to the NIST "trn" format,
namely:

 <s> WORD1 WORD2 WORD3 WORD4 ... </s> (uttid)

As an extension, class tags are also allowed, which are expected to be
in the form used by the CMU Communicator class tagger:

 <ClassName>WORD</ClassName>

If class tags are present, a Sphinx language model control file
(specified with B<--lmctl>) and probability definition file (specified
with --probdef) will also be generated.

The other formats supported (using the B<--inputfilter> option) are:

=over 4

=item CMU

This is the format used for CMU Communicator data.  See
L<Text::CMU::InputFilter::CMU> for more information.

=item ISL

This is the format used in the ISL Meeting Room transcription project.
See L<Text::CMU::InputFilter::ISL> for more information.

=item NIST

This is the format used in the NIST Pilot Meeting corpus.  See
L<Text::CMU::InputFilter::NIST> for more information.

=item SWB

This is the format used in the Switchboard Transcription Project.  See
L<Text::CMU::InputFilter::SWB> for more information.

=item ICSI

This is the format used in the ICSI Meeting corpus.  See
L<Text::CMU::InputFilter::ICSI> for more information.

=back

=head1 SEE ALSO

L<ngram_test>, L<ngram_interp>, L<Text::CMU::InputFilter>, L<Text::CMU::NGramModel>,
L<Text::CMU::Smoothing>, L<Text::CMU::Vocabulary>

=head1 AUTHOR

David Huggins-Daines E<lt>dhuggins@cs.cmu.eduE<gt>

=head1 COPYRIGHT

Copyright (c) 2006 Carnegie Mellon University.  All rights reserved.
This is free software; see source code for copying conditions.

=cut

