=head1 NAME

iPE::Model::Emission - The base class for all emission models in iParameterEstimation.

=head1 DESCRIPTION

This serves to initialize the common variables in emission models and serves as a skeleton to all emission models.

=cut

package iPE::Model::Emission;

use base ("iPE::Model");
use POSIX;
use iPE;
use iPE::Globals;
use iPE::Util::Interval;
use iPE::Model::Emission::CDS;
use iPE::Model::Emission::ISO;
use iPE::Model::Emission::LUT;
use iPE::Model::Emission::SDT;
use iPE::Model::Emission::WAM;
use iPE::Model::Emission::WMM;
use iPE::Model::Emission::WWAM;
#use iPE::Model::Emission::ASM; #akshat AAA
use strict;

=head1 CONSTANTS

=over 8

=item MODEL (), DEFAULT_MODEL (), FIXED_MODEL (), SUBMODEL (), FIXED_SUBMODEL ()

These functions act as static constants to the class.  One of these is assigned the type member variable and indicate what kind of model this is.

=cut
sub MODEL           { 0 }
sub DEFAULT_MODEL   { 1 }
sub FIXED_MODEL     { 2 }
sub SUBMODEL        { 3 }
sub FIXED_SUBMODEL  { 4 }

=item LITERAL (), LEXICAL (), PENALTY ()

These functions define how to treat a wildcard (see the wildcard member).  If it is LITERAL, the wildcard is to be scored as another character in the alphabet.  If it is LEXICAL, it is to be treated as all possible letters in the alphabet (besides itself).  If it is PENALTY, then the wildcard is to be penalized wherever it is seen.

=back

=cut
sub LITERAL         { 1 }
sub LEXICAL         { 2 }
sub PENALTY         { 3 }



=head1 FUNCTIONS

=over 8

=item getClassname(modelName)

This function gets the classname for a model type after checking if it exists.  If the class does not exist, it dies.  This is not an object method.

=cut
sub getClassname {
    my $model = shift;
    my $classname = __PACKAGE__."::$model";
    
    eval "require $classname";
    die __PACKAGE__.": The model $model does not exist in iPE.\n" if ($@); 

    return $classname;
}

=item new (tag, attributes, data, element)

The constructor here initializes based on the attributes.  If you are interested in further initialization based on the XML attributes, your new code should call this superclass constructor as follows:

    sub new {
        my $class = shift;
        my ($tag, $att, $data, $element) = @_;
        my $this = $class->SUPER::new(@_);

        #initialization here.
    }

=cut
sub new {
    my ($class) = shift;
    my ($tag, $att, $data, $element) = @_;
    my $this = $class->SUPER::new(@_);

    for($tag) {
        if(/^string_model$/)             { $this->{type_} = MODEL;          }
        elsif(/^default_string_model$/)  { $this->{type_} = DEFAULT_MODEL;  }
        elsif(/^string_submodel$/)       { $this->{type_} = SUBMODEL;       }
        elsif(/^fixed_string_model$/)    { $this->{type_} = FIXED_MODEL;    }
        elsif(/^fixed_string_submodel$/) { $this->{type_} = FIXED_SUBMODEL; }
    }
        

    my $g = new iPE::Globals();
    $this->{name_} = $att->{name};
    $this->{source_} = $att->{source};
    die __PACKAGE__." new: No $att->{source} alphabet known.\n"
        unless($g->seqtype_exists($att->{source}));
    $this->{seqClass_} = $g->seqclass($att->{source});
    eval "require $this->{seqClass_}";
    $this->{model_} = $att->{model};
    $this->{ordinal_} = $att->{ordinal};
    $this->{ordinal_} = 0 if(!defined($att->{ordinal}));
    die __PACKAGE__.": Ordinal must be 0 or greater for model $this->{name_}.\n"
        if($this->{ordinal_} =~ m/[^\d]/ || $this->{ordinal_} < 0);

    if(defined $att->{sampling_rate}) { 
        $this->{samplingRate_} = $att->{sampling_rate}
    }
    else { $this->{samplingRate_} = 1 }

    if (defined $att->{begin}) {
        $this->{interval_} = 
            new iPE::Util::Interval({low => $att->{begin},
                                   high => $att->{end},
                                   letter => "L"});
        $this->{length_} = $this->{interval_}->length;
    }
    else {
        $this->{interval_} = 
            new iPE::Util::Interval({low => ".", high => ".", letter => "L"});
    }

    # Zoe output variables--these are generally just there for the purpose of
    # outputting zoe formatted parameter files, and not of functional use to
    # iParameterEstimation, with the exception of submodels.
    $this->{zName_} = $att->{zoe_name};
    $this->{zModel_} = $att->{zoe_model};
    $this->{zLength_} = $att->{length};
    $this->{zFocus_} = $att->{focus};
    $this->{zNSubmodels_} = $att->{submodels}; 
    
    my $alphaSize = scalar(@{$this->{seqClass_}->getAlphabet});
    $this->{ambiguate_} = 0;
    die "Require a number in the symbols attribute for $this->{name_}\n"
        unless(defined($att->{symbols}) && $att->{symbols} =~ m/[\d]/);
    if($alphaSize == $att->{symbols}-1) {
        $this->{ambiguate_} = 1;
    }
    elsif(defined($att->{wildcard})) {
        warn "Defined wildcard attribute without including enough symbols for ".
            "including a wildcard character\nin the alphabet for model ".
            "$this->{name}.  Use ".($alphaSize+1)." symbols for wildcard.\n";
    }

    $this->{zSymbols_} = $att->{symbols};
    $this->{wildcard_} = $att->{wildcard};
    $this->{wildcardPenalty_} = 0;
    for($att->{wildcard}) {
        if(!defined($_) || m/^literal$/) { $this->{wildcard_} = LITERAL   }
        elsif(m/^lexical$/)              { $this->{wildcard_} = LEXICAL   }
        elsif(m/^[0-9-]+$/)              { $this->{wildcard_} = PENALTY 
                                           $this->{wildcardPenalty_} = $_ }
        else {
            die "Illegal value $att->{wildcard} for wildcard attribute of ".
                "$this->{name_}.\nMust be literal, lexical or a number.\n";
        }
    }

    $this->{settings_} = {};
    $this->_parseData($att->{data});
    
    $this->{nullModel_} = $att->{null_model};
    if(defined($att->{null_params}) && $att->{null_params} == 1) {
            $this->{nullParams_} = 1;
    }
    else {  $this->{nullParams_} = 0; }
    $this->{countable_} = 
        ($this->type != FIXED_MODEL && $this->type != FIXED_SUBMODEL && 
        !$this->{nullParams_});

    # these are the actual submodels in an array reference.  
    $this->{submodels_} = [];

    #these are non-XML pre-defined attributes
    if(defined $data) {
        $this->{fixedData_} = $data ;
        $this->setParamString($data);
        # if this is a fixed model, then we have data in this field.
    }

    $this->handle_submodels($element);
    if(scalar(@{$this->{submodels_}})) {
        $this->{submodels_} = 
            [ sort {$a->ordinal <=> $b->ordinal} @{$this->submodels} ];
    }
    $this->init;
    $this->clear if($this->{countable_} || $this->{nullParams_});

    $this->{zNSubmodels_} = $this->numZoeSubmodels;


    return $this;
}
=item name (), source (), seqClass (), interval (), zModel (), zLength (), zFocus (), zSymbols (), ambiguate (), data (), type (), fixedData (), ordinal ()

These are accessor methods to the class.  None are valid lvalues.

name, source and model indicate the name of the region, for example "Acceptor," the source of the model, e.g. "dna" or "cons", and the type of model, respectively.  These are used for outputting routines, except the model which is also used to indicate the kind of model upon instantiation.  seqClass is the class name of the sequence type that this model will be predicted on.

interval is an iPE::Util::Interval defining the boundaries within the parent region where the model will be estimated.  (A parent region is just the interval itself if the model is not a submodel, otherwise it is the region one generation up.)  The coordinates are given relative to the beginning parent region.  Length is th length of the model defined by this interval.

zName, zModel, zLength, zFocus and zSymbols are present entirely for the sake of outputting to Zoe forrmatted parameter files.  zModel is only present if the model name outputted in zoe format differs from the name given in XML format.

ambiguate indicates whether the ambiguous wildcard character should be used in the model.  wildcard will be set to one of three things, LEXICAL, LITERAL, or PENALTY.  If it is LEXICAL, that indicates that the wildcard could be treated as all four possible characters, and the counts for all characters in each wildcard position should be added together.  This can be accomplished with the included function lexicalAmbiguateMarkovChain.  If it is LITERAL, all the counts for the wildcard character, as observed, should be computed to probabilities, and nothing needs to be done.  If it is PENALTY, then the function penalizeAmbiguousNmers should be used to change the scores when scoring.

nullModel refers to whether an analogous null model is automatically added in as a log-odds ratio.  If it is not, then the countNullRegion () method will never be called (see below), and no null counts will be found.  This will affect the way a region deals with the null region.  If nullParams is true, then the model acts as a null region, and all its parameters are set to 0 (log(Pr(+)/Pr(-)) = 0).

data is a model-specific optional member variable which indicates something about the model which is meaningful to the model.

type is indicates the type of model (see above constant definitions).

fixedData is an optional member variable which is present if the model is a fixed model and is not estimated.

ordinal refers to submodel ordinality.  If a parent model requires that the submodels have an order, the ordinal refers to its position in the ordered group of submodels.

=cut
sub name            { shift->{name_}            }
sub source          { shift->{source_}          }
sub seqClass        { shift->{seqClass_}        }
sub model           { shift->{model_}           }
sub interval        { shift->{interval_}        }
sub regLength       { shift->{regLength_}       }
sub length          { shift->{length_}          }
sub zName           { shift->{zName_}           }
sub zModel          { shift->{zModel_}          }
sub zLength         { shift->{zLength_}         }
sub zFocus          { shift->{zFocus_}          }
sub zSymbols        { shift->{zSymbols_}        }
sub zNSubmodels     { shift->{zNSubmodels_}     }
sub ambiguate       { shift->{ambiguate_}       }
sub wildcard        { shift->{wildcard_}        }
sub wildcardPenalty { shift->{wildcardPenalty_} }
sub data            { shift->{data_}            }
sub nullModel       { shift->{nullModel_}       }
sub nullParams      { shift->{nullParams_}      }
sub type            { shift->{type_}            }
sub fixedData       { shift->{fixedData_}       }
sub ordinal         { shift->{ordinal_}         }
sub samplingRate    { shift->{samplingRate_}    }

=item countable ()

Indicates if the model is suitible for counting.  A model that shouldn't be counted might be a FIXED_MODEL.

=cut
sub countable { shift->{countable_} } 


=item submodels ()

Accessor function for the array of submodels.  lvalue is returned.

=cut
sub submodels :lvalue { shift->{submodels_}  }


=item handle_submodel (tag, att, data, element)

Callback function for start tags within an (fixed_)?string_(sub)?model.  This would indicate that the model is a submodel.

=cut
sub handle_submodel {
    my $this = shift;
    my ($tag, $att, $data, $element) = @_;

    # pass along information to submodels which are shared
    $att->{null_model} = $this->nullModel;
    $att->{null_params} = $this->nullParams;

    if($tag eq "string_submodel") { $att->{type} = SUBMODEL }
    elsif($tag eq "fixed_string_submodel") { $att->{type} = FIXED_SUBMODEL }
    else { die "unexpected submodel tag: $tag\n" }

    # pass along attributes which will remain the same in the submodel
    $att->{source} = $this->source;

    my $classname = getClassname($att->{model});

    push (@{$this->submodels}, $classname->new (@_));
}

# cache these values for speed.
our $negInf = undef;
our $scale = undef;

=item logScore (pos [, null] )

This function will return the log-odds score of a ratio of frequencies.  This is intended as a utility function for all models.  If there is no null model, a log score is returned instead.

=cut
sub logScore { 
    my ($this, $pos, $null) = @_;
    my $score;

    if(!defined ($negInf)) {
        my $g = new iPE::Globals();
        $negInf = $g->options->sequenceNegInf;
        $scale = $g->options->scaleFactor;
    }

    if($pos) {
        if(defined ($null)) {
            if($null) { $score = $scale*log($pos/$null)/log(2); }
            else      { $score = -$negInf;                      }
            #here we accommodate for the chance that we get no null example
            #which is unlikely, but should be present.
        }
        else {
            $score = $scale*log($pos)/log(2);
        }
    }
    else {
        $score = $negInf;
    }
    # this is sam's method for rounding
    if($score < 0) {
        $score = int($score - 0.5);
    }
    else {
        $score = int($score + 0.5);
    }
    #this is chauchun's method for rounding
    #$score = POSIX::floor($score);

    return $score;
}

=item lexicalAmbiguateMarkovChain (href, order)

This function will take a hash ref and add the counts of all contexts with wildcard characters (e.g. "N") in them to the counts.

=cut
sub lexicalAmbiguateMarkovChain {
    my ($this, $href, $order) = @_;

    my $seqClass = $this->seqClass;
    my @nmers = $seqClass->getAllSequences($order, 1);
    my $wc = $seqClass->getWildCard();
    my @alphabet = @{$this->seqClass->getAlphabet()};
    my @contexts;
    for my $nmer (@nmers) {
        next if ($nmer !~ m/$wc/);
        for my $l (@alphabet) {
            $href->{$nmer.$l} = 0;
        }
        @contexts = $seqClass->expandAmbigSeq($nmer);
        for my $ctx (@contexts) {
            for my $l (@alphabet) {
                $href->{$nmer.$l} += $href->{$ctx.$l};
            }
        }
    }
}

=item penalizeAmbiguousNmers (href, order)

This function will take a hash ref and put the wildcard penalty (stored in wildcardPenalty) in each nmer (n=order+1) which includes the wildcard character.

=cut
sub penalizeAmbiguousNmers {
    my ($this, $href, $order) = @_;

    my $seqClass = $this->seqClass;
    my @nmers = $seqClass->getAllSequences($order+1, 1);
    my $wc = $seqClass->getWildCard();
    my $penalty = $this->wildcardPenalty;

    for my $nmer (@nmers) {
        if($nmer =~ m/$wc/) {
            $href->{$nmer} = $penalty;
        }
    }
 
}
=back

=head1 STUB FUNCTIONS

The following functions are partially or not implemented in the base class and are necessary for constructing a new model.

=over 8

=item numZoeModels ()

This function simply indicates how many zoe-type submodels your model will output.  The normal verion returns the number of submodels.  In general this does not have to be overridden.  The SDT and CDS model are special cases where one model implies several models.

=cut
sub numZoeSubmodels { scalar(@{shift->submodels}) }

=item init ()

This function is actually defined in the XML::Object superclass.  Its meaning in the context of the model is to initialize any variables which are not initialized by default.  Generelly these fall under the "data" field, which are specific to the model.

=item clear ()

This function is intended to clear all values of all parameters to 0.  It is called immediately after initialization.  Pseudocounting should not be done here, as it is handled by the object's smoother.

=cut
sub clear { }

=item countRegion (region)

This is completely unimplemented in the base class.  When a model is asked to count a region, it is expected to either put it in the counts for the appropraite class or delegate the region to the appropriate submodel depending on the region that it is counting.  See iPE::Region for more information about regions.

=cut

sub countRegion { }

=item countNullRegion (region)

Null regions are added to the transcripts as a post-processing step to the annotation.  When counting, the Estimator finds these null regions and counts all the models against this region.  When you count this region, you are expected to count exactly the same as you would the positive region, but in separate arrays.  When you score, you will take the log of all the normalized values and use them for an odds ratio.

=cut

sub countNullRegion { }

=item smooth ()

Here you should use your Smoother class (via $this->smoother from iPE::Model) to smooth the counts of your distribution.

=cut
sub smooth {
    my ($this) = @_;

    for my $model(@{$this->submodels}) {
        next if(!$model->countable);
        $model->smooth;
    }
}

=item normalize ()

Take all the parameters you have counted and normalize them.  

=cut

sub normalize { 
    my ($this) = @_;

    for my $model (@{$this->submodels}) {
        next if(!$model->countable);
        $model->normalize;
    }
}

=item score ()

Take all the normalized scores and take the log-odds ratio of the positive model over the null model.

=cut

sub score { 
    my ($this) = @_;

    for my $model (@{$this->submodels}) {
        next if(!$model->countable);
        $model->score;
    }
}

=item getZoeHeader ()

This function returns the generic Zoe-format header for the output string.  If additional things need to be printed out for the zoe header, you may override the getZoeHeaderEnd () function.  This function need not be overridden.

=cut
sub getZoeHeader {
    my ($this) = @_;

    my ($model, $name);
    if(defined $this->zModel)   { $model = $this->zModel    }
    else                        { $model = $this->model     }
    if(defined $this->zName)    { $name = $this->zName     }
    else                        { $name = $this->name       }

    my $g = new iPE::Globals();
    my $head = $name." ";
    $head .= $model." ";
    $head .= $g->zoe_seqtype($this->source)." ";
       
       $head .= $this->zLength." ";
       $head .= $this->zFocus." ";
       $head .= $this->zSymbols." ";
       $head .= $this->zNSubmodels." ";
       $head .= $this->getZoeHeaderEnd."\n";

    return $head;
}

=item getSSHeader (counts, order, numContexts)

Retrieves a default significant statistics header for this model.

You need to pass the total counts observed in this model in counts, the order of the model, and the total number of distinct contexts observed in the estimation.

=cut
sub getSSHeader {
    my ($this, $counts, $order, $numContexts) = @_;

    my $ssHeader;
    my $g = new iPE::Globals();
    $ssHeader  = "NSEQS = ".scalar(@{$g->ssSeqNames()})."\n";
    $ssHeader .= "LENGTH = $counts\n";
    $ssHeader .= "TUPLE_SIZE = ".($order+1)."\n";
    $ssHeader .= "NTUPLES = $numContexts\n";
    $ssHeader .= "NAMES = ".join(",", @{$g->ssSeqNames()})."\n";
    $ssHeader .= "ALPHABET = ".join("", @{$this->seqClass->getAlphabet})."\n";
    $ssHeader .= "NCATS = -1\n\n";

    return $ssHeader;
}

=item getZoeHeaderEnd ()

This function should be overridden if there is additional data about the model in order for it to be interpreted by the Zoe input.  For example, the WAM header requires the order be printed at the end of the line.  This should be done here.

=cut
sub getZoeHeaderEnd { "" }

=item hasSettings ()

If true (1), the model has the new settings=value formatted data attribute.  This is deprecated on arrival, but present temporarily while transitioning to the new format.

=cut
sub hasSettings { shift->{hasSettings_} }

=item settings ()

Returns a hash of settings and their values.

=cut
sub settings { shift->{settings_} }

=item outputPrepare (out, mode)

This function is called before outputting occurs.  You should set the parameter string here.  You can set the parameter string with the setParamString () method.  It is necessary to do it this way in order to be able to output your model in XML format.  Use the out object (iPE::Util::Output) with the 'floatf' functions and the 'intf' functions.

The mode string will indicate one of the "count", "prob", or "score" modes.  Depending on which it is, your parameter string should be prepared as such.

=cut
sub outputPrepare { 
    my ($this, $out, $mode) = @_;

    for my $sub (@{$this->submodels}) {
        if($sub->type != FIXED_MODEL && $sub->type != FIXED_SUBMODEL) {
            $sub->outputPrepare($out, $mode);
        }
    }
}


=item outputZoe (out, mode)

This method should print out the parameters that were estimated.  Use the getParamString () method of this class to get the parameter string and print it.

If you wish not to construct the header to your model, you may use the one in iPE::Model::Emission by calling getZoeHeader ().

You must put a newline character in before your parameters are output.

The mode string will indicate one of the "count", "prob", or "score" modes.  

=cut
sub outputZoe {
    my ($this, $out, $mode) = @_;

    $out->print($out->indent.$this->getZoeHeader);
    
    $out->increaseIndent;
    if(scalar(@{$this->submodels})) {
        for my $sub (@{$this->submodels}) {
            $sub->outputZoe($out, $mode);
        }
    }
    else {
        $out->printPCData($this->getParamString());
    }
    $out->decreaseIndent;
}

sub _parseData {
    my ($this, $data) = @_;

    $data = "" unless(defined($data));
    chomp($data);  $data =~ s/^\s*//; 
    $this->{data_} = $data;

    return unless($data =~ m/\S/);

    # new format
    if($data =~ m/=/) {
        $this->{hasSettings_} = 1;
        my @settings=split(' ',$data);
        for my $setting(@settings) {
            $setting =~ m/(\S+)=(\S+)/ 
                or die("Improperly formatted setting $setting.  Use the ".
                       "setting=value format\nwithout any spaces around ".
                       "the =. separate setting/value pairs with spaces.");
            $this->{settings_}->{$1} = $2;
        }
    }
    else {
        $this->{hasSettings_} = 0;
    }
}

=back

=head1 SEE ALSO 

L<iPE::Model>, L<iPE::XML::Object>, L<iPE::gHMM>

=head1 AUTHOR

Bob Zimmermann (rpz@cse.wustl.edu).

=cut

1;
