=head1 NAME

iPE::Model::Emission::SDT - Sequence Decision Tree model.

=head1 DESCRIPTION

This model requires submodels.  It takes in features, and based on the sequence of the feature, decides which submodel to pass the feature on to.  The model uses the ambiguity codes (and canonical DNA codes) of the submodels in order to determine which model the submodel belongs to.

=head1 FUNCTIONS

=cut

package iPE::Model::Emission::SDT;
use iPE;
use iPE::Globals;
use iPE::Model;
use iPE::Model::Emission;
use base("iPE::Model::Emission");
use bytes;
use strict;

sub init {
    my ($this) = @_;

    die "SDT models will only work with dna sequences.\n"
        if($this->{source_} ne "dna");

    die "SDT models require at least one submodel.\n" 
    if(scalar(@{$this->submodels}) < 1);

    for my $model (@{$this->submodels}) {
        die "Missing required ordinal for SDT in @{[$this->name]} ".
            "submodel @{[$model->name]}.\n"
            if (!defined $model->ordinal && 
                    ($model->type == iPE::Model::Emission::SUBMODEL() ||
                     $model->type == iPE::Model::Emission::FIXED_SUBMODEL()));
        die "All ordinals must be of positive value.\n"
            if($model->ordinal < 0);
    }

    my $prev_ord = -1;
    $this->{orderedRegExps_} = [];
    $this->{submodelHash_} = {};
    $this->{posCountsHash_} = {};
    $this->{nullCountsHash_} = {};
    $this->{posProbHash_} = {};
    $this->{nullProbHash_} = {};
    for my $model (@{$this->submodels}) {
        next if($model->type != iPE::Model::Emission::SUBMODEL());

        die "SDT model ordinals must be of unequal value.\n"
            if($model->ordinal == $prev_ord);
        $prev_ord = $model->ordinal;

        #create a regular expression that represents the ambiguity code
        #given for the name of the SDT submodel
        die "SDT ".$this->name." submodel name \"".$model->name."\" length ".
            "is incorrect.  \nAll SDT submodels must have names with strings ".
            "equal to the length of the model.\n"
            if(length($model->name) != $this->length);
        my $re = $this->seqClass->ambigSeqToRegExp($model->name);

        push @{$this->{orderedRegExps_}}, $re;
        $this->{submodelHash_}->{$re} = $model;
        $this->{posCountsHash_}->{$re} = 0;
        $this->{nullCountsHash_}->{$re} = 0;
        $this->{posProbHash_}->{$re} = 0;
        $this->{nullProbHash_}->{$re} = 0;
    }
}

#The number of submodels in an SDT according to ZOE is the expansion of the
#consensus sequences in ambiguity codes.  The consensus sequences are in the
#model names, and can be expanded with the sequence class.
sub numZoeSubmodels {
    my ($this) = @_;
    my $n = 0;
    for my $model (@{$this->submodels}) {
        my @subseqs = $this->seqClass->expandAmbigSeq(
            $model->name, [ $this->seqClass->getWildCard() ]);
        $n += scalar(@subseqs);
    }
    return $n;
}

sub submodelHash { shift->{submodelHash_} }

sub outputZoe {
    my ($this, $out, $mode) = @_;

    $out->print($out->indent.$this->getZoeHeader);
    $out->increaseIndent;
    for my $model (@{$this->submodels}) {
        if ($model->type == iPE::Model::Emission::FIXED_MODEL() || 
            $model->type == iPE::Model::Emission::FIXED_SUBMODEL()) {
            $model->outputZoe($out, $mode);
        }
        else {
            my @subseqs = $this->seqClass->expandAmbigSeq(
                $model->name, [ $this->seqClass->getWildCard() ]);
            my $old_name = $model->name;
            for my $subseq (@subseqs) {
                $model->{name_} = $subseq;
                $model->outputZoe($out, $mode);
            }
            $model->{name_} = $old_name;
        }
    }
    $out->decreaseIndent;
}

sub _count {
    my ($this, $region, $null) = @_;

    #In this function we construct a substring and using the regular expressions
    #defined for the SDT, determine the positions sites for counting.

    #The SDT will have to match a regular expression to the whole
    #length of the site.  Since the model could have been cut up by 
    #segmentation, it is necessary to create a string which represents the
    #entire sequence where the model region begins and ends.
    my $model_start = $region->parentStart;

    #If the null region is being counted, we are given a region bigger than
    #the actual length of the model.  If we are couting the positive model, 
    #we will be given a region that is at most as long as the site length.
    #The length of the string will be the maximum of the length or the
    my $str_len = $region->parentEnd-$region->parentStart+1;
    return if($this->interval->length > $str_len);

    #now $str will represent our search space for sites.
    my $str;
    if($region->seq->loaded) {
        $str = substr ${$region->strRef}, $model_start, $str_len;
    }
    else {
        $str = $region->seq->getContext($region->strand, 
            $region->parentEnd, $str_len-1);
    }

    #declare these early for less garbage collection.
    my ($match_start, $match_end);

    if($null) {
        msg("$this->{name_} sub region (".$region->start.", ".$region->end.
            ") $str_len\n");
    }

    #optimization
    my $length = $this->interval->length;
    my $start = $region->start;

    for my $re (@{$this->{orderedRegExps_}}) {
        my $submodel = $this->{submodelHash_}->{$re};
        while($str =~ m/$re/g) {
            #We have found a match in the smaller string, and now we need
            #to map the coordinates of the match back onto the larger sequence.
            #The smaller sequence begins at $model_start, and the match
            #ends at pos($str)-1 within the smaller string.
            $match_start = $model_start+pos($str)-$length;
            $match_end   = $model_start+pos($str)-1;
            next if ($match_end < $start);

            my @translatedCoords = $submodel->interval->translate($match_start, 
                $match_start+$length);
            my $subregion = $region->subregion(@translatedCoords);
            next if (!defined $subregion);

            if($null) { 
                $submodel->countNullRegion($subregion);
                $this->{nullCountsHash_}->{$re} += $region->weight
            }
            else { 
                $submodel->countRegion($subregion);
                $this->{posCountsHash_}->{$re} += $region->weight
            }

            #In order to prevent progressive matching leapfrogging any sites
            #that may be found later, we reset the position to 1 after the match
            pos $str -= $length-1;

            # only count one sample for the positive model, we only iterate
            # through the entire sequence if we are counting pseudosites
            last if(!$null);
        }
    }
}

sub countRegion     { shift->_count(@_, 0)  }
sub countNullRegion { shift->_count(@_, 1)  }

sub normalize {
    my $this = shift;
    $this->SUPER::normalize(@_);

    #normalize all the probabilities of seeing each leaf node on the 
    #decision tree.
    my $totalPos = 0;
    my $totalNull = 0;
    for my $re (keys %{$this->{submodelHash_}}) {
        $totalPos  += $this->{posCountsHash_}->{$re};
        $totalNull += $this->{nullCountsHash_}->{$re};
    }

    for my $re (keys %{$this->{submodelHash_}}) {
        $this->{posProbHash_}->{$re}  = 
            $this->{posCountsHash_}->{$re}/$totalPos if($totalPos);
        $this->{nullProbHash_}->{$re} = 
            $this->{nullCountsHash_}->{$re}/$totalNull if($totalNull);
    }

    return if($this->source ne "dna");

    my $g = new iPE::Globals();

    #set all the given letters (where the probability of the letter being
    #there is 1, e.g. letters in the start codon), to the prior probability
    #of seeing the letter in the sequence.
    my $re = $this->seqClass->ambigSeqToRegExp($this->seqClass->getWildCard);
    for my $model (@{$this->submodels}) {
        next if(!$model->countable);
        my $name = $model->name;
        while($name =~ m/($re)/g) {
            my $l = $1;
            $model->setNullProb(pos($name)-1, $l, $g->levels->cumulative->{$l});
        }
    }

    #adjust the probability of the leaf nodes to be the probability of seeing
    #being that leaf node.
    # this is equivalent to treating each leaf node as a state, and having
    # independent emission probabilities associated with it.
    #$re = "[".(join("", (keys(%{$this->seqClass->getAmbigCodes()}), 
        #@{$this->seqClass->getAlphabet})))."]";
    #my $wc = $this->seqClass->getWildCard;
    #$re =~ s/$wc//;
    for my $modelRe (@{$this->{orderedRegExps_}}) {
        my $model = $this->{submodelHash_}->{$modelRe};
        my $name = $model->name;
        my $ambigLetter = substr($model->name, 0, 1);
        msg("P+($name) = $this->{posProbHash_}->{$modelRe}\nP-($name)".
            " = $this->{nullProbHash_}->{$modelRe}\n");
        for my $letter ($this->seqClass->expandAmbigSeq($ambigLetter)) {
            $model->adjustPosProb(0, $letter,
                $this->{posProbHash_}->{$modelRe});
        }
    }

}

sub score {
    my $this = shift;
    $this->SUPER::score(@_);
}

=head1 SEE ALSO

L<iPE::Model::Emission::SDT>

=head1 AUTHOR

Bob Zimmermann (rpz@cse.wustl.edu)

=cut

1;
