#!/usr/bin/perl -w

use strict;
use Data::Dumper;
use Getopt::Std;
use POSIX;
use FindBin qw($Bin);  #$Bin is the directory with the script
use lib $Bin;        #add bin to the library path
use shared;

my $USAGE = "Usage: $0 [options] advisor_file gtf_file

Count for each exon, gene and optionally intron, how many exon
positions are supported and in conflict with the evidence as
summarized in the advisor file. Resulting gtf is written on the
standard output.

Exon or intron position is supported, if the advisor value for its
label is maximum among all labels for that position and is greater
than the given threshold. Position is in conflict, if its
value is less than one. Transcript support is computed only from 
exon statistics (not introns).

Consider rows with feature CDS unless option -e is specified, in
which case consider rows with feature exon. If frame is not specified,
consider the best of all frames for each position.

Options:
-e             Consider feature exon instead of CDS
-i             Score introns as well as exons
-t threshold   Use given threshold to count position as supported in exon
               (default 1)
-T threshold   Use given threshold to count position as supported in intron
               (default 1)
";

##
# Absolute frame of an exon: What label would be in pos. 0 of the sequence
# if the exon extends that far?
#
# Labels from our labeling in different frames:
#
#            Position of the sequence (counting from 0)
#Abs. frame      0  1  2  3  4  5 
# ---------------------------------
#       0        0  1  2  0  1  2
#       1        1  2  0  1  2  0
#       2        2  0  1  2  0  1
#       3        3  5  4  3  5  4
#       4        4  3  5  4  3  5
#       5        5  4  3  5  4  3

my %Options;
getopts('eit:T:', \%Options);
if(!exists $Options{'t'}) {
    $Options{'t'} = 1;
}
if(!exists $Options{'T'}) {
    $Options{'T'} = 1;
}

if(scalar @ARGV != 2) {
    die $USAGE;
}
my ($advisor_file, $gtf_file) = @ARGV;
my $exon_feature = (exists $Options{'e'}) ? 'exon' : 'CDS';
my %score_features;  #which features (rows of gtf) to score
$score_features{$exon_feature} = 1;
if(exists $Options{'i'}) { $score_features{'intron'} = 1; }


my $LAB = "x012beiad345BEIADyY";

local *GTF_FILE;
open GTF_FILE, "<$gtf_file" or die "Cannot open $gtf_file";

local *ADVISOR_FILE;
open ADVISOR_FILE, "<$advisor_file" or die "Cannot open $advisor_file";

while(1) {    #while not eof
    
    #read advisor and process to binary strings
    my ($name, $advisor) = read_advisor(*ADVISOR_FILE, $Options{'t'}, $Options{'T'});
    last if(!defined($name));

    $name = normalize_fasta_name($name);
    print STDERR "$name\n";

    my $rows = read_gtf(*GTF_FILE, $name);

    score_rows($rows, $advisor);

    print_gtf(*STDOUT, $rows);
}

my $line = <GTF_FILE>;
die "GTF file too long" if defined($line);

close GTF_FILE or die;
close ADVISOR_FILE or die;

############################
sub print_gtf
{
    my ($out, $rows) = @_;
    
    foreach my $row (@$rows) {
	print $out $row, "\n";
    }
}

############################
sub score_str {
    my ($good, $bad, $chr) = @_;

    $good = $good || 0;   #replace undefined values
    $bad = $bad || 0;
    my $all = $good+$bad;
    $all = $all || 1;     #do not allow zero

    my $ratio = $good/$all;
    my $num = POSIX::floor($ratio*10);
    die "$chr $good $bad" unless $num>=0 && $num<=10;

    return ($chr x $num) . ('-' x (10-$num));
}

############################
sub parse_row {
    my ($row, $check) = @_;

    my ($seq, $source, $feature, $from, $to, $score, $strand, $frame, $rest) 
	= split "\t", $row;

    return unless defined($rest);
    return unless !$check 
	|| exists $score_features{$feature} && $from <= $to && $strand =~ /^[+-.]$/ 
	&& $frame =~ /^[012.]$/;

    my $transcript;
    foreach my $part (split ';', $rest) {
	if($part =~ /^\s*transcript_id\s*\"(.*)\"\s*$/) {
	    $transcript = $1;
	}
    }

    return unless defined($transcript);

    return ($feature, $from, $to, $strand, $frame, $transcript);
}



############################
sub score_rows {
    my ($rows, $advisor) = @_;

    my %transcripts;    #$transcripts{$id}{good/bad}


    #for each sequence position %3 and gtf frame give absolute frame for advisor 
    my @frame = ([0,2,1,3,5,4], [2,1,0,4,3,5], [1,0,2,5,4,3]);

    my $len = length($advisor->{'0'});

    #first score all exons
    foreach my $row (@$rows) {

	my ($feature, $from, $to, $strand, $frame, $transcript) = parse_row($row, 1);

	#skip bad rows, rows that are not scored
	next if(!defined($feature));

	#coordinates in gtf are from 1
	$from--;
	$to--;

	#skip regions outside of sequence
	next if($from<0 || $to>=$len);

	#skip one nucl. from each side to avoid other labels
        # (a,d, e, b)
	my $from_i = $from+1;
	my $to_i = $to-1;

	my @labels;   #which labels from $advisor to use to compute result
	if($feature eq $exon_feature) {

	    #if frame known, determine absolute frame and look up string
	    if($frame ne '.' && $strand ne '.') {
	
		my $where = $from;    #which end of exon the gtf frame refers to
		if($strand eq '-') {
		    $frame+=3;
		    $where = $to;
		}
		@labels = ($frame[$where%3][$frame]);
	    }
	    elsif($strand eq '+') { @labels = (0,1,2); }
	    elsif($strand eq '-') { @labels = (3,4,5); }
	    else { @labels = (0,1,2,3,4,5); }
	}
	else {
	    #intron
	    if($strand eq '+') { @labels = ('i'); }
	    elsif($strand eq '-') { @labels = ('I'); }
	    else { @labels = ('i', 'I'); }
	}

	#now count good, bad, and unknown positions
	my ($good, $bad, $zero) = (0,0,0);
	
	if($from_i<=$to_i) {
	    #uniquely determined label
	    if(@labels==1) {
		#cut out the exon from a corresponding frame
		my $adv = substr($advisor->{$labels[0]}, $from_i, $to_i-$from_i+1);
	    
		#count 1's and 0's
		$good  = ($adv =~ tr/+/+/);
		$bad  = ($adv =~ tr/-/-/);
		$zero = ($adv =~ tr/0/0/);
	    }
	    else {
		#for each position take the best of the labels
		my %seen;
		for(my $pos = $from_i; $pos<=$to_i; $pos++) {
		    undef %seen;
		    foreach my $label (@labels) {
			$seen{substr($advisor->{$label}, $pos, 1)} = 1;
		    }
		    if (exists $seen{'+'}) { $good++; }
		    elsif (exists $seen{'0'}) { $zero++; }
		    else { $bad++; }
		}
	    }
	}
	
	#we should get all positions but the two that were skipped
	die unless $good+$bad+$zero+2 == $to-$from+1 || $to==$from;
	
	#skipped positions at the boundaries count as zero
	$zero += ($to-$from+1) - ($good+$bad+$zero);
	die "$from $to $good $bad $zero '$row'" unless $zero>=0;

	if($feature eq $exon_feature) {
	    $row .= " CDS_support_abs \"$good\";";
	    $row .= " CDS_conflict_abs \"$bad\";";
	    
	    $transcripts{$transcript}{'good'} += $good;
	    $transcripts{$transcript}{'bad'} += $bad;
	    $transcripts{$transcript}{'zero'} += $zero;
	}
	else {
	    #intron
	    $row .= " intron_support_abs \"$good\";";
	    $row .= " intron_conflict_abs \"$bad\";";
	}
    }

    #now score transcripts
    foreach my $row (@$rows) {
	my ($feature, $from, $to, $strand, $frame, $transcript) = parse_row($row, 0);

	#skip bad rows, rows that are not exons
	next if(!defined($feature));

	my $good = $transcripts{$transcript}{'good'};
	my $bad = $transcripts{$transcript}{'bad'};
	my $zero = $transcripts{$transcript}{'zero'};
	my $all = $good+$bad+$zero;

	my $score_good = score_str($good, $bad + $zero, 'X');
	my $score_bad = score_str($bad, $good + $zero, 'Y');

	$row .= " transcript_support \"$score_good\";";
	$row .= " transcript_conflict \"$score_bad\";";
	$row .= " transcript_support_abs \"$good\";";
	$row .= " transcript_conflict_abs \"$bad\";";
	$row .= " transcript_length \"$all\";";
    }
}



############################
sub read_advisor
{
    my ($in, $threshold_exon, $threshold_intron) = @_;

    my @label_pos;      #$labels[profile_num]{label} = position within profile
                        #       (undefined for non-complete partitions)

    my @profile_size;   #profile_size[profile_num] = number of explicit sets

    my @profile_set_size;  #profile_set_size[profile_num][set_num] = 
                           #  number of labels in set set_num of the profile

    my %result;         #for each reading frame a string denoting support
                        #  +/-/0 is support, conflict, no info
                        #reading frame is the label at position 0 of the seq.
                        #frame 'i'/'I' are introns on forward/reverse

    my @all_labs = (0..5);
    if(exists $Options{'i'}) {
	push @all_labs, 'i', 'I';
    }
    my %threshold;
    @threshold{(0..5)}=($threshold_exon) x 6;
    @threshold{'i','I'}=($threshold_intron) x 2;

    foreach my $lab (@all_labs) {
	$result{$lab} = '';
    }
    

    #for each positon in sequence %3, which frame shouch each label go
    my @where = ([0,1,2,3,4,5], [2,0,1,4,5,3], [1,2,0,5,3,4]);

    # read sequence name
    my $name = <$in>;
    return unless defined($name);   #end of file
    die unless $name =~ /^>/;

    # read header with partition specs
    my $line = <$in>;
    die unless defined($line) && $line eq "profiles:\n";

    while(1) {
        $line = <$in>;
        last unless defined($line)
            && $line =~ /^\s*(\d+):\s+(.*)\s*:end\s*$/;
        my $profile = $1;
        my @partition = split ' ', $2;
	$profile_size[$profile] = scalar @partition;

	my $curr_label_pos = {};
	foreach my $lab (split '', $LAB) {
	    foreach my $i (0..scalar(@partition)-1) {
		my $set = $partition[$i];
		if(index($set,$lab)>=0) {  #is label in the label set?
		    die unless !exists $curr_label_pos->{$lab};
		    $curr_label_pos->{$lab} = $i;
		}
	    }
	    #label not found!
	    if(!exists $curr_label_pos->{$lab}) {
		die "Non-complete profile $line\n";
	    }
	}
	$label_pos[$profile] = $curr_label_pos;

	foreach my $i (0..scalar(@partition)-1) {
	    $profile_set_size[$profile][$i] = length($partition[$i]);	    

	    if($profile_set_size[$profile][$i]>1) {
		print STDERR "profile $profile set $i size " .
		    $profile_set_size[$profile][$i] . "\n";
	    }
	}
    }
    die "'$line'" unless defined($line) && $line eq ":end\n";

    $line = <$in>;
    die unless defined($line) && $line eq "nodefault:\n";

    #read one line per position
    my $seq_pos = 0;
    while(1) {
	my $file_pos = tell $in;   # remember where the line starts

	$line = <$in>;
	last unless defined($line); # end of file

	if($line =~ /^\>/) {         #new sequence
            seek $in, $file_pos, SEEK_SET;   #return to the start of line
            last;
        }

	die "'$line'\n " unless $line =~ /^\s*(\d+)\s+(.*)$/;
        my @values = split ' ', $2;
        my $profile = $1;
        die unless scalar(@values) == $profile_size[$profile];

	#find maximum value
	my $max = 0;
	for(my $i=0; $i<@values; $i++) {
	    my $value =  $values[$i]/$profile_set_size[$profile][$i];
	    if($value > $max) { $max = $value; }
	}
	
	my $all_res = '';
	foreach my $lab (@all_labs) {
	    die unless exists $label_pos[$profile]{$lab};

	    my $set = $label_pos[$profile]{$lab};

	    my $value = $values[$set]
		/ $profile_set_size[$profile][$set];
	    die unless defined $value;

	    my $lab_idx = $lab;
	    if(index('012345',$lab)>=0) {
		$lab_idx = $where[$seq_pos %3][$lab];
	    }

	    if($value >= $max && $value > $threshold{$lab}) {
		$result{$lab_idx} .= "+";
	    }
	    elsif($value < 1) {
		$result{$lab_idx} .= "-";
	    }
	    else {
		$result{$lab_idx} .= "0";
	    }
	}

	$seq_pos++;
    }

    return ($name, \%result);
}

#############################
sub HELP_MESSAGE {
    print $USAGE;
    exit 1;
}
