#! /usr/bin/perl -w

use strict;
use Bio::SeqIO;
use List::Util qw[min max];
use Data::Dumper;

my $MIN_GENE_COUNT = 4;
my $YEAST_MIT_CODE =  3;
my $GENE_EXTENSION = 200;


my @strand = ('-','.','+');


my $prepared_data_directory="prepared_data";
my $file_out_all = 'output/all_genes.fasta';
my $file_out_all_gtf = 'output/all_genes.gtf';
my $exonerate_fasta_file = 'output/exonerate.fasta';
my $exonerate_gtf_file = 'output/exonerate.gtf';


my $transcript_id = 0;
my $organism = "";

my %gene_synonym_name = (
	'CYTB' => 'COB',
	'CYB' => 'COB',
	'OLI1' => 'ATP9',
	'ND1' => 'NAD1',
	'ND2' => 'NAD2',
	'ND3' => 'NAD3',
	'ND4' => 'NAD4',
	'ND4L' => 'NAD4L',
	'ND5' => 'NAD5',
	'ND6' => 'NAD6',
);

my %organism_gene;
my %processed_cds;
my %gene_counts;
my @processed_genes;


my %forbidden_genes = (
	'ORF1' => '1',
	'ORF2' => '1',
	'COX1-I1' => '1',
);

my @model_organisms = ('Sacce','Canal');

my $allow_coding_boundaries = "0";
my $print_protein_models = "0";
my $malin = "0";
my $print_cds_alignments = "0";
my $print_codon_alignments = "1";
my $print_seq_alignments = "0";
my $synonym_creation = "0";
my $print_exonerate_alignments = "0";
my $print_gene_names = "0";
my $print_weasel = "0";
my $print_tree = "0";

open(ALLFASTA, ">$file_out_all");
open(ALLGTF, ">$file_out_all_gtf");

foreach my $filename (glob("selection/*.gb")) {

	#get all functional elements
	my @elements_starts; 
	my @elements_ends;
    my $stream = Bio::SeqIO->new(-file => $filename,
			      -format => 'GenBank');
    while ( my $seq = $stream->next_seq() ) {
		my @ann = $seq->get_SeqFeatures();
		#print Dumper(\@ann);
		foreach my $feature (@ann) {
			my $ac = $feature->annotation();
			if($feature->primary_tag() eq 'gene' || $feature->primary_tag() eq 'tRNA' || $feature->primary_tag() eq 'rRNA') {
				if ( $feature->location->isa('Bio::Location::SplitLocationI')){
					my @subLoc = $feature->location->sub_Location;
					my $start = $subLoc[0]->start; 
					my $end = $subLoc[scalar(@subLoc)-1]->end;
					
					push(@elements_starts,$start);
					push(@elements_ends,$end);
				} else {
					my $start = $feature->location->start; 
					my $end = $feature->location->end;

					push(@elements_starts,$start);
					push(@elements_ends,$end);
				}
			}
		}
	}

	@elements_starts = sort {$a <=> $b} (@elements_starts);
	@elements_ends = sort {$b <=> $a} (@elements_ends);

	$stream = Bio::SeqIO->new(-file => $filename,
			      -format => 'GenBank');
	my $organism;

    while ( my $seq = $stream->next_seq() ) {
	
		my @ann = $seq->get_SeqFeatures();

		#print Dumper(\@ann);
		foreach my $feature (@ann) {
			my $ac = $feature->annotation();
			
			if($feature->primary_tag() eq 'source') {
				$organism = get_value($feature->annotation(), 'organism');
				$organism =~ /^(\S+) (\S+).*/;
				$organism = substr($1,0,3).substr($2,0,2);
				my $strain = get_value($feature->annotation(), 'strain');
				if(defined $strain){
					$strain =~ s/\W+//g;
					#$organism.= uc $strain;
				}
			}
			
			if($feature->primary_tag() eq 'gene') {
				next unless get_value($ac, 'gene');
				next if ( $feature->location->isa('Bio::Location::SplitLocationI')); #avoid cyclic genes
				
				my $gene =  uc get_value($ac, 'gene');
				next if ($forbidden_genes{$gene});

				my $gene_name;
				my @synonyms = $ac->get_Annotations('gene_synonym');
				if($gene_synonym_name{$gene}){
					$gene_name = $gene_synonym_name{$gene};
				} else {
					foreach my $synonym (@synonyms){
						#last;
						last if($gene =~ /.*[\W_].*/); #forbid strange synonyms
						if($gene_synonym_name{uc $synonym}){
							$gene_name = $gene_synonym_name{uc $synonym};
							last;
						}
					} 
				}
				
				unless($gene_name) {
					$gene_name = $gene;
				}
				
				$gene_synonym_name{$gene} = $gene_name;
				foreach my $synonym (@synonyms){
					last unless($synonym_creation);
					last if($gene_name =~ /.*[\W_].*/); #forbid strange synonyms
					next if($synonym =~ /.*[\W_].*/); #forbid strange synonyms				
					$gene_synonym_name{uc $synonym} = $gene_name; 
				}
				
				my $start_gene = $feature->location->start; 
				my $end_gene = $feature->location->end;

				my $last_end = 0;
				my $last_start = $seq->length()+1;
				for(my $i=0;$i<@elements_ends;$i++){
					if($elements_ends[$i] < $start_gene){
						$last_end = $elements_ends[$i];
						last;
					}
				}
				for(my $i=0;$i<@elements_starts;$i++){
					if($elements_starts[$i] > $end_gene){
						$last_start = $elements_starts[$i];
						last;
					}
				}
				
				if($allow_coding_boundaries){
					$start_gene = max(1,$start_gene-$GENE_EXTENSION);
					$end_gene = min($seq->length(),$end_gene+$GENE_EXTENSION);
				} else {
					$start_gene = max(1,$start_gene-$GENE_EXTENSION,$last_end+1);
					$end_gene = min($seq->length(),$end_gene+$GENE_EXTENSION,$last_start-1);
				}

				my $geneObj = GeneObject->new();
				$geneObj->start($start_gene);
				$geneObj->end($end_gene);
				$geneObj->sequence($seq->subseq($start_gene,$end_gene));

				$organism_gene{"$organism\_$gene_name"} = $geneObj;
				$gene_counts{$gene_name} = 0 unless(defined $gene_counts{$gene_name});
				
			}
		}
	}

	#print Dumper(\%gene_synonym_name);

	$stream = Bio::SeqIO->new(-file => $filename,
			      -format => 'GenBank');
    while ( my $seq = $stream->next_seq() ) {
		my @ann = $seq->get_SeqFeatures();
	
		#print Dumper(\@ann);
		foreach my $feature (@ann) {
			my $ac = $feature->annotation();
			if($feature->primary_tag() eq 'CDS') {
				next unless(get_value($ac, 'gene'));
				my $gene = uc get_value($ac, 'gene');
				my $gene_name = $gene_synonym_name{$gene};
				next unless($gene_name);
				my $cds_id = "$organism\_$gene_name";
				next if($processed_cds{$cds_id});
				my $geneObj = $organism_gene{$cds_id};
				next unless($geneObj);

				my $start_gene = $geneObj->start();
				my $end_gene = $geneObj->end();
				my $s = $geneObj->sequence();

				my $start_cds = 0;
				my $end_cds = 0;

				my $stop_codon = "";

				my $start_stop_codon = -1;
				my $end_stop_codon = -1;
				my $str = $feature->location->strand +1;

				$geneObj->strand($str-1);
				$geneObj->gene_name($gene_name);
				$geneObj->organism($organism);
				$geneObj->sequence($s);
				$geneObj->transcript_id($cds_id);

				my @malin_introns;
				my $coding_sequence = "";

				if ( $feature->location->isa('Bio::Location::SplitLocationI'))  {
					my $location_no = 1;
					my $frame = 0;
					for my $location ( sort {$a->end <=> $b->end} ($feature->location->sub_Location) ) {
						$start_cds = $location->start - $start_gene +1;
						$end_cds = $location->end - $start_gene +1;
						
						if($str == 2 && $location_no == $feature->location->sub_Location){
							$end_cds   -= 3;
							$start_stop_codon = $end_cds + 1;
							$end_stop_codon = $end_cds + 3;
							$geneObj->stop_codon(substr($s,$start_stop_codon - 1, 3));
						}
						
						if($str == 0 && $location_no == 1){
							$start_cds += 3;
							$start_stop_codon = $start_cds - 3;
							$end_stop_codon = $start_cds - 1;
							$geneObj->stop_codon(Bio::Seq->new(-seq => substr($s,$start_stop_codon - 1, 3))->revcom()->seq());
						}
						
						
						push(@malin_introns, (length($coding_sequence)+1)) if($location_no>1);
						$coding_sequence .= substr($s,$start_cds - 1, $end_cds - $start_cds + 1) ; 
						

						$frame = ($end_cds - $start_cds + 1 - (3-$frame) % 3) % 3 	if($str == 0);

						$geneObj->add_gtf_line("$cds_id\t.\tCDS\t$start_cds\t$end_cds\t.\t$strand[$str]\t$frame\tgene_id \"$cds_id\"; transcript_id \"$cds_id\"; filename \"$filename\"; gene \"$gene_name\"; original_gene \"$gene\";\n");
						
						$frame = (3 - ($end_cds - $start_cds + 1 - $frame) % 3) % 3 if($str == 2);
						 
						$location_no++;
					}
				} else {
					$start_cds = $feature->location->start - $start_gene +1;
					#exlude stop codon
					$end_cds = $feature->location->end - $start_gene +1;
					

					if($str == 2){
						$end_cds -= 3;
						$start_stop_codon = $end_cds + 1;
						$end_stop_codon = $end_cds + 3;
						$geneObj->stop_codon(substr($s,$start_stop_codon - 1, 3));
					} else {
						$start_cds += 3;
						$start_stop_codon = $start_cds - 3;
						$end_stop_codon = $start_cds - 1;
						$geneObj->stop_codon(Bio::Seq->new(-seq => substr($s,$start_stop_codon - 1, 3))->revcom()->seq());
					}
					next if($start_cds < 1);
					next if($end_cds < 1);
					$coding_sequence = substr($s,$start_cds - 1, $end_cds - $start_cds + 1); 
					
					$geneObj->add_gtf_line("$cds_id\t.\tCDS\t$start_cds\t$end_cds\t.\t$strand[$str]\t0\tgene_id \"$cds_id\"; transcript_id \"$cds_id\"; filename \"$filename\"; gene \"$gene_name\"; original_gene \"$gene\";\n");
					
				}
				
				my $malin_header = "";
				if($str == 2){
					foreach my $malin_intron( @malin_introns ){
						$malin_header .= "$malin_intron,";
					}
					chop($malin_header);
				} else {
					foreach my $malin_intron( @malin_introns ){
						$malin_header = ((length($coding_sequence) - $malin_intron) + 2).",$malin_header";
					}
					chop($malin_header);
				}

				$geneObj->malin_header(" \/organism=$organism \{i $malin_header i\}");
				$geneObj->coding_sequence($coding_sequence) if($str == 2);
				$geneObj->coding_sequence(Bio::Seq->new(-seq => $coding_sequence)->revcom()->seq()) if($str == 0);
				$geneObj->translation(Bio::Seq->new(-seq => $coding_sequence)->translate(-codontable_id => $YEAST_MIT_CODE)->seq()) 			if($str == 2);
				$geneObj->translation(Bio::Seq->new(-seq => $coding_sequence)->revcom()->translate(-codontable_id => $YEAST_MIT_CODE)->seq()) 	if($str == 0);
				$gene_counts{$gene_name} = $gene_counts{$gene_name} + 1;
				#print ALLGTF "$cds_id\t.\tstop_codon\t$start_stop_codon\t$end_stop_codon\t.\t$strand[$str]\t.\tgene_id \"$cds_id\"; transcript_id \"$cds_id\"; filename \"$filename\"; gene \"$gene_name\";\n";
				$processed_cds{$cds_id} = 1;
				push(@processed_genes, $geneObj);
			}
		}
	}
}

############Directory creation
system "mkdir output";

if($malin) {
	system "mkdir output/prot";
	system "mkdir output/prot/algn";
	system "rm output/prot/algn/*.faa";
	system "rm output/prot/*.fasta";
}

if($print_seq_alignments) {
	system "mkdir output/seq";
	system "mkdir output/seq/algn";
	system "rm output/seq/algn/*.faa";
	system "rm output/seq/*.fasta";
}

if($print_cds_alignments) {
	system "mkdir output/cds";
	system "mkdir output/cds/algn";
	system "rm output/cds/algn/*.faa";
	system "rm output/cds/*.fasta";
}

if($print_codon_alignments) {
	system "mkdir output/cds";
	system "mkdir output/cds/codon_aln";
	system "rm output/cds/codon_aln/*";
	system "rm output/cds/*.fasta";
}

if($print_protein_models) {
	system "mkdir output/protein_models";
}

if($print_protein_models) {
	system "mkdir output/protein_models";
}

if($print_exonerate_alignments) {
	system "rm $exonerate_fasta_file $exonerate_gtf_file";
}
#############

if($print_gene_names) {
	open(NAMES, ">output/gene_names.txt");
}

foreach my $geneObj (@processed_genes) {
	if( $gene_counts{$geneObj->gene_name()} >= $MIN_GENE_COUNT){
		my $gene_id = $geneObj->organism()."_".$geneObj->gene_name();
		print ALLFASTA ">$gene_id\n".$geneObj->sequence()."\n";
		foreach my $gtf_line ($geneObj->gtf_lines()) {
			print ALLGTF $gtf_line;
		}
		
		if($print_seq_alignments) {
			my $seq_file_name = 'output/seq/'.$geneObj->gene_name().'.fasta';
			open(SEQ, ">>$seq_file_name");
			print SEQ ">$gene_id\n".$geneObj->sequence()."\n";
			close SEQ;
		}

		if($print_cds_alignments) {
			my $cds_file_name = 'output/cds/'.$geneObj->gene_name().'.fasta';
			open(CDS, ">>$cds_file_name");
			print CDS ">$gene_id\n".substr($geneObj->coding_sequence(),3)."\n";
			close CDS;
		}

		if($print_codon_alignments) {
			my $cds_file_name = 'output/cds/'.$geneObj->gene_name().'.fasta';
			open(CDS, ">>$cds_file_name");
			print CDS ">$gene_id\n".substr($geneObj->coding_sequence(),3)."\n";
			close CDS;
		}

		if($malin) {
		#Malin output
			my $malin_file_name = 'output/prot/'.$geneObj->gene_name().'.fasta';
			my $malin_header = '>'.($geneObj->organism()).($geneObj->malin_header());
			open(MALIN, ">>$malin_file_name");
			print MALIN "$malin_header\n".$geneObj->translation()."\n";
		}

		if($print_gene_names) {
			print NAMES "$gene_id;";
		}
		
	}
}

if($print_protein_models) {
	foreach my $gene_name (keys(%gene_counts)) {
		if( $gene_counts{$gene_name} >= $MIN_GENE_COUNT){
			foreach $organism (@model_organisms) {
				my $geneObj = $organism_gene{"$organism\_$gene_name"};
				next unless($geneObj);
				open(MODEL, ">output/protein_models/$gene_name.fasta");
				print MODEL ">$organism\_$gene_name\n".$geneObj->translation()."\n";
				last;
			}
		}
	}
}

if($print_cds_alignments) {
	foreach my $gene_name (keys(%gene_counts)) {
		if( $gene_counts{$gene_name} >= $MIN_GENE_COUNT){
			print "MUSCLE:$gene_name\n";
			system "muscle -in output/cds/$gene_name.fasta -out output/cds/algn/$gene_name.faa";
		}
	}
}

if($print_seq_alignments) {
	foreach my $gene_name (keys(%gene_counts)) {
		if( $gene_counts{$gene_name} >= $MIN_GENE_COUNT){
			print "MUSCLE:$gene_name\n";
			system "muscle -in output/seq/$gene_name.fasta -out output/seq/algn/$gene_name.faa";
		}
	}
}

if($malin) {
	foreach my $gene_name (keys(%gene_counts)) {
		if( $gene_counts{$gene_name} >= $MIN_GENE_COUNT){
			print "MUSCLE:$gene_name\n";
			system "muscle -in output/prot/$gene_name.fasta -out output/prot/algn/$gene_name.faa";
		}
	}
}

if($print_exonerate_alignments) {
	foreach my $filename (glob("$prepared_data_directory/cds/*.fasta")) {
		$filename =~ /\/(\w+)\.fasta/;
		my $gene_name = $1;
		system "./extract_exonerate_output.pl $exonerate_fasta_file $exonerate_gtf_file $prepared_data_directory/protein_models/$gene_name.fasta $filename";
	}
	my %exonerate_hits;
	open(INFASTA , "<$exonerate_fasta_file");
	while(<INFASTA>) {
		my $line = $_;
		if(/^>(\w+)/) {
			$exonerate_hits{$1} = 1;
		}
	}
	close INFASTA;
	open(OUTFASTA, ">>$exonerate_fasta_file");
	foreach my $geneObj (@processed_genes) {
		if( $gene_counts{$geneObj->gene_name()} >= $MIN_GENE_COUNT){
			my $gene_id = $geneObj->organism()."_".$geneObj->gene_name();
			print OUTFASTA ">$gene_id\n".(0 x length($geneObj->sequence()))."\n" unless($exonerate_hits{$gene_id});
		}
	}
	close OUTFASTA;
}

if($print_weasel) {
	open(WEASEL, "<$prepared_data_directory/rnaweasel/rnaweasel.output");
	open(IFASTA, ">output/introns.fasta");
	open(IGTF, ">output/introns.gtf");
	
	my %weasel_hits;
	foreach my $geneObj (@processed_genes) {
		if( $gene_counts{$geneObj->gene_name()} >= $MIN_GENE_COUNT){
			$weasel_hits{$geneObj->organism()."_".$geneObj->gene_name()} = '0' x length($geneObj->sequence());
		}
	}

	while(<WEASEL>){
		if(/^(\w+) +(\S+) +(\d+)\.\.(\d+) +(\w+)/){
			my $str = "";
			$str = '+' if($5 eq 'FW');
			$str = '-' if($5 eq 'RC');
			
			$weasel_hits{$1} = map_weasel_hits($weasel_hits{$1},$3,$4,$str,$1);
			print IGTF "$1\trnaweasel\tintron\t$3\t$4\t.\t$str\t.\tgene_id \"$1\"; transcript_id \"$1\"; intron_type \"$2\";\n";
			
		}
	}
	
	foreach my $gene_name (keys(%weasel_hits)) {
		print IFASTA ">$gene_name\n$weasel_hits{$gene_name}\n";
	}
	
	close IGTF;
	close IFASTA;
	close WEASEL;
}

if($print_tree) {
	system "./combine_alignments.pl $prepared_data_directory/malin_prot/algn/* > output/aln.fasta";
	system "java -jar ../programs/readseq/readseq.jar -f 12 -o output/aln.newick output/aln.fasta";
	system "phyml output/aln.newick 1 i 1 5 JTT 0.0 4 1.0 BIONJ y y";
}

if($print_codon_alignments) {
		
	foreach my $cds_file_name (glob("output/cds/*.fasta")){
		$cds_file_name =~ /output\/cds\/(\w+)\.fasta/;
		system "./align_coding.pl $cds_file_name zoznam9.csv 1 output/cds/codon_aln/$1";
	}
	system "./combine_alignments.pl output/cds/codon_aln/COX1.dna_aln output/cds/codon_aln/COX2.dna_aln > output/cds/codon_aln/codons.dna_aln";
	system "java -jar ../programs/readseq/readseq.jar -f 12 -o output/cds/codon_aln/codons.dna_phy output/cds/codon_aln/codons.dna_aln -a";
}

#print Dumper(\%gene_synonym_name);


sub map_weasel_hits {
    my ($hit_seq,$start ,$end, $str, $gene_name)  = @_;
	my $length = $end - $start + 1;
    
	if( substr($hit_seq,$start-1,$length) eq ('0'x$length) ){
		substr($hit_seq,$start-1,$length, '1'x$length) if($str eq '+');
		substr($hit_seq,$start-1,$length, '2'x$length) if($str eq '-');
	} else {
		print "Problematic rnaweasel hit found:$gene_name\n";
	}
    
    return $hit_seq;
}

sub get_value {
    my ($collection, $key) = @_;

    my @values = $collection->get_Annotations($key);
    return unless @values==1;
    return $values[0]->value();
}

##############

package GeneObject;

sub new {
    my $ref = shift;
    my $class = ref($ref) || $ref;
    my $self = {
		'gene_name' => undef,
		'organism' => undef,
        'sequence' => undef,
		'strand' => undef,
		'start' => undef,
		'end' => undef,
		'stop_codon' => undef,
		'coding_sequence' => undef,
		'translation' => undef,
		'transcript_id' => undef,
		'malin_header' => undef,
        'gtf_lines'  => []
        
	};

    bless $self, $class;
    return $self;
}

sub organism {
    my $self = shift;
    my $organism = shift;

    if( defined $organism ) {
        $self->{'organism'} = $organism;
    }
    return $self->{'organism'};
}

sub strand {
    my $self = shift;
    my $strand = shift;

    if( defined $strand ) {
        $self->{'strand'} = $strand;
    }
    return $self->{'strand'};
}

sub stop_codon {
    my $self = shift;
    my $stop_codon = shift;

    if( defined $stop_codon ) {
        $self->{'stop_codon'} = $stop_codon;
    }
    return $self->{'stop_codon'};
}

sub coding_sequence {
    my $self = shift;
    my $coding_sequence = shift;

    if( defined $coding_sequence ) {
        $self->{'coding_sequence'} = $coding_sequence;
    }
    return $self->{'coding_sequence'};
}

sub translation {
    my $self = shift;
    my $translation = shift;

    if( defined $translation ) {
        $self->{'translation'} = $translation;
    }
    return $self->{'translation'};
}

sub gene_name {
    my $self = shift;
    my $gene_name = shift;

    if( defined $gene_name ) {
        $self->{'gene_name'} = $gene_name;
    }
    return $self->{'gene_name'};
}

sub start {
    my $self = shift;
    my $start = shift;

    if( defined $start ) {
        $self->{'start'} = $start;
    }
    return $self->{'start'};
}

sub end {
    my $self = shift;
    my $end = shift;

    if( defined $end ) {
        $self->{'end'} = $end;
    }
    return $self->{'end'};
}

sub sequence {
    my $self = shift;
    my $sequence = shift;

    if( defined $sequence ) {
        $self->{'sequence'} = $sequence;
    }
    return $self->{'sequence'};
}

sub transcript_id {
    my $self = shift;
    my $transcript_id = shift;

    if( defined $transcript_id ) {
        $self->{'transcript_id'} = $transcript_id;
    }
    return $self->{'transcript_id'};
}

sub malin_header {
    my $self = shift;
    my $malin_header = shift;

    if( defined $malin_header ) {
        $self->{'malin_header'} = $malin_header;
    }
    return $self->{'malin_header'};
}

sub add_gtf_line {
    my $self = shift;
    my $gtf_line = shift;
    push(@{$self->{'gtf_lines'}},$gtf_line); 
}

sub gtf_lines {
    my $self = shift;
    return @{$self->{'gtf_lines'}};
}

##############



 
