#! /usr/bin/perl -w

#extract genes from genbank files of selection directory and write them into fasta file in output directory, extract CDS locations of the gene and write their location in extracted fasta sequence
#argument 0: gene that will be extracted to separate files

use strict;
use Bio::SeqIO;
use Data::Dumper;

my $file_out_fasta = 'output/'.$ARGV[0].'_prot.fasta';
my $file_out_gtf = 'output/'.$ARGV[0].'_prot.gtf';
my $file_out_ex_fasta = 'output/'.$ARGV[0].'_ex_prot.fasta';
my $file_out_ex_gtf = 'output/'.$ARGV[0].'_ex_prot.gtf';
my $file_out_all = 'output/all_prot.fasta';
my $file_out_all_gtf = 'output/all_prot.gtf';

my @strand = ('-','.','+');

my $transcript_id = 0;
my $organism_gene = "";

open(INFO, ">$file_out_fasta");
open(INFO2, ">$file_out_gtf");
open(INFO3, ">$file_out_ex_fasta");
open(INFO4, ">$file_out_ex_gtf");
open(INFO5, ">$file_out_all");
open(INFO6, ">$file_out_all_gtf");

foreach my $filename (glob("selection/*.gb")) {
    my $stream = Bio::SeqIO->new(-file => $filename,
			      -format => 'GenBank');
   while ( my $seq = $stream->next_seq() ) {
	
	my @ann = $seq->get_SeqFeatures();
	my $organism_gene;
	my $last_gene = "";
	my $gene = "";
	my $protein = "";
	my $s = "";
	my $start_gene = 0;
	my $end_gene = 0;
	my $start_cds = 0;
	my $end_cds = 0;
	
	#print Dumper(\@ann);
	foreach my $feature (@ann) {
	    my $ac = $feature->annotation();
	    
	    if($feature->primary_tag() eq 'source') {
			$organism_gene = get_value($feature->annotation(), 'organism');
			$organism_gene =~ /^(\S+) (\S+).*/;
			$organism_gene = substr($1,0,3).substr($2,0,2);
			my $strain = get_value($feature->annotation(), 'strain');
			if(defined $strain){
			  $strain =~ s/\W+//g;
			  #$organism_gene.= uc $strain;
			}
	    }
	    
	    if($feature->primary_tag() eq 'gene') {
	      $gene =  uc get_value($ac, 'gene');
	      $transcript_id++;
	      $protein = "";
	      $start_gene = $feature->location->start - 200; 
	      
	      if($start_gene + 200 <  $end_gene - 200){
		$gene = ""; 
		next;
	      }
	      
	      if($start_gene<1){ $start_gene = 1;}
	      $end_gene = $feature->location->end + 200;
	      if($end_gene>$seq->length()){$end_gene = $seq->length();}
	      
	      
	      
	      $s = $seq->subseq($start_gene,$end_gene);
	    }

	    if($feature->primary_tag() eq 'CDS') {
		my $frame = 0;
		
		next unless $protein eq "" && !($gene eq "");
		#print only first CDS which contains all exon locations
		$protein = "true";
		#if CDS exists its not tRNA so print to fasta this gene
		if($gene eq uc $ARGV[0]){
		  print INFO ">$organism_gene"."_$gene\n$s\n";
		} else {
		  print INFO3 ">$organism_gene"."_$gene\n$s\n";
		}
		print INFO5 ">$organism_gene"."_$gene\n$s\n";
		if ( $feature->location->isa('Bio::Location::SplitLocationI'))  {
		  my $location_no = 1;
		  for my $location ( $feature->location->sub_Location ) {
		    $start_cds = $location->start - $start_gene +1;
		    $end_cds = $location->end - $start_gene +1;
		    my $str = $feature->location->strand +1;
		    
		    if($str == 2 && $location_no == $feature->location->sub_Location){
		      $end_cds -= 3;
		    }
		    if($str == 0 && $location_no == 1){
		      $start_cds += 3;
		    }
		    
		    if($str == 0){
		      $frame = ($end_cds - $start_cds + 1 - (3-$frame) % 3) % 3;
		    }

		    if($gene eq uc $ARGV[0]){
		      print INFO2 "$organism_gene"."_$gene\t.\tCDS\t$start_cds\t$end_cds\t.\t$strand[$str]\t$frame\tgene_id \"$organism_gene\"; transcript_id \"$transcript_id\"; filename \"$filename\"; gene \"$gene\";\n";
		    } else {
		      print INFO4 "$organism_gene"."_$gene\t.\tCDS\t$start_cds\t$end_cds\t.\t$strand[$str]\t$frame\tgene_id \"$organism_gene\"; transcript_id \"$transcript_id\"; filename \"$filename\"; gene \"$gene\";\n";
		    }
		    print INFO6 "$organism_gene"."_$gene\t.\tCDS\t$start_cds\t$end_cds\t.\t$strand[$str]\t$frame\tgene_id \"$organism_gene\"; transcript_id \"$transcript_id\"; filename \"$filename\"; gene \"$gene\";\n";
		    if($str == 2){
		      $frame = (3 - ($end_cds - $start_cds + 1 - $frame) % 3) % 3;
		    } 
		    $location_no++;
		  }
		} else {
		  $start_cds = $feature->location->start - $start_gene +1;
		  #exlude stop codon
		  $end_cds = $feature->location->end - $start_gene +1;
		  my $str = $feature->location->strand +1;
		  if($str == 2){
		      $end_cds -= 3;
		  } else {
		      $start_cds += 3;
		  }
		  if($gene eq uc $ARGV[0]){
		      print INFO2 "$organism_gene"."_$gene\t.\tCDS\t$start_cds\t$end_cds\t.\t$strand[$str]\t0\tgene_id \"$organism_gene\"; transcript_id \"$transcript_id\"; filename \"$filename\"; gene \"$gene\";\n";
		  } else {
		      print INFO4 "$organism_gene"."_$gene\t.\tCDS\t$start_cds\t$end_cds\t.\t$strand[$str]\t0\tgene_id \"$organism_gene\"; transcript_id \"$transcript_id\"; filename \"$filename\"; gene \"$gene\";\n";
		  }
		  print INFO6 "$organism_gene"."_$gene\t.\tCDS\t$start_cds\t$end_cds\t.\t$strand[$str]\t0\tgene_id \"$organism_gene\"; transcript_id \"$transcript_id\"; filename \"$filename\"; gene \"$gene\";\n";
		}
	    }
	}
    }
}

close(INFO);

sub get_value {
    my ($collection, $key) = @_;

    my @values = $collection->get_Annotations($key);
    return unless @values==1;
    return $values[0]->value();
}

 
