#! /usr/bin/perl -w
#extract protein sequnce of specified gene, if not translation not present, then translate the DNA sequence, annotate intron position
#argument 0: name of gene to extract protein from

use strict;
use Bio::SeqIO;
use Data::Dumper;

my $code = 3;
my $file = 'cds/'.$ARGV[0].'_prot.fa';

open(INFO, ">$file");

foreach my $filename (glob("genbank/*.gb")) {
    my $stream = Bio::SeqIO->new(-file => $filename,
			      -format => 'GenBank');

   while ( my $seq = $stream->next_seq() ) {
	
	my @ann = $seq->get_SeqFeatures();
	my $organism;
    my $gene = "";
    my $protein;
    my $introns = "";
	my $start_gene = 0;
	#print Dumper(\@ann);
	foreach my $feature (@ann) {
		
		
	    if($feature->primary_tag() eq 'source') {
			$organism = get_value($feature->annotation(), 'organism');
			$organism =~ /^(\S+) (\S+).*/;
			$organism = substr($1,0,3).substr($2,0,2);
			my $strain = get_value($feature->annotation(), 'strain');
			if(defined $strain){
			  $strain =~ s/\W+//g;
			  $organism.= uc $strain;
			}
	    }

	    if($feature->primary_tag() eq 'CDS') {
			my $ac = $feature->annotation();

			$gene =  uc get_value($ac, 'gene');

			#if($gene eq $ARGV[0] && defined $protein) {
			#	$introns =~ s/,$//;
			#	print INFO ">$filename /organism=$organism {i $introns i}\n$protein\n";
			#}

			next unless $gene eq $ARGV[0] && !(defined $protein);
		

			my $frame = 0;
			my $start_gene = $feature->start();
			my $end_gene = 0;
			my $start_cds = 0;
			my $end_cds = 0;

			$filename =~ tr/[\/ \.]/_/;
			$introns = "";
			my $total_length = 0;

			my $translated = 0;
			$protein = get_value($ac, 'translation');
			if(!(defined $protein)){
			  $protein = "";
			} else {
			    $translated = 1;
			}
			
			if ( $feature->location->isa('Bio::Location::SplitLocationI'))  {
			  my $location_no = 1;
			  for my $location ( $feature->location->sub_Location ) {
			      if($location_no>1){
					$introns .= ($total_length+1).',';
			      }
			      $start_cds = $location->start - $start_gene +1;
			      $end_cds = $location->end - $start_gene +1;
			      $total_length += $location->end - $location->start + 1;
			      my $str = $feature->location->strand +1;
			      if(!$translated){
					$protein .= $feature->seq()->trunc($start_cds,$end_cds)->seq(); 
			      }
			      $frame =  ($end_cds - $start_cds +2 + $frame) % 3;
			      $location_no++;
			    }
			    if($translated==0){
			      $protein = Bio::Seq->new(-seq => $protein)->translate(-codontable_id => $code)->seq();
			    }
	    
			} else {
			  my $str = $feature->location->strand +1;
			  if($translated==0){
			    $protein = $feature->seq()->translate(-codontable_id => $code, -frame => $frame)->seq();
			  }
			}
			#remove stop codon
			$protein =~ s/\*$//;
			
	    }

	    
	}

	if(defined $protein) {
		$introns =~ s/,$//;
		print INFO ">$filename /organism=$organism {i $introns i}\n$protein\n";
	}
    }
}

close(INFO);

sub get_value {
    my ($collection, $key) = @_;

    my @values = $collection->get_Annotations($key);
    return unless @values==1;
    return $values[0]->value();
}

