#! /usr/bin/perl -w

use strict;


my $USAGE = "
$0 <fasta> <gap threshold> > gtf-out

Will create a gtf file showing coordinates of stretches of
N's long at least threshold. Feature will be called gap.
";

die $USAGE unless @ARGV == 2;
my ($fasta, $threshold) = @ARGV;

open FASTA, "<$fasta" or die "Cannot open $fasta";
while(1) {
    my ($name, $seq) = read_fasta(\*FASTA);
    last unless defined $name;
    $name = normalize_fasta_name($name);

    my $n = length($$seq);
    my $last = -1;
    
    for(my $i=0; $i<=$n; $i++) {
	my $c;
	if($i<$n) { $c = uc substr($$seq, $i, 1); }
	else { $c = 'A'; }

	if($c ne 'N') {
	    if($i - $last - 1 >= $threshold) {
		#gap $last+1..$i-1, change coordinates to start from 1
		my ($start, $end) = ($last+2, $i);

		printf "%s\t%s\t%s\t%d\t%d\t.\t.\t.\n",
		    $name, 'gap', 'gap', $start, $end;
	    }
	    $last = $i;
	}
    }
}

close FASTA or die;

############################
sub read_fasta {
    # Read one fasta sequence from the fasta file
    # Return undef, if no sequence found;
    #        name and reference to sequence otherwise.

    my ($input) = @_;

    # read name of fasta sequence
    my $name;
    while(1) {
        my $line = <$input>;

        # end of file
        return unless defined $line;

        # skip empty lines
        next if $line =~ /^\s*$/;

        # parse the name
        $line =~ s/\s+$//;
        if($line =~ /^>/) {
            $name = $line;
            last;
        }
        else { die "Incorrect fasta file '$line'."; }
    }

    # read fasta sequence
    my $sequence = "";
    while(1) {
        my $file_pos = tell($input);
        my $line = <$input>;

        # end of file
        last unless defined $line;

        # if there is a beginning of a new sequence
        if($line =~ /^>/) {

	    seek($input, $file_pos, 0);
	    last;
	}

	# remove all whitespaces from line and append it to the sequence
	$line =~ s/\s//g;
	$sequence .= $line;
    }
    
    return ($name, \$sequence);
}

############################
sub normalize_fasta_name
{
    my ($name) = @_;

    $name =~ s/\s+$//;          # remove trailing white space
    $name =~ s/^>\s*//;         # remove > at the beginning of line
    $name =~ s/^(\S+)\s.*$/$1/; # remove everything after first space

    #if it has gi, take only number
    if ( $name =~ /gi\|(.*)\|/ ) {
        $name = $1;
    }

    return $name;
}


