#! /usr/bin/perl -w
use strict;

my $USAGE = "$0 <file> <which_element1> <which_element2> ... \n

Parse RNABob output file stored in <file> to fasta format
containing the found regions.
If run without any <which_elements> then it outputs all elements.
Otherwise, it selects only those elements specified on the input
(elements are numbered 1,2,... in the order in which they appear
on the first line of the descriptor).
";

#parse command line arguments
die $USAGE unless @ARGV>=1;

my ($filename, @which) = @ARGV;

#check that all elements of @which are positive integers
foreach my $num (@which) {
   die "Wrong which_element '$num'" 
     unless $num=~/^[0-9]+$/ && $num>0;
}

my $file;
open $file, "<$filename" or die "Cannot open $filename";

my $active = 0;
while(my $line = <$file>) {
    chomp $line;
    if ($active) {
	if ($line) {
            #parse line and print fasta header
	    print ">",make_header($line),"\n";
            #read another line with sequence
	    my $content = <$file>;
	    chomp $content;
	    my @parts = split '\|',$content;
            #first part should be empty
            die "Wrong format" unless $parts[0] eq "";

            #select parts of the output of interest
	    my @subparts;
	    if (@which==0) {
		@subparts=@parts;
		shift @subparts;
	    } else {
		foreach my $col (@which) {
                    die "Part $col not found" unless exists $parts[$col];
		    push @subparts,$parts[$col];
		}
	    }
 
            #prints selected parts 
	    print join("", @subparts), "\n";
	} else {
	    # empty line
	    $active = 0;
	}
    } else {
        #start of occurrences, end of file header
	if ($line =~ /^ seq-f/) {
            # next line should contain dashed on top of the table
	    $line = <$file>;
            die "Unexpected end of line" unless defined $line;
            die "Wrong format" unless $line =~ /^[- \t]*\s*$/;
	    $active = 1;
	}
    }
}

sub make_header {
    my ($line) = @_;
    #split line into three parts at whitespace
    my @parts = split " ",$line;
    die "Wrong format" unless @parts>=3;

    #if header contains gi numbers, take onlu the number
    my $header;
    if ($parts[2] =~ /^gi\|[^\|]*\|[^\|]*\|([^\|]*)\|/) {
	$header = $1;
    } else {
	$header = $parts[2];
    }
    return $header.":".$parts[0]."-".$parts[1];
}
