#! /usr/bin/perl -w

use strict;
use Data::Dumper;
use Getopt::Long;
#use File::Temp;
use File::Path;
use File::Copy;

use FindBin qw($Bin);   #add directory with the script to the library path
use lib "$Bin";
use shared;


my $USAGE = "
$0 [<options>] <task> <configuration file> <output prefix> 
   <query fasta> <database file>*

<task> is one of protein, est, genome, repeat, etc.
<database file> is typically fasta file with sequences; 
    there can be several or none.
<query fasta> should be typically masked, unless the task is repeat

Options:
--set <variable>=<value> 
           will override value of option <variable>
           from the configuration file (can be used multiple times).
--debug    will not delete temporary files.
--restart <directory>
           will restart the process after a failure,
           given a temporary directory used by the external program.
           If directory is an empty string, tries to find one that is unique match.
--continue Similar as restart with empty param, 
           but if there is no matching directory, start from scratch.
";

#GLOBAL VARIABLES
my $Debug = 0;      #keep temporary files?
my $Restart;        #restart after a filure?
my $Continue;       #continue computation?
my $Task;           #task to be done
my $Output_prefix;  #prefix of output files
my $Query_filename; #name of the fasta file
my @Databases;      #list of databases with evidence
my %Options;        #options obtained by a combination of config file
                    #and command-line options

my $Temp_dir;       #name of temporary directory
my $Tool;           #name of the tool to use for current task

# parse options and remove them from @ARGV
my @rewrite_options;
my $ret = GetOptions("set=s" => \@rewrite_options,
		     "debug" => \$Debug, 
		     "restart=s" => \$Restart, "continue" => \$Continue);

# at least four parameters should remain in @ARGV
if(!$ret || scalar(@ARGV)<4) { die $USAGE; }
my $config_filename;
($Task, $config_filename, $Output_prefix, $Query_filename, @Databases) = @ARGV;

my @config_options;  #lines of the config file
@config_options = read_config($config_filename);
parse_options(\%Options, \@config_options);
parse_options(\%Options, \@rewrite_options);

my $now1 = localtime;
print STDERR "Start $now1\n";

# Create directory for temporary files. It will deleted
# automatically unless $Debug is true.
my $tmp_name = $Output_prefix;
$tmp_name =~ s/\W+/-/g;
$tmp_name = "temp-" . substr($tmp_name, -20) . "-";
my $create = 1;  #should we create a new temp dir?
if((defined $Restart && $Restart eq '') || defined $Continue) {
   #if restarting, look for a temporary directory
    my @dirs = glob("${tmp_name}[0-9]*");
    die "Multiple candidate temporary directories '"  
	. join("' '", @dirs) . "'"
	unless @dirs<=1;
    if(@dirs>0) {
	$Temp_dir = $dirs[0];
	$create = 0;
    }
    elsif(defined $Restart) {
	die "Cannot find temporary directory starting with $tmp_name";
    }
}
if(defined($Restart) && $Restart ne '') {
    #temporary directory given on command line
    $Temp_dir = $Restart;
    $create = 0;
}
if(!defined $Temp_dir) {
    # temporary directory not found or given, make a new one
    $Temp_dir = $tmp_name . int(rand(999999));
    
}

die "Wrong temporary directory name '$Temp_dir'" 
    unless defined $Temp_dir && $Temp_dir =~ /^$tmp_name\d+$/;

if($create) {
    #create a new temporay directory
    mkdir($Temp_dir) or die "Cannot create temporary directory $Temp_dir";
    print STDERR "Created temporary directory $Temp_dir\n";
}    
die "'$Temp_dir' not a directory" unless -d $Temp_dir;
#from now on restart and continue work the same
if(defined $Continue) { $Restart = ''; }

#determine the tool to use
$Tool = get_option("PROGRAM_" . uc $Task);

if($Tool eq "repeatmasker") {
    chunk_and_make(\&write_make_repeatmasker, \&parse_repeatmasker);
    copy_masked();
}
elsif($Tool eq "none") {
    tool_none();
}
elsif($Tool eq "find_gaps") {
    run_find_gaps();
}
elsif($Tool eq "blastx") {
    chunk_and_make(\&write_make_blastx, \&parse_blastx);
}
elsif($Tool eq "rpsblast") {
    chunk_and_make(\&write_make_rpsblast, \&parse_rpsblast);
}
elsif($Tool eq "blat") {
    chunk_and_make(\&write_make_blat, \&parse_blat);
}
elsif($Tool eq "blatp") {
    chunk_and_make(\&write_make_blatp, \&parse_blatp);
}
elsif($Tool eq "sim4") {
    chunk_and_make(\&write_make_sim4, \&parse_sim4);
}
elsif($Tool eq "phsim4") {
    chunk_and_make(\&write_make_phsim4, \&parse_sim4);
}
elsif($Tool eq "wublastsim4") {
    chunk_and_make(\&write_make_wublastsim4, \&parse_sim4);
}
elsif($Tool eq "ph") {
    chunk_and_make(\&write_make_ph, \&parse_ph);
}
elsif($Tool eq "wublast") {
    chunk_and_make(\&write_make_wublast, \&parse_wublast);
}
elsif($Tool eq "exonerate") {
    chunk_and_make(\&write_make_exonerate, \&parse_exonerate);
}
else {
    die "Tool $Tool not supported";
}

#clean the temporary directory if there was no error
#and debugging is switched off
if(!$Debug) {
    print STDERR "Deleting the temporary directory.\n";
    File::Path::rmtree($Temp_dir);
}

my $now2 = localtime;
print STDERR "Done $now2\n";
exit 0;


############################
sub tool_none {
    if(!@Databases) {
	print STDERR "Do nothing: tool is 'none', no database given.\n";
	if(!-r "$Output_prefix.gtf") {
	    print STDERR "Warning: $Output_prefix.gtf does not exists.\n";
	}
    }
    elsif(@Databases==1) {
	print STDERR "Copy one gtf file ($Databases[0]) to temporary directory.\n";
	copy($Databases[0], "$Output_prefix.gtf") or die "Copy failed.";
    }
    else {
	die "Program 'none' can support at most one database.\n";
    }
}

############################
sub run_find_gaps {

   my $threshold = get_task_option("FIND_GAPS_THRESHOLD");

   my_run("$Bin/find_gaps.pl $Query_filename $threshold >$Output_prefix.gtf");

}

############################
sub write_make_repeatmasker {
    
    my ($chunk_num) = @_;
    
    local *MAKEFILE;
    init_makefile($chunk_num, 1, 1, \*MAKEFILE);

    my $options = get_task_option("REPEATMASKER_OPTIONS");
    my $species = get_task_option("REPEATMASKER_SPECIES");
    my $prog = get_option("REPEATMASKER");

    if(@Databases) {
	die "RepeatMasker can support at most one database at a time"
	    unless @Databases == 1;

	die "You cannot specify both species and database in RepeatMasker"
	    unless $species eq '';

	$species = '-lib ' . adjust_relpath($Databases[0]);
    } else {
	$species = '-species ' . $species;
    }

    #rule for running repeatmasker (goal: done1)
    print MAKEFILE "seq%-1.done1 : seq%.fasta\n";

    printf MAKEFILE "\t$prog -pa 1 $species $options -nocut -dir . "
	. " seq\$*.fasta >seq\$*.log 2>seq\$*.log2\n";
    print MAKEFILE "\techo '' >seq\$*-1.done1\n\n";

    close MAKEFILE or die;
}

############################
sub copy_masked {

    #for any other output prefix create only gtf file
    return unless $Output_prefix eq 'repeat';

    #read sequence coordinates
    #usage: $chunk_coordinates[$chunk][$record]{start/end/name}
    my $chunk_coordinates = read_chunk_coordinates("$Temp_dir/list");

    local *OUT;
    open OUT, ">", "$Output_prefix.fasta" or die;

    #for each chunk parse result, split to orig. sequences and print
    foreach my $chunk (1..scalar(@$chunk_coordinates)-1) {

	#read masked sequence (concatenated)
	my $filename = "$Temp_dir/seq$chunk.fasta.masked";
	open IN, "<", $filename or die "Cannot open $filename";
	my ($name, $seq) = read_fasta(\*IN);
	die "Empty masked sequence in $filename" unless defined $name;

	#check that nothing else in the file
	my ($name2, $seq2) = read_fasta(\*IN);
	die "Strange masked file $filename" unless !defined $name2;

	close IN or die;

	#write fasta seq for all piecies in this chunk
	foreach my $seq_rec (@{$chunk_coordinates->[$chunk]}) {
	    my ($start, $end) = ($seq_rec->{'start'}, $seq_rec->{'end'});

	    die unless length($$seq)>=$end;
	    my $seq = substr($$seq, $start-1, $end-$start+1);
	    write_fasta(\*OUT, $seq_rec->{'name'}, \$seq);
	}
    }

    close OUT or die;
    return 0;
}

############################
sub parse_repeatmasker {
    my ($chunk) = @_;

    my @result;

    local *IN;
    my $filename = "$Temp_dir/seq$chunk.fasta.out";
    open IN, "<$filename" or die "Cannot open repeatmasker output '$filename'";
    
    #skip first three lines
    my $line = <IN>; 
    if (!($line =~/^There were no/)) { 
	if($line =~ /^\s*SW/) {  
           #file has a header with column meanings explained - skip it
	    
	    $line = <IN>; 
	    die "Bad format of repeatmasker output" 
		unless $line =~ /^\s*score/;

	    $line = <IN>; 
	    die "Bad format of repeatmasker output" 
		unless $line =~ /^\s*$/;
	}
    }

    #read repeatmasker output
    while (my $line = <IN>) {
	my @parts = split ' ', $line;
	die "Bad format of repeatmasker output" unless scalar(@parts)>=14;

	my ($name, $start, $end, $sub, $type) = @parts[4,5,6,9,10];
	my ($class, $score);

	#ignore low complexity regions
	if($type eq 'Low_complexity') {
	    $class = 'ignore'; $score = -1;
	}
	elsif($type eq 'Simple_repeat') {
	    my $pattern;
	    if ($sub =~ /^\((\S+)\)n$/) {
		$pattern = $1;
	    }
	    else {
		warn "Unknown simple repeat '$sub'";
	    }
	    if(defined $pattern && length($pattern)%3 == 0) {
		$class = 'ignore'; $score = -1;	
	    }
	    else {
		$class = 'simple_repeat'; $score = 0;
	    }
	}
	elsif(index($type, 'Satellite')>=0) {
	    $class = 'satellite'; $score = 1;
	}
	else {
	    $class = 'other_repeat'; $score = 2;
	}

	my $gtf1 = "$Tool\t$class";
	my $gtf2 = "$score\t+\t.\tfamily \"$type\"; info_name \"$sub\"";

	push @result, {'start' => $start, 'end' => $end,
		       'gtf1' => $gtf1, 'gtf2' => $gtf2 };
    }
    close IN;
    return \@result;
}


############################
sub write_make_ph {
    my ($chunk_num) = @_;

    my $db_num = scalar @Databases;

    local *MAKEFILE;
    init_makefile($chunk_num, 1, $db_num, \*MAKEFILE);

    #rules for each db
    foreach my $db (1..$db_num) {
	#rule for running ph (goal: done1)
	print MAKEFILE "seq%-$db.done1 : seq%.fasta\n";
	
	printf MAKEFILE "\t%s %s -b -i %s -j %s -o %s >%s 2>%s\n",
    	  get_option("PH"), get_task_option("PH_OPTIONS"),
	  "seq\$*.fasta", adjust_relpath($Databases[$db-1]), 
	  "seq\$*-$db.ph", "seq\$*-$db.log", "seq\$*-$db.log2";
	print MAKEFILE "\techo '' >seq\$*-$db.done1\n\n";
    }
    
    close MAKEFILE or die;
}

############################
sub format_wublast {

    #format databases for wublast, unless already formatted
    foreach my $db (@Databases) {
	if(!-r "$db.xnd" || !-r "$db.xns" || !-r "$db.xnt") {
	    my $prog = get_option("XDFORMAT");
	    my_run("$prog -n $db >$Temp_dir/formatdb.log"
		   . " 2>$Temp_dir/formatdb.log2");
	}
	else {
	    print STDERR "DB $db already formatted\n";
	}
    }
}

############################
sub write_make_wublast {
    my ($chunk_num) = @_;

    format_wublast();
    my $db_num = scalar @Databases;

    local *MAKEFILE;
    init_makefile($chunk_num, 1, $db_num, \*MAKEFILE);

    #rules for each db
    foreach my $db (1..$db_num) {
	#rule for running ph (goal: done1)
	print MAKEFILE "seq%-$db.done1 : seq%.fasta\n";
	
	printf MAKEFILE "\t%s %s %s %s mformat=3 shortqueryok -O %s >%s 2>%s\n",
    	  get_option("WUBLASTN"), 
          adjust_relpath($Databases[$db-1]), 
          "seq\$*.fasta",
          get_task_option("WUBLAST_OPTIONS"),
	  "seq\$*-$db.wublast", "seq\$*-$db.log", "seq\$*-$db.log2";
	print MAKEFILE "\techo '' >seq\$*-$db.done1\n\n";
    }
    
    close MAKEFILE or die;
}


############################
sub parse_wublast {
    my ($chunk) = @_;

    # Parse ph output from files for each database, 
    # and return gtf records.

    my @result;

    my $db_num = scalar @Databases;
    foreach my $db (1..$db_num) {

	local *IN;
	my $filename = "$Temp_dir/seq$chunk-$db.wublast";
	open IN, "<$filename" or die "Cannot open ph output '$filename'";

	while(my $line = <IN>) {

	    next if($line =~ /^\#/);
	    $line =~ s/\s+$//;
	    my @parts = split "\t", $line;
	    next unless @parts >= 22; 
	    
	    my ($info_name, $score, $start, $end, $info_start, $info_end)
		= @parts[1, 5, 17, 18, 20, 21];

	    my ($strand, $info_strand) = ('+', '+');

	    if($start > $end) {
		$strand = '-';
		($start, $end) = ($end, $start);
	    }

	    if($info_start > $info_end) {
		$info_strand = '-';
		($info_start, $info_end) = ($info_end, $info_start);
	    }

            # 1  qid     query sequence
            # 2  sid     subject (database) sequence
            # 3  E       the expectation or E-value
            # 4  N       num. of scores considered jointly in computing E
            # 5  Sprime  the normalized alignment score in bits
            # 6  S       the raw alignment score
            # 7  alignlen len. of the alignment including gaps
            # 8  nident  the number of identical letter pairs
            # 9  npos    the number of letter pairs with a positive score
            # 10 nmism   the number of mismatched letter pairs
            # 11 pcident % identity (fraction of alignlen)
            # 12 pcpos   % positive letter pairs (fraction of alignlen)
            # 13 qgaps   number of gaps in the query sequence
            # 14 qgaplen total length of all gaps in the query sequence
            # 15 sgaps   number of gaps in the subject sequence
            # 16 sgaplen total length of all gaps in the subject sequence
            # 17 qframe  the reading frame in the query sequence 
            # 18 qstart  the starting coordinate in the query sequence
            # 19 qend    the ending coordinate in the query sequence
            # 20 sframe  the reading frame in the subject sequence 
            # 21 sstart  the starting coordinate in the subject sequence
            # 22 send    the ending coordinate in the subject sequence 

	    my $gtf1 = "wublast\texon";
	    my $gtf2 = "$score\t$strand\t.\tinfo_name \"$info_name\"; " 
	    	. "info_start \"$info_start\"; info_end \"$info_end\"; "
	    	. "info_strand \"$info_strand\";";
	    
	    push @result, {'start' => $start, 'end' => $end,
	    		   'gtf1' => $gtf1, 'gtf2' => $gtf2 };

	}
	close IN or die;
    }
    return \@result;
}


############################
sub parse_ph {
    my ($chunk) = @_;

    # Parse ph output from files for each database, 
    # and return gtf records.

    my @result;

    my $db_num = scalar @Databases;
    foreach my $db (1..$db_num) {

	local *IN;
	my $filename = "$Temp_dir/seq$chunk-$db.ph";
	open IN, "<$filename" or die "Cannot open ph output '$filename'";

	my $info_name;
	my $info_len;

	while(my $line = <IN>) {

	    if($line =~ /^>\s*(\S+)/) {
		#line announcing new matching sequence 
                #- use only part until the first whitespace
		
		$info_name = $1;

		#read the next line with the length of the sequence
		$line = <IN>;
		while(defined $line 
		      && !($line =~ /^\s*Length\s*=\s*(\S+)\s*$/)) {
		    die if($line =~ /^>/ || $line =~ /^[0-9]+:\(/);
		    $line = <IN>;
		}

		die unless defined $line;
		die $line . " " unless $line =~ /^\s*Length\s*=\s*(\S+)\s*$/;
		$info_len = $1;
	    }
	    elsif($line =~ / ^ [0-9]+ : \( ([-0-9]+) , ([-0-9]+) \)
		  < ([0-9]+) , ([0-9]+) >
		  ([-0-9]+) \s+ E=\S+ \s* $/x) {

	      #Line contains coordinates of alignment;
	      #Format:
	      #number of alignment:(start query, start subject)
	      #<length query, length subject>score E=score
	      #
	      #coordinates start from 0
	    	    
	      my ($start, $info_start, $query_a_len, $info_a_len, $score) 
		  = ($1, $2, $3, $4, $5);
	      
	      my $strand = '+';
	      my $info_strand = '+';

	      #if reverse strand 
              #then $start is -(the righmost part of the interval)
	      if($start<0) {
		  $start = -$start-$query_a_len;
		  $strand = '-';
	      }
	      if($info_start<0) {
		  $info_start = -$info_start-$info_a_len;
		  $info_strand = '-';
	      }
	      #number coordinates from 1
	      $start++;
	      $info_start++;

	      my $end = $start + $query_a_len - 1;
	      my $info_end = $info_start + $info_a_len - 1;

	      my $gtf1 = "ph\texon";
	      my $gtf2 = "$score\t$strand\t.\tinfo_name \"$info_name\"; " 
		  . "info_start \"$info_start\"; info_end \"$info_end\"; "
		  . "info_strand \"$info_strand\"; info_length \"$info_len\";";
	    
	      push @result, {'start' => $start, 'end' => $end,
			     'gtf1' => $gtf1, 'gtf2' => $gtf2 };

	    }

	}
	close IN;
    }
    return \@result;
}

############################
sub parse_blat {
    my ($chunk) = @_;
    
    # Parse blat output from files for each database, 
    # and return gtf records.
    # format spec.: http://genome.ucsc.edu/FAQ/FAQformat#format2 
    # some important info about the psl format:
    # - first base is numbered zero
    # - when representing range, the end coord is not included in it
    #   example: first 100 bases are 0-100, second 100 are 100-200
    # - reverse strand: "In the qStart/qEnd (12th)fields the coordinates
    #   are where it matches from the point of the forward strand
    #   (even when the match is on the reverse strand). However on
    #   the qStarts (20th item)  list, the coordinates are reversed."
    my @result;

    my $transcript_id = 0;

    my $db_num = scalar @Databases;
    foreach my $db (1..$db_num) {
	    local *IN;
	    my $filename = "$Temp_dir/seq$chunk-$db.psl";
	    open IN, "<$filename" or die "Cannot open blat output '$filename'";
	    
        while(my $line = <IN>) {
            # there may be multiple records (blocks) on one line
            chomp $line;
            $transcript_id++;
            my @data = split(/\t/, $line);
	        if (@data != 21) {
                die "Wrong PSL format ($filename, line $transcript_id)";
            }
            my $blockGap = get_task_option("BLAT_GAPSIZE");
            if (!defined($blockGap)) {$blockGap = 10;}
            
            # common values for records on one line
            my $score = int(($data[0] + $data[2]) / ($data[0]+$data[1]+$data[2])*100);
            my $strand = $data[8];
	        my $est_id = $data[9];
            my $est_len = $data[10];
           
            my @blockLengths = split(/,/, $data[18]);
            my @estStarts = split(/,/, $data[19]);
            my @blockStarts = split(/,/, $data[20]);
            
            my $i = 0;
            # there is always at least one record (block)
            do {
                $i++;
                my $start = shift @blockStarts;
                my $length = shift @blockLengths;
                my $end = $start + $length;
                my $est_start;
                my $est_end;
                
                if ($strand eq '+') {
                    $est_start = shift @estStarts;
                    $est_end = $est_start + $length;
                } elsif ($strand eq '-') {
                    # target sequence (genome) remains the same, query (est) is reversed
                    $est_end = $data[10] - (shift @estStarts);
                    $est_start = $est_end - $length;
                } else {
                    die "Wrong PSL format ($filename, line $transcript_id)";
                }
                while ((@blockStarts > 0) && (($blockStarts[0] - $end) < $blockGap)) {
                    # join blocks while gaps are small enough
                    my $start2 = shift @blockStarts;
                    my $lenght2 = shift @blockLengths;
                    my $est2 = shift @estStarts;
                    $end = $start2 + $lenght2;
                    if ($strand eq '+') {
                        $est_end = $est2 + $lenght2;
                    } elsif ($strand eq '-') {
                        $est_start = $data[10] - $est2 - $lenght2;
                    } else {
                        die "Wrong PSL format ($filename, line $transcript_id)";
                    }
                }
                $start++; # because of different indexing
                $est_start++;

                my $gtf1 = "blat\texon";
                my $gtf2 = "$score\t.\t.\tinfo_name \"$est_id\"; "
                . "info_start \"$est_start\"; info_end \"$est_end\"; "
                . "info_length \"$est_len\"; "
                . "transcript_id \"$chunk.$transcript_id\";";            
            
                push @result, {'start' => $start, 'end' => $end,
                            'gtf1' => $gtf1, 'gtf2' => $gtf2};
            } while (@blockStarts > 0);
        }
	    close IN;
    }
    return \@result;
}

############################
sub write_make_blat {
    my ($chunk_num) = @_;
    
    my $db_num = scalar @Databases;
    
    local *MAKEFILE;
    init_makefile($chunk_num, 1, $db_num, \*MAKEFILE);

    #rules for each db
    foreach my $db (1..$db_num) {
	#rule for running blat (goal: done1)
	print MAKEFILE "seq%-$db.done1 : seq%.fasta\n";
	printf MAKEFILE "\t%s %s %s -noHead %s\n",
	get_option("BLAT"), "seq\$*.fasta", adjust_relpath($Databases[$db-1]),
        "seq\$*-$db.psl";
	print MAKEFILE "\techo '' >seq\$*-$db.done1\n\n";
    }
    
    close MAKEFILE or die;
}

############################
sub parse_blatp {
	my ($chunk) = @_;
    
    # Parse blat output from files for each database, 
    # and return gtf records.
    # format spec.: http://genome.ucsc.edu/FAQ/FAQformat#format2 
    # some important info about the psl format:
    # - first base is numbered zero
    # - when representing range, the end coord is not included in it
    #   example: first 100 bases are 0-100, second 100 are 100-200
    # - reverse strand: "In the qStart/qEnd (12th)fields the coordinates
    #   are where it matches from the point of the forward strand
    #   (even when the match is on the reverse strand). However on
    #   the qStarts (20th item)  list, the coordinates are reversed."
    my @result;

    my $transcript_id = 0;

    my $db_num = scalar @Databases;
    foreach my $db (1..$db_num) {
	    local *IN;
	    my $filename = "$Temp_dir/seq$chunk-$db.psl";
	    open IN, "<$filename" or die "Cannot open blat output '$filename'";
	    
        while(my $line = <IN>) {
            # there may be multiple records (blocks) on one line
            chomp $line;
            $transcript_id++;
            my @data = split(/\t/, $line);
	        if (@data != 21) {
                die "Wrong PSL format ($filename, line $transcript_id)";
            }
            my $blockGap = get_task_option("BLAT_PROTEIN_GAPSIZE");
            if (!defined($blockGap)) {$blockGap = 10;}
            
            # common values for records on one line
            my $score = int(($data[0] + $data[2]) / ($data[0]+$data[1]+$data[2])*100);
            my $strand = $data[8];
			my $strand_out;
	        my $prot_id = $data[9];
            my $prot_len = $data[10];
			my $target_len = $data[14];
           
            my @blockLengths = split(/,/, $data[18]);
            my @protStarts = split(/,/, $data[19]);
            my @targetStarts = split(/,/, $data[20]);
            
            my $i = 0;
            # there is always at least one record (block)
            do {
                $i++;
                my $start;
                my $length = shift @blockLengths;
                my $end;
                my $prot_start = shift @protStarts;
                my $prot_end = $prot_start + $length;
                
				if ($strand eq '++') {
                    $start = shift @targetStarts;
                    $end = $start + 3*$length;
					$strand_out = "+";
                } elsif ($strand eq '+-') {
                    # query sequence (protein) remains the same, target (genome) is reversed
                    $end = $target_len - (shift @targetStarts);
                    $start = $end - 3*$length;
					$strand_out = "-";
                } else {
                    die "Wrong PSL format ($filename, line $transcript_id)";
                }
                while ((@targetStarts > 0) && (($targetStarts[0] - $end) < $blockGap)) {
                    # join blocks while gaps are small enough
					my $lenght2 = shift @blockLengths;                    
					my $start2 = shift @targetStarts;                    
                    my $prot2 = shift @protStarts;
                    
					$prot_end = $prot2 + $lenght2;
                    if ($strand eq '++') {
                        $end = $start2 + 3*$lenght2;
                    } elsif ($strand eq '+-') {
                        $start = $target_len - $start2 - 3*$lenght2;
                    } else {
                        die "Wrong PSL format ($filename, line $transcript_id)";
                    }
                }
                $start++; # because of different indexing
                $prot_start++;

                my $gtf1 = "blat\tCDS";
                my $gtf2 = "$score\t$strand_out\t0\tinfo_name \"$prot_id\"; "
                . "info_start \"$prot_start\"; info_end \"$prot_end\"; "
                . "info_length \"$prot_len\"; "
		. "transcript_id \"$chunk.$transcript_id\";";            
		# Peter was not printing transcript id. Is there any reason?

            	die "Wrong data ($filename, line $transcript_id), start=$start, end=$end" if ($end < $start);
                push @result, {'start' => $start, 'end' => $end,
                            'gtf1' => $gtf1, 'gtf2' => $gtf2};
            } while (@targetStarts > 0);
        }
	    close IN;
    }
    return \@result;
}

############################
sub write_make_blatp {
    my ($chunk_num) = @_;
    
    my $db_num = scalar @Databases;
    
    local *MAKEFILE;
    init_makefile($chunk_num, 1, $db_num, \*MAKEFILE);

    #rules for each db
    foreach my $db (1..$db_num) {
	#rule for running blat (goal: done1)
	print MAKEFILE "seq%-$db.done1 : seq%.fasta\n";
	printf MAKEFILE "\t%s %s %s -q=prot -t=dnax -minScore=%s -noHead %s\n",
	get_option("BLAT"), "seq\$*.fasta", adjust_relpath($Databases[$db-1]), get_option("BLAT_MIN_PROTEIN_SCORE"),   
        "seq\$*-$db.psl";

	print MAKEFILE "\techo '' >seq\$*-$db.done1\n\n";
    }
    
    close MAKEFILE or die;
}

############################
sub parse_sim4 {
    my ($chunk) = @_;

    # Parse sim4 output from files for each database, 
    # and return gtf records.

    my @result;

    my $transcript_id = 0;

    my $db_num = scalar @Databases;
    foreach my $db (1..$db_num) {

	local *IN;
	my $filename = "$Temp_dir/seq$chunk-$db.sim4";
	open IN, "<$filename" or die "Cannot open sim4 output '$filename'";

	my $est_len;
	my $est_id;
	my $strand='.';
	my $info_strand;

	while(my $line = <IN>) {

	    next if($line =~ /^\s*$/);

	    if($line =~ /^\(complement\)/) {
		$info_strand = '-';
		next;
	    }
	    
	    if($line =~ /^seq1/) {
		#new sequence starts

		$line = <IN>;
		#seq1 should be followed by seq2, store EST length
		die unless $line =~ /^seq2\s+.*\s+\((.*)\),\s*\s(\d+)\s+bp\s*$/;  
		$est_len = $2;
		$est_id = $1;
		$strand = '.';
		$info_strand = '+';
		$transcript_id++;
	    }
	    else {
		#should be line with coordinates
		$line =~ /\s*(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d+)\%\s*(\S*)\s*$/
		    or die "Wrong format of sim4 output $filename";
		my ($start, $end, $est_start, $est_end, 
		    $score, $intron_sign) = ($1, $2, $3, $4, $5, $6);

		
		if($info_strand eq '-') {  #sim4 reports coordinates in compl.
		    ($est_start, $est_end) = ($est_len - $est_end + 1,
					      $est_len - $est_start + 1);
		}

		if($intron_sign eq '<-') {
		    $strand = '-';
		}
		if($intron_sign eq '->') {
		    $strand = '+';
		}
				
		my $gtf1 = "sim4\texon";
		my $gtf2 = "$score\t$strand\t.\tinfo_name \"$est_id\"; " 
		. "info_start \"$est_start\"; info_end \"$est_end\"; "
		. "info_strand \"$info_strand\"; info_length \"$est_len\"; "
		. "transcript_id \"$chunk.$transcript_id\";";

		push @result, {'start' => $start, 'end' => $end,
			       'gtf1' => $gtf1, 'gtf2' => $gtf2 };
	    }
	}
	close IN;
    }
    return \@result;
}

############################
sub write_make_sim4 {
    my ($chunk_num) = @_;

    my $db_num = scalar @Databases;

    local *MAKEFILE;
    init_makefile($chunk_num, 1, $db_num, \*MAKEFILE);

    #rules for each db
    foreach my $db (1..$db_num) {
	#rule for running sim4 (goal: done1)
	print MAKEFILE "seq%-$db.done1 : seq%.fasta\n";
	printf MAKEFILE "\t%s %s %s >%s\n",
	get_option("SIM4"), "seq\$*.fasta", adjust_relpath($Databases[$db-1]),
	"seq\$*-$db.sim4";
	print MAKEFILE "\techo '' >seq\$*-$db.done1\n\n";
    }
    
    close MAKEFILE or die;
}



############################
sub write_make_phsim4 {
    my ($chunk_num) = @_;

    #method that prints a string given database number.
    my $run = sub {
	my ($db) = @_;
	return sprintf "\t%s %s -i %s -j %s -o %s >%s 2>%s\n",
	get_option("PH"), get_task_option("PHSIM4_PHOPTIONS"),
	adjust_relpath($Databases[$db-1]), "seq\$*.fasta",
	"seq\$*-$db.ph", "seq\$*-$db.phlog", "seq\$*-$db.phlog2";
    };

    write_make_filtersim4($chunk_num, "ph", $run);
}

############################
sub write_make_wublastsim4 {
    my ($chunk_num) = @_;

    format_wublast();

    #method that prints a string given database number.
    my $run = sub {
	my ($db) = @_;
	return sprintf "\t%s %s %s %s mformat=2 shortqueryok -O %s >%s 2>%s\n",
	get_option("WUBLASTN"), 
	adjust_relpath($Databases[$db-1]), "seq\$*.fasta",
	get_task_option("WUBLASTSIM4_WUBLASTOPTIONS"),
	"seq\$*-$db.wublast", 
        "seq\$*-$db.wublastlog", "seq\$*-$db.wublastlog2";
    };

    write_make_filtersim4($chunk_num, "wublast", $run);
}



############################
sub write_make_filtersim4 {
    my ($chunk_num, $filter_name, $format_run_filter) = @_;

    my $db_num = scalar @Databases;

    local *MAKEFILE;
    init_makefile($chunk_num, 3, $db_num, \*MAKEFILE);

    #rules for each db
    foreach my $db (1..$db_num) {
	#rule for running ph (goal: done1)
	print MAKEFILE "seq%-$db.done1 : seq%.fasta\n";
	
	print MAKEFILE &$format_run_filter($db);

	print MAKEFILE "\techo '' >seq\$*-$db.done1\n\n";

	#rule for filtering ph (goal: done2)
	print MAKEFILE "seq%-$db.done2 : seq%-$db.done1\n";
	printf MAKEFILE "\t%s/filter-evidence %s %s %s %s >%s 2>%s\n",
	adjust_relpath($Bin), "seq\$*-$db.$filter_name", $filter_name,
	adjust_relpath($Databases[$db-1]), "seq\$*-$db.fasta",
	"seq\$*-$db.filterlog", "seq\$*-$db.filterlog2";
	print MAKEFILE "\techo '' >seq\$*-$db.done2\n\n";
	
	#rule for running sim4 (goal: done3)
	print MAKEFILE "seq%-$db.done3 : seq%-$db.done2\n";
	printf MAKEFILE "\t%s %s %s >%s\n",
	get_option("SIM4"), "seq\$*.fasta", "seq\$*-$db.fasta",
	"seq\$*-$db.sim4";
	print MAKEFILE "\techo '' >seq\$*-$db.done3\n\n";
    }
    
    close MAKEFILE or die;
}


############################
sub write_make_rpsblast {
    my ($chunk_num) = @_;

    my $db_num = scalar @Databases;

    local *MAKEFILE;
    init_makefile($chunk_num, 1, $db_num, \*MAKEFILE);
        
    #rule for each db
    foreach my $db (1..$db_num) {
	print MAKEFILE "seq%-$db.done1 : seq%.fasta\n";
	
	printf MAKEFILE "\t%s -db %s -outfmt 6 %s -query seq\$*.fasta"
            . " > seq\$*-%d.blast 2>seq\$*-%d.log\n",
            get_option("RPSBLAST"), adjust_relpath($Databases[$db-1]),
            get_task_option("RPSBLAST_OPTIONS"), $db, $db, $db;
	print MAKEFILE "\techo '' >seq\$*-$db.done1\n\n";

    }

    close MAKEFILE or die;
}

############################
sub write_make_blastx {
    my ($chunk_num) = @_;

    #format databases for blast, unless already formatted
    foreach my $db (@Databases) {
	if(!-r "$db.pin" || !-r "$db.psq" || !-r "$db.phr") {
	    my $prog = get_option("FORMATDB");
	    my_run("$prog -p T -i $db -l $Temp_dir/formatdb.log");
	}
	else {
	    print STDERR "DB $db already formatted\n";
	}
    }

    my $db_num = scalar @Databases;

    local *MAKEFILE;
    init_makefile($chunk_num, 1, $db_num, \*MAKEFILE);
        
    #rule for each db
    foreach my $db (1..$db_num) {
	print MAKEFILE "seq%-$db.done1 : seq%.fasta\n";
	
	printf MAKEFILE "\t%s -p blastx -d %s -m 9 %s -i seq\$*.fasta"
	    . " -o seq\$*-%d.blast >seq\$*-%d.log 2>seq\$*-%d.log2\n",
	    get_option("BLAST"), adjust_relpath($Databases[$db-1]),
	    get_task_option("BLASTX_OPTIONS"), $db, $db, $db;
	print MAKEFILE "\techo '' >seq\$*-$db.done1\n\n";

    }

    close MAKEFILE or die;
}


############################
sub parse_blastx {
    return parse_blast_tabular_output(1, @_);
}

############################
sub parse_rpsblast {
    return parse_blast_tabular_output(0, @_);
}

############################
sub parse_blast_tabular_output {
    my ($get_lengths, $chunk) = @_;

    # Parse blastx tabular output from files for each database, 
    # and return gtf records.

    my @result;

    my $db_num = scalar @Databases;
    foreach my $db (1..$db_num) {

	my $length_data;
	if($get_lengths) {
	    $length_data = read_fasta_lengths($Databases[$db-1]);
	}

	local *IN;
	my $filename = "$Temp_dir/seq$chunk-$db.blast";
	open IN, "<$filename" or die "Cannot open blast output '$filename'";

	while(my $line = <IN>) {
	    $line =~ s/\#.*$//;
	    $line =~ s/\s+$//;
	    my @parts = split("\t", $line);
	    if(scalar(@parts) != 12) { next; }

	    #Line contains coordinates of alignment;
	    # query id, subject id, % identity, alignment length,
	    # mismatches, gap openings, q. start, q. end, s. start, s. end,
	    # e-value, bit score

	    my ($start, $end) = ($parts[6], $parts[7]);
	    my ($q_id, $q_start, $q_end) = ($parts[1], $parts[8], $parts[9]); 
	    my $score = $parts[11];

	    my $strand = '+';      #forward strand label
	    if($start>$end) {
		my $tmp = $start;
		$start = $end;
		$end = $tmp;
		$strand = '-';     #reverse strand 
	    }
	    
	    my $gtf1 = lc($Tool) . "\tCDS";
	    my $gtf2 = "$score\t$strand\t0\tinfo_name \"$q_id\"; " 
		. "info_start \"$q_start\"; info_end \"$q_end\";";

	    if(exists $length_data->{$q_id}) {
		my $prot_q_length = $length_data->{$q_id};
		$gtf2 .= " info_length \"$prot_q_length\";"
	    }

	    push @result, {'start' => $start, 'end' => $end,
			   'gtf1' => $gtf1, 'gtf2' => $gtf2 };

	}
	close IN;
    }

    return \@result;
}

############################
sub parse_exonerate {
	my ($chunk) = @_;
    my @result;

    my $transcript_id = 0;

    my $db_num = scalar @Databases;
    foreach my $db (1..$db_num) {
	    local *IN;
	    my $filename = "$Temp_dir/seq$chunk-$db.exonerate";
	    open IN, "<$filename" or die "Cannot open Exonerate output '$filename'";

		# Exonerate output parsing is done in stages:
		# stage 0: skip few lines at the begining of the file until first line starting with hash (#) is reached
		# stage 1: skip lines at the begining of the GFF dump (starting with hash)
		# stage 2: process the GFF DUMP until a line with "similarity" feature is reached
		# stage 3: skip lines ath the end of the GFF dump (starting with hash)
		# stage 4: process one line - the length of query sequence (not included in GFF dump) and then goto stage 1
	    my $stage = 0;
		my $line_number = 0;
		my $transcript_id = 0;
		my @transcript_data = ();
		my $score = 0;
        while(my $line = <IN>) {
			$line_number++;
			if ($stage == 0) {
				if (substr($line, 0, 1) eq "#") {
					$stage = 1;
				} else {
					next;
				}			
			}
			if ($stage == 1) {
				if (substr($line, 0, 1) eq "#") {
					next;
				} else {
					$stage = 2;
				}
			} 
			if ($stage == 2) {
				if (substr($line, 0, 1) eq "#") {
					$stage = 3;
				} else {
					chomp $line;
					my @data = split(/\t/, $line);
					if ($data[2] eq "exon" or $data[2] eq "intron" or $data[2] eq "splice5" or $data[2] eq "splice3") {
						pop(@data);
						push(@transcript_data, \@data);	
					} elsif ($data[2] eq "similarity") {
						$score = $data[5];
						my @sim = split(/ ; /, $data[8]);
						my @similarity = ();
						foreach my $item (@sim) {
							my @sim_items = split(/ /, $item);
							if ($sim_items[0] eq "Align") {
								push(@similarity, \@sim_items);
							}
						}
						my $j = 0;
						foreach my $item (@transcript_data) {
							next if ($item->[2] ne "exon");
							push(@$item, $similarity[$j][2]);
							push(@$item, $similarity[$j][2] + $similarity[$j][3] / 3 - 1);
							$j++;
							
						}						
					}
				}
			}
			if ($stage == 3) {
				if (substr($line, 0, 1) eq "#") {
					next;
				} else {
					$stage = 4;
				}
			}
			if ($stage == 4) {
				chomp $line;
				my @data = split(/\t/, $line);
				foreach my $item (@transcript_data) {
					my $gtf1 = "exonerate\t".$item->[2];
                	my $gtf2 = $score."\t".$item->[6]."\t".$item->[7]."\tinfo_name \"".$data[1]."\"; ";
					if ($item->[2] eq "exon") {
                	    $gtf2 = $gtf2."info_start \"".$item->[8]."\"; info_end \"".$item->[9]."\"; ";
					}
					$gtf2 = $gtf2."info_length \"".$data[2]."\";"; #transcript_id \"$chunk.$transcript_id\";";        
                	push @result, {'start' => $item->[3], 'end' => $item->[4], 'gtf1' => $gtf1, 'gtf2' => $gtf2};
				}
				$transcript_id++;
				$stage = 1;
				$score = 0;			
				@transcript_data = ();
			}
			if ($stage < 0 or $stage > 4) {
				die "Unknown file format ($filename, line $line_number)";
			}
        }
	    close IN;
    }
    return \@result;
}

############################
sub write_make_exonerate {
    my ($chunk_num) = @_;
    
    my $db_num = scalar @Databases;
    
    local *MAKEFILE;
    init_makefile($chunk_num, 1, $db_num, \*MAKEFILE);

    #rules for each db
    foreach my $db (1..$db_num) {
		#rule for running (goal: done1)
		print MAKEFILE "seq%-$db.done1 : seq%.fasta\n";
		print MAKEFILE "\t".get_option("EXONERATE")." --target "."seq\$*.fasta"." --targettype dna --query ".adjust_relpath($Databases[$db-1])." ";
		print MAKEFILE "--querytype protein --model protein2genome --splice5 ".get_option("EXONERATE_SPLICE5")." --splice3 ".get_option("EXONERATE_SPLICE3")." ";
		print MAKEFILE "--forcegtag yes --geneseed ".get_option("EXONERATE_GENESEED")." --percent ".get_option("EXONERATE_PERCENT")." ";
		print MAKEFILE "--hspfilter ".get_option("EXONERATE_HSPFILTER")." --seedrepeat ".get_option("EXONERATE_SEEDREPEAT")." ";
		print MAKEFILE "--softmasktarget ".get_option("EXONERATE_SOFTMASKTARGET")." --showalignment no --showtargetgff yes --showquerygff no --showvulgar no ";
		print MAKEFILE "--showcigar no --showsugar no --ryo 'queryLength\\t%qi\\t%ql\\n' >"."seq\$*-$db.exonerate"."\n";
		#printf MAKEFILE "\t%s --target %s --targettype dna --query %s --querytype protein --model protein2genome --splice5 %s --splice3 %s --forcegtag yes --geneseed %s --percent %s --hspfilter %s --seedrepeat % --softmasktarget %s --showalignment no --showtargetgff yes --showquerygff no --showvulgar no --showcigar no --showsugar no --ryo 'queryLength\\t%%qi\\t%%ql\\n' >%s\n",
		#get_option("EXONERATE"), "seq\$*.fasta", adjust_relpath($Databases[$db-1]), get_option("EXONERATE_SPLICE5"), get_option("EXONERATE_SPLICE3"), get_option("EXONERATE_GENESEED"), get_option("EXONERATE_PERCENT"), get_option("EXONERATE_HSPFILTER"), get_option("EXONERATE_SEEDREPEAT"), get_option("EXONERATE_SOFTMASKTARGET"), "seq\$*-$db.exonerate";
		print MAKEFILE "\techo '' >seq\$*-$db.done1\n\n";
    }
    
    close MAKEFILE or die;
}

############################
sub chunk_and_make {
    my ($make_function, $parse_function) = @_;

    # Routine that implements the overall framework for most
    # programs: splits sequence to chunks, writes makefile
    # calls make, parses results and writes them to output.

    # $make_function takes number of chunks as argument.
    # $parse_function takes a number of a chunk.
    # It returns a reference to an array containing one gtf 
    # record in a form of hash with records start, end, 
    # gtf1 and gtf2. start and end are coordinates
    # within the chunk (starting from 1) and gtf1 and gtf2
    # are the two remaining portions of a gtf record.
    
    
    if(!defined $Restart 
       || !(-r "$Temp_dir/Makefile") || !(-r "$Temp_dir/list")) {
	#prepare files unless restarting and files already exist
	my $chunk_size = get_task_option(uc($Tool) . "_CHUNKSIZE", 0);
	if(!defined $chunk_size) { $chunk_size = 100000; }
	my $chunk_sep = get_task_option(uc($Tool) . "_CHUNKSEP", 0);
	if(!defined $chunk_sep) { $chunk_sep = 1000; }

	my $chunk_num = write_chunks($Temp_dir, $Query_filename, 
				     $chunk_size, $chunk_sep);

	#create makefile by a function supplied as argument
	&{$make_function}($chunk_num);
    }
	
    #run make
    my $proc = get_option("PROCESSORS");
    my_run("make -j $proc -C $Temp_dir");

    #check that everything finished
    die "Some error during make" unless -e "$Temp_dir/done";


    #read sequence coordinates
    #usage: $chunk_coordinates[$chunk][$record]{start/end/name}
    my $chunk_coordinates = read_chunk_coordinates("$Temp_dir/list");
          
    local *GTF;
    open GTF, ">$Output_prefix.gtf" or die "Cannot write to$Output_prefix.gtf";

    print STDERR "Parsing output\n";

    #for each chunk parse result, split to orig. sequences and print
    foreach my $chunk (1..scalar(@$chunk_coordinates)-1) {
	my $gtf_rows = &{$parse_function}($chunk);
	print_gtf(\*GTF, $gtf_rows, $chunk_coordinates->[$chunk]);
    }
}


############################
sub read_chunk_coordinates {
    my ($filename) = @_;

    #usage $chunk_coordinates[$chunk][$record]{start/end/name}
    my @chunk_coordinates;
    local *LIST;
    open LIST, "<", $filename
	or die "Cannot open $filename for reading.";

    my $i = 1;
    while(my $line = <LIST>) {
	$line =~ s/\s+$//;
	my ($chunk, $start, $end, $name) = split ' ', $line;
	if($chunk != $i) {
	    $i++;
	    die "Wrong number of chunk $chunk" unless $chunk == $i;
	}
	push @{$chunk_coordinates[$chunk]}, 
	{'start' => $start, 'end' => $end, 'name' => $name};
    }
    close LIST or die;
    return \@chunk_coordinates;
}

############################
sub init_makefile {
    #open makefile and write the first rule with overall goal
    #needs number of chunks, number of last stage and makefile filehandle
    my ($chunk_num, $last_stage, $db_num, $makefile) = @_;
    
    open $makefile, ">$Temp_dir/Makefile";
    
    #the goal is to make seqX-Y.doneZ for all chunks X and all db Y
    print $makefile "all:";
    foreach my $chunk (1..$chunk_num) {
	foreach my $db (1..$db_num) {
	    print $makefile " seq$chunk-$db.done$last_stage";
	}
    }
    print $makefile "\n\techo '' >done\n\n";
}

############################
sub print_gtf {
    my ($file, $rows, $coordinates) = @_;

    # sort by start
    @$rows = sort { $a->{'start'} <=> $b->{'start'} } @$rows;


    my $parts_num = scalar(@$coordinates);
    die unless $parts_num>0;

        
    # go over intervals by increasing start and print each with 
    # a corresponding sequence name 
    my $part=0;
    my $skipped = 0;
    foreach my $row (@$rows) {
	my ($start, $end) = ($row->{'start'}, $row->{'end'});
	
	# advance $part until $start is in current part or we reach the end
	while($part<$parts_num 
	      && $coordinates->[$part]{'end'} < $start) {
	    $part++;
	}

	# if the row is completely inside a sequence
	if($part<$parts_num 
	   && $coordinates->[$part]{'start'} <= $start
	   && $coordinates->[$part]{'end'} >= $end) {
	    my $name = normalize_fasta_name($coordinates->[$part]{'name'});
	    $start -= $coordinates->[$part]{'start'} - 1;
	    $end -= $coordinates->[$part]{'start'} - 1;
	    printf $file "%s\t%s\t%d\t%d\t%s\n",
	    $name, $row->{'gtf1'}, $start, $end, $row->{'gtf2'};
	}
	else { $skipped++; }
    }

    if($skipped) {
	print STDERR "Skipped $skipped alignments spanning regions\n";
    }
}


############################
sub write_chunks {
    my ($dir, $fasta, $chunk_size, $chunk_sep) = @_;
    print STDERR "Writing chunks\n";

    my $chunk_num = 0;
    local *IN;
    open IN, "<$fasta" or die "Cannot open '$fasta' for reading";
    
    local *LIST;
    open LIST, ">$dir/list" or die "Cannot open '$dir/list' for writing";

    my $chunk_seq = "";
    while(1) {
	my ($name, $seq) = read_fasta(\*IN);

	if(defined $name) {
	    #another sequence read - append to $chunk_seq, write to list file
	    die "Empty sequence '$name'" unless length($seq)>0;

	    if($chunk_seq ne '') {
		# separate sequences by several N's
		# to make overlaping alignments less likely.
		$chunk_seq .= 'N' x $chunk_sep;
	    } 
	    my $start = length($chunk_seq)+1;
	    $chunk_seq .= $$seq;
	    my $end = length($chunk_seq);
	    print LIST $chunk_num+1, " ", $start, " ", $end, " ", $name, "\n";
	}

	# if at the end or if enough sequence, write the chunk to file
	if(length($chunk_seq)>=$chunk_size
	   || !defined $name && $chunk_seq ne '') {
	    $chunk_num++;
	    local *FASTA;
	    open FASTA, ">$dir/seq$chunk_num.fasta" or die;
	    write_fasta(\*FASTA, ">seq$chunk_num", \$chunk_seq);
	    close FASTA;
	    $chunk_seq = '';
	}
	last unless defined $name;
    }
	
    die "Internal error" unless $chunk_seq eq '';

    close IN;
    close LIST;

    return $chunk_num;
}

############################
sub get_task_option {
    my ($name, $die) = @_;
    $die = 1 unless defined $die;

    #first try to extract task-specific option, e.g.
    #GENOME_PH_OPTIONS
    my $res = get_option(uc($Task) . "_$name", 0);

    #if task-specific option not found, use general option,
    #e.g. $PH_OPTIONS
    if(!defined $res) {
	$res = get_option($name, $die);
    }
    return $res;
}

############################
sub get_option {
    my ($name, $die) = @_;
    $die = 1 unless defined $die;

    if(!exists $Options{$name}) {
	die "Required option $name not set." unless !$die;
	return;
    }
    return $Options{$name};
}

############################
sub read_fasta_lengths {
    my ($filename) = @_;
    
    #read given fasta file and store lengths of sequences in a hash

    my %result;

    local *IN;
    open IN, "<$filename" or die "Cannot open $filename";
    
    while(1) {
	my ($name, $seq) = read_fasta(\*IN);
	last unless defined $name;
	$name = normalize_fasta_name($name);
	$result{$name} = length($$seq);
    }

    close IN or die;
    return \%result;
}
