#! /usr/bin/perl -w

use strict;
use Data::Dumper;
use Getopt::Long;

use FindBin qw($Bin);   #add directory with the script to the library path
use lib "$Bin";
use shared;

my $USAGE = "
$0 [<options>] <filter file> <format> <input file> <output file> 

Parse sequence names from <filter file>, assuming it was written by a
program specified by <format> (and what we are looking for are subject
sequences) and print sequences that have these names from <input file>
to <output file>.  
";

if(scalar(@ARGV)!=4) { die $USAGE; }
my ($filter, $format, $input_fasta, $output_fasta) = @ARGV;

my $names;  #$names->{$name}=1 iff the $name has a match

open IN, "<$filter" or die "Cannot open $filter";
if($format eq "ph") {
    $names = parse_ph_names(\*IN);
}
elsif($format eq "wublast") {
    $names = parse_wublast_names(\*IN);
}
else {
    die "Uknown format '$format'";
}
close(IN);

#read input and print only those that 
#that apper in %$names
open(IN, "<$input_fasta") or die "Cannot open $input_fasta";
open(OUT, ">$output_fasta") or die "Cannot open $output_fasta";
    
my $skip = 1;
#number of sequences on input, on output, and names that occur multiple times
my ($all, $written, $double) = (0,0);
while(my $line = <IN>) {
    chomp $line;
    if(index($line, '>')==0) {
	my $name = normalize_fasta_name($line);
	$all++;
	if(exists($names->{$name})) {
	    $skip = 0;
	    $written++;
	    if($names->{$name} == 0) { $double++; }
	    $names->{$name} = 0;
	}
	else {
	    $skip = 1;
	}
    }
    if(!$skip) {
	print OUT $line, "\n";
    }
}
print STDERR "$written out of $all sequences written to output\n";
if($double) {
    print STDERR "$double sequences were printed multiple times\n";
}

my $notfound = scalar grep {$_ eq 1} values(%$names);
if($notfound) {
    print STDERR "$notfound sequences from $filter were not found in $input_fasta\n";
}


close IN;
close OUT;


############################
sub parse_wublast_names {

    #parse names of subject sequences from the given file

    my ($file) = @_;
    my %names;

    while(my $line = <$file>) {
	my @parts = split ' ', $line;
	next unless @parts == 22;
	my $name = normalize_fasta_name($parts[1]);
	$names{$name} = 1;
    }
    return \%names;
}


############################
sub parse_ph_names {

    #parse names of subject sequences from the given file

    my ($file) = @_;
    my %names;

    my $last_query = "";
    while(my $line = <$file>) {
        if($line =~ /^Query=\s+(.+)$/) {
            $last_query = normalize_fasta_name($1);
        }
        elsif($line=~/^\>/) {
            $names{$last_query} = 1;
        }
    }
    return \%names;
}
