#! /usr/bin/perl -w

use strict;
use Bio::SeqIO;
use Data::Dumper;

my $prepared_data_directory="prepared_data";
my @dir_names = ('train','test');
my @gtf_files = ("$prepared_data_directory/all_genes.gtf","$prepared_data_directory/exonerate/exonerate.gtf","$prepared_data_directory/rnaweasel/introns.gtf");
my @fasta_files = ("$prepared_data_directory/all_genes.fasta","$prepared_data_directory/exonerate/exonerate.fasta","$prepared_data_directory/rnaweasel/introns.fasta");

open(NAMES , "$prepared_data_directory/gene_names.txt");
my $iterations = $ARGV[0];
my @data_sets;

my $holdout_dir = "holdout-".localtime();
$holdout_dir =~ s/[ :]/-/g;
my $output_dir = "output/$holdout_dir";

while(<NAMES>)
{
	my($line) = $_;
	chop($line);
	my @gene_names = split(";",$line);
	my $size = @gene_names;
	my $part_size = int($size / $iterations + 0.5) ;
	my @data_set;
	for(my $i=0;$i<$size;$i++){
		if(($i % $part_size) == 0 && $i > 0) {
			push @data_sets, [@data_set];
			@data_set = ();
		}
		my $r = int(rand($size-$i));
		push(@data_set, $gene_names[$r]);
		splice(@gene_names,$r,1);
	}
	push @data_sets, [@data_set];
	last;
}

system "mkdir $output_dir";
for(my $i=1;$i<=$iterations;$i++){
	system "echo \"nohup ./bin/conrad.sh trainFeatures models/comparative.xml data/datasets/$holdout_dir/dataset_$i/train data/datasets/$holdout_dir/dataset_$i/train/features.ser > data/datasets/$holdout_dir/dataset_$i/train/nohup_features.out\" >> $output_dir/commands";
	system "echo \"nohup ./bin/conrad.sh trainWeights data/datasets/$holdout_dir/dataset_$i/train/features.ser data/datasets/$holdout_dir/dataset_$i/train data/datasets/$holdout_dir/dataset_$i/train/validation.ser > data/datasets/$holdout_dir/dataset_$i/train/nohup_weights.out\" >> $output_dir/commands";
	system "echo \"nohup ./bin/conrad.sh test data/datasets/$holdout_dir/dataset_$i/train/validation.ser data/datasets/$holdout_dir/dataset_$i/test data/datasets/$holdout_dir/dataset_$i/test/output > data/datasets/$holdout_dir/dataset_$i/test/nohup.out\n\" >> $output_dir/commands";
	system "mkdir $output_dir/dataset_$i $output_dir/dataset_$i/train $output_dir/dataset_$i/test";
	my @test_set = @{$data_sets[$i-1]};
	my @train_set;
	for(my $j=0;$j<$iterations;$j++){
		push @train_set, @{$data_sets[$j]} unless($i-1 == $j);
	}
	my @data_sets = ( \@train_set, \@test_set );
	for(my $j=0;$j<2;$j++){
		my @data_set = @{$data_sets[$j]};
		my %map;
		@map{@data_set} = (1..@data_set);
		my $dir =  $dir_names[$j];
		foreach my $gtf_file(@gtf_files){
			$gtf_file =~ /\/([^\/]+)$/;
			my %data;
			foreach my $gene(@data_set){
				$data{$gene} = "";
			}
			my $out_file = "$output_dir/dataset_$i/$dir_names[$j]/$1";
			open(INGTF , "<$gtf_file");
			open(OUTGTF, ">$out_file");
			while(<INGTF>) {
				my $line = $_;
				next unless($line =~ /(\w+)\t/);
				next unless($map{$1});
				$data{$1} = $data{$1}."$line";
				
			}
			foreach my $gene(@data_set){
				print OUTGTF $data{$gene};
			}
			close INGTF;
			close OUTGTF;
		}
		
		foreach my $fasta_file(@fasta_files){
			$fasta_file =~ /\/([^\/]+)$/;
			my %data;
			foreach my $gene(@data_set){
				$data{$gene} = "";
			}
			my $out_file = "$output_dir/dataset_$i/$dir_names[$j]/$1";
			open(INFASTA , "<$fasta_file");
			open(OUTFASTA, ">$out_file");
			my $print;
			my $gene;
			while(<INFASTA>) {
				my $line = $_;
				if(/^>(\w+)/) {
					if($map{$1}) {
						$data{$1} = $data{$1}."$line";
						$print = 1;
						$gene = $1;
						next;
					} else {
						$print = "";
					}
				}
				if($print) {
					$data{$gene} = $data{$gene}."$line";
				}
			}
			foreach my $gene(@data_set){
				print OUTFASTA $data{$gene};
			}
			close INFASTA;
			close OUTFASTA;
		}


		#multiple alignments
		open(OUTALIG, ">$output_dir/dataset_$i/$dir_names[$j]/alignments.faa");
		my %data;
		foreach my $gene(@data_set){
			$data{$gene} = "";
			my($organism, $gene_name) = split("_",$gene);
			open(INALIG, "<$prepared_data_directory/cds/algn/$gene_name.faa");
			while(<INALIG>) {
				my $line = $_;
				if(/^>(\w+)_(\w+)/) {
					print OUTALIG ">$gene $1\n";
				} else {
					print OUTALIG $line;
				}
			}
			close INALIG;
		}
		close OUTALIG;

		open(OUTTREE, ">$output_dir/dataset_$i/$dir_names[$j]/tree.tre");
		open(INTREE, "<$prepared_data_directory/tree.newick");
		while(<INTREE>) {
			my $line = $_;
			chomp($line);
			print OUTTREE "$line\n";
			last;
		}
		my $organisms = "";
		foreach my $gene(@data_set){
			my($organism, $gene_name) = split("_",$gene);
			$organisms .= "$organism;";
		}
		chop $organisms;
		print OUTTREE $organisms;
		close INTREE;
		close OUTTREE;
		###
		
	}
}



