/*
 * $Id: main.cpp,v 1.8 2012-02-25 16:38:30 laci Exp $
 *
 * Project      : RNA motif searching in genomic sequences
 * Description  : the main program
 *
 * Author       : Ladislav Rampasek <rampasek@gmail.com>
 * Institution  : Comenius University in Bratislava
 *
 */

#include <iostream>
#include <sstream>
#include <iomanip>
#include <fstream>
#include <ctime>
#include <cmath>

#include "search.h"

using namespace std;

#ifndef RELEASE
    #define DEF_RELEASE     "dev1.94"
    #define DEF_RELEASEDATE "February 2012"
#endif


static string txt_banner = "RNARobo - RNA pattern searching program";
static string txt_usage =
"rnarobo: version "+string(DEF_RELEASE)+", "+string(DEF_RELEASEDATE)+"\n\
Usage: rnarobo <descriptor-file> <sequence-file>\n\
\n\
  Available options: \n\
     -c:    search both strands of database\n\
     -f:    print output in plain FASTA format\n\
     -s:    print output in FASTA format with element separators\n";
//     -d:    show dot-bracked represenation for every match (!!!CAN'T HANDLE INSERTIONS YET!!!)\n

//int opt_skip_mult = false;    /* OPTION: True to filter out overlapping hits   */
int opt_be_quiet = false;       /* OPTION: TRUE to silence verbosity             */    
int opt_searchcomp = false;     /* OPTION: TRUE to search both strands of database*/
int opt_fasta = false;          /* OPTION: TRUE to print output in FASTA format*/
int opt_dotbracked = false;     /* OPTION: TRUE to print also dot-bracked representation*/

string format_output(int begin, int end, string name, string details, string seq){
    stringstream res;
    
    if (opt_fasta == true) {
        res<<">"<<begin<<"-"<<end<<"_"<<name<<" "<<details<<endl;
        res<<seq<<endl;
        
    } else {
        res<<setw(6)<<begin;
        res<<" "<<setw(6)<<end;
        res<<" "<<setw(12)<<name;
        res<<" "<<setw(12)<<details<<endl;
        res<<seq<<endl;
    }
    
    return res.str();
}

int main(int argc, char* argv[]){
    clock_t tStart = clock(); //to measure runtime
    
    string file_db;       /* name of sequence file to search           */
    string file_desc;     /* name of file containing descriptor        */
    ifstream dbin, descin;

    string element_separator = "|"; /* the string used to separate individual elements in the output */

  /***********************************************
   * Parse command line arguments
   * TODO: pridat podporu pre options OPT_skip_mult, OPT_be_quiet, OPT_print_full ako v RNABob
   ***********************************************/
    if (argc < 3) die("Not enough parameters\n\n" + txt_usage);
    //for(int i=0;i<argc;++i) cout<<argv[i]<<endl;

    char ch;
    extern int optind, optopt;

    while ((ch = getopt(argc, argv, ":cfsd")) != -1) {
        switch(ch) {
        case 'c':
            opt_searchcomp = true;
            break;            
        case 'f':
            opt_fasta = true;
            opt_be_quiet = true;
            element_separator = "";
            break;
        case 's':
            opt_fasta = true;
            opt_be_quiet = true;
            element_separator = "|";
            break;
        case 'd':
            //opt_dotbracked = true;
            break;
        case ':':
            //printf("-%c without filename\n", optopt);
            break;
        case '?':
            char opt[20];
            sprintf(opt, "Unknown arg \"-%c\"\n", optopt);
            die(string(opt) + txt_usage);
            break;
        }
    }

    /*cout<<optind<<endl;
    for ( ; optind < argc; optind++) {
            if (argv[optind]) {
                cout<<"mame "<<argv[optind]<<endl;
            }
    }*/
    if (argv[optind]){
        file_desc = argv[optind];
    }
    if (argv[optind+1]) {
        file_db = argv[optind+1];
    }

    //cout<<opt_searchcomp<<" "<<file_desc<<' '<<file_db<<endl;


  /***********************************************
   * Initialize sequence file, read descriptor
   * TODO: podpora pre preddefinovane umiestnenia suborov
   ***********************************************/
    /// read descriptor
    descin.open( file_desc.c_str() );
    if(! descin.good()) die("Descriptor file "+file_desc+" could not be opened or it does not exist");
    Descriptor desc(descin);
    if(! desc.is_initialized()) die(desc.error_str());

    /// open sequence file
    dbin.open( file_db.c_str() );
    if(! dbin.good()) die("Sequence file "+file_db+" could not be opened or it does not exist");

  /***********************************************
   * Print header and do the search
   ***********************************************/

    if (! opt_be_quiet)
    {
      cout<<"Starting rnarobo: version "<<DEF_RELEASE<<", "<<DEF_RELEASEDATE<<"\n";
      cout<<"---------------------------------------------------\n";
      cout<<"Database file:                 "<<file_db<<"\n";
      cout<<"Descriptor file:               "<<file_desc<<"\n";
      cout<<"Search order:                  "<<desc.search_order_to_str()<<"\n";
      cout<<"Complementary strand searched: "<<(opt_searchcomp? "yes":"no")<<"\n";
      //cout<<"Filter out overlapping hits:   "<<(opt_skip_mult? "yes":"no")<<"\n";
      cout<<"---------------------------------------------------\n";

      cout<<"\n";
      cout<<" seq-f  seq-t     name     description\n";
      cout<<"------ ------ ------------ ------------\n";
    }
    

    Simple_Search ssearch(desc);
    string line;
    long long total_bases_scanned = 0;
    long long total_matches = 0;
    
    while( get_valuable_line(dbin,line) ){
        if(line[0] != '>') die("Incorrect sequence format");
        line[0]=' ';

        string sq_name, sq_details, sq;
        istringstream sin(line);
        if( !(sin>>sq_name)) die("Incorrect sequence format");
        while( sin>>line ) sq_details+=" "+line;

        while('A'<=dbin.peek() && dbin.peek()<='z') {
            get_valuable_line(dbin,line);
            normalize_seq(line);
            sq+=line;
        }
        filter_whitespaces(sq);
        //cout<<sq_name<<":"<<sq<<endl<<endl;
        
        //count number of bases scanned
        total_bases_scanned += sq.size();
        if(opt_searchcomp){ total_bases_scanned += sq.size(); }
            
        int max_motif_length = ssearch.desc->get_max_motif_length();
        //to how long pieces we will chop up the sequence
        int max_seq_length = max(300, 5*max_motif_length);
        
        //to store beginnings of found matches (in both strands) - for filtering repeating matches
        set <pair <int, int> > found_matches, found_op_matches;
        found_matches.clear();
        found_op_matches.clear();

        //cut out "N"regions longer than 10 -> the sequence is divided into blocks
        vector< pair<string,int> > seq_blocks; //first is block's sequence, second is block's beginning position in original seqence
        int prev_found = 0;
        int found = sq.find("NNNNNNNNNN");
        while(found!=string::npos){
            seq_blocks.push_back(make_pair(sq.substr(prev_found, found-prev_found+1), prev_found));
            while(sq[found+10]=='N') ++found;
            prev_found = found;
            found=sq.find("NNNNNNNNNN",found+1);
        }
        seq_blocks.push_back(make_pair(sq.substr(prev_found, sq.size()), prev_found));

        for(int k=0; k<seq_blocks.size(); k++){
            //chop up the given block to partitions of length at most max_seq_length
            // ! two partitions must have overlap of max_motif_length !
            int pointer = 0;
            vector< pair<string,int> > seq_partitions;
            while(pointer < seq_blocks[k].first.size()){
                int start = max(0, pointer-max_motif_length+1);

                string s = seq_blocks[k].first.substr(start, max_seq_length);
                seq_partitions.push_back(make_pair(s, start));
                pointer = start + max_seq_length;
            }

            //search for the motif in partitions
            for(int j=0; j<seq_partitions.size(); j++){
                ssearch.search(seq_partitions[j].first);

                int offset = seq_partitions[j].second+ seq_blocks[k].second;

                //print found hits
                for(int i=0;i<ssearch.solutions.size();i++){
                    int match_begin = ssearch.solutions[i][0].first +1 +offset;
                    int match_end = ssearch.solutions[i].back().second +offset;
                    pair<int, int> match_pos = make_pair(match_begin, match_end);
                    
                    if(found_matches.find(match_pos) != found_matches.end()){
                        continue;
                    } else {
                        found_matches.insert(match_pos);

                        cout<<format_output(match_begin,
                                            match_end,
                                            sq_name,
                                            sq_details,
                                            ssearch.solution_to_str(i, seq_partitions[j].first, element_separator)
                                        );
                        if (opt_dotbracked == true) {
                            cout<<ssearch.solution_to_dotbracket(i, element_separator)<<endl;
                        }
                        
                    }
                }

                //search the partition also in the opposite direction (if opt_searchcomp == true)
                if(opt_searchcomp){
                    reverse_complement(seq_partitions[j].first.begin(), seq_partitions[j].first.end());
                    ssearch.search(seq_partitions[j].first);

                    //print found hits
                    for(int i=0;i<ssearch.solutions.size();i++){
                        int match_begin = seq_partitions[j].first.size() -ssearch.solutions[i][0].first + offset;
                        int match_end = seq_partitions[j].first.size() -ssearch.solutions[i].back().second +1 +offset;
                        pair<int, int> match_pos = make_pair(match_begin, match_end);
                        
                        if(found_op_matches.find(match_pos) != found_op_matches.end()){
                            continue;
                        } else {
                            found_op_matches.insert(match_pos);
                        
                            cout<<format_output(match_begin,
                                            match_end,
                                            sq_name,
                                            sq_details,
                                            ssearch.solution_to_str(i, seq_partitions[j].first, element_separator)
                                        );
                            if (opt_dotbracked == true) {
                                cout<<ssearch.solution_to_dotbracket(i, element_separator)<<endl;
                            }
                        
                        }
                    }
                }
            }

        }
        total_matches += found_matches.size();
        total_matches += found_op_matches.size();

    }
    
    cout<<"\n----- SEARCH DONE -----\nTotal scanned bases: "<<total_bases_scanned;
    cout<<"\nFound matches:       "<<total_matches<<endl;
    double elapsed = (double)(clock() - tStart)/CLOCKS_PER_SEC;
    printf("Time since start:    %02.0fh %02.0fm %02.0fs (%.2fs)\n",
           floor(elapsed/3600.0),
           floor(fmod(elapsed,3600.0)/60.0),
           fmod(elapsed,60.0),
           elapsed
          );

    return 0;
}
