/*
 * $Id: descriptor.cpp,v 1.3 2011-09-26 06:59:12 laci Exp $
 *
 * Project      : RNA motif searching in genomic sequences
 * Description  : the implementation of a class representing a motif descriptor
 *
 * Author       : Ladislav Rampasek <rampasek@gmail.com>
 * Institution  : Comenius University in Bratislava
 *
 */

#include <sstream>
#include <algorithm>
#include <string>
#include <vector>
#include <map>
#include <cmath>
#include <stack>
#include <iostream>
using namespace std;

#include "types.h"
#include "generalfuncs.h"

// the class of an descriptor
class Descriptor{
    public:
        vector<SSE> sses;           // the array where all SSEs are stored, indexed from 1 (0-th position is grabage)
        vector<int> motif;          // the list of all SSEs, i-th number is index to the array "sses",
                                    //   if the number is negative it signs for the second strand
                                    //   of the helix which has index "-number" in "sses"
        vector<int> pknot_levels;   
        map<string, int> transl;    // a transl. dictionary from original names (h1,s5,...) to indices in "sses"
        vector<int> srch_order;     // the ordered list of SSEs to be searched (indices to the vector "sses")

        Descriptor():initialized(false){};
        Descriptor(ifstream &fin);
        bool is_initialized(){ return initialized;}
        string error_str(){ return err_str;}
        string search_order_to_str();
        int get_max_motif_length();
        string get_dotbracket_notation();

    private:
        bool initialized;
        string err_str;

        bool parse_desc_map(string line);
        bool parse_desc_properties(string line);
        bool parse_desc_order(string line);
        bool expand_wildcards(string &s);
        bool check_consistency();
        bool has_no_duplicates(vector<int> vec);
        void complete_srch_order();
        void compute_sizes();
        void compute_pknot_levels();
};

Descriptor::Descriptor(ifstream &fin){
    initialized=false;
    err_str="";

    bool desc_ok = true;
    string line;

    // read the map of the descriptor
    desc_ok &= get_valuable_line(fin,line);
    desc_ok &= parse_desc_map(line);
    if(! desc_ok) { err_str="Invalid descriptor topology map"; return; }

    // read properties of sses declared in the map
    while( get_valuable_line(fin,line) ){
        normalize_seq(line);
        desc_ok &= parse_desc_properties(line);
        if(! desc_ok) desc_ok = parse_desc_order(line);
        if(! desc_ok) {
            err_str="Invalid descriptor line: "+line+
                "\nInvalid syntax of an secondary structure element or of a reorder command";
            return;
        }
    }

    // check whether all sses are defined and descriptor contains no inconsistencies
    if(! check_consistency() ) {
        err_str="The descriptor is not complete or contains a logical error";
        return;
    }

    // compute sizes of SSEs and distances between strands of helices
    compute_sizes();

    complete_srch_order();
    
    compute_pknot_levels();

    initialized=true;
}

bool Descriptor::parse_desc_map(string line){
    transform(line.begin(), line.end(), line.begin(), ::toupper); //uppercase the line
    istringstream sin(line);

    sses.clear();
    sses.resize(1); //to have this array indexed from 1 !!!
    motif.clear();
    transl.clear();

    string item;
    int open_helices=0;
    while( sin>>item ){
        if( item[item.length()-1] == '\'' ){ //if it is the second strand of a helix
            map<string, int>::iterator it=transl.find( item.substr(0,item.length()-1) );
            if( it==transl.end() ) return false;
            motif.push_back( -(it->second) );
            open_helices--;
        } else {
            SSE new_sse;
            switch (item[0]){
                case 'S': new_sse.is_helix=false;
                          break;
                case 'R':
                case 'H': new_sse.is_helix=true;
                          open_helices++;
                          break;
                default: return false;
            }
            new_sse.id=sses.size();
            sses.push_back(new_sse);
            motif.push_back(sses.size()-1);
            transl[item]=motif.back();
        }
    }

    //for(int i=0;i<motif.size();i++) cout<<motif[i]<<" "; cout<<endl;
    if( !has_no_duplicates(motif) ) return false;
    return open_helices==0;
}

bool Descriptor::parse_desc_properties(string line){
    istringstream sin(line);
    string item;
    char first_char_in_name;

    //get "name" of the sse
    if(! (sin>>item)) return false; //cout<<item<<endl;
    //transform(item.begin(), item.end(), item.begin(), ::toupper);
    map<string, int>::iterator it=transl.find(item);
    if( it==transl.end() ) return false; //if it is unknown
    first_char_in_name=item[0];

    if( sses[it->second].is_helix){ //if sse is helix
        //get number of tolerated mismatches, mispairings and insertions
        if(! (sin>>item)) return false; //cout<<item<<endl;
        int tmp_num=0;
        for(int i=0;i<item.size();i++) if(item[i]==':') { item[i]=' '; tmp_num++; }
        if( tmp_num != 1 && tmp_num != 2) return false;

        istringstream sin2(item);
        if( !(sin2>>sses[it->second].num_mismatches) || !(sin2>>sses[it->second].num_mispairings) ) return false;
        //by defalut we do not allow insertions
        sses[it->second].num_insertions=0;
        if( tmp_num == 2 && !(sin2>>sses[it->second].num_insertions)) return false;

        //get primary structure restrictions
        if(! (sin>>item)) return false; //cout<<item<<endl;
        tmp_num=0;
        for(int i=0;i<item.size();i++) if(item[i]==':') { item[i]=' '; tmp_num++; }
        if( tmp_num != 1 && tmp_num != 2) return false;

        sin2.clear();
        sin2.str(item); //cout<<item<<endl;
        if( !(sin2>>sses[it->second].pattern) || !(sin2>>sses[it->second].complement) ) {
            //cout<<sses[it->second].pattern<<endl;
            return false;
        }
        //by defalut we allow insertion of all nucleotides
        sses[it->second].allowed_insertion='N';
        if( tmp_num == 2 && !(sin2>>sses[it->second].allowed_insertion)) return false;

        //get transformation matrix (if it is R)
        switch(first_char_in_name){
            case 'R': if(! (sin>>item)) return false; //cout<<item<<endl;
                        if(item.size()!=4) return false;
                        //transform(item.begin(), item.end(), item.begin(), ::toupper);
                        sses[it->second].transf_matrix = item;
                        break;
            case 'H': sses[it->second].transf_matrix = "TGYR";
                        break;
            default: return false; //unexpected value of first_char_in_name
        }
        //cout<<sses[it->second].transf_matrix<<endl;

        if(sin>>item) return false; //no more input is expected in this line

    } else { //if sse is single strand
        //get number of tolerated mismatches and insertions
        if(! (sin>>item)) return false; //cout<<item<<endl;
        int tmp_num=0;
        for(int i=0;i<item.size();i++) if(item[i]==':') { item[i]=' '; tmp_num++; }
        if( tmp_num != 0 && tmp_num != 1) return false;

        istringstream sin2(item);
        if( !(sin2>>sses[it->second].num_mismatches) ) return false;
        //by defalut we do not allow insertions
        sses[it->second].num_insertions=0;
        if( tmp_num == 1 && !(sin2>>sses[it->second].num_insertions)) return false;

        //get primary structure restrictions
        if(! (sin>>item)) return false; //cout<<item<<endl;
        tmp_num=0;
        for(int i=0;i<item.size();i++) if(item[i]==':') { item[i]=' '; tmp_num++; }
        if( tmp_num != 0 && tmp_num != 1) return false;

        sin2.clear();
        sin2.str(item); //cout<<item<<endl;
        if( !(sin2>>sses[it->second].pattern) ) return false;
        //by defalut we allow insertion of all nucleotides
        sses[it->second].allowed_insertion='N';
        if( tmp_num == 1 && !(sin2>>sses[it->second].allowed_insertion)) return false;

        if(sin>>item) return false; //no more input is expected in this line
    }
    return true;
}

bool Descriptor::parse_desc_order(string line){
    //transform(line.begin(), line.end(), line.begin(), ::toupper); //uppercase the line
    istringstream sin(line);

    string item;
    if(! (sin>>item)) return false;
    if(item!="R") return false;

    srch_order.clear();
    map<string, int>::iterator it;
    while( sin>>item ){
        it=transl.find(item);
        if( it==transl.end() ) return false; //if it is unknown
        srch_order.push_back(it->second);
    }

    if( !has_no_duplicates(srch_order) ) return false;
    return true;
}

void Descriptor::complete_srch_order(){
    vector<bool> has(sses.size(),false);
    vector<pair< pair<int,int> ,int > > auto_order;
    for(int i=0;i<srch_order.size();i++) has[srch_order[i]] = true;

    srch_order.reserve(sses.size()-1);

    //trivial heuristic to order elements according to their specificity
    auto_order.reserve(sses.size()-srch_order.size());
    for(int i=1;i<sses.size();i++){
        if(!has[i]){
            int score=0;
            for(int j=0;j<sses[i].pattern.size();j++){
                if( sses[i].pattern[j]=='*') --score;
                else ++score;
            }

            if(sses[i].is_helix) auto_order.push_back( make_pair( make_pair(score*2 +1,sses[i].pattern.size()*2), i) );
            else auto_order.push_back( make_pair( make_pair(score,sses[i].pattern.size()), i) );
        }
    }
    sort(auto_order.begin(), auto_order.end());
    for(int i=auto_order.size()-1; i>-1; i--){
       srch_order.push_back(auto_order[i].second);
    }

/*    //now add helices
    for(int i=1;i<sses.size();i++){
        if(sses[i].is_helix && !has[i]){
            has[i]=true;
            srch_order.push_back(i);
        }
    }
    //add single strands first
    for(int i=1;i<sses.size();i++){
        if(!sses[i].is_helix && !has[i]){
            has[i]=true;
            srch_order.push_back(i);
        }
    }*/
}

// returns true if vector contains no duplicates
bool Descriptor::has_no_duplicates(vector<int> vec){
    if(vec.size() < 2) return true;

    sort(vec.begin(), vec.end());
    for(int i=1;i<vec.size();i++) if(vec[i-1]==vec[i]) return false;

    return true;
}

//expand [x] to (*)^x in the given string
bool Descriptor::expand_wildcards(string &s){
    string news;
    news.reserve(s.size());
    int i=0;

    while(i<s.size()){
        if(s[i]!='[') {
            news+=s[i];
            ++i;
            continue;
        }

        ++i;
        int num=0;
        while(i<s.size() && s[i]>='0' && s[i]<='9'){
            num*=10;
            num+=s[i] - (int)'0';
            ++i;
        }
        if(i>=s.size() || s[i]!=']') return false;
        for(int j=0;j<num;j++) news+='*';
        ++i;
    }
    s=news;
    return true;
}

// consistency check + wildcars exansion
bool Descriptor::check_consistency(){
    /*
    for(int i=1;i<sses.size();i++){
        cout<<i<<": \n";
        cout<<sses[i].pattern<<" "<<sses[i].complement<<endl;
        cout<<sses[i].num_mismatches<<" "<<sses[i].num_mispairings<<endl<<endl;
    }
    for(int i=0;i<srch_order.size();i++) cout<<srch_order[i]<<" "; cout<<endl;
    */

    for(int i=1;i<sses.size();i++){
        switch (sses[i].is_helix){
            case true: //a helix
                if( !expand_wildcards(sses[i].pattern) ) return false;
                if( !expand_wildcards(sses[i].complement) ) return false;
                if( sses[i].pattern.size() != sses[i].complement.size() ) return false;
                if( sses[i].pattern.size() == 0) return false;
                if( sses[i].num_mismatches > sses[i].pattern.size()) return false;
                if( sses[i].num_mispairings > sses[i].pattern.size()) return false;

                reverse(sses[i].complement.begin(), sses[i].complement.end()); //reverse the complement
                for(int j=0;j<sses[i].pattern.size();j++){ //complementarity check
                    if(!is_complemntary(sses[i].pattern[j], sses[i].complement[j], sses[i].transf_matrix)) return false;
                }
                break;
            case false: //a single strand
                if( !expand_wildcards(sses[i].pattern) ) return false;
                if( sses[i].pattern.size() == 0) return false;
                if( sses[i].num_mismatches > sses[i].pattern.size()) return false;
                break;
        }
    }

    if( !has_no_duplicates(srch_order) ) return false;
    return true;
}

void Descriptor::compute_sizes(){
    //compute size range of sses
    for(int i=1;i<sses.size();i++){
        int count=0;
        for(int j=0;j<sses[i].pattern.size();j++) if(sses[i].pattern[j]=='*') ++count;

        sses[i].size_range.first = sses[i].pattern.size() - count;
        sses[i].size_range.second = sses[i].pattern.size() + sses[i].num_insertions;
    }
    //compute strand distance of helices
    for(int i=1;i<sses.size();i++){
        if (sses[i].is_helix){
            bool start=false;
            int min_dis=0, max_dis=0;
            for(int j=0;j<motif.size();j++){
                if(motif[j] == -i) break;
                if(start){
                    min_dis+=sses[abs(motif[j])].size_range.first;
                    max_dis+=sses[abs(motif[j])].size_range.second;
                }
                if(motif[j] == i) start=true;
            }
            sses[i].strand_dist.first = min_dis;
            sses[i].strand_dist.second = max_dis;
        }
    }

    /*
    for(int i=1;i<sses.size();i++){
        cout<<i<<": \n";
        cout<<"size_range "<<sses[i].size_range.first<<" "<<sses[i].size_range.second<<endl;
        cout<<"strand_dist "<<sses[i].strand_dist.first<<" "<<sses[i].strand_dist.second<<endl;
        cout<<"seq(comp) "<<sses[i].pattern<<" "<<sses[i].complement<<endl;
        cout<<"mismach(pair) "<<sses[i].num_mismatches<<" "<<sses[i].num_mispairings<<endl;
        cout<<"insert. "<<sses[i].num_insertions<<" "<<sses[i].allowed_insertion<<endl<<endl;
    }
    */
}

//returns the order in which elements of the motif are going to be search in human-readable form
string Descriptor::search_order_to_str(){
    string result;
    for(int i=0;i<srch_order.size();i++) {
        for(map<string, int>::iterator it=transl.begin();it!=transl.end();++it){
            if( it->second==srch_order[i] ) {
                result+=it->first+' ';
                break;
            }
        }
    }
    return result;
}

//returns the maximal length of the motif specified by the instance of Descriptor
int Descriptor::get_max_motif_length(){
    int result=0;
    for(int i=0;i<motif.size();i++){
        result+=sses[abs(motif[i])].size_range.second;
    }
    return result;
}

//compute which helical elements cause a pseudoknot and of which level, pseudoknots
// of up to level 2 are supported; the result is stored in vector pknot_levels
void Descriptor::compute_pknot_levels(){
    pknot_levels.clear();
    pknot_levels.resize(motif.size());
    
    stack<int> hstack;
    stack<int> temp;
    
    
    //decide which helical elements cause a pseudoknot and of which level
    for(int i=0;i<motif.size();i++){
        int elementid = abs(motif[i]);
        
        if(sses[elementid].is_helix == true){
            if(motif[i]>0){ //if it is the first strand of the helix
                hstack.push(elementid);
            } else {        //if it is the second strand of the helix
                int top = hstack.top();
                hstack.pop();
                
                while(!temp.empty()) temp.pop();
                
                //take out all helices crossing the particular one
                while(top != elementid){ 
                    temp.push(top);
                    top = hstack.top();
                    hstack.pop();
                }
                
                //adjust "pseudoknot level" of the crossing helcies
                while(!temp.empty()){
                    top = temp.top();
                    temp.pop();
                    
                    hstack.push(top);
                                        
                    if(pknot_levels[top] <= pknot_levels[elementid]){
                        pknot_levels[top] = pknot_levels[elementid] + 1;
                    }
                }
            }
            
        }
    }

}

//returns secondary structure representation in dot-bracked notation, pseudoknots
// of up to level 2 are supported
string Descriptor::get_dotbracket_notation(){
    string res;

    //create the dot-bracked anotation according to the determined levels
    for(int i=0;i<motif.size();i++){
        char c;
        int elementid = abs(motif[i]);
       
        if(sses[elementid].is_helix == true){
            if(motif[i]>0){ //if it is the first strand of the helix
                switch(pknot_levels[elementid]){
                  case 0: c = '(';
                          break;
                  case 1: c = '[';
                          break;
                  case 2: c = '{';
                          break;
                  default: c = '-'; die("Unexpected situation + in Descriptor::get_dotbracket_notation()");
                }
            } else {        //if it is the second strand of the helix
                switch(pknot_levels[elementid]){
                  case 0: c = ')';
                          break;
                  case 1: c = ']';
                          break;
                  case 2: c = '}';
                          break;
                  default: c = '+'; die("Unexpected situation in Descriptor::get_dotbracket_notation()");
                }
            }
            
        } else {
            c = '.';
        }
            
        for(int j=0;j<sses[abs(motif[i])].size_range.second;j++){
            res += c;
        }
    }
    
    return res;
}
