LocARNA-1.9.2: src/LocARNA/exact_matcher.hh Source File

00001 #ifndef EXACT_MATCHER_HH
00002 #define EXACT_MATCHER_HH
00003 
00004 #ifdef HAVE_CONFIG_H
00005 #include <config.h>
00006 #endif
00007 
00008 #include <iostream>
00009 #include <sstream>
00010 #include <list>
00011 #include <algorithm>
00012 #include <limits>
00013 #include <iterator>
00014 #include "trace_controller.hh"
00015 #include "sparsification_mapper.hh"
00016 #include "tuples.hh"
00017 #include "scoring.hh"
00018 #include "ext_rna_data.hh"
00019 #include "aux.hh"
00020 
00021 extern "C" {
00022 #include <ViennaRNA/fold_vars.h>
00023 #include <ViennaRNA/utils.h>
00024 #include <ViennaRNA/PS_dot.h>
00025 #include <ViennaRNA/fold.h>
00026 int
00027 PS_rna_plot(char *string, char *structure, char *file);
00028 int
00029 PS_rna_plot_a(char *string, char *structure, char *file, char *pre, char *post);
00030 float
00031 fold(const char *sequence, char *structure);
00032 }
00033 
00034 namespace LocARNA {
00035 
00036     typedef size_t size_type;
00037     typedef std::vector<unsigned int> intVec;
00038     typedef std::pair<unsigned int, unsigned int> intPair;
00039     typedef std::pair<intPair, intPair> intPPair;
00040     typedef const intPPair *intPPairPTR;
00041     typedef std::vector<intPPair>::const_iterator IntPPairCITER;
00042 
00046     class SinglePattern {
00047     public:
00048         SinglePattern(){};
00049 
00056         SinglePattern(const std::string &myId_,
00057                       const std::string &seqId_,
00058                       const intVec &mySinglePattern_)
00059             : myId(myId_), seqId(seqId_), pattern(mySinglePattern_){};
00060 
00064         virtual ~SinglePattern() { pattern.clear(); };
00065 
00070         const std::string &
00071         getmyId() const {
00072             return myId;
00073         };
00074 
00079         const std::string &
00080         getseqId() const {
00081             return seqId;
00082         };
00083 
00088         const intVec &
00089         getPat() const {
00090             return pattern;
00091         };
00092 
00093     private:
00094         std::string myId;  
00095         std::string seqId; 
00096         intVec pattern;    
00097     };
00098 
00103     class PatternPair {
00104     public:
00105         PatternPair(){};
00106 
00115         PatternPair(const std::string &myId,
00116                     const SinglePattern &myFirstPat,
00117                     const SinglePattern &mySecPat,
00118                     const std::string &structure_,
00119                     int &score_)
00120             : id(myId),
00121               first(myFirstPat),
00122               second(mySecPat),
00123               structure(structure_),
00124               EPMscore(score_) {
00125             if (first.getPat().size() != second.getPat().size()) {
00126                 std::cerr << "Error! PatternPair cannot be constructed due to "
00127                              "different sizes of SinglePatterns!"
00128                           << std::endl;
00129             }
00130             score = EPMscore;
00131             size = first.getPat().size();
00132         };
00133 
00135         virtual ~PatternPair() { insideBounds.clear(); };
00136 
00141         const std::string &
00142         getId() const {
00143             return id;
00144         };
00145 
00150         const int &
00151         getSize() const {
00152             return size;
00153         };
00154 
00159         const SinglePattern &
00160         getFirstPat() const {
00161             return first;
00162         };
00163 
00168         const SinglePattern &
00169         getSecPat() const {
00170             return second;
00171         };
00172 
00174         void
00175         resetBounds();
00176 
00181         void
00182         setOutsideBounds(intPPair myPPair);
00183 
00188         const intPPair
00189         getOutsideBounds() const {
00190             return outsideBounds;
00191         };
00192 
00194         void
00195         addInsideBounds(intPPair myPPair);
00196 
00201         const std::vector<intPPair> &
00202         getInsideBounds() const {
00203             return insideBounds;
00204         };
00205 
00210         void
00211         setEPMScore(int myScore);
00212 
00217         const int
00218         getScore() const {
00219             return score;
00220         };
00221 
00226         const int
00227         getEPMScore() const {
00228             return EPMscore;
00229         };
00230 
00235         const std::string &
00236         get_struct() const {
00237             return structure;
00238         };
00239 
00240     private:
00241         std::string id;       
00242         int size;             
00243         SinglePattern first;  
00244         SinglePattern second; 
00245 
00246         std::string structure;              
00247         int score;                          
00248         int EPMscore;                       
00249         std::vector<intPPair> insideBounds; 
00250         intPPair outsideBounds;             
00251     };
00252 
00256     class PatternPairMap {
00257     public:
00258         typedef PatternPair selfValueTYPE; 
00259         typedef PatternPair *SelfValuePTR; 
00260 
00261         typedef std::multimap<int, SelfValuePTR, std::greater<int> >
00262             orderedMapTYPE; 
00263         typedef orderedMapTYPE::const_iterator
00264             orderedMapCITER; 
00265         typedef orderedMapTYPE::iterator
00266             orderedMapITER;                          
00267         typedef std::list<SelfValuePTR> patListTYPE; 
00268         typedef patListTYPE::iterator
00269             patListITER; 
00270         typedef patListTYPE::const_iterator
00271             patListCITER; 
00272         typedef unordered_map<std::string, SelfValuePTR>::type
00273             PatternIdMapTYPE; 
00274 
00276         PatternPairMap();
00277 
00280         PatternPairMap(const PatternPairMap &myPairMap)
00281             : patternList(myPairMap.patternList),
00282               patternOrderedMap(myPairMap.patternOrderedMap),
00283               idMap(myPairMap.idMap) {
00284             minPatternSize = 100000;
00285         };
00286 
00288         virtual ~PatternPairMap();
00289 
00299         void
00300         add(const std::string &id,
00301             const SinglePattern &first,
00302             const SinglePattern &second,
00303             const std::string &structure,
00304             int score);
00305 
00310         void
00311         add(const SelfValuePTR value);
00312 
00314         void
00315         makeOrderedMap();
00316 
00318         void
00319         updateFromMap();
00320 
00326         const PatternPair &
00327         getPatternPair(const std::string &id) const;
00328 
00334         const SelfValuePTR
00335         getPatternPairPTR(const std::string &id) const;
00336 
00341         const patListTYPE &
00342         getList() const;
00343 
00348         const orderedMapTYPE &
00349         getOrderedMap() const;
00350 
00355         orderedMapTYPE &
00356         getOrderedMap2();
00357 
00362         const int
00363         size() const;
00364 
00369         int
00370         getMapBases();
00371 
00376         int
00377         getMapEPMScore();
00378 
00383         const int
00384         getMinPatternSize() const {
00385             return minPatternSize;
00386         };
00387 
00388     private:
00389         patListTYPE patternList;          
00390         orderedMapTYPE patternOrderedMap; 
00391 
00392         PatternIdMapTYPE idMap; 
00393         int minPatternSize;     
00394     };
00395 
00402     std::ostream &
00403     operator<<(std::ostream &out,
00404                const PatternPairMap::patListTYPE &pat_pair_map);
00405 
00409     class LCSEPM {
00410     public:
00418         LCSEPM(const Sequence &seqA_,
00419                const Sequence &seqB_,
00420                const PatternPairMap &myPatterns,
00421                PatternPairMap &myLCSEPM)
00422 
00423             : seqA(seqA_),
00424               seqB(seqB_),
00425               matchedEPMs(myLCSEPM),
00426               patterns(myPatterns){};
00427 
00429         virtual ~LCSEPM();
00430 
00439         void
00440         MapToPS(const std::string &sequenceA,
00441                 const std::string &sequenceB,
00442                 PatternPairMap &myMap,
00443                 const std::string &file1,
00444                 const std::string &file2);
00445 
00447         void
00448         calculateLCSEPM(bool quiet);
00449 
00451         std::pair<SequenceAnnotation, SequenceAnnotation>
00452         anchor_annotation();
00453 
00455         void
00456         output_locarna(const std::string &sequenceA,
00457                        const std::string &sequenceB,
00458                        const std::string &outfile);
00459 
00461         void
00462         output_clustal(const std::string &outfile_name);
00463 
00464     private:
00465         struct HoleCompare2 {
00466             bool
00467             operator()(const intPPairPTR &h1, const intPPairPTR &h2) const {
00468                 // first compare size of holes
00469                 if (h1->first.second - h1->first.first - 1 <
00470                     h2->first.second - h2->first.first - 1) {
00471                     return true;
00472                 }
00473                 // compare if holes are identical in both structures
00474                 if (h1->first.second - h1->first.first - 1 ==
00475                     h2->first.second - h2->first.first - 1) {
00476                     if ((h1->first.first == h2->first.first) &&
00477                         (h1->first.second == h2->first.second) &&
00478                         (h1->second.first == h2->second.first) &&
00479                         (h1->second.second == h2->second.second)) {
00480                         return true;
00481                     }
00482                 }
00483 
00484                 return false;
00485             }
00486         };
00487 
00488         typedef std::multimap<intPPairPTR,
00489                               PatternPairMap::SelfValuePTR,
00490                               HoleCompare2>
00491             HoleOrderingMapTYPE2;
00492         typedef HoleOrderingMapTYPE2::const_iterator HoleMapCITER2;
00493 
00494         void
00495         preProcessing();
00496         void
00497         calculateHoles3(bool quiet);
00498         void
00499         calculatePatternBoundaries(PatternPair *myPair);
00500         void
00501         calculateTraceback2(const int i,
00502                             const int j,
00503                             const int k,
00504                             const int l,
00505                             std::vector<std::vector<int> > holeVec);
00506         int
00507         D_rec2(const int &i,
00508                const int &j,
00509                const int &k,
00510                const int &l,
00511                std::vector<std::vector<int> > &D_h,
00512                const bool debug);
00513 
00514         int
00515         max3(int a, int b, int c) {
00516             int tmp = a > b ? a : b;
00517             return (tmp > c ? tmp : c);
00518         };
00519 
00521         char *
00522         getStructure(PatternPairMap &myMap, bool firstSeq, int length);
00523 
00524         std::string
00525         intvec2str(const std::vector<unsigned int> &V,
00526                    const std::string &delim) {
00527             std::stringstream oss;
00528             copy(V.begin(), V.end(),
00529                  std::ostream_iterator<unsigned int>(oss, delim.c_str()));
00530             std::string tmpstr;
00531             tmpstr = oss.str();
00532             if (tmpstr.length() > 0)
00533                 tmpstr.erase(tmpstr.end() - 1);
00534             return tmpstr;
00535         }
00536 
00537         std::string
00538         upperCase(const std::string &seq) {
00539             std::string s = "";
00540             for (unsigned int i = 0; i < seq.length(); i++)
00541                 s += toupper(seq[i]);
00542             return s;
00543         }
00544 
00545         std::vector<std::vector<std::vector<PatternPairMap::SelfValuePTR> > >
00546             EPM_Table2;
00547         HoleOrderingMapTYPE2 holeOrdering2;
00548         const Sequence &seqA;
00549         const Sequence &seqB;
00550         PatternPairMap &matchedEPMs;
00551         const PatternPairMap &patterns;
00552     };
00553 
00557     class SparseTraceController : public TraceController {
00558     private:
00559         typedef SparsificationMapper::matidx_t
00560             matidx_t; 
00561         typedef SparsificationMapper::seq_pos_t
00562             seqpos_t; 
00563         typedef SparsificationMapper::index_t
00564             index_t; 
00565 
00566     public:
00567         typedef std::pair<matidx_t, matidx_t>
00568             matpos_t; 
00569         typedef std::pair<seqpos_t, seqpos_t>
00570             pair_seqpos_t; 
00571 
00572     private:
00573         const SparsificationMapper
00574             &sparse_mapperA; 
00575         const SparsificationMapper
00576             &sparse_mapperB; 
00577 
00578     public:
00586         SparseTraceController(const SparsificationMapper &sparse_mapperA_,
00587                               const SparsificationMapper &sparse_mapperB_,
00588                               const TraceController &trace_controller_)
00589             : TraceController::TraceController(trace_controller_),
00590               sparse_mapperA(sparse_mapperA_),
00591               sparse_mapperB(sparse_mapperB_)
00592 
00593         {}
00594 
00595         virtual ~SparseTraceController(){}; 
00596 
00598         const SparsificationMapper &
00599         get_sparse_mapperA() const {
00600             return sparse_mapperA;
00601         }
00602 
00604         const SparsificationMapper &
00605         get_sparse_mapperB() const {
00606             return sparse_mapperB;
00607         }
00608 
00620         matidx_t
00621         min_col_idx(
00622             index_t indexA,
00623             index_t indexB,
00624             matidx_t idx_i,
00625             index_t left_endB = std::numeric_limits<index_t>::max()) const {
00626             seqpos_t i = sparse_mapperA.get_pos_in_seq_new(indexA, idx_i);
00627             return sparse_mapperB.idx_geq(indexB, min_col(i), left_endB);
00628         }
00629 
00642         matidx_t
00643         idx_after_max_col_idx(
00644             index_t indexA,
00645             index_t indexB,
00646             matidx_t idx_i,
00647             index_t left_endB = std::numeric_limits<index_t>::max()) const {
00648             seqpos_t i = sparse_mapperA.get_pos_in_seq_new(indexA, idx_i);
00649             return sparse_mapperB.idx_after_leq(indexB, max_col(i), left_endB);
00650         }
00651 
00671         matpos_t
00672         diag_pos_bef(
00673             index_t indexA,
00674             index_t indexB,
00675             pair_seqpos_t cur_pos_seq,
00676             index_t left_endA = std::numeric_limits<index_t>::max(),
00677             index_t left_endB = std::numeric_limits<index_t>::max()) const {
00678             bool debug_valid_mat_pos = false;
00679 
00680             if (debug_valid_mat_pos)
00681                 std::cout << "first valid mat pos before with tc " << std::endl;
00682 
00683             seqpos_t i = cur_pos_seq.first;
00684             seqpos_t j = cur_pos_seq.second;
00685 
00686             matidx_t idx_after_max_col;
00687 
00688             // find valid matrix position based on the SparsificationMapper
00689             matidx_t cur_row =
00690                 sparse_mapperA.first_valid_mat_pos_before(indexA, i, left_endA);
00691             matidx_t col_before =
00692                 sparse_mapperB.first_valid_mat_pos_before(indexB, j, left_endB);
00693 
00694             // bool valid_pos_found = false;
00695 
00696             // find a valid position that is valid also based on the
00697             // TraceController
00698             // go through the rows and find an interval that includes the column
00699             // col_before or lies
00700             // before the column col_before
00701             for (;; --cur_row) {
00702                 matidx_t min_col =
00703                     min_col_idx(indexA, indexB, cur_row, left_endB);
00704                 idx_after_max_col =
00705                     idx_after_max_col_idx(indexA, indexB, cur_row, left_endB);
00706 
00707                 if (debug_valid_mat_pos)
00708                     std::cout << "interval " << min_col << ","
00709                               << idx_after_max_col << std::endl;
00710 
00711                 // valid interval found
00712                 if (min_col < idx_after_max_col && min_col <= col_before) {
00713                     // valid_pos_found=true;
00714                     break;
00715                 }
00716 
00717                 if (cur_row == 0) {
00718                     break;
00719                 }
00720             }
00721 
00722             // assert(valid_pos_found);
00723             assert(idx_after_max_col > 0);
00724 
00725             matidx_t max_col = idx_after_max_col - 1;
00726 
00727             // the column of the new position is the col_before or lies before
00728             // it
00729             matpos_t result = matpos_t(cur_row, std::min(max_col, col_before));
00730 
00731             assert(is_valid_idx_pos(indexA, indexB, result));
00732 
00733             return result;
00734         }
00735 
00744         pair_seqpos_t
00745         pos_in_seq(index_t idxA,
00746                    index_t idxB, // const Arc &a, const Arc &b,
00747                    const matpos_t &cur_pos) const {
00748             return pair_seqpos_t(
00749                 sparse_mapperA.get_pos_in_seq_new(idxA, cur_pos.first),
00750                 sparse_mapperB.get_pos_in_seq_new(idxB, cur_pos.second));
00751         }
00752 
00766         bool
00767         matching_wo_gap(index_t idxA,
00768                         index_t idxB,
00769                         const matpos_t &idx_pos_diag,
00770                         pair_seqpos_t seq_pos_to_be_matched) const {
00771             pair_seqpos_t pos_diag = pos_in_seq(idxA, idxB, idx_pos_diag);
00772             return (pos_diag.first + 1 == seq_pos_to_be_matched.first) &&
00773                 (pos_diag.second + 1 == seq_pos_to_be_matched.second);
00774         }
00775 
00783         bool
00784         pos_unpaired(index_t idxA, index_t idxB, matpos_t pos) const {
00785             return sparse_mapperA.pos_unpaired(idxA, pos.first) &&
00786                 sparse_mapperB.pos_unpaired(idxB, pos.second);
00787         }
00788 
00796         bool
00797         is_valid_idx_pos(index_t idxA, index_t idxB, matpos_t mat_pos) const {
00798             pair_seqpos_t seq_pos = pos_in_seq(idxA, idxB, mat_pos);
00799             return is_valid(seq_pos.first, seq_pos.second);
00800         }
00801     };
00802 
00806     class EPM {
00807     public:
00808         typedef BasePairs__Arc Arc; 
00809 
00810         typedef SparsificationMapper::seq_pos_t
00811             seqpos_t; 
00812         typedef SparseTraceController::matpos_t
00813             matpos_t; 
00814         typedef SparsificationMapper::ArcIdx ArcIdx; 
00815         typedef SparseTraceController::pair_seqpos_t
00816             pair_seqpos_t; 
00817         typedef std::pair<ArcIdx, ArcIdx> PairArcIdx; 
00818         typedef std::vector<PairArcIdx>
00819             PairArcIdxVec; 
00820 
00823         typedef triple<seqpos_t, seqpos_t, char> el_pat_vec;
00824 
00825         typedef std::vector<el_pat_vec> pat_vec_t; 
00826 
00827     private:
00828         pat_vec_t pat_vec; 
00829 
00830         score_t score; 
00831         int state; 
00832 
00833         matpos_t cur_pos;     
00834 
00835         score_t max_tol_left; 
00836 
00837         bool first_insertion; 
00838 
00839         bool invalid; 
00840 
00841 
00842 
00843 
00844         PairArcIdxVec am_to_do; 
00845 
00846 
00848         struct compare_el_pat_vec {
00849         public:
00850             bool
00851             operator()(const EPM::el_pat_vec &el1,
00852                        const EPM::el_pat_vec &el2) const {
00853                 seqpos_t el1_pos1 = el1.first;
00854                 seqpos_t el1_pos2 = el1.second;
00855                 seqpos_t el2_pos1 = el2.first;
00856                 seqpos_t el2_pos2 = el2.second;
00857                 char el1_struc = el1.third;
00858                 char el2_struc = el2.third;
00859                 return (el1_pos1 < el2_pos1) ||
00860                     (el1_pos1 == el2_pos1 && el1_pos2 < el2_pos2) ||
00861                     (el1_pos1 == el2_pos1 && el1_pos2 == el2_pos2 &&
00862                      el1_struc < el2_struc);
00863             }
00864         };
00865 
00866         struct compare_el_am_to_do {
00867         public:
00868             bool
00869             operator()(const EPM::PairArcIdx &el1,
00870                        const EPM::PairArcIdx &el2) const {
00871                 return (el1.first < el2.first) ||
00872                     ((el1.first == el2.first) && el1.second < el2.second);
00873             }
00874         };
00875 
00876     public:
00878         EPM()
00879             : score(0),
00880               state(0),
00881               cur_pos(matpos_t(0, 0)),
00882               max_tol_left(0),
00883               first_insertion(true),
00884               invalid(false) {}
00885 
00886         virtual ~EPM() {} 
00887 
00888         //-----------------------------------------------------------------------
00889         // getter methods
00890         //-----------------------------------------------------------------------
00891 
00893         score_t
00894         get_score() const {
00895             return score;
00896         }
00897 
00899         int
00900         get_state() const {
00901             return state;
00902         }
00903 
00905         const matpos_t &
00906         get_cur_pos() const {
00907             return cur_pos;
00908         }
00909 
00911         const score_t &
00912         get_max_tol_left() const {
00913             return max_tol_left;
00914         }
00915 
00917         bool
00918         get_first_insertion() const {
00919             return first_insertion;
00920         }
00921 
00925         bool
00926         is_invalid() const {
00927             return invalid;
00928         }
00929 
00930         //-----------------------------------------------------------------------
00931         // setter methods
00932         //-----------------------------------------------------------------------
00933 
00938         void
00939         set_score(score_t score_) {
00940             score = score_;
00941         }
00942 
00947         void
00948         set_state(int state_) {
00949             state = state_;
00950         }
00951 
00956         void
00957         set_cur_pos(const matpos_t &cur_pos_) {
00958             cur_pos = cur_pos_;
00959         }
00960 
00965         void
00966         set_max_tol_left(score_t tol) {
00967             max_tol_left = tol;
00968         }
00969 
00974         void
00975         set_first_insertion(bool first_insertion_) {
00976             first_insertion = first_insertion_;
00977         }
00978 
00980         void
00981         set_invalid() {
00982             invalid = true;
00983         }
00984 
00991         const PairArcIdx &
00992         get_am(PairArcIdxVec::size_type idx) const {
00993             assert(idx < am_to_do.size());
00994             return am_to_do[idx];
00995         }
00996 
01001         PairArcIdxVec::size_type
01002         number_of_am() {
01003             return am_to_do.size();
01004         }
01005 
01007         void
01008         clear_am_to_do() {
01009             am_to_do.clear();
01010         }
01011 
01017         PairArcIdxVec::const_iterator
01018         am_begin() const {
01019             return am_to_do.begin();
01020         }
01021 
01027         PairArcIdxVec::const_iterator
01028         am_end() const {
01029             return am_to_do.end();
01030         }
01031 
01037         el_pat_vec
01038         pat_vec_at(pat_vec_t::size_type idx) const {
01039             assert(idx < pat_vec.size());
01040             return pat_vec[idx];
01041         }
01042 
01047         pat_vec_t::size_type
01048         pat_vec_size() const {
01049             return pat_vec.size();
01050         }
01051 
01056         pat_vec_t::const_iterator
01057         begin() const {
01058             return pat_vec.begin();
01059         }
01060 
01065         pat_vec_t::const_iterator
01066         end() const {
01067             return pat_vec.end();
01068         }
01069 
01074         pair_seqpos_t
01075         last_matched_pos() {
01076             assert(!pat_vec.empty());
01077             return pair_seqpos_t(pat_vec.back().first, pat_vec.back().second);
01078         }
01079 
01089         void
01090         add(seqpos_t posA, seqpos_t posB, char c) {
01091             pat_vec.push_back(el_pat_vec(posA, posB, c));
01092         }
01093 
01104         void
01105         overwrite(seqpos_t posA,
01106                   seqpos_t posB,
01107                   char c,
01108                   pat_vec_t::size_type pos) {
01109             if (pat_vec.size() <= pos) {
01110                 pat_vec.push_back(el_pat_vec(posA, posB, c));
01111             }
01112             pat_vec.at(pos) = el_pat_vec(posA, posB, c);
01113         }
01114 
01120         void
01121         add_am(const Arc &a, const Arc &b) {
01122             add(a.right(), b.right(), ')');
01123             add(a.left(), b.left(), '(');
01124         }
01125 
01131         void
01132         store_am(const Arc &a, const Arc &b) {
01133             const PairArcIdx &pair_arc_idx = PairArcIdx(a.idx(), b.idx());
01134             // store the pair of arc indices in the am_to_do datastructure
01135             am_to_do.push_back(pair_arc_idx);
01136         }
01137 
01142         PairArcIdx
01143         next_arcmatch() {
01144             PairArcIdx arc_idx = am_to_do.back();
01145             am_to_do.pop_back();
01146             return arc_idx;
01147         }
01148 
01155         void
01156         sort_patVec() {
01157             sort(pat_vec.begin(), pat_vec.end(), compare_el_pat_vec());
01158         }
01159 
01163         void
01164         sort_am_to_do() {
01165             sort(am_to_do.begin(), am_to_do.end(), compare_el_am_to_do());
01166         }
01167 
01173         void
01174         insert_epm(const EPM &epm_to_insert) {
01175             pat_vec.insert(pat_vec.end(), epm_to_insert.begin(),
01176                            epm_to_insert.end());
01177         }
01178 
01185         bool
01186         includes(const EPM &epm_to_test) const {
01187             assert(pat_vec_size() >= epm_to_test.pat_vec_size());
01188             return std::includes(this->begin(), this->end(),
01189                                  epm_to_test.begin(), epm_to_test.end(),
01190                                  compare_el_pat_vec());
01191         }
01192 
01201         bool
01202         includes_am(const EPM &epm_to_test) const {
01203             return std::includes(am_begin(), am_end(), epm_to_test.am_begin(),
01204                                  epm_to_test.am_end(), compare_el_am_to_do());
01205         }
01206 
01212         void
01213         print_epm(std::ostream &out, bool verbose) const {
01214             out << "_________________________________________________"
01215                 << std::endl;
01216             out << "epm with score " << this->score << std::endl;
01217             out << " ";
01218             for (pat_vec_t::const_iterator it = pat_vec.begin();
01219                  it != pat_vec.end(); ++it) {
01220                 out << it->first << ":" << it->second << " ";
01221             }
01222             out << std::endl;
01223             out << " ";
01224             for (pat_vec_t::const_iterator it = pat_vec.begin();
01225                  it != pat_vec.end(); ++it) {
01226                 out << it->third;
01227             }
01228             out << std::endl;
01229             out << "am_to_do " << am_to_do << std::endl;
01230             out << "tolerance left " << this->max_tol_left << std::endl;
01231             if (verbose) {
01232                 out << "score " << score << std::endl;
01233                 out << "pos " << this->cur_pos.first << ","
01234                     << this->cur_pos.second << std::endl;
01235                 out << "state " << this->state << std::endl;
01236             }
01237             out << "______________________________________________________"
01238                 << std::endl;
01239         }
01240     };
01241 
01250     inline bool
01251     operator<(const EPM &epm1, const EPM &epm2) {
01252         return epm1.get_max_tol_left() > epm2.get_max_tol_left();
01253     }
01254 
01261     inline std::ostream &
01262     operator<<(std::ostream &out, const EPM &epm) {
01263         epm.print_epm(out, false);
01264         return out;
01265     }
01266 
01274     template <class T1>
01275     T1
01276     max3(const T1 &first, const T1 &second, const T1 &third) {
01277         return max(max(first, second), third);
01278     }
01279 
01288     template <class T1>
01289     T1
01290     max4(const T1 &first, const T1 &second, const T1 &third, const T1 &fourth) {
01291         return max(max3(first, second, third), fourth);
01292     }
01293 
01309     class ExactMatcher {
01310         typedef BasePairs__Arc Arc; 
01311 
01312         typedef SparsificationMapper::ArcIdx ArcIdx; 
01313         typedef SparsificationMapper::ArcIdxVec
01314             ArcIdxVec; 
01315         typedef SparsificationMapper::matidx_t
01316             matidx_t; 
01317         typedef SparsificationMapper::seq_pos_t
01318             seqpos_t; 
01319         typedef SparsificationMapper::index_t index_t; 
01320 
01321 
01322 
01323         typedef SparseTraceController::matpos_t
01324             matpos_t; 
01325         typedef SparseTraceController::pair_seqpos_t
01326             pair_seqpos_t; 
01327 
01328         typedef EPM::PairArcIdx PairArcIdx; 
01329         typedef EPM::PairArcIdxVec
01330             PairArcIdxVec; 
01331 
01332         typedef std::list<EPM>
01333             epm_cont_t; 
01334         typedef epm_cont_t::iterator epm_it_t; 
01335         typedef std::pair<score_t, epm_cont_t> el_map_am_to_do_t; 
01336 
01337 
01338 
01339 
01340 
01341 
01344         typedef unordered_map<PairArcIdx,
01345                               el_map_am_to_do_t,
01346                               pair_of_size_t_hash>::type map_am_to_do_t;
01347 
01348     private:
01352         //<state, max_tol, current matrix position, potential arcMatch, sequence
01353         //position to be matched>
01354         typedef quintuple<int,
01355                           infty_score_t,
01356                           matpos_t,
01357                           PairArcIdx,
01358                           pair_seqpos_t>
01359             poss_L_LR;
01360 
01361         // infty_score_t because of the check_poss, change to score_t!!!
01364         typedef triple<int, infty_score_t, matpos_t>
01365             poss_in_G; //<state,max_tol,current matrix position>
01366 
01367         const Sequence &seqA; 
01368         const Sequence &seqB; 
01369 
01370         const RnaData &rna_dataA; 
01371 
01372         const RnaData &rna_dataB; 
01373 
01374         const ArcMatches
01375             &arc_matches; 
01376 
01377         const BasePairs &bpsA; 
01378         const BasePairs &bpsB; 
01379         const SparseTraceController
01380             &sparse_trace_controller; 
01381 
01382         // (valid positions in the matrix)
01383         const SparsificationMapper
01384             &sparse_mapperA; 
01385         const SparsificationMapper
01386             &sparse_mapperB;       
01387         PatternPairMap &foundEPMs; 
01388 
01389 
01390 
01391         ScoreMatrix L;   
01392         ScoreMatrix G_A; 
01393 
01394 
01395 
01396         ScoreMatrix G_AB; 
01397 
01398         ScoreMatrix
01399             LR; 
01400         ScoreMatrix F;    
01401         ScoreMatrix Dmat; 
01402 
01403 
01404         int alpha_1; 
01405         int alpha_2; 
01406         int alpha_3; 
01407 
01408         int difference_to_opt_score; 
01409 
01410 
01411         // worse than the optimal score are considered
01412         int min_score;               
01413         long int max_number_of_EPMs; 
01414 
01415         long int cur_number_of_EPMs; 
01416 
01417 
01418         bool inexact_struct_match;     
01419 
01420         score_t struct_mismatch_score; 
01421 
01422         bool add_filter; 
01423 
01424 
01425         bool verbose; 
01426 
01427         pair_seqpos_t pos_of_max; 
01428 
01429         enum {
01430             in_LR,
01431             in_G_A,
01432             in_G_AB,
01433             in_L,
01434             in_F
01435         }; 
01436 
01437         const Arc pseudo_arcA; 
01438         const Arc pseudo_arcB; 
01439 
01447         const infty_score_t &
01448         D(const ArcMatch &am) const {
01449             return D(am.arcA(), am.arcB());
01450         }
01451 
01459         infty_score_t &
01460         D(const ArcMatch &am) {
01461             return D(am.arcA(), am.arcB());
01462         }
01463 
01472         const infty_score_t &
01473         D(const Arc &a, const Arc &b) const {
01474             return Dmat(a.idx(), b.idx());
01475         }
01476 
01485         infty_score_t &
01486         D(const Arc &a, const Arc &b) {
01487             return Dmat(a.idx(), b.idx());
01488         }
01489 
01490         bool
01491         nucleotide_match(seqpos_t pos_seqA, seqpos_t pos_seqB) const {
01492             assert(pos_seqA >= 1 && pos_seqA <= seqA.length() &&
01493                    pos_seqB >= 1 &&
01494                    pos_seqB <= seqB.length()); // seqA and seqB are 1-based
01495             return (seqA[pos_seqA] == seqB[pos_seqB]);
01496         }
01497 
01498         bool
01499         seq_matching(ArcIdx idxA,
01500                      ArcIdx idxB,
01501                      matpos_t cur_mat_pos,
01502                      pair_seqpos_t cur_seq_pos) const {
01503             seqpos_t i = cur_seq_pos.first;
01504             seqpos_t j = cur_seq_pos.second;
01505 
01506             return sparse_trace_controller.pos_unpaired(idxA, idxB,
01507                                                         cur_mat_pos) &&
01508                 nucleotide_match(i, j);
01509         }
01510 
01511         // ----------------------------------------
01512         // fill matrices
01513 
01519         void
01520         initialize_gap_matrices();
01521 
01523         void
01524         init_Fmat();
01525 
01537         void
01538         init_mat(ScoreMatrix &mat,
01539                  const Arc &a,
01540                  const Arc &b,
01541                  infty_score_t first_entry,
01542                  infty_score_t first_col,
01543                  infty_score_t first_row);
01544 
01556         pair_seqpos_t
01557         compute_LGLR(const Arc &a, const Arc &b, bool suboptimal);
01558 
01573         infty_score_t
01574         compute_matrix_entry(const Arc &a,
01575                              const Arc &b,
01576                              matpos_t mat_pos,
01577                              matpos_t mat_pos_diag,
01578                              bool matrixLR,
01579                              bool suboptimal);
01580 
01599         infty_score_t
01600         seq_str_matching(const Arc &a,
01601                          const Arc &b,
01602                          matpos_t mat_pos_diag,
01603                          pair_seqpos_t seq_pos_to_be_matched,
01604                          score_t add_score,
01605                          bool matrixLR,
01606                          bool suboptimal);
01607 
01609         void
01610         compute_F();
01611 
01612         // --------------------------------------------
01613         // helper functions
01614 
01620         score_t
01621         score_for_seq_match();
01622 
01631         infty_score_t
01632         score_for_am(const Arc &a, const Arc &b) const;
01633 
01646         score_t
01647         score_for_stacking(const Arc &a,
01648                            const Arc &b,
01649                            const Arc &inner_a,
01650                            const Arc &inner_b);
01651 
01659         void
01660         add_foundEPM(EPM &cur_epm, bool count_EPMs);
01661 
01662         bool
01663         check_PPM() {
01664             if (this->difference_to_opt_score != -1)
01665                 return true; // when we use the parameter
01666                              // difference_to_opt_score,
01667             // we enumerate all EPMs regardless of whether the number extends
01668             // the max_number_of_EPMs
01669             if (cur_number_of_EPMs >= max_number_of_EPMs + 1)
01670                 return false;
01671             else
01672                 return true;
01673         }
01674 
01685         void
01686         find_start_pos_for_tb(bool suboptimal,
01687                               score_t difference_to_opt_score = -1,
01688                               bool count_EPMs = false);
01689 
01690         bool
01691         check_num_EPMs() {
01692             double valid_deviation = 0.8;
01693             return (cur_number_of_EPMs >=
01694                         max_number_of_EPMs * valid_deviation &&
01695                     cur_number_of_EPMs <= max_number_of_EPMs);
01696         }
01697 
01698         // --------------------------------------------
01699         // heuristic traceback
01700 
01709         void
01710         trace_F_heuristic(pos_type i, pos_type j, EPM &cur_epm);
01711 
01720         void
01721         trace_LGLR_heuristic(const Arc &a, const Arc &b, EPM &cur_epm);
01722 
01741         bool
01742         trace_seq_str_matching_heuristic(const Arc &a,
01743                                          const Arc &b,
01744                                          int &state,
01745                                          matpos_t &cur_mat_pos,
01746                                          matpos_t mat_pos_diag,
01747                                          pair_seqpos_t seq_pos_to_be_matched,
01748                                          score_t add_score);
01749 
01750         // --------------------------------------------
01751         // suboptimal traceback
01752 
01764         void
01765         trace_F_suboptimal(pos_type i,
01766                            pos_type j,
01767                            score_t max_tol,
01768                            bool recurse,
01769                            bool count_EPMs);
01770 
01780         void
01781         apply_filter(epm_cont_t &found_epms);
01782 
01795         void
01796         trace_LGLR_suboptimal(const Arc &a,
01797                               const Arc &b,
01798                               score_t max_tol,
01799                               epm_cont_t &found_epms,
01800                               bool recurse,
01801                               bool count_EPMs);
01802 
01828         void
01829         trace_seq_str_matching_subopt(const Arc &a,
01830                                       const Arc &b,
01831                                       score_t score_contr,
01832                                       matpos_t mat_pos_diag,
01833                                       pair_seqpos_t seq_pos_to_be_matched,
01834                                       const PairArcIdx &am,
01835                                       poss_L_LR &poss,
01836                                       epm_it_t cur_epm,
01837                                       epm_cont_t &found_epms,
01838                                       map_am_to_do_t &map_am_to_do,
01839                                       bool count_EPMs);
01840 
01860         bool
01861         check_poss(const Arc &a,
01862                    const Arc &b,
01863                    const poss_L_LR &pot_new_poss,
01864                    poss_L_LR &poss,
01865                    epm_it_t cur_epm,
01866                    epm_cont_t &found_epms,
01867                    map_am_to_do_t &am_to_do_for_cur_am,
01868                    bool count_EPMs);
01869 
01890         void
01891         store_new_poss(const Arc &a,
01892                        const Arc &b,
01893                        bool last_poss,
01894                        const poss_L_LR &new_poss,
01895                        poss_L_LR &poss,
01896                        epm_it_t cur_epm,
01897                        epm_cont_t &found_epms,
01898                        map_am_to_do_t &am_to_do_for_cur_am,
01899                        bool count_EPMs);
01900 
01919         void
01920         trace_G_suboptimal(const Arc &a,
01921                            const Arc &b,
01922                            const poss_L_LR &pot_new_poss,
01923                            poss_L_LR &poss,
01924                            epm_it_t cur_epm,
01925                            epm_cont_t &found_epms,
01926                            map_am_to_do_t &map_am_to_do,
01927                            bool count_EPMs);
01928 
01941         bool
01942         is_valid_gap(const Arc &a, const Arc &b, const poss_L_LR &pot_new_poss);
01943 
01964         void
01965         preproc_fill_epm(map_am_to_do_t &am_to_do,
01966                          epm_it_t cur_epm,
01967                          epm_cont_t &found_epms,
01968                          bool count_EPMs,
01969                          score_t min_allowed_score = -1);
01970 
01998         void
01999         fill_epm(const map_am_to_do_t &map_am_to_do,
02000                  size_type vec_idx,
02001                  std::vector<score_t> &max_tol_left_up_to_pos,
02002                  std::vector<const EPM *> &epms_to_insert,
02003                  score_t min_score,
02004                  epm_it_t cur_epm,
02005                  epm_cont_t &found_epms,
02006                  bool count_EPMs);
02007 
02008         // --------------------------------------------
02009         // debugging/testing
02010 
02011         // print the matrices in the condensed form
02012         void
02013         print_matrices(const Arc &a,
02014                        const Arc &b,
02015                        size_type offset_A,
02016                        size_type offset_B,
02017                        bool suboptimal,
02018                        bool add_info);
02019 
02020         // checks whether an epm is valid, i.e. only one gap per arc match etc.
02021         bool
02022         validate_epm(const EPM &epm_to_test) const;
02023 
02024         // checks the validity of the epm list, i.e. that no epm is contained in
02025         // another epm (all
02026         // epms are maximally extended)
02027         bool
02028         validate_epm_list(epm_cont_t &found_epms) const;
02029 
02030     public:
02060         ExactMatcher(const Sequence &seqA_,
02061                      const Sequence &seqB_,
02062                      const RnaData &rna_dataA_,
02063                      const RnaData &rna_dataB_,
02064                      const ArcMatches &arc_matches_,
02065                      const SparseTraceController &sparse_trace_controller_,
02066                      PatternPairMap &foundEPMs_,
02067                      int alpha_1_,
02068                      int alpha_2_,
02069                      int alpha_3_,
02070                      score_t difference_to_opt_score_,
02071                      score_t min_score_,
02072                      long int max_number_of_EPMs_,
02073                      bool inexact_struct_match_,
02074                      score_t struct_mismatch_score_,
02075                      bool apply_filter_,
02076                      bool verbose_);
02077 
02078         ~ExactMatcher();
02079 
02082         void
02083         compute_arcmatch_score();
02084 
02086         void
02087         test_arcmatch_score();
02088 
02095         void
02096         trace_EPMs(bool suboptimal);
02097     };
02098 
02099 } // end namespace
02100 
02101 #endif //  EXACT_MATCHER_HH