LocARNA-1.9.2
|
00001 #ifndef EXACT_MATCHER_HH 00002 #define EXACT_MATCHER_HH 00003 00004 #ifdef HAVE_CONFIG_H 00005 #include <config.h> 00006 #endif 00007 00008 #include <iostream> 00009 #include <sstream> 00010 #include <list> 00011 #include <algorithm> 00012 #include <limits> 00013 #include <iterator> 00014 #include "trace_controller.hh" 00015 #include "sparsification_mapper.hh" 00016 #include "tuples.hh" 00017 #include "scoring.hh" 00018 #include "ext_rna_data.hh" 00019 #include "aux.hh" 00020 00021 extern "C" { 00022 #include <ViennaRNA/fold_vars.h> 00023 #include <ViennaRNA/utils.h> 00024 #include <ViennaRNA/PS_dot.h> 00025 #include <ViennaRNA/fold.h> 00026 int 00027 PS_rna_plot(char *string, char *structure, char *file); 00028 int 00029 PS_rna_plot_a(char *string, char *structure, char *file, char *pre, char *post); 00030 float 00031 fold(const char *sequence, char *structure); 00032 } 00033 00034 namespace LocARNA { 00035 00036 typedef size_t size_type; 00037 typedef std::vector<unsigned int> intVec; 00038 typedef std::pair<unsigned int, unsigned int> intPair; 00039 typedef std::pair<intPair, intPair> intPPair; 00040 typedef const intPPair *intPPairPTR; 00041 typedef std::vector<intPPair>::const_iterator IntPPairCITER; 00042 00046 class SinglePattern { 00047 public: 00048 SinglePattern(){}; 00049 00056 SinglePattern(const std::string &myId_, 00057 const std::string &seqId_, 00058 const intVec &mySinglePattern_) 00059 : myId(myId_), seqId(seqId_), pattern(mySinglePattern_){}; 00060 00064 virtual ~SinglePattern() { pattern.clear(); }; 00065 00070 const std::string & 00071 getmyId() const { 00072 return myId; 00073 }; 00074 00079 const std::string & 00080 getseqId() const { 00081 return seqId; 00082 }; 00083 00088 const intVec & 00089 getPat() const { 00090 return pattern; 00091 }; 00092 00093 private: 00094 std::string myId; 00095 std::string seqId; 00096 intVec pattern; 00097 }; 00098 00103 class PatternPair { 00104 public: 00105 PatternPair(){}; 00106 00115 PatternPair(const std::string &myId, 00116 const SinglePattern &myFirstPat, 00117 const SinglePattern &mySecPat, 00118 const std::string &structure_, 00119 int &score_) 00120 : id(myId), 00121 first(myFirstPat), 00122 second(mySecPat), 00123 structure(structure_), 00124 EPMscore(score_) { 00125 if (first.getPat().size() != second.getPat().size()) { 00126 std::cerr << "Error! PatternPair cannot be constructed due to " 00127 "different sizes of SinglePatterns!" 00128 << std::endl; 00129 } 00130 score = EPMscore; 00131 size = first.getPat().size(); 00132 }; 00133 00135 virtual ~PatternPair() { insideBounds.clear(); }; 00136 00141 const std::string & 00142 getId() const { 00143 return id; 00144 }; 00145 00150 const int & 00151 getSize() const { 00152 return size; 00153 }; 00154 00159 const SinglePattern & 00160 getFirstPat() const { 00161 return first; 00162 }; 00163 00168 const SinglePattern & 00169 getSecPat() const { 00170 return second; 00171 }; 00172 00174 void 00175 resetBounds(); 00176 00181 void 00182 setOutsideBounds(intPPair myPPair); 00183 00188 const intPPair 00189 getOutsideBounds() const { 00190 return outsideBounds; 00191 }; 00192 00194 void 00195 addInsideBounds(intPPair myPPair); 00196 00201 const std::vector<intPPair> & 00202 getInsideBounds() const { 00203 return insideBounds; 00204 }; 00205 00210 void 00211 setEPMScore(int myScore); 00212 00217 const int 00218 getScore() const { 00219 return score; 00220 }; 00221 00226 const int 00227 getEPMScore() const { 00228 return EPMscore; 00229 }; 00230 00235 const std::string & 00236 get_struct() const { 00237 return structure; 00238 }; 00239 00240 private: 00241 std::string id; 00242 int size; 00243 SinglePattern first; 00244 SinglePattern second; 00245 00246 std::string structure; 00247 int score; 00248 int EPMscore; 00249 std::vector<intPPair> insideBounds; 00250 intPPair outsideBounds; 00251 }; 00252 00256 class PatternPairMap { 00257 public: 00258 typedef PatternPair selfValueTYPE; 00259 typedef PatternPair *SelfValuePTR; 00260 00261 typedef std::multimap<int, SelfValuePTR, std::greater<int> > 00262 orderedMapTYPE; 00263 typedef orderedMapTYPE::const_iterator 00264 orderedMapCITER; 00265 typedef orderedMapTYPE::iterator 00266 orderedMapITER; 00267 typedef std::list<SelfValuePTR> patListTYPE; 00268 typedef patListTYPE::iterator 00269 patListITER; 00270 typedef patListTYPE::const_iterator 00271 patListCITER; 00272 typedef unordered_map<std::string, SelfValuePTR>::type 00273 PatternIdMapTYPE; 00274 00276 PatternPairMap(); 00277 00280 PatternPairMap(const PatternPairMap &myPairMap) 00281 : patternList(myPairMap.patternList), 00282 patternOrderedMap(myPairMap.patternOrderedMap), 00283 idMap(myPairMap.idMap) { 00284 minPatternSize = 100000; 00285 }; 00286 00288 virtual ~PatternPairMap(); 00289 00299 void 00300 add(const std::string &id, 00301 const SinglePattern &first, 00302 const SinglePattern &second, 00303 const std::string &structure, 00304 int score); 00305 00310 void 00311 add(const SelfValuePTR value); 00312 00314 void 00315 makeOrderedMap(); 00316 00318 void 00319 updateFromMap(); 00320 00326 const PatternPair & 00327 getPatternPair(const std::string &id) const; 00328 00334 const SelfValuePTR 00335 getPatternPairPTR(const std::string &id) const; 00336 00341 const patListTYPE & 00342 getList() const; 00343 00348 const orderedMapTYPE & 00349 getOrderedMap() const; 00350 00355 orderedMapTYPE & 00356 getOrderedMap2(); 00357 00362 const int 00363 size() const; 00364 00369 int 00370 getMapBases(); 00371 00376 int 00377 getMapEPMScore(); 00378 00383 const int 00384 getMinPatternSize() const { 00385 return minPatternSize; 00386 }; 00387 00388 private: 00389 patListTYPE patternList; 00390 orderedMapTYPE patternOrderedMap; 00391 00392 PatternIdMapTYPE idMap; 00393 int minPatternSize; 00394 }; 00395 00402 std::ostream & 00403 operator<<(std::ostream &out, 00404 const PatternPairMap::patListTYPE &pat_pair_map); 00405 00409 class LCSEPM { 00410 public: 00418 LCSEPM(const Sequence &seqA_, 00419 const Sequence &seqB_, 00420 const PatternPairMap &myPatterns, 00421 PatternPairMap &myLCSEPM) 00422 00423 : seqA(seqA_), 00424 seqB(seqB_), 00425 matchedEPMs(myLCSEPM), 00426 patterns(myPatterns){}; 00427 00429 virtual ~LCSEPM(); 00430 00439 void 00440 MapToPS(const std::string &sequenceA, 00441 const std::string &sequenceB, 00442 PatternPairMap &myMap, 00443 const std::string &file1, 00444 const std::string &file2); 00445 00447 void 00448 calculateLCSEPM(bool quiet); 00449 00451 std::pair<SequenceAnnotation, SequenceAnnotation> 00452 anchor_annotation(); 00453 00455 void 00456 output_locarna(const std::string &sequenceA, 00457 const std::string &sequenceB, 00458 const std::string &outfile); 00459 00461 void 00462 output_clustal(const std::string &outfile_name); 00463 00464 private: 00465 struct HoleCompare2 { 00466 bool 00467 operator()(const intPPairPTR &h1, const intPPairPTR &h2) const { 00468 // first compare size of holes 00469 if (h1->first.second - h1->first.first - 1 < 00470 h2->first.second - h2->first.first - 1) { 00471 return true; 00472 } 00473 // compare if holes are identical in both structures 00474 if (h1->first.second - h1->first.first - 1 == 00475 h2->first.second - h2->first.first - 1) { 00476 if ((h1->first.first == h2->first.first) && 00477 (h1->first.second == h2->first.second) && 00478 (h1->second.first == h2->second.first) && 00479 (h1->second.second == h2->second.second)) { 00480 return true; 00481 } 00482 } 00483 00484 return false; 00485 } 00486 }; 00487 00488 typedef std::multimap<intPPairPTR, 00489 PatternPairMap::SelfValuePTR, 00490 HoleCompare2> 00491 HoleOrderingMapTYPE2; 00492 typedef HoleOrderingMapTYPE2::const_iterator HoleMapCITER2; 00493 00494 void 00495 preProcessing(); 00496 void 00497 calculateHoles3(bool quiet); 00498 void 00499 calculatePatternBoundaries(PatternPair *myPair); 00500 void 00501 calculateTraceback2(const int i, 00502 const int j, 00503 const int k, 00504 const int l, 00505 std::vector<std::vector<int> > holeVec); 00506 int 00507 D_rec2(const int &i, 00508 const int &j, 00509 const int &k, 00510 const int &l, 00511 std::vector<std::vector<int> > &D_h, 00512 const bool debug); 00513 00514 int 00515 max3(int a, int b, int c) { 00516 int tmp = a > b ? a : b; 00517 return (tmp > c ? tmp : c); 00518 }; 00519 00521 char * 00522 getStructure(PatternPairMap &myMap, bool firstSeq, int length); 00523 00524 std::string 00525 intvec2str(const std::vector<unsigned int> &V, 00526 const std::string &delim) { 00527 std::stringstream oss; 00528 copy(V.begin(), V.end(), 00529 std::ostream_iterator<unsigned int>(oss, delim.c_str())); 00530 std::string tmpstr; 00531 tmpstr = oss.str(); 00532 if (tmpstr.length() > 0) 00533 tmpstr.erase(tmpstr.end() - 1); 00534 return tmpstr; 00535 } 00536 00537 std::string 00538 upperCase(const std::string &seq) { 00539 std::string s = ""; 00540 for (unsigned int i = 0; i < seq.length(); i++) 00541 s += toupper(seq[i]); 00542 return s; 00543 } 00544 00545 std::vector<std::vector<std::vector<PatternPairMap::SelfValuePTR> > > 00546 EPM_Table2; 00547 HoleOrderingMapTYPE2 holeOrdering2; 00548 const Sequence &seqA; 00549 const Sequence &seqB; 00550 PatternPairMap &matchedEPMs; 00551 const PatternPairMap &patterns; 00552 }; 00553 00557 class SparseTraceController : public TraceController { 00558 private: 00559 typedef SparsificationMapper::matidx_t 00560 matidx_t; 00561 typedef SparsificationMapper::seq_pos_t 00562 seqpos_t; 00563 typedef SparsificationMapper::index_t 00564 index_t; 00565 00566 public: 00567 typedef std::pair<matidx_t, matidx_t> 00568 matpos_t; 00569 typedef std::pair<seqpos_t, seqpos_t> 00570 pair_seqpos_t; 00571 00572 private: 00573 const SparsificationMapper 00574 &sparse_mapperA; 00575 const SparsificationMapper 00576 &sparse_mapperB; 00577 00578 public: 00586 SparseTraceController(const SparsificationMapper &sparse_mapperA_, 00587 const SparsificationMapper &sparse_mapperB_, 00588 const TraceController &trace_controller_) 00589 : TraceController::TraceController(trace_controller_), 00590 sparse_mapperA(sparse_mapperA_), 00591 sparse_mapperB(sparse_mapperB_) 00592 00593 {} 00594 00595 virtual ~SparseTraceController(){}; 00596 00598 const SparsificationMapper & 00599 get_sparse_mapperA() const { 00600 return sparse_mapperA; 00601 } 00602 00604 const SparsificationMapper & 00605 get_sparse_mapperB() const { 00606 return sparse_mapperB; 00607 } 00608 00620 matidx_t 00621 min_col_idx( 00622 index_t indexA, 00623 index_t indexB, 00624 matidx_t idx_i, 00625 index_t left_endB = std::numeric_limits<index_t>::max()) const { 00626 seqpos_t i = sparse_mapperA.get_pos_in_seq_new(indexA, idx_i); 00627 return sparse_mapperB.idx_geq(indexB, min_col(i), left_endB); 00628 } 00629 00642 matidx_t 00643 idx_after_max_col_idx( 00644 index_t indexA, 00645 index_t indexB, 00646 matidx_t idx_i, 00647 index_t left_endB = std::numeric_limits<index_t>::max()) const { 00648 seqpos_t i = sparse_mapperA.get_pos_in_seq_new(indexA, idx_i); 00649 return sparse_mapperB.idx_after_leq(indexB, max_col(i), left_endB); 00650 } 00651 00671 matpos_t 00672 diag_pos_bef( 00673 index_t indexA, 00674 index_t indexB, 00675 pair_seqpos_t cur_pos_seq, 00676 index_t left_endA = std::numeric_limits<index_t>::max(), 00677 index_t left_endB = std::numeric_limits<index_t>::max()) const { 00678 bool debug_valid_mat_pos = false; 00679 00680 if (debug_valid_mat_pos) 00681 std::cout << "first valid mat pos before with tc " << std::endl; 00682 00683 seqpos_t i = cur_pos_seq.first; 00684 seqpos_t j = cur_pos_seq.second; 00685 00686 matidx_t idx_after_max_col; 00687 00688 // find valid matrix position based on the SparsificationMapper 00689 matidx_t cur_row = 00690 sparse_mapperA.first_valid_mat_pos_before(indexA, i, left_endA); 00691 matidx_t col_before = 00692 sparse_mapperB.first_valid_mat_pos_before(indexB, j, left_endB); 00693 00694 // bool valid_pos_found = false; 00695 00696 // find a valid position that is valid also based on the 00697 // TraceController 00698 // go through the rows and find an interval that includes the column 00699 // col_before or lies 00700 // before the column col_before 00701 for (;; --cur_row) { 00702 matidx_t min_col = 00703 min_col_idx(indexA, indexB, cur_row, left_endB); 00704 idx_after_max_col = 00705 idx_after_max_col_idx(indexA, indexB, cur_row, left_endB); 00706 00707 if (debug_valid_mat_pos) 00708 std::cout << "interval " << min_col << "," 00709 << idx_after_max_col << std::endl; 00710 00711 // valid interval found 00712 if (min_col < idx_after_max_col && min_col <= col_before) { 00713 // valid_pos_found=true; 00714 break; 00715 } 00716 00717 if (cur_row == 0) { 00718 break; 00719 } 00720 } 00721 00722 // assert(valid_pos_found); 00723 assert(idx_after_max_col > 0); 00724 00725 matidx_t max_col = idx_after_max_col - 1; 00726 00727 // the column of the new position is the col_before or lies before 00728 // it 00729 matpos_t result = matpos_t(cur_row, std::min(max_col, col_before)); 00730 00731 assert(is_valid_idx_pos(indexA, indexB, result)); 00732 00733 return result; 00734 } 00735 00744 pair_seqpos_t 00745 pos_in_seq(index_t idxA, 00746 index_t idxB, // const Arc &a, const Arc &b, 00747 const matpos_t &cur_pos) const { 00748 return pair_seqpos_t( 00749 sparse_mapperA.get_pos_in_seq_new(idxA, cur_pos.first), 00750 sparse_mapperB.get_pos_in_seq_new(idxB, cur_pos.second)); 00751 } 00752 00766 bool 00767 matching_wo_gap(index_t idxA, 00768 index_t idxB, 00769 const matpos_t &idx_pos_diag, 00770 pair_seqpos_t seq_pos_to_be_matched) const { 00771 pair_seqpos_t pos_diag = pos_in_seq(idxA, idxB, idx_pos_diag); 00772 return (pos_diag.first + 1 == seq_pos_to_be_matched.first) && 00773 (pos_diag.second + 1 == seq_pos_to_be_matched.second); 00774 } 00775 00783 bool 00784 pos_unpaired(index_t idxA, index_t idxB, matpos_t pos) const { 00785 return sparse_mapperA.pos_unpaired(idxA, pos.first) && 00786 sparse_mapperB.pos_unpaired(idxB, pos.second); 00787 } 00788 00796 bool 00797 is_valid_idx_pos(index_t idxA, index_t idxB, matpos_t mat_pos) const { 00798 pair_seqpos_t seq_pos = pos_in_seq(idxA, idxB, mat_pos); 00799 return is_valid(seq_pos.first, seq_pos.second); 00800 } 00801 }; 00802 00806 class EPM { 00807 public: 00808 typedef BasePairs__Arc Arc; 00809 00810 typedef SparsificationMapper::seq_pos_t 00811 seqpos_t; 00812 typedef SparseTraceController::matpos_t 00813 matpos_t; 00814 typedef SparsificationMapper::ArcIdx ArcIdx; 00815 typedef SparseTraceController::pair_seqpos_t 00816 pair_seqpos_t; 00817 typedef std::pair<ArcIdx, ArcIdx> PairArcIdx; 00818 typedef std::vector<PairArcIdx> 00819 PairArcIdxVec; 00820 00823 typedef triple<seqpos_t, seqpos_t, char> el_pat_vec; 00824 00825 typedef std::vector<el_pat_vec> pat_vec_t; 00826 00827 private: 00828 pat_vec_t pat_vec; 00829 00830 score_t score; 00831 int state; 00832 00833 matpos_t cur_pos; 00834 00835 score_t max_tol_left; 00836 00837 bool first_insertion; 00838 00839 bool invalid; 00840 00841 00842 00843 00844 PairArcIdxVec am_to_do; 00845 00846 00848 struct compare_el_pat_vec { 00849 public: 00850 bool 00851 operator()(const EPM::el_pat_vec &el1, 00852 const EPM::el_pat_vec &el2) const { 00853 seqpos_t el1_pos1 = el1.first; 00854 seqpos_t el1_pos2 = el1.second; 00855 seqpos_t el2_pos1 = el2.first; 00856 seqpos_t el2_pos2 = el2.second; 00857 char el1_struc = el1.third; 00858 char el2_struc = el2.third; 00859 return (el1_pos1 < el2_pos1) || 00860 (el1_pos1 == el2_pos1 && el1_pos2 < el2_pos2) || 00861 (el1_pos1 == el2_pos1 && el1_pos2 == el2_pos2 && 00862 el1_struc < el2_struc); 00863 } 00864 }; 00865 00866 struct compare_el_am_to_do { 00867 public: 00868 bool 00869 operator()(const EPM::PairArcIdx &el1, 00870 const EPM::PairArcIdx &el2) const { 00871 return (el1.first < el2.first) || 00872 ((el1.first == el2.first) && el1.second < el2.second); 00873 } 00874 }; 00875 00876 public: 00878 EPM() 00879 : score(0), 00880 state(0), 00881 cur_pos(matpos_t(0, 0)), 00882 max_tol_left(0), 00883 first_insertion(true), 00884 invalid(false) {} 00885 00886 virtual ~EPM() {} 00887 00888 //----------------------------------------------------------------------- 00889 // getter methods 00890 //----------------------------------------------------------------------- 00891 00893 score_t 00894 get_score() const { 00895 return score; 00896 } 00897 00899 int 00900 get_state() const { 00901 return state; 00902 } 00903 00905 const matpos_t & 00906 get_cur_pos() const { 00907 return cur_pos; 00908 } 00909 00911 const score_t & 00912 get_max_tol_left() const { 00913 return max_tol_left; 00914 } 00915 00917 bool 00918 get_first_insertion() const { 00919 return first_insertion; 00920 } 00921 00925 bool 00926 is_invalid() const { 00927 return invalid; 00928 } 00929 00930 //----------------------------------------------------------------------- 00931 // setter methods 00932 //----------------------------------------------------------------------- 00933 00938 void 00939 set_score(score_t score_) { 00940 score = score_; 00941 } 00942 00947 void 00948 set_state(int state_) { 00949 state = state_; 00950 } 00951 00956 void 00957 set_cur_pos(const matpos_t &cur_pos_) { 00958 cur_pos = cur_pos_; 00959 } 00960 00965 void 00966 set_max_tol_left(score_t tol) { 00967 max_tol_left = tol; 00968 } 00969 00974 void 00975 set_first_insertion(bool first_insertion_) { 00976 first_insertion = first_insertion_; 00977 } 00978 00980 void 00981 set_invalid() { 00982 invalid = true; 00983 } 00984 00991 const PairArcIdx & 00992 get_am(PairArcIdxVec::size_type idx) const { 00993 assert(idx < am_to_do.size()); 00994 return am_to_do[idx]; 00995 } 00996 01001 PairArcIdxVec::size_type 01002 number_of_am() { 01003 return am_to_do.size(); 01004 } 01005 01007 void 01008 clear_am_to_do() { 01009 am_to_do.clear(); 01010 } 01011 01017 PairArcIdxVec::const_iterator 01018 am_begin() const { 01019 return am_to_do.begin(); 01020 } 01021 01027 PairArcIdxVec::const_iterator 01028 am_end() const { 01029 return am_to_do.end(); 01030 } 01031 01037 el_pat_vec 01038 pat_vec_at(pat_vec_t::size_type idx) const { 01039 assert(idx < pat_vec.size()); 01040 return pat_vec[idx]; 01041 } 01042 01047 pat_vec_t::size_type 01048 pat_vec_size() const { 01049 return pat_vec.size(); 01050 } 01051 01056 pat_vec_t::const_iterator 01057 begin() const { 01058 return pat_vec.begin(); 01059 } 01060 01065 pat_vec_t::const_iterator 01066 end() const { 01067 return pat_vec.end(); 01068 } 01069 01074 pair_seqpos_t 01075 last_matched_pos() { 01076 assert(!pat_vec.empty()); 01077 return pair_seqpos_t(pat_vec.back().first, pat_vec.back().second); 01078 } 01079 01089 void 01090 add(seqpos_t posA, seqpos_t posB, char c) { 01091 pat_vec.push_back(el_pat_vec(posA, posB, c)); 01092 } 01093 01104 void 01105 overwrite(seqpos_t posA, 01106 seqpos_t posB, 01107 char c, 01108 pat_vec_t::size_type pos) { 01109 if (pat_vec.size() <= pos) { 01110 pat_vec.push_back(el_pat_vec(posA, posB, c)); 01111 } 01112 pat_vec.at(pos) = el_pat_vec(posA, posB, c); 01113 } 01114 01120 void 01121 add_am(const Arc &a, const Arc &b) { 01122 add(a.right(), b.right(), ')'); 01123 add(a.left(), b.left(), '('); 01124 } 01125 01131 void 01132 store_am(const Arc &a, const Arc &b) { 01133 const PairArcIdx &pair_arc_idx = PairArcIdx(a.idx(), b.idx()); 01134 // store the pair of arc indices in the am_to_do datastructure 01135 am_to_do.push_back(pair_arc_idx); 01136 } 01137 01142 PairArcIdx 01143 next_arcmatch() { 01144 PairArcIdx arc_idx = am_to_do.back(); 01145 am_to_do.pop_back(); 01146 return arc_idx; 01147 } 01148 01155 void 01156 sort_patVec() { 01157 sort(pat_vec.begin(), pat_vec.end(), compare_el_pat_vec()); 01158 } 01159 01163 void 01164 sort_am_to_do() { 01165 sort(am_to_do.begin(), am_to_do.end(), compare_el_am_to_do()); 01166 } 01167 01173 void 01174 insert_epm(const EPM &epm_to_insert) { 01175 pat_vec.insert(pat_vec.end(), epm_to_insert.begin(), 01176 epm_to_insert.end()); 01177 } 01178 01185 bool 01186 includes(const EPM &epm_to_test) const { 01187 assert(pat_vec_size() >= epm_to_test.pat_vec_size()); 01188 return std::includes(this->begin(), this->end(), 01189 epm_to_test.begin(), epm_to_test.end(), 01190 compare_el_pat_vec()); 01191 } 01192 01201 bool 01202 includes_am(const EPM &epm_to_test) const { 01203 return std::includes(am_begin(), am_end(), epm_to_test.am_begin(), 01204 epm_to_test.am_end(), compare_el_am_to_do()); 01205 } 01206 01212 void 01213 print_epm(std::ostream &out, bool verbose) const { 01214 out << "_________________________________________________" 01215 << std::endl; 01216 out << "epm with score " << this->score << std::endl; 01217 out << " "; 01218 for (pat_vec_t::const_iterator it = pat_vec.begin(); 01219 it != pat_vec.end(); ++it) { 01220 out << it->first << ":" << it->second << " "; 01221 } 01222 out << std::endl; 01223 out << " "; 01224 for (pat_vec_t::const_iterator it = pat_vec.begin(); 01225 it != pat_vec.end(); ++it) { 01226 out << it->third; 01227 } 01228 out << std::endl; 01229 out << "am_to_do " << am_to_do << std::endl; 01230 out << "tolerance left " << this->max_tol_left << std::endl; 01231 if (verbose) { 01232 out << "score " << score << std::endl; 01233 out << "pos " << this->cur_pos.first << "," 01234 << this->cur_pos.second << std::endl; 01235 out << "state " << this->state << std::endl; 01236 } 01237 out << "______________________________________________________" 01238 << std::endl; 01239 } 01240 }; 01241 01250 inline bool 01251 operator<(const EPM &epm1, const EPM &epm2) { 01252 return epm1.get_max_tol_left() > epm2.get_max_tol_left(); 01253 } 01254 01261 inline std::ostream & 01262 operator<<(std::ostream &out, const EPM &epm) { 01263 epm.print_epm(out, false); 01264 return out; 01265 } 01266 01274 template <class T1> 01275 T1 01276 max3(const T1 &first, const T1 &second, const T1 &third) { 01277 return max(max(first, second), third); 01278 } 01279 01288 template <class T1> 01289 T1 01290 max4(const T1 &first, const T1 &second, const T1 &third, const T1 &fourth) { 01291 return max(max3(first, second, third), fourth); 01292 } 01293 01309 class ExactMatcher { 01310 typedef BasePairs__Arc Arc; 01311 01312 typedef SparsificationMapper::ArcIdx ArcIdx; 01313 typedef SparsificationMapper::ArcIdxVec 01314 ArcIdxVec; 01315 typedef SparsificationMapper::matidx_t 01316 matidx_t; 01317 typedef SparsificationMapper::seq_pos_t 01318 seqpos_t; 01319 typedef SparsificationMapper::index_t index_t; 01320 01321 01322 01323 typedef SparseTraceController::matpos_t 01324 matpos_t; 01325 typedef SparseTraceController::pair_seqpos_t 01326 pair_seqpos_t; 01327 01328 typedef EPM::PairArcIdx PairArcIdx; 01329 typedef EPM::PairArcIdxVec 01330 PairArcIdxVec; 01331 01332 typedef std::list<EPM> 01333 epm_cont_t; 01334 typedef epm_cont_t::iterator epm_it_t; 01335 typedef std::pair<score_t, epm_cont_t> el_map_am_to_do_t; 01336 01337 01338 01339 01340 01341 01344 typedef unordered_map<PairArcIdx, 01345 el_map_am_to_do_t, 01346 pair_of_size_t_hash>::type map_am_to_do_t; 01347 01348 private: 01352 //<state, max_tol, current matrix position, potential arcMatch, sequence 01353 //position to be matched> 01354 typedef quintuple<int, 01355 infty_score_t, 01356 matpos_t, 01357 PairArcIdx, 01358 pair_seqpos_t> 01359 poss_L_LR; 01360 01361 // infty_score_t because of the check_poss, change to score_t!!! 01364 typedef triple<int, infty_score_t, matpos_t> 01365 poss_in_G; //<state,max_tol,current matrix position> 01366 01367 const Sequence &seqA; 01368 const Sequence &seqB; 01369 01370 const RnaData &rna_dataA; 01371 01372 const RnaData &rna_dataB; 01373 01374 const ArcMatches 01375 &arc_matches; 01376 01377 const BasePairs &bpsA; 01378 const BasePairs &bpsB; 01379 const SparseTraceController 01380 &sparse_trace_controller; 01381 01382 // (valid positions in the matrix) 01383 const SparsificationMapper 01384 &sparse_mapperA; 01385 const SparsificationMapper 01386 &sparse_mapperB; 01387 PatternPairMap &foundEPMs; 01388 01389 01390 01391 ScoreMatrix L; 01392 ScoreMatrix G_A; 01393 01394 01395 01396 ScoreMatrix G_AB; 01397 01398 ScoreMatrix 01399 LR; 01400 ScoreMatrix F; 01401 ScoreMatrix Dmat; 01402 01403 01404 int alpha_1; 01405 int alpha_2; 01406 int alpha_3; 01407 01408 int difference_to_opt_score; 01409 01410 01411 // worse than the optimal score are considered 01412 int min_score; 01413 long int max_number_of_EPMs; 01414 01415 long int cur_number_of_EPMs; 01416 01417 01418 bool inexact_struct_match; 01419 01420 score_t struct_mismatch_score; 01421 01422 bool add_filter; 01423 01424 01425 bool verbose; 01426 01427 pair_seqpos_t pos_of_max; 01428 01429 enum { 01430 in_LR, 01431 in_G_A, 01432 in_G_AB, 01433 in_L, 01434 in_F 01435 }; 01436 01437 const Arc pseudo_arcA; 01438 const Arc pseudo_arcB; 01439 01447 const infty_score_t & 01448 D(const ArcMatch &am) const { 01449 return D(am.arcA(), am.arcB()); 01450 } 01451 01459 infty_score_t & 01460 D(const ArcMatch &am) { 01461 return D(am.arcA(), am.arcB()); 01462 } 01463 01472 const infty_score_t & 01473 D(const Arc &a, const Arc &b) const { 01474 return Dmat(a.idx(), b.idx()); 01475 } 01476 01485 infty_score_t & 01486 D(const Arc &a, const Arc &b) { 01487 return Dmat(a.idx(), b.idx()); 01488 } 01489 01490 bool 01491 nucleotide_match(seqpos_t pos_seqA, seqpos_t pos_seqB) const { 01492 assert(pos_seqA >= 1 && pos_seqA <= seqA.length() && 01493 pos_seqB >= 1 && 01494 pos_seqB <= seqB.length()); // seqA and seqB are 1-based 01495 return (seqA[pos_seqA] == seqB[pos_seqB]); 01496 } 01497 01498 bool 01499 seq_matching(ArcIdx idxA, 01500 ArcIdx idxB, 01501 matpos_t cur_mat_pos, 01502 pair_seqpos_t cur_seq_pos) const { 01503 seqpos_t i = cur_seq_pos.first; 01504 seqpos_t j = cur_seq_pos.second; 01505 01506 return sparse_trace_controller.pos_unpaired(idxA, idxB, 01507 cur_mat_pos) && 01508 nucleotide_match(i, j); 01509 } 01510 01511 // ---------------------------------------- 01512 // fill matrices 01513 01519 void 01520 initialize_gap_matrices(); 01521 01523 void 01524 init_Fmat(); 01525 01537 void 01538 init_mat(ScoreMatrix &mat, 01539 const Arc &a, 01540 const Arc &b, 01541 infty_score_t first_entry, 01542 infty_score_t first_col, 01543 infty_score_t first_row); 01544 01556 pair_seqpos_t 01557 compute_LGLR(const Arc &a, const Arc &b, bool suboptimal); 01558 01573 infty_score_t 01574 compute_matrix_entry(const Arc &a, 01575 const Arc &b, 01576 matpos_t mat_pos, 01577 matpos_t mat_pos_diag, 01578 bool matrixLR, 01579 bool suboptimal); 01580 01599 infty_score_t 01600 seq_str_matching(const Arc &a, 01601 const Arc &b, 01602 matpos_t mat_pos_diag, 01603 pair_seqpos_t seq_pos_to_be_matched, 01604 score_t add_score, 01605 bool matrixLR, 01606 bool suboptimal); 01607 01609 void 01610 compute_F(); 01611 01612 // -------------------------------------------- 01613 // helper functions 01614 01620 score_t 01621 score_for_seq_match(); 01622 01631 infty_score_t 01632 score_for_am(const Arc &a, const Arc &b) const; 01633 01646 score_t 01647 score_for_stacking(const Arc &a, 01648 const Arc &b, 01649 const Arc &inner_a, 01650 const Arc &inner_b); 01651 01659 void 01660 add_foundEPM(EPM &cur_epm, bool count_EPMs); 01661 01662 bool 01663 check_PPM() { 01664 if (this->difference_to_opt_score != -1) 01665 return true; // when we use the parameter 01666 // difference_to_opt_score, 01667 // we enumerate all EPMs regardless of whether the number extends 01668 // the max_number_of_EPMs 01669 if (cur_number_of_EPMs >= max_number_of_EPMs + 1) 01670 return false; 01671 else 01672 return true; 01673 } 01674 01685 void 01686 find_start_pos_for_tb(bool suboptimal, 01687 score_t difference_to_opt_score = -1, 01688 bool count_EPMs = false); 01689 01690 bool 01691 check_num_EPMs() { 01692 double valid_deviation = 0.8; 01693 return (cur_number_of_EPMs >= 01694 max_number_of_EPMs * valid_deviation && 01695 cur_number_of_EPMs <= max_number_of_EPMs); 01696 } 01697 01698 // -------------------------------------------- 01699 // heuristic traceback 01700 01709 void 01710 trace_F_heuristic(pos_type i, pos_type j, EPM &cur_epm); 01711 01720 void 01721 trace_LGLR_heuristic(const Arc &a, const Arc &b, EPM &cur_epm); 01722 01741 bool 01742 trace_seq_str_matching_heuristic(const Arc &a, 01743 const Arc &b, 01744 int &state, 01745 matpos_t &cur_mat_pos, 01746 matpos_t mat_pos_diag, 01747 pair_seqpos_t seq_pos_to_be_matched, 01748 score_t add_score); 01749 01750 // -------------------------------------------- 01751 // suboptimal traceback 01752 01764 void 01765 trace_F_suboptimal(pos_type i, 01766 pos_type j, 01767 score_t max_tol, 01768 bool recurse, 01769 bool count_EPMs); 01770 01780 void 01781 apply_filter(epm_cont_t &found_epms); 01782 01795 void 01796 trace_LGLR_suboptimal(const Arc &a, 01797 const Arc &b, 01798 score_t max_tol, 01799 epm_cont_t &found_epms, 01800 bool recurse, 01801 bool count_EPMs); 01802 01828 void 01829 trace_seq_str_matching_subopt(const Arc &a, 01830 const Arc &b, 01831 score_t score_contr, 01832 matpos_t mat_pos_diag, 01833 pair_seqpos_t seq_pos_to_be_matched, 01834 const PairArcIdx &am, 01835 poss_L_LR &poss, 01836 epm_it_t cur_epm, 01837 epm_cont_t &found_epms, 01838 map_am_to_do_t &map_am_to_do, 01839 bool count_EPMs); 01840 01860 bool 01861 check_poss(const Arc &a, 01862 const Arc &b, 01863 const poss_L_LR &pot_new_poss, 01864 poss_L_LR &poss, 01865 epm_it_t cur_epm, 01866 epm_cont_t &found_epms, 01867 map_am_to_do_t &am_to_do_for_cur_am, 01868 bool count_EPMs); 01869 01890 void 01891 store_new_poss(const Arc &a, 01892 const Arc &b, 01893 bool last_poss, 01894 const poss_L_LR &new_poss, 01895 poss_L_LR &poss, 01896 epm_it_t cur_epm, 01897 epm_cont_t &found_epms, 01898 map_am_to_do_t &am_to_do_for_cur_am, 01899 bool count_EPMs); 01900 01919 void 01920 trace_G_suboptimal(const Arc &a, 01921 const Arc &b, 01922 const poss_L_LR &pot_new_poss, 01923 poss_L_LR &poss, 01924 epm_it_t cur_epm, 01925 epm_cont_t &found_epms, 01926 map_am_to_do_t &map_am_to_do, 01927 bool count_EPMs); 01928 01941 bool 01942 is_valid_gap(const Arc &a, const Arc &b, const poss_L_LR &pot_new_poss); 01943 01964 void 01965 preproc_fill_epm(map_am_to_do_t &am_to_do, 01966 epm_it_t cur_epm, 01967 epm_cont_t &found_epms, 01968 bool count_EPMs, 01969 score_t min_allowed_score = -1); 01970 01998 void 01999 fill_epm(const map_am_to_do_t &map_am_to_do, 02000 size_type vec_idx, 02001 std::vector<score_t> &max_tol_left_up_to_pos, 02002 std::vector<const EPM *> &epms_to_insert, 02003 score_t min_score, 02004 epm_it_t cur_epm, 02005 epm_cont_t &found_epms, 02006 bool count_EPMs); 02007 02008 // -------------------------------------------- 02009 // debugging/testing 02010 02011 // print the matrices in the condensed form 02012 void 02013 print_matrices(const Arc &a, 02014 const Arc &b, 02015 size_type offset_A, 02016 size_type offset_B, 02017 bool suboptimal, 02018 bool add_info); 02019 02020 // checks whether an epm is valid, i.e. only one gap per arc match etc. 02021 bool 02022 validate_epm(const EPM &epm_to_test) const; 02023 02024 // checks the validity of the epm list, i.e. that no epm is contained in 02025 // another epm (all 02026 // epms are maximally extended) 02027 bool 02028 validate_epm_list(epm_cont_t &found_epms) const; 02029 02030 public: 02060 ExactMatcher(const Sequence &seqA_, 02061 const Sequence &seqB_, 02062 const RnaData &rna_dataA_, 02063 const RnaData &rna_dataB_, 02064 const ArcMatches &arc_matches_, 02065 const SparseTraceController &sparse_trace_controller_, 02066 PatternPairMap &foundEPMs_, 02067 int alpha_1_, 02068 int alpha_2_, 02069 int alpha_3_, 02070 score_t difference_to_opt_score_, 02071 score_t min_score_, 02072 long int max_number_of_EPMs_, 02073 bool inexact_struct_match_, 02074 score_t struct_mismatch_score_, 02075 bool apply_filter_, 02076 bool verbose_); 02077 02078 ~ExactMatcher(); 02079 02082 void 02083 compute_arcmatch_score(); 02084 02086 void 02087 test_arcmatch_score(); 02088 02095 void 02096 trace_EPMs(bool suboptimal); 02097 }; 02098 02099 } // end namespace 02100 02101 #endif // EXACT_MATCHER_HH