LocARNA-1.9.2
|
00001 #ifndef SPARSIFICATION_MAPPER_HH 00002 #define SPARSIFICATION_MAPPER_HH 00003 00004 #ifdef HAVE_CONFIG_H 00005 #include <config.h> 00006 #endif 00007 00008 #include <iostream> 00009 #include <limits> 00010 00011 #include "aux.hh" 00012 #include "basepairs.hh" 00013 #include "ext_rna_data.hh" 00014 00015 // use for type safe index_t 00016 // #include "type_wrapper.hh" 00017 00018 namespace LocARNA { 00019 00020 // print a pair 00021 template <class T1, class T2> 00022 std::ostream & 00023 operator<<(std::ostream &out, const std::pair<T1, T2> &pair) { 00024 return out << "(" << pair.first << "," << pair.second << ") "; 00025 } 00026 00033 class SparsificationMapper { 00034 public: 00035 typedef BasePairs__Arc Arc; 00036 typedef size_t ArcIdx; 00037 typedef std::vector<ArcIdx> ArcIdxVec; 00038 typedef pos_type matidx_t; 00039 typedef pos_type seq_pos_t; 00040 00041 // note: the type safe index_t breaks current code, since 00042 // casts to size_t need to be explicite (using index_t's val()-method) 00043 // //! type-safe index type this is useful to distinguish index type 00044 // //! from other types that are defined as unsigned int 00045 // typedef type_wrapper<size_t> index_t; 00046 00047 typedef size_t index_t; 00048 00051 struct info_for_pos { 00052 seq_pos_t seq_pos; 00053 bool 00054 unpaired; 00055 ArcIdxVec valid_arcs; 00056 00057 00059 void 00060 reset() { 00061 this->seq_pos = 0; 00062 this->valid_arcs.clear(); 00063 this->unpaired = false; 00064 } 00065 }; 00066 00067 typedef std::vector<info_for_pos> InfoForPosVec; 00068 00069 00070 00071 00072 00073 00074 private: 00075 const BasePairs &bps; 00076 const ExtRnaData &rnadata; 00077 const double prob_unpaired_in_loop_threshold; 00078 00079 00080 const double prob_basepair_in_loop_threshold; 00081 00082 size_type max_info_vec_size; 00083 00084 00085 00086 std::vector<InfoForPosVec> info_valid_seq_pos_vecs; 00087 00091 std::vector<std::vector<matidx_t> > valid_mat_pos_vecs_before_eq; 00092 00096 std::vector<std::vector<ArcIdxVec> > left_adj_vec; 00097 00100 void 00101 compute_mapping_idx_arcs(); 00102 00105 void 00106 compute_mapping_idx_left_ends(); 00107 00112 void 00113 iterate_left_adj_list(pos_type cur_left_end, 00114 pos_type cur_pos, 00115 const Arc *inner_arc, 00116 info_for_pos &struct_pos); 00117 00118 void 00119 valid_pos_external(pos_type cur_pos, 00120 const Arc *inner_arc, 00121 info_for_pos &struct_pos); 00122 00123 public: 00138 SparsificationMapper(const BasePairs &bps_, 00139 const ExtRnaData &rnadata_, 00140 double prob_unpaired_in_loop_threshold_, 00141 double prob_basepair_in_loop_threshold_, 00142 bool index_left_ends) 00143 : bps(bps_), 00144 rnadata(rnadata_), 00145 prob_unpaired_in_loop_threshold(prob_unpaired_in_loop_threshold_), 00146 prob_basepair_in_loop_threshold(prob_basepair_in_loop_threshold_), 00147 max_info_vec_size(0) { 00148 if (index_left_ends) { 00149 compute_mapping_idx_left_ends(); 00150 } else { 00151 compute_mapping_idx_arcs(); 00152 } 00153 } 00154 00158 size_type 00159 get_max_info_vec_size() const { 00160 return max_info_vec_size; 00161 } 00162 00169 const InfoForPosVec & 00170 valid_seq_positions(index_t idx) const { 00171 return info_valid_seq_pos_vecs.at(idx); 00172 } 00173 00180 const ArcIdxVec & 00181 valid_arcs_right_adj(index_t idx, matidx_t pos) const { 00182 return info_valid_seq_pos_vecs.at(idx).at(pos).valid_arcs; 00183 } 00184 00196 matidx_t 00197 first_valid_mat_pos_before_eq( 00198 index_t index, 00199 seq_pos_t pos, 00200 index_t left_end = std::numeric_limits<index_t>::max()) const { 00201 if (left_end == std::numeric_limits<index_t>::max()) 00202 left_end = index; 00203 assert(pos >= left_end); // tocheck 00204 return valid_mat_pos_vecs_before_eq.at(index).at(pos - left_end); 00205 } 00206 00216 inline matidx_t 00217 first_valid_mat_pos_before( 00218 index_t index, 00219 seq_pos_t pos, 00220 index_t left_end = std::numeric_limits<index_t>::max()) const { 00221 // if (left_end == std::numeric_limits<index_t>::max()) assert (pos 00222 // > index); 00223 return first_valid_mat_pos_before_eq(index, pos - 1, left_end); 00224 } 00225 00233 inline seq_pos_t 00234 get_pos_in_seq_new(index_t idx, matidx_t pos) const { 00235 assert(pos < number_of_valid_mat_pos(idx)); 00236 return ( 00237 info_valid_seq_pos_vecs.at(idx).at(pos).seq_pos); //+arc.left(); 00238 } 00239 00245 size_type 00246 number_of_valid_mat_pos(index_t idx) const { 00247 return info_valid_seq_pos_vecs.at(idx).size(); 00248 } 00249 00257 bool 00258 pos_unpaired(index_t idx, matidx_t pos) const { 00259 return info_valid_seq_pos_vecs.at(idx).at(pos).unpaired; 00260 } 00261 00274 bool 00275 matching_without_gap_possible(const Arc &arc, seq_pos_t pos) const { 00276 const pos_type &mat_pos = 00277 first_valid_mat_pos_before(arc.idx(), pos, arc.left()); 00278 return get_pos_in_seq_new(arc.idx(), mat_pos) == pos - 1; 00279 } 00280 00288 const ArcIdxVec & 00289 valid_arcs_left_adj(const Arc &arc, seq_pos_t pos) const { 00290 return left_adj_vec.at(arc.idx()).at(pos - arc.left()); 00291 } 00292 00294 virtual ~SparsificationMapper() {} 00295 00307 matidx_t 00308 idx_geq(index_t index, 00309 seq_pos_t min_col, 00310 index_t left_end = std::numeric_limits<index_t>::max()) const { 00311 if (left_end == std::numeric_limits<index_t>::max()) 00312 left_end = index; 00313 00314 size_t num_pos = number_of_valid_mat_pos(index); 00315 seq_pos_t last_mapped_seq_pos = 00316 get_pos_in_seq_new(index, num_pos - 1); 00317 00318 // if the position min_col is smaller than or equal to the left end 00319 // (first mapped position), return 0 00320 if (min_col <= left_end) 00321 return 0; 00322 00323 // if the position min_col is greater than the last mapped sequence 00324 // position, the position does not exists 00325 // return the number of valid positions 00326 if (min_col > last_mapped_seq_pos) 00327 return num_pos; 00328 00329 // the matrix position after the first valid position before the 00330 // position min_col (first matrix position that 00331 // stores a sequence position that is greater than or equal to 00332 // min_col-1) 00333 matidx_t idx_geq = 00334 first_valid_mat_pos_before_eq(index, min_col - 1, left_end) + 1; 00335 00336 assert(get_pos_in_seq_new(index, idx_geq) >= min_col && 00337 !(get_pos_in_seq_new(index, idx_geq - 1) >= min_col)); 00338 00339 return idx_geq; 00340 } 00341 00352 matidx_t 00353 idx_after_leq( 00354 index_t index, 00355 seq_pos_t max_col, 00356 index_t left_end = std::numeric_limits<index_t>::max()) const { 00357 if (left_end == std::numeric_limits<index_t>::max()) 00358 left_end = index; 00359 00360 size_t num_pos = number_of_valid_mat_pos(index); 00361 seq_pos_t last_mapped_seq_pos = 00362 get_pos_in_seq_new(index, num_pos - 1); 00363 00364 // if the position max_col is smaller than the left_end (first 00365 // mapped position), return 0 00366 if (max_col < left_end) 00367 return 0; 00368 00369 // if the position max_col is greater than or equal to the last 00370 // mapped sequence position, 00371 // return the number of positions 00372 if (max_col >= last_mapped_seq_pos) 00373 return num_pos; 00374 00375 // the matrix position after the first matrix position before or 00376 // equal the sequence position max_col 00377 // (last matrix position that stores a sequence position that is 00378 // lower than or equal max_col) 00379 matidx_t idx_after_leq = 00380 first_valid_mat_pos_before_eq(index, max_col, left_end) + 1; 00381 00382 assert(get_pos_in_seq_new(index, idx_after_leq - 1) <= max_col && 00383 !(get_pos_in_seq_new(index, idx_after_leq) <= max_col)); 00384 00385 return idx_after_leq; 00386 } 00387 00388 private: 00399 bool 00400 is_valid_arc(const Arc &inner_arc, const Arc &arc) const { 00401 assert(inner_arc.left() > arc.left() && 00402 inner_arc.right() < arc.right()); 00403 return rnadata.arc_in_loop_prob(inner_arc.left(), inner_arc.right(), 00404 arc.left(), arc.right()) >= 00405 prob_basepair_in_loop_threshold; 00406 } 00407 00416 bool 00417 is_valid_arc_external(const Arc &inner_arc) const { 00418 return rnadata.arc_external_prob(inner_arc.left(), 00419 inner_arc.right()) >= 00420 prob_basepair_in_loop_threshold; 00421 } 00422 00423 public: 00434 bool 00435 is_valid_pos(const Arc &arc, seq_pos_t pos) const { 00436 assert(arc.left() < pos && pos < arc.right()); 00437 return rnadata.unpaired_in_loop_prob(pos, arc.left(), 00438 arc.right()) >= 00439 prob_unpaired_in_loop_threshold; // todo: additional variable 00440 // for external case 00441 } 00442 00449 bool 00450 is_valid_pos_external(seq_pos_t pos) const { 00451 return rnadata.unpaired_external_prob(pos) >= 00452 prob_unpaired_in_loop_threshold; // todo: additional variable 00453 // for external case 00454 } 00455 }; 00456 00463 template <class T> 00464 std::ostream & 00465 operator<<(std::ostream &out, const std::vector<T> &vec) { 00466 for (typename std::vector<T>::const_iterator it = vec.begin(); 00467 it != vec.end(); it++) { 00468 out << *it << " "; 00469 } 00470 return out; 00471 } 00472 00480 std::ostream & 00481 operator<<( 00482 std::ostream &out, 00483 const std::vector<SparsificationMapper::InfoForPosVec> &pos_vecs_); 00484 00492 std::ostream & 00493 operator<<(std::ostream &out, 00494 const SparsificationMapper::InfoForPosVec &pos_vec_); 00495 00496 } // end namespace 00497 00498 #endif // SPARSIFICATION_MAPPER_HH