LocARNA-1.9.2
|
00001 #ifndef LOCARNA_MULTIPLE_ALIGNMENT_HH 00002 #define LOCARNA_MULTIPLE_ALIGNMENT_HH 00003 00004 #ifdef HAVE_CONFIG_H 00005 #include <config.h> 00006 #endif 00007 00008 #include <iosfwd> 00009 #include <string> 00010 #include <vector> 00011 #include <map> 00012 00013 #include "aux.hh" 00014 #include "string1.hh" 00015 #include "scoring_fwd.hh" 00016 #include "sequence_annotation.hh" 00017 00018 #include <assert.h> 00019 00020 #include <iostream> 00021 00022 namespace LocARNA { 00023 00024 class Alignment; 00025 class AlignmentEdges; 00026 template <class T> 00027 class Alphabet; 00028 class BasePairs; 00029 class Scoring; 00030 class Sequence; 00031 00065 class MultipleAlignment { 00066 public: 00067 typedef size_t size_type; 00068 00072 struct FormatType { 00074 enum type { 00075 STOCKHOLM, 00076 PP, 00077 CLUSTAL, 00078 FASTA 00079 }; 00080 00082 static size_t 00083 size() { 00084 return 4; 00085 } 00086 }; 00087 00090 struct AnnoType { 00092 enum type { 00094 consensus_structure, 00097 structure, 00100 fixed_structure, 00102 anchors 00103 }; 00104 00106 static size_t 00107 size() { 00108 return 4; 00109 } 00110 }; 00111 00112 private: 00118 typedef std::vector<std::vector<std::string> > annotation_tags_t; 00119 00120 static annotation_tags_t annotation_tags; 00121 00123 static void 00124 init_annotation_tags(); 00125 00126 public: 00129 static size_t 00130 num_of_annotypes() { 00131 return annotation_tags.size(); 00132 } 00133 00142 class SeqEntry { 00143 public: 00144 typedef MultipleAlignment::size_type size_type; 00145 00146 typedef std::pair<pos_type, pos_type> 00147 pos_pair_t; 00148 00149 private: 00150 std::string name_; 00151 std::string description_; 00152 string1 seq_; //<! alignment string of the sequence 00153 00154 public: 00162 SeqEntry(const std::string &name, const std::string &seq) 00163 : name_(name), description_(""), seq_((string1)seq) {} 00164 00172 SeqEntry(const std::string &name, const string1 &seq) 00173 : name_(name), description_(""), seq_(seq) {} 00174 00182 SeqEntry(const std::string &name, 00183 const std::string &description, 00184 const std::string &seq) 00185 : name_(name), description_(description), seq_((string1)seq) {} 00186 00195 SeqEntry(const std::string &name, 00196 const std::string &description, 00197 const string1 &seq) 00198 : name_(name), description_(description), seq_(seq) {} 00199 00200 // access 00201 00203 const std::string & 00204 name() const { 00205 return name_; 00206 } 00207 00209 const std::string & 00210 description() const { 00211 return description_; 00212 } 00213 00215 const string1 & 00216 seq() const { 00217 return seq_; 00218 } 00219 00221 size_type 00222 length_wogaps() const; 00223 00224 //**************************************** 00225 // projections 00226 00234 pos_type 00235 pos_to_col(pos_type pos) const; 00236 00248 pos_pair_t 00249 col_to_pos(pos_type col) const; 00250 00255 void 00256 reverse() { 00257 seq_.reverse(); 00258 } 00259 00264 void 00265 push_back(char c) { 00266 seq_.push_back(c); 00267 } 00268 00270 void 00271 set_seq(const string1 &seq) { 00272 seq_ = seq; 00273 } 00274 }; 00275 00282 class AliColumn { 00283 const MultipleAlignment &ma_; 00284 size_type col_index_; 00285 00286 public: 00293 AliColumn(const MultipleAlignment &ma, size_type col_index) 00294 : ma_(ma), col_index_(col_index) { 00295 assert(1 <= col_index); 00296 assert(col_index <= ma.length()); 00297 } 00298 00306 const char &operator[](size_type row_index) const { 00307 return ma_.seqentry(row_index).seq()[col_index_]; 00308 } 00309 00314 size_type 00315 size() const { 00316 return ma_.num_of_rows(); 00317 } 00318 00326 bool 00327 operator==(const AliColumn &ac) const { 00328 bool ret = this->size() == ac.size(); 00329 for (size_type i = 0; ret && i < size(); i++) { 00330 ret = (this->ma_.seqentry(i).seq()[this->col_index_] == 00331 ac.ma_.seqentry(i).seq()[ac.col_index_]); 00332 } 00333 return ret; 00334 } 00335 00343 bool 00344 operator!=(const AliColumn &ac) const { 00345 return !(*this == ac); 00346 } 00347 }; 00348 00349 private: 00351 typedef std::map<std::string, size_type> str2idx_map_t; 00352 00354 typedef std::map<size_t, SequenceAnnotation> annotation_map_t; 00355 00356 //************************************************************ 00357 // attributes of MultipleAlignment 00358 00360 std::vector<SeqEntry> alig_; 00361 00363 annotation_map_t annotations_; 00364 00369 str2idx_map_t name2idx_; 00370 00371 // end attributes 00372 //************************************************************ 00373 00375 void 00376 create_name2idx_map(); 00377 00386 void 00387 read_clustallike(std::istream &in, FormatType::type format); 00388 00395 void 00396 read_stockholm(std::istream &in); 00397 00410 void 00411 read_clustalw(std::istream &in); 00412 00432 void 00433 read_fasta(std::istream &in); 00434 00435 public: 00437 typedef std::vector<SeqEntry>::const_iterator const_iterator; 00438 00440 MultipleAlignment(); 00441 00450 MultipleAlignment(const std::string &file, 00451 FormatType::type format = FormatType::CLUSTAL); 00452 00460 MultipleAlignment(std::istream &in, 00461 FormatType::type format = FormatType::CLUSTAL); 00462 00468 MultipleAlignment(const std::string &name, const std::string &sequence); 00469 00481 MultipleAlignment(const std::string &nameA, 00482 const std::string &nameB, 00483 const std::string &alistringA, 00484 const std::string &alistringB); 00485 00499 MultipleAlignment(const Alignment &alignment, 00500 bool only_local = false, 00501 bool special_gap_symbols = false); 00502 00514 MultipleAlignment(const AlignmentEdges &edges, 00515 const Sequence &seqA, 00516 const Sequence &seqB); 00517 00518 protected: 00528 void 00529 init(const AlignmentEdges &edges, 00530 const Sequence &seqA, 00531 const Sequence &seqB, 00532 bool special_gap_symbols); 00533 00534 public: 00538 virtual ~MultipleAlignment(); 00539 00546 const Sequence & 00547 as_sequence() const; 00548 00556 void 00557 normalize_rna_symbols(); 00558 00563 size_type 00564 num_of_rows() const { 00565 return alig_.size(); 00566 } 00567 00576 bool 00577 empty() const { 00578 return alig_.empty(); 00579 } 00580 00587 const SequenceAnnotation & 00588 annotation(const AnnoType::type &annotype) const; 00589 00597 void 00598 set_annotation(const AnnoType::type &annotype, 00599 const SequenceAnnotation &annotation) { 00600 assert(0 <= annotype && annotype < num_of_annotypes()); 00601 annotations_[(size_t)annotype] = annotation; 00602 } 00603 00609 bool 00610 has_annotation(const AnnoType::type &annotype) const { 00611 assert(0 <= annotype && annotype < num_of_annotypes()); 00612 return annotations_.find(annotype) != annotations_.end(); 00613 } 00614 00619 bool 00620 is_proper() const; 00621 00629 pos_type 00630 length() const { 00631 return alig_.empty() ? 0 : alig_[0].seq().length(); 00632 } 00633 00638 const_iterator 00639 begin() const { 00640 return alig_.begin(); 00641 } 00642 00647 const_iterator 00648 end() const { 00649 return alig_.end(); 00650 } 00651 00657 bool 00658 contains(const std::string &name) const; 00659 00660 /* index access saves time over access by sequence name */ 00661 00669 size_type 00670 index(const std::string &name) const { 00671 str2idx_map_t::const_iterator it = name2idx_.find(name); 00672 assert(it != name2idx_.end()); 00673 return it->second; 00674 } 00675 00683 const SeqEntry & 00684 seqentry(size_type index) const { 00685 return alig_[index]; 00686 } 00687 00694 const SeqEntry & 00695 seqentry(const std::string &name) const { 00696 return alig_[index(name)]; 00697 } 00698 00709 size_type 00710 deviation(const MultipleAlignment &ma) const; 00711 00727 double 00728 sps(const MultipleAlignment &ma, bool compalign = true) const; 00729 00745 double 00746 cmfinder_realignment_score(const MultipleAlignment &ma) const; 00747 00761 double 00762 avg_deviation_score(const MultipleAlignment &ma) const; 00763 00772 std::string 00773 consensus_sequence() const; 00774 00782 AliColumn 00783 column(size_type col_index) const { 00784 return AliColumn(*this, col_index); 00785 } 00786 00794 void 00795 append(const SeqEntry &seqentry); 00796 00807 void 00808 prepend(const SeqEntry &seqentry); 00809 00815 void 00816 operator+=(const AliColumn &c); 00817 00823 void 00824 operator+=(char c); 00825 00829 void 00830 reverse(); 00831 00832 // ------------------------------------------------------------ 00833 // output 00834 00849 std::ostream & 00850 write(std::ostream &out, 00851 FormatType::type format = 00852 MultipleAlignment::FormatType::CLUSTAL) const; 00853 00868 std::ostream & 00869 write(std::ostream &out, 00870 size_t width, 00871 FormatType::type format = 00872 MultipleAlignment::FormatType::CLUSTAL) const; 00873 00886 std::ostream & 00887 write_name_sequence_line(std::ostream &out, 00888 const std::string &name, 00889 const std::string &sequence, 00890 size_t namewidth) const; 00891 00905 std::ostream & 00906 write(std::ostream &out, 00907 size_type start, 00908 size_type end, 00909 FormatType::type format = 00910 MultipleAlignment::FormatType::CLUSTAL) const; 00911 00922 bool 00923 checkAlphabet(const Alphabet<char> &alphabet) const; 00924 00925 private: 00936 static size_type 00937 deviation2(const string1 &a1, 00938 const string1 &a2, 00939 const string1 &ref1, 00940 const string1 &ref2); 00941 00956 static double 00957 pairwise_match_score(const SeqEntry &a1, 00958 const SeqEntry &a2, 00959 const SeqEntry &ref1, 00960 const SeqEntry &ref2, 00961 bool score_common_gaps); 00962 00973 static std::vector<int> 00974 match_vector(const string1 &s, const string1 &t); 00975 00986 static std::vector<int> 00987 match_vector2(const string1 &s, const string1 &t); 00988 00997 static size_t 00998 count_matches(const SeqEntry &a1, const SeqEntry &a2); 00999 01012 static size_t 01013 count_exclusive_matches(const SeqEntry &a1, 01014 const SeqEntry &a2, 01015 const SeqEntry &ref1, 01016 const SeqEntry &ref2); 01017 01035 static double 01036 pairwise_deviation_score(const SeqEntry &a1, 01037 const SeqEntry &a2, 01038 const SeqEntry &ref1, 01039 const SeqEntry &ref2); 01040 01041 public: 01046 void 01047 write_debug(std::ostream &out = std::cout) const; 01048 }; 01049 01056 std::ostream & 01057 operator<<(std::ostream &out, const MultipleAlignment &ma); 01058 01059 } // end namespace 01060 01061 #endif // LOCARNA_MULTIPLE_ALIGNMENT_HH