LocARNA-1.9.2
src/LocARNA/multiple_alignment.hh
00001 #ifndef LOCARNA_MULTIPLE_ALIGNMENT_HH
00002 #define LOCARNA_MULTIPLE_ALIGNMENT_HH
00003 
00004 #ifdef HAVE_CONFIG_H
00005 #include <config.h>
00006 #endif
00007 
00008 #include <iosfwd>
00009 #include <string>
00010 #include <vector>
00011 #include <map>
00012 
00013 #include "aux.hh"
00014 #include "string1.hh"
00015 #include "scoring_fwd.hh"
00016 #include "sequence_annotation.hh"
00017 
00018 #include <assert.h>
00019 
00020 #include <iostream>
00021 
00022 namespace LocARNA {
00023 
00024     class Alignment;
00025     class AlignmentEdges;
00026     template <class T>
00027     class Alphabet;
00028     class BasePairs;
00029     class Scoring;
00030     class Sequence;
00031 
00065     class MultipleAlignment {
00066     public:
00067         typedef size_t size_type; 
00068 
00072         struct FormatType {
00074             enum type {
00075                 STOCKHOLM, 
00076                 PP,        
00077                 CLUSTAL,   
00078                 FASTA      
00079             };
00080 
00082             static size_t
00083             size() {
00084                 return 4;
00085             }
00086         };
00087 
00090         struct AnnoType {
00092             enum type {
00094                 consensus_structure,
00097                 structure,
00100                 fixed_structure,
00102                 anchors
00103             };
00104 
00106             static size_t
00107             size() {
00108                 return 4;
00109             }
00110         };
00111 
00112     private:
00118         typedef std::vector<std::vector<std::string> > annotation_tags_t;
00119 
00120         static annotation_tags_t annotation_tags;
00121 
00123         static void
00124         init_annotation_tags();
00125 
00126     public:
00129         static size_t
00130         num_of_annotypes() {
00131             return annotation_tags.size();
00132         }
00133 
00142         class SeqEntry {
00143         public:
00144             typedef MultipleAlignment::size_type size_type; 
00145 
00146             typedef std::pair<pos_type, pos_type>
00147                 pos_pair_t; 
00148 
00149         private:
00150             std::string name_;        
00151             std::string description_; 
00152             string1 seq_;             //<! alignment string of the sequence
00153 
00154         public:
00162             SeqEntry(const std::string &name, const std::string &seq)
00163                 : name_(name), description_(""), seq_((string1)seq) {}
00164 
00172             SeqEntry(const std::string &name, const string1 &seq)
00173                 : name_(name), description_(""), seq_(seq) {}
00174 
00182             SeqEntry(const std::string &name,
00183                      const std::string &description,
00184                      const std::string &seq)
00185                 : name_(name), description_(description), seq_((string1)seq) {}
00186 
00195             SeqEntry(const std::string &name,
00196                      const std::string &description,
00197                      const string1 &seq)
00198                 : name_(name), description_(description), seq_(seq) {}
00199 
00200             // access
00201 
00203             const std::string &
00204             name() const {
00205                 return name_;
00206             }
00207 
00209             const std::string &
00210             description() const {
00211                 return description_;
00212             }
00213 
00215             const string1 &
00216             seq() const {
00217                 return seq_;
00218             }
00219 
00221             size_type
00222             length_wogaps() const;
00223 
00224             //****************************************
00225             // projections
00226 
00234             pos_type
00235             pos_to_col(pos_type pos) const;
00236 
00248             pos_pair_t
00249             col_to_pos(pos_type col) const;
00250 
00255             void
00256             reverse() {
00257                 seq_.reverse();
00258             }
00259 
00264             void
00265             push_back(char c) {
00266                 seq_.push_back(c);
00267             }
00268 
00270             void
00271             set_seq(const string1 &seq) {
00272                 seq_ = seq;
00273             }
00274         };
00275 
00282         class AliColumn {
00283             const MultipleAlignment &ma_;
00284             size_type col_index_;
00285 
00286         public:
00293             AliColumn(const MultipleAlignment &ma, size_type col_index)
00294                 : ma_(ma), col_index_(col_index) {
00295                 assert(1 <= col_index);
00296                 assert(col_index <= ma.length());
00297             }
00298 
00306             const char &operator[](size_type row_index) const {
00307                 return ma_.seqentry(row_index).seq()[col_index_];
00308             }
00309 
00314             size_type
00315             size() const {
00316                 return ma_.num_of_rows();
00317             }
00318 
00326             bool
00327             operator==(const AliColumn &ac) const {
00328                 bool ret = this->size() == ac.size();
00329                 for (size_type i = 0; ret && i < size(); i++) {
00330                     ret = (this->ma_.seqentry(i).seq()[this->col_index_] ==
00331                            ac.ma_.seqentry(i).seq()[ac.col_index_]);
00332                 }
00333                 return ret;
00334             }
00335 
00343             bool
00344             operator!=(const AliColumn &ac) const {
00345                 return !(*this == ac);
00346             }
00347         };
00348 
00349     private:
00351         typedef std::map<std::string, size_type> str2idx_map_t;
00352 
00354         typedef std::map<size_t, SequenceAnnotation> annotation_map_t;
00355 
00356         //************************************************************
00357         // attributes of MultipleAlignment
00358 
00360         std::vector<SeqEntry> alig_;
00361 
00363         annotation_map_t annotations_;
00364 
00369         str2idx_map_t name2idx_;
00370 
00371         // end attributes
00372         //************************************************************
00373 
00375         void
00376         create_name2idx_map();
00377 
00386         void
00387         read_clustallike(std::istream &in, FormatType::type format);
00388 
00395         void
00396         read_stockholm(std::istream &in);
00397 
00410         void
00411         read_clustalw(std::istream &in);
00412 
00432         void
00433         read_fasta(std::istream &in);
00434 
00435     public:
00437         typedef std::vector<SeqEntry>::const_iterator const_iterator;
00438 
00440         MultipleAlignment();
00441 
00450         MultipleAlignment(const std::string &file,
00451                           FormatType::type format = FormatType::CLUSTAL);
00452 
00460         MultipleAlignment(std::istream &in,
00461                           FormatType::type format = FormatType::CLUSTAL);
00462 
00468         MultipleAlignment(const std::string &name, const std::string &sequence);
00469 
00481         MultipleAlignment(const std::string &nameA,
00482                           const std::string &nameB,
00483                           const std::string &alistringA,
00484                           const std::string &alistringB);
00485 
00499         MultipleAlignment(const Alignment &alignment,
00500                           bool only_local = false,
00501                           bool special_gap_symbols = false);
00502 
00514         MultipleAlignment(const AlignmentEdges &edges,
00515                           const Sequence &seqA,
00516                           const Sequence &seqB);
00517 
00518     protected:
00528         void
00529         init(const AlignmentEdges &edges,
00530              const Sequence &seqA,
00531              const Sequence &seqB,
00532              bool special_gap_symbols);
00533 
00534     public:
00538         virtual ~MultipleAlignment();
00539 
00546         const Sequence &
00547         as_sequence() const;
00548 
00556         void
00557         normalize_rna_symbols();
00558 
00563         size_type
00564         num_of_rows() const {
00565             return alig_.size();
00566         }
00567 
00576         bool
00577         empty() const {
00578             return alig_.empty();
00579         }
00580 
00587         const SequenceAnnotation &
00588         annotation(const AnnoType::type &annotype) const;
00589 
00597         void
00598         set_annotation(const AnnoType::type &annotype,
00599                        const SequenceAnnotation &annotation) {
00600             assert(0 <= annotype && annotype < num_of_annotypes());
00601             annotations_[(size_t)annotype] = annotation;
00602         }
00603 
00609         bool
00610         has_annotation(const AnnoType::type &annotype) const {
00611             assert(0 <= annotype && annotype < num_of_annotypes());
00612             return annotations_.find(annotype) != annotations_.end();
00613         }
00614 
00619         bool
00620         is_proper() const;
00621 
00629         pos_type
00630         length() const {
00631             return alig_.empty() ? 0 : alig_[0].seq().length();
00632         }
00633 
00638         const_iterator
00639         begin() const {
00640             return alig_.begin();
00641         }
00642 
00647         const_iterator
00648         end() const {
00649             return alig_.end();
00650         }
00651 
00657         bool
00658         contains(const std::string &name) const;
00659 
00660         /* index access saves time over access by sequence name */
00661 
00669         size_type
00670         index(const std::string &name) const {
00671             str2idx_map_t::const_iterator it = name2idx_.find(name);
00672             assert(it != name2idx_.end());
00673             return it->second;
00674         }
00675 
00683         const SeqEntry &
00684         seqentry(size_type index) const {
00685             return alig_[index];
00686         }
00687 
00694         const SeqEntry &
00695         seqentry(const std::string &name) const {
00696             return alig_[index(name)];
00697         }
00698 
00709         size_type
00710         deviation(const MultipleAlignment &ma) const;
00711 
00727         double
00728         sps(const MultipleAlignment &ma, bool compalign = true) const;
00729 
00745         double
00746         cmfinder_realignment_score(const MultipleAlignment &ma) const;
00747 
00761         double
00762         avg_deviation_score(const MultipleAlignment &ma) const;
00763 
00772         std::string
00773         consensus_sequence() const;
00774 
00782         AliColumn
00783         column(size_type col_index) const {
00784             return AliColumn(*this, col_index);
00785         }
00786 
00794         void
00795         append(const SeqEntry &seqentry);
00796 
00807         void
00808         prepend(const SeqEntry &seqentry);
00809 
00815         void
00816         operator+=(const AliColumn &c);
00817 
00823         void
00824         operator+=(char c);
00825 
00829         void
00830         reverse();
00831 
00832         // ------------------------------------------------------------
00833         // output
00834 
00849         std::ostream &
00850         write(std::ostream &out,
00851               FormatType::type format =
00852                   MultipleAlignment::FormatType::CLUSTAL) const;
00853 
00868         std::ostream &
00869         write(std::ostream &out,
00870               size_t width,
00871               FormatType::type format =
00872                   MultipleAlignment::FormatType::CLUSTAL) const;
00873 
00886         std::ostream &
00887         write_name_sequence_line(std::ostream &out,
00888                                  const std::string &name,
00889                                  const std::string &sequence,
00890                                  size_t namewidth) const;
00891 
00905         std::ostream &
00906         write(std::ostream &out,
00907               size_type start,
00908               size_type end,
00909               FormatType::type format =
00910                   MultipleAlignment::FormatType::CLUSTAL) const;
00911 
00922         bool
00923         checkAlphabet(const Alphabet<char> &alphabet) const;
00924 
00925     private:
00936         static size_type
00937         deviation2(const string1 &a1,
00938                    const string1 &a2,
00939                    const string1 &ref1,
00940                    const string1 &ref2);
00941 
00956         static double
00957         pairwise_match_score(const SeqEntry &a1,
00958                              const SeqEntry &a2,
00959                              const SeqEntry &ref1,
00960                              const SeqEntry &ref2,
00961                              bool score_common_gaps);
00962 
00973         static std::vector<int>
00974         match_vector(const string1 &s, const string1 &t);
00975 
00986         static std::vector<int>
00987         match_vector2(const string1 &s, const string1 &t);
00988 
00997         static size_t
00998         count_matches(const SeqEntry &a1, const SeqEntry &a2);
00999 
01012         static size_t
01013         count_exclusive_matches(const SeqEntry &a1,
01014                                 const SeqEntry &a2,
01015                                 const SeqEntry &ref1,
01016                                 const SeqEntry &ref2);
01017 
01035         static double
01036         pairwise_deviation_score(const SeqEntry &a1,
01037                                  const SeqEntry &a2,
01038                                  const SeqEntry &ref1,
01039                                  const SeqEntry &ref2);
01040 
01041     public:
01046         void
01047         write_debug(std::ostream &out = std::cout) const;
01048     };
01049 
01056     std::ostream &
01057     operator<<(std::ostream &out, const MultipleAlignment &ma);
01058 
01059 } // end namespace
01060 
01061 #endif // LOCARNA_MULTIPLE_ALIGNMENT_HH
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends