src/biu/Alphabet.cc
Go to the documentation of this file.00001 #include <biu/Alphabet.hh>
00002 #include <biu/assertbiu.hh>
00003
00004 #include <iostream>
00005 #include <math.h>
00006
00007 namespace biu {
00008
00009 Alphabet::Alphabet( const std::string& alphabetString,
00010 const size_t _elementLength)
00011 : elementLength(_elementLength)
00012 {
00013 assertbiu (alphabetString.length() % elementLength == 0,
00014 "length of the alphabetstring has to be multiple of the element length");
00015
00016
00017 alph2string = std::vector<std::string>(
00018 alphabetString.length() / elementLength, "");
00019 AlphElem actElem = 0;
00020 std::string actStr = "";
00021 for (std::string::size_type i = 0; i<alphabetString.length();
00022 i+= elementLength) {
00023 actStr = alphabetString.substr(i,elementLength);
00024 string2alph[actStr] = actElem;
00025 alph2string[actElem] = actStr;
00026 actElem++;
00027 }
00028 }
00029
00030 Alphabet::Alphabet(const std::vector<std::string> & alphabetStrings) {
00031
00032 size_t alphabetSize=alphabetStrings.size();
00033
00034 assertbiu (alphabetSize>0,"No init of empty alphabet allowed");
00035
00036 elementLength = alphabetStrings[0].length();
00037
00038 #ifdef NDEBUG
00039 {
00040 size_t i;
00041 for(i=0; i<alphabetSize; i++) {
00042 assertbiu(alphabetStrings[i].length() == elementLength,
00043 "All strings must have equal length");
00044 }
00045 }
00046 #endif
00047
00048 alph2string = std::vector<std::string>(alphabetSize, "");
00049
00050 AlphElem actElem = 0;
00051 std::string actStr = "";
00052 for (std::string::size_type i = 0; i<alphabetSize; i++) {
00053 actStr = alphabetStrings[i];
00054 string2alph[actStr] = actElem;
00055 alph2string[actElem] = actStr;
00056 actElem++;
00057 }
00058 }
00059
00060 Alphabet::~Alphabet() {
00061 }
00062
00063 bool
00064 Alphabet::operator== (const Alphabet& alph2) const {
00065 return string2alph == alph2.string2alph
00066 && alph2string == alph2.alph2string
00067 && elementLength == alph2.elementLength;
00068 }
00069
00070 bool
00071 Alphabet::operator!= (const Alphabet& alph2) const {
00072 return string2alph != alph2.string2alph
00073 && alph2string != alph2.alph2string
00074 && elementLength != alph2.elementLength;
00075 }
00076
00077
00078
00079 Alphabet::Sequence
00080 Alphabet::getSequence(const std::string& seqString) const {
00081 assertbiu (seqString.length() % elementLength == 0,
00082 "the sequence length has to be a multiple of the element length");
00083
00084 Alphabet::Sequence seq(seqString.length() / elementLength,0);
00085 Alphabet::Sequence::size_type seqPos = 0;
00086 for (std::string::size_type i=0; i<seqString.length();
00087 i+= elementLength) {
00088 seq[seqPos++] = getElement(seqString.substr(i,elementLength));
00089 }
00090 return seq;
00091 }
00092
00093 std::string
00094 Alphabet::getString(const Alphabet::Sequence& sequence) const {
00095 std::string str(sequence.size()*elementLength,' ');
00096 std::string::size_type i = 0, j = 0;
00097 for (Alphabet::Sequence::const_iterator it = sequence.begin();
00098 it != sequence.end(); it++) {
00099 for (j=0; j<elementLength; j++) {
00100 str[i++] = alph2string.at(*it)[j];
00101 }
00102 }
00103 return str;
00104 }
00105
00106 std::string
00107 Alphabet::getString(const Alphabet::AlphElem& elem) const {
00108 assertbiu( elem < alph2string.size(),
00109 "elem is no valid element of the alphabet");
00110 return alph2string.at(elem);
00111 }
00112
00113 Alphabet::AlphElem
00114 Alphabet::getElement(const std::string& alphElemStr) const {
00115 STR2ALPH_TYPE::const_iterator member = string2alph.find(alphElemStr);
00116 assertbiu(member != string2alph.end(),
00117 "element '"+alphElemStr+"' is no member of the alphabet");
00118 return member->second;
00119 }
00120
00121 bool
00122 Alphabet::isAlphabetString(const std::string& str) const {
00123
00124 bool retVal = (str.length() % elementLength == 0);
00125 for (std::string::size_type i = 0; retVal && i< str.size();
00126 i+=elementLength) {
00127 retVal = string2alph.find(str.substr(i,elementLength))
00128 != string2alph.end();
00129 }
00130 return retVal;
00131 }
00132
00133 size_t
00134 Alphabet::getIndex(const AlphElem& elem) const {
00135 return (size_t)elem;
00136 }
00137
00138 size_t
00139 Alphabet::getIndex(const std::string& elemStr) const {
00140 return getIndex(getElement(elemStr));
00141 }
00142
00143 Alphabet::AlphElem
00144 Alphabet::getElement(const size_t index) const {
00145 assertbiu(index < getAlphabetSize(),
00146 "the index leaves the valid indices of the alphabet");
00147 return index;
00148 }
00149
00150
00151 Alphabet::CSequence
00152 Alphabet::compress(const Alphabet::Sequence& seq) const
00153 {
00154
00155 size_t base = this->getAlphabetSize();
00156
00157 size_t numOfPos = (size_t) (log(256.) / log((double) base));
00158
00159
00160 CSequence cs(
00161 (size_t) ceil((double) seq.size() / (double) numOfPos), 0);
00162
00163
00164 for (size_t i = 0; i < seq.size(); i++) {
00165 cs[i/numOfPos] += seq.at(i) * (int) pow(base, i%numOfPos);
00166 }
00167 return cs;
00168 }
00169
00170 Alphabet::Sequence
00171 Alphabet::decompress(const Alphabet::CSequence & seq,
00172 const size_t seqLength) const
00173 {
00174
00175 size_t base = this->getAlphabetSize();
00176
00177 size_t numOfPos = (size_t) (log(256.) / log((double) base));
00178 Sequence uncompressedSeq(seqLength, this->getElement(0));
00179
00180 unsigned char c = seq[0];
00181
00182 assertbiu(seqLength <= seq.size()*numOfPos,
00183 "The compressed sequence can't hold <seqLength> many elements.");
00184
00185
00186 for (size_t i = 0; i < seqLength; i++) {
00187
00188 if (i%numOfPos == 0) {
00189 c = seq[i/numOfPos];
00190 }
00191 uncompressedSeq[i] = this->getElement( c % base );
00192 c /= base;
00193 }
00194
00195 return uncompressedSeq;
00196 }
00197
00198 bool
00199 Alphabet::isAlphabetSequence(const Alphabet::Sequence& seq) const {
00200 const size_t size = getAlphabetSize();
00201 for (Sequence::const_iterator it = seq.begin(); it != seq.end(); it++) {
00202 if ((size_t)*it > size)
00203 return false;
00204 }
00205 return true;
00206 }
00207
00208
00209
00210 }
00211