librostlab  1.0.20
readFasta.h
Go to the documentation of this file.
1 /*
2  Copyright (C) 2011 Laszlo Kajan, Technical University of Munich, Germany
3 
4  This file is part of librostlab.
5 
6  librostlab is free software: you can redistribute it and/or modify
7  it under the terms of the GNU Lesser General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10 
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU Lesser General Public License for more details.
15 
16  You should have received a copy of the GNU Lesser General Public License
17  along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19 #ifndef ROSTLAB_READFASTA
20 #define ROSTLAB_READFASTA 1
21 
22 #include <boost/regex.hpp>
23 #include <iostream>
24 #include <fstream>
26 
27 namespace bo = boost;
28 
29 namespace rostlab {
30 namespace bio {
31 
32  namespace fmt {
33  class fasta{}; // fasta format class
34  };
35 
36 template<typename _FmtT>
37 class seq {
38  private:
39  std::string _desc;
40  std::string _display_id;
41  std::string _seqstr;
42  public:
43  seq(){};
44  seq( const std::string& __desc, const std::string& __display_id, const std::string& __seqstr ) : _desc(__desc), _display_id(__display_id), _seqstr(__seqstr) {};
45  virtual ~seq(){};
46 
47  std::string& seqstr(){ return _seqstr; };
48 };
49 
50 /*template<> // could specialize it...
51 class seq<bio::fmt::fasta>
52 {
53  private:
54  public:
55 };*/
56 
57 /*template<typename _FmtT>
58 std::istream& operator>>( std::istream& __is, bio::seq<_FmtT>& __n )
59 {
60  return __is;
61 }*/
62 
63 inline std::istream& operator>>( std::istream& __is, bio::seq<bio::fmt::fasta>& __seq )
64 {
65  // based on Bio/SeqIO/fasta.pm
66  std::string rec; rec.reserve(1024);
67  while( __is.peek() != std::istream::traits_type::eof() )
68  {
69  if(rec.capacity() == rec.size()) rec.reserve(rec.capacity() * 2);
70  if( rec.size() && __is.peek() == '>' && *rec.rbegin() == '\n' ) break;
71  else rec += __is.get();
72  }
73 
74  if( !rec.size() || *rec.begin() != '>' ) throw runtime_error( std::string("FASTA syntax error in record '") + rec + "': no leading '>'" );
75 
76  rec = bo::regex_replace( rec, bo::regex("^>"), "" ); // $entry =~ s/^>//;
77 
78  bo::sregex_token_iterator i(rec.begin(), rec.end(), bo::regex("\n"), -1); // split(/\n/,$entry,2);
79 
80  if( i == boost::sregex_token_iterator() ) throw runtime_error( std::string("FASTA syntax error in record '") + rec + "': only one line" );
81 
82  std::string top = *i++;
83  std::string sequence( i->first, static_cast<std::string::const_iterator>( rec.end() ) );
84 
85  sequence = bo::regex_replace( sequence, bo::regex(">"), "" ); // $sequence =~ s/>//g;
86 
87  bo::match_results<std::string::const_iterator> what;
88  std::string id, fulldesc;
89  if( bo::regex_search( top, what, bo::regex("^[[:space:]]*([^[:space:]]+)[:space:]*(.*)") ) )
90  { id = std::string( what[1].first, what[1].second ); fulldesc = std::string( what[2].first, what[2].second ); }
91 
92  if( id.empty() ) id = fulldesc;
93 
94  sequence = bo::regex_replace( sequence, bo::regex("[ \t\n\r]"), "" );
95 
96  // alphabet? would be good to have this
97 
98  __seq = bio::seq<bio::fmt::fasta>( fulldesc, id, sequence );
99 
100  return __is;
101 }
102 
103 }; // namespace bio
104 }; // namespace rostlab
105 
106 #endif /* ROSTLAB_READFASTA */
107 // vim:et:ts=2:ai:
virtual ~seq()
Definition: readFasta.h:45
seq(const std::string &__desc, const std::string &__display_id, const std::string &__seqstr)
Definition: readFasta.h:44
std::string & seqstr()
Definition: readFasta.h:47
std::istream & operator>>(std::istream &__is, bio::seq< bio::fmt::fasta > &__seq)
Definition: readFasta.h:63