ROSE  0.11.145.0
Clexer.h
1 // WARNING: Changes to this file must be contributed back to Sawyer or else they will
2 // be clobbered by the next update from Sawyer. The Sawyer repository is at
3 // https://github.com/matzke1/sawyer.
4 
5 
6 
7 
8 // Lexical analyzer for C-like languages
9 #ifndef Sawyer_Clexer_H
10 #define Sawyer_Clexer_H
11 
12 #include <Sawyer/Sawyer.h>
13 
14 #include <Sawyer/Assert.h>
15 #include <Sawyer/Buffer.h>
16 #include <Sawyer/Interval.h>
17 #include <Sawyer/LineVector.h>
18 
19 #include <string>
20 #include <vector>
21 
22 namespace Sawyer {
23 namespace Language {
24 namespace Clexer {
25 
26 enum TokenType {
27  TOK_EOF, // end of file
28  TOK_LEFT, // '(', '[', or '{'
29  TOK_RIGHT, // ')', ']', or '}'
30  TOK_CHAR, // character literal
31  TOK_STRING, // string literal
32  TOK_NUMBER, // numeric constant, including optional leading sign
33  TOK_WORD, // word or symbol name
34  TOK_CPP, // preprocessor statement starting with '#'
35  TOK_COMMENT, // comment starting with '//' or '/*'
36  TOK_OTHER // anything else
37 };
38 
39 std::string toString(TokenType);
40 
41 using Indices = Container::Interval<size_t>;
42 
43 class Token {
44  friend class TokenStream;
45 
46  TokenType type_;
47  size_t prior_; // start of skipped stuff (whitespace, etc) before begin_
48  size_t begin_; // location for first character of token
49  size_t end_; // location one past end of token
50 
51 public:
52  Token(): type_(TOK_EOF), prior_(0), begin_(0), end_(0) {} // for std::vector, otherwise not used
53 
54  Token(TokenType type, size_t prior, size_t begin, size_t end)
55  : type_(type), prior_(prior), begin_(begin), end_(end) {
56  ASSERT_require(prior <= begin_);
57  ASSERT_require(begin <= end);
58  }
59 
60  TokenType type() const {
61  return type_;
62  }
63 
64  size_t prior() const {
65  return prior_;
66  }
67 
68  size_t begin() const {
69  return begin_;
70  }
71 
72  size_t end() const {
73  return end_;
74  }
75 
76  size_t size() const {
77  return end_ - begin_;
78  }
79 
80  Indices where() const {
81  return end_ > begin_ ? Indices::hull(begin_, end_-1) : Indices();
82  }
83 
84  explicit operator bool() const {
85  return type_ != TOK_EOF;
86  }
87 
88  bool operator!() const {
89  return type_ == TOK_EOF;
90  }
91 };
92 
93 class TokenStream {
94 private:
95  std::string fileName_; // name of source file
96  Sawyer::Container::LineVector content_; // contents of source file
97  Indices parseRegion_; // parse only within this region
98  size_t prior_; // one past end of previous token
99  size_t at_; // cursor position in buffer
100  std::vector<Token> tokens_; // token stream filled on demand
101  bool skipPreprocessorTokens_; // skip over '#' preprocessor directives
102  bool skipCommentTokens_; // skip over '//' and '/*' comments
103 
104 public:
105  // Parse the contents of a file
106  explicit TokenStream(const std::string &fileName)
107  : fileName_(fileName), content_(fileName), parseRegion_(Indices::whole()), prior_(0), at_(0),
108  skipPreprocessorTokens_(true), skipCommentTokens_(true) {}
109 
110  // Parse from buffer
111  TokenStream(const std::string &fileName, const Sawyer::Container::Buffer<size_t, char>::Ptr &buffer)
112  : fileName_(fileName), content_(buffer), parseRegion_(Indices::whole()), prior_(0), at_(0),
113  skipPreprocessorTokens_(true), skipCommentTokens_(true) {}
114 
115  // Reparse part of another token stream. Position info, error messages, lines, etc. are from the enclosing token stream.
116  TokenStream(TokenStream &super, const Indices &region)
117  : fileName_(super.fileName_), content_(super.content_), parseRegion_(region), prior_(region.least()), at_(region.least()),
118  skipPreprocessorTokens_(true), skipCommentTokens_(true) {
119  ASSERT_require(region);
120  }
121 
122  const std::string fileName() const { return fileName_; }
123 
124  bool skipPreprocessorTokens() const { return skipPreprocessorTokens_; }
125  void skipPreprocessorTokens(bool b) { skipPreprocessorTokens_ = b; }
126 
127  bool skipCommentTokens() const { return skipCommentTokens_; }
128  void skipCommentTokens(bool b) { skipCommentTokens_ = b; }
129 
130  int getChar(size_t position);
131 
132  const Token& operator[](size_t lookahead);
133 
134  void consume(size_t n = 1);
135 
136  std::string lexeme(const Token &t) const;
137 
138  std::string toString(const Token &t) const;
139 
140  // Return the line of source in which this token appears, including line termination if present.
141  std::string line(const Token &t) const;
142 
143  bool matches(const Token &token, const char *s2) const;
144  bool startsWith(const Token &token, const char *prefix) const;
145 
146  void emit(std::ostream &out, const std::string &fileName, const Token &token, const std::string &message) const;
147 
148  void emit(std::ostream &out, const std::string &fileName, const Token &begin, const Token &locus, const Token &end,
149  const std::string &message) const;
150 
151  std::pair<size_t, size_t> location(const Token &token) const;
152 
153  const Sawyer::Container::LineVector& content() const {
154  return content_;
155  }
156 
157 private:
158  void scanString();
159  void makeNextToken();
160 };
161 
162 
163 } // namespace
164 } // namespace
165 } // namespace
166 
167 #endif
const char * Language(int64_t)
Convert ClangToSageTranslator::Language enum constant to a string.
A buffer of characters indexed by line number.
Definition: LineVector.h:24
Reference-counting intrusive smart pointer.
Definition: SharedPointer.h:68
Name space for the entire library.
Definition: FeasiblePath.h:767
ROSE_UTIL_API std::string toString(const Path &)
Convert a path to a string.
T least() const
Returns lower limit.
Definition: Interval.h:207
static Interval whole()
Construct an interval that covers the entire domain.
Definition: Interval.h:180
static Interval hull(size_t v1, size_t v2)
Construct an interval from two endpoints.
Definition: Interval.h:151