ROSE 0.11.145.192
Clexer.h
1// WARNING: Changes to this file must be contributed back to Sawyer or else they will
2// be clobbered by the next update from Sawyer. The Sawyer repository is at
3// https://gitlab.com/charger7534/sawyer.git.
4
5
6
7
8// Lexical analyzer for C-like languages
9#ifndef Sawyer_Clexer_H
10#define Sawyer_Clexer_H
11
12#include <Sawyer/Sawyer.h>
13
14#include <Sawyer/Assert.h>
15#include <Sawyer/Buffer.h>
16#include <Sawyer/Interval.h>
17#include <Sawyer/LineVector.h>
18
19#include <string>
20#include <vector>
21
22namespace Sawyer {
23namespace Language {
24namespace Clexer {
25
26enum TokenType {
27 TOK_EOF, // end of file
28 TOK_LEFT, // '(', '[', or '{'
29 TOK_RIGHT, // ')', ']', or '}'
30 TOK_CHAR, // character literal
31 TOK_STRING, // string literal
32 TOK_NUMBER, // numeric constant, including optional leading sign
33 TOK_WORD, // word or symbol name
34 TOK_CPP, // preprocessor statement starting with '#'
35 TOK_COMMENT, // comment starting with '//' or '/*'
36 TOK_OTHER // anything else
37};
38
39std::string toString(TokenType);
40
41using Indices = Container::Interval<size_t>;
42
43class Token {
44 friend class TokenStream;
45
46 TokenType type_;
47 size_t prior_; // start of skipped stuff (whitespace, etc) before begin_
48 size_t begin_; // location for first character of token
49 size_t end_; // location one past end of token
50
51public:
52 Token(): type_(TOK_EOF), prior_(0), begin_(0), end_(0) {} // for std::vector, otherwise not used
53
54 Token(TokenType type, size_t prior, size_t begin, size_t end)
55 : type_(type), prior_(prior), begin_(begin), end_(end) {
56 ASSERT_require(prior <= begin_);
57 ASSERT_require(begin <= end);
58 }
59
60 TokenType type() const {
61 return type_;
62 }
63
64 size_t prior() const {
65 return prior_;
66 }
67
68 size_t begin() const {
69 return begin_;
70 }
71
72 size_t end() const {
73 return end_;
74 }
75
76 size_t size() const {
77 return end_ - begin_;
78 }
79
80 Indices where() const {
81 return end_ > begin_ ? Indices::hull(begin_, end_-1) : Indices();
82 }
83
84 explicit operator bool() const {
85 return type_ != TOK_EOF;
86 }
87
88 bool operator!() const {
89 return type_ == TOK_EOF;
90 }
91};
92
94private:
95 std::string fileName_; // name of source file
96 Sawyer::Container::LineVector content_; // contents of source file
97 Indices parseRegion_; // parse only within this region
98 size_t prior_; // one past end of previous token
99 size_t at_; // cursor position in buffer
100 std::vector<Token> tokens_; // token stream filled on demand
101 bool skipPreprocessorTokens_; // skip over '#' preprocessor directives
102 bool skipCommentTokens_; // skip over '//' and '/*' comments
103
104public:
105 // Parse the contents of a file
106 explicit TokenStream(const std::string &fileName)
107 : fileName_(fileName), content_(fileName), parseRegion_(Indices::whole()), prior_(0), at_(0),
108 skipPreprocessorTokens_(true), skipCommentTokens_(true) {}
109
110 // Parse from buffer
111 TokenStream(const std::string &fileName, const Sawyer::Container::Buffer<size_t, char>::Ptr &buffer)
112 : fileName_(fileName), content_(buffer), parseRegion_(Indices::whole()), prior_(0), at_(0),
113 skipPreprocessorTokens_(true), skipCommentTokens_(true) {}
114
115 // Reparse part of another token stream. Position info, error messages, lines, etc. are from the enclosing token stream.
116 TokenStream(TokenStream &super, const Indices &region)
117 : fileName_(super.fileName_), content_(super.content_), parseRegion_(region), prior_(region.least()), at_(region.least()),
118 skipPreprocessorTokens_(true), skipCommentTokens_(true) {
119 ASSERT_require(region);
120 }
121
122 const std::string fileName() const { return fileName_; }
123
124 bool skipPreprocessorTokens() const { return skipPreprocessorTokens_; }
125 void skipPreprocessorTokens(bool b) { skipPreprocessorTokens_ = b; }
126
127 bool skipCommentTokens() const { return skipCommentTokens_; }
128 void skipCommentTokens(bool b) { skipCommentTokens_ = b; }
129
130 int getChar(size_t position);
131
132 const Token& operator[](size_t lookahead);
133
134 void consume(size_t n = 1);
135
136 std::string lexeme(const Token &t) const;
137
138 std::string toString(const Token &t) const;
139
140 // Return the line of source in which this token appears, including line termination if present.
141 std::string line(const Token &t) const;
142
143 bool matches(const Token &token, const char *s2) const;
144 bool startsWith(const Token &token, const char *prefix) const;
145
146 void emit(std::ostream &out, const std::string &fileName, const Token &token, const std::string &message) const;
147
148 void emit(std::ostream &out, const std::string &fileName, const Token &begin, const Token &locus, const Token &end,
149 const std::string &message) const;
150
151 std::pair<size_t, size_t> location(const Token &token) const;
152
153 const Sawyer::Container::LineVector& content() const {
154 return content_;
155 }
156
157private:
158 void scanString();
159 void makeNextToken();
160};
161
162
163} // namespace
164} // namespace
165} // namespace
166
167#endif
static Interval hull(size_t v1, size_t v2)
Construct an interval from two endpoints.
Definition Interval.h:162
T least() const
Returns lower limit.
Definition Interval.h:218
static Interval whole()
Construct an interval that covers the entire domain.
Definition Interval.h:191
A buffer of characters indexed by line number.
Definition LineVector.h:24
Reference-counting intrusive smart pointer.
ROSE_UTIL_API std::string toString(const Path &)
Convert a path to a string.
Sawyer support library.
const char * Language(int64_t)
Convert ClangToSageTranslator::Language enum constant to a string.