1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
|
//===- Lexer.h - Lexer for the Toy language -------------------------------===//
//
// Copyright 2019 The MLIR Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
//
// This file implements a simple Lexer for the Toy language.
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_TUTORIAL_TOY_LEXER_H_
#define MLIR_TUTORIAL_TOY_LEXER_H_
#include "llvm/ADT/StringRef.h"
#include <memory>
#include <string>
namespace toy {
/// Structure definition a location in a file.
struct Location {
std::shared_ptr<std::string> file; ///< filename
int line; ///< line number.
int col; ///< column number.
};
// List of Token returned by the lexer.
enum Token : int {
tok_semicolon = ';',
tok_parenthese_open = '(',
tok_parenthese_close = ')',
tok_bracket_open = '{',
tok_bracket_close = '}',
tok_sbracket_open = '[',
tok_sbracket_close = ']',
tok_eof = -1,
// commands
tok_return = -2,
tok_var = -3,
tok_def = -4,
// primary
tok_identifier = -5,
tok_number = -6,
};
/// The Lexer is an abstract base class providing all the facilities that the
/// Parser expects. It goes through the stream one token at a time and keeps
/// track of the location in the file for debugging purpose.
/// It relies on a subclass to provide a `readNextLine()` method. The subclass
/// can proceed by reading the next line from the standard input or from a
/// memory mapped file.
class Lexer {
public:
/// Create a lexer for the given filename. The filename is kept only for
/// debugging purpose (attaching a location to a Token).
Lexer(std::string filename)
: lastLocation(
{std::make_shared<std::string>(std::move(filename)), 0, 0}) {}
virtual ~Lexer() = default;
/// Look at the current token in the stream.
Token getCurToken() { return curTok; }
/// Move to the next token in the stream and return it.
Token getNextToken() { return curTok = getTok(); }
/// Move to the next token in the stream, asserting on the current token
/// matching the expectation.
void consume(Token tok) {
assert(tok == curTok && "consume Token mismatch expectation");
getNextToken();
}
/// Return the current identifier (prereq: getCurToken() == tok_identifier)
llvm::StringRef getId() {
assert(curTok == tok_identifier);
return IdentifierStr;
}
/// Return the current number (prereq: getCurToken() == tok_number)
double getValue() {
assert(curTok == tok_number);
return NumVal;
}
/// Return the location for the beginning of the current token.
Location getLastLocation() { return lastLocation; }
// Return the current line in the file.
int getLine() { return curLineNum; }
// Return the current column in the file.
int getCol() { return curCol; }
private:
/// Delegate to a derived class fetching the next line. Returns an empty
/// string to signal end of file (EOF). Lines are expected to always finish
/// with "\n"
virtual llvm::StringRef readNextLine() = 0;
/// Return the next character from the stream. This manages the buffer for the
/// current line and request the next line buffer to the derived class as
/// needed.
int getNextChar() {
// The current line buffer should not be empty unless it is the end of file.
if (curLineBuffer.empty())
return EOF;
++curCol;
auto nextchar = curLineBuffer.front();
curLineBuffer = curLineBuffer.drop_front();
if (curLineBuffer.empty())
curLineBuffer = readNextLine();
if (nextchar == '\n') {
++curLineNum;
curCol = 0;
}
return nextchar;
}
/// Return the next token from standard input.
Token getTok() {
// Skip any whitespace.
while (isspace(LastChar))
LastChar = Token(getNextChar());
// Save the current location before reading the token characters.
lastLocation.line = curLineNum;
lastLocation.col = curCol;
if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9_]*
IdentifierStr = (char)LastChar;
while (isalnum((LastChar = Token(getNextChar()))) || LastChar == '_')
IdentifierStr += (char)LastChar;
if (IdentifierStr == "return")
return tok_return;
if (IdentifierStr == "def")
return tok_def;
if (IdentifierStr == "var")
return tok_var;
return tok_identifier;
}
if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+
std::string NumStr;
do {
NumStr += LastChar;
LastChar = Token(getNextChar());
} while (isdigit(LastChar) || LastChar == '.');
NumVal = strtod(NumStr.c_str(), nullptr);
return tok_number;
}
if (LastChar == '#') {
// Comment until end of line.
do
LastChar = Token(getNextChar());
while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
if (LastChar != EOF)
return getTok();
}
// Check for end of file. Don't eat the EOF.
if (LastChar == EOF)
return tok_eof;
// Otherwise, just return the character as its ascii value.
Token ThisChar = Token(LastChar);
LastChar = Token(getNextChar());
return ThisChar;
}
/// The last token read from the input.
Token curTok = tok_eof;
/// Location for `curTok`.
Location lastLocation;
/// If the current Token is an identifier, this string contains the value.
std::string IdentifierStr;
/// If the current Token is a number, this contains the value.
double NumVal = 0;
/// The last value returned by getNextChar(). We need to keep it around as we
/// always need to read ahead one character to decide when to end a token and
/// we can't put it back in the stream after reading from it.
Token LastChar = Token(' ');
/// Keep track of the current line number in the input stream
int curLineNum = 0;
/// Keep track of the current column number in the input stream
int curCol = 0;
/// Buffer supplied by the derived class on calls to `readNextLine()`
llvm::StringRef curLineBuffer = "\n";
};
/// A lexer implementation operating on a buffer in memory.
class LexerBuffer final : public Lexer {
public:
LexerBuffer(const char *begin, const char *end, std::string filename)
: Lexer(std::move(filename)), current(begin), end(end) {}
private:
/// Provide one line at a time to the Lexer, return an empty string when
/// reaching the end of the buffer.
llvm::StringRef readNextLine() override {
auto *begin = current;
while (current <= end && *current && *current != '\n')
++current;
if (current <= end && *current)
++current;
llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
return result;
}
const char *current, *end;
};
} // namespace toy
#endif // MLIR_TUTORIAL_TOY_LEXER_H_
|