//===- Lexer.h - Lexer for the Toy language -------------------------------===// // // Copyright 2019 The MLIR Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================= // // This file implements a simple Lexer for the Toy language. // //===----------------------------------------------------------------------===// #ifndef MLIR_TUTORIAL_TOY_LEXER_H_ #define MLIR_TUTORIAL_TOY_LEXER_H_ #include "llvm/ADT/StringRef.h" #include #include namespace toy { /// Structure definition a location in a file. struct Location { std::shared_ptr file; ///< filename int line; ///< line number. int col; ///< column number. }; // List of Token returned by the lexer. enum Token : int { tok_semicolon = ';', tok_parenthese_open = '(', tok_parenthese_close = ')', tok_bracket_open = '{', tok_bracket_close = '}', tok_sbracket_open = '[', tok_sbracket_close = ']', tok_eof = -1, // commands tok_return = -2, tok_var = -3, tok_def = -4, // primary tok_identifier = -5, tok_number = -6, }; /// The Lexer is an abstract base class providing all the facilities that the /// Parser expects. It goes through the stream one token at a time and keeps /// track of the location in the file for debugging purpose. /// It relies on a subclass to provide a `readNextLine()` method. The subclass /// can proceed by reading the next line from the standard input or from a /// memory mapped file. class Lexer { public: /// Create a lexer for the given filename. The filename is kept only for /// debugging purpose (attaching a location to a Token). Lexer(std::string filename) : lastLocation( {std::make_shared(std::move(filename)), 0, 0}) {} virtual ~Lexer() = default; /// Look at the current token in the stream. Token getCurToken() { return curTok; } /// Move to the next token in the stream and return it. Token getNextToken() { return curTok = getTok(); } /// Move to the next token in the stream, asserting on the current token /// matching the expectation. void consume(Token tok) { assert(tok == curTok && "consume Token mismatch expectation"); getNextToken(); } /// Return the current identifier (prereq: getCurToken() == tok_identifier) llvm::StringRef getId() { assert(curTok == tok_identifier); return IdentifierStr; } /// Return the current number (prereq: getCurToken() == tok_number) double getValue() { assert(curTok == tok_number); return NumVal; } /// Return the location for the beginning of the current token. Location getLastLocation() { return lastLocation; } // Return the current line in the file. int getLine() { return curLineNum; } // Return the current column in the file. int getCol() { return curCol; } private: /// Delegate to a derived class fetching the next line. Returns an empty /// string to signal end of file (EOF). Lines are expected to always finish /// with "\n" virtual llvm::StringRef readNextLine() = 0; /// Return the next character from the stream. This manages the buffer for the /// current line and request the next line buffer to the derived class as /// needed. int getNextChar() { // The current line buffer should not be empty unless it is the end of file. if (curLineBuffer.empty()) return EOF; ++curCol; auto nextchar = curLineBuffer.front(); curLineBuffer = curLineBuffer.drop_front(); if (curLineBuffer.empty()) curLineBuffer = readNextLine(); if (nextchar == '\n') { ++curLineNum; curCol = 0; } return nextchar; } /// Return the next token from standard input. Token getTok() { // Skip any whitespace. while (isspace(LastChar)) LastChar = Token(getNextChar()); // Save the current location before reading the token characters. lastLocation.line = curLineNum; lastLocation.col = curCol; if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9_]* IdentifierStr = (char)LastChar; while (isalnum((LastChar = Token(getNextChar()))) || LastChar == '_') IdentifierStr += (char)LastChar; if (IdentifierStr == "return") return tok_return; if (IdentifierStr == "def") return tok_def; if (IdentifierStr == "var") return tok_var; return tok_identifier; } if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+ std::string NumStr; do { NumStr += LastChar; LastChar = Token(getNextChar()); } while (isdigit(LastChar) || LastChar == '.'); NumVal = strtod(NumStr.c_str(), nullptr); return tok_number; } if (LastChar == '#') { // Comment until end of line. do LastChar = Token(getNextChar()); while (LastChar != EOF && LastChar != '\n' && LastChar != '\r'); if (LastChar != EOF) return getTok(); } // Check for end of file. Don't eat the EOF. if (LastChar == EOF) return tok_eof; // Otherwise, just return the character as its ascii value. Token ThisChar = Token(LastChar); LastChar = Token(getNextChar()); return ThisChar; } /// The last token read from the input. Token curTok = tok_eof; /// Location for `curTok`. Location lastLocation; /// If the current Token is an identifier, this string contains the value. std::string IdentifierStr; /// If the current Token is a number, this contains the value. double NumVal = 0; /// The last value returned by getNextChar(). We need to keep it around as we /// always need to read ahead one character to decide when to end a token and /// we can't put it back in the stream after reading from it. Token LastChar = Token(' '); /// Keep track of the current line number in the input stream int curLineNum = 0; /// Keep track of the current column number in the input stream int curCol = 0; /// Buffer supplied by the derived class on calls to `readNextLine()` llvm::StringRef curLineBuffer = "\n"; }; /// A lexer implementation operating on a buffer in memory. class LexerBuffer final : public Lexer { public: LexerBuffer(const char *begin, const char *end, std::string filename) : Lexer(std::move(filename)), current(begin), end(end) {} private: /// Provide one line at a time to the Lexer, return an empty string when /// reaching the end of the buffer. llvm::StringRef readNextLine() override { auto *begin = current; while (current <= end && *current && *current != '\n') ++current; if (current <= end && *current) ++current; llvm::StringRef result{begin, static_cast(current - begin)}; return result; } const char *current, *end; }; } // namespace toy #endif // MLIR_TUTORIAL_TOY_LEXER_H_