1 files changed, 239 insertions, 0 deletions
diff --git a/include/toy/Lexer.h b/include/toy/Lexer.h
new file mode 100644
index 0000000..d73adb9
--- /dev/null
+++ b/include/toy/Lexer.h
@@ -0,0 +1,239 @@
+//===- Lexer.h - Lexer for the Toy language -------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a simple Lexer for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_LEXER_H_
+#define MLIR_TUTORIAL_TOY_LEXER_H_
+
+#include "llvm/ADT/StringRef.h"
+
+#include <memory>
+#include <string>
+
+namespace toy {
+
+/// Structure definition a location in a file.
+struct Location {
+  std::shared_ptr<std::string> file; ///< filename
+  int line;                          ///< line number.
+  int col;                           ///< column number.
+};
+
+// List of Token returned by the lexer.
+enum Token : int {
+  tok_semicolon = ';',
+  tok_parenthese_open = '(',
+  tok_parenthese_close = ')',
+  tok_bracket_open = '{',
+  tok_bracket_close = '}',
+  tok_sbracket_open = '[',
+  tok_sbracket_close = ']',
+
+  tok_eof = -1,
+
+  // commands
+  tok_return = -2,
+  tok_var = -3,
+  tok_def = -4,
+
+  // primary
+  tok_identifier = -5,
+  tok_number = -6,
+};
+
+/// The Lexer is an abstract base class providing all the facilities that the
+/// Parser expects. It goes through the stream one token at a time and keeps
+/// track of the location in the file for debugging purpose.
+/// It relies on a subclass to provide a `readNextLine()` method. The subclass
+/// can proceed by reading the next line from the standard input or from a
+/// memory mapped file.
+class Lexer {
+public:
+  /// Create a lexer for the given filename. The filename is kept only for
+  /// debugging purpose (attaching a location to a Token).
+  Lexer(std::string filename)
+      : lastLocation(
+            {std::make_shared<std::string>(std::move(filename)), 0, 0}) {}
+  virtual ~Lexer() = default;
+
+  /// Look at the current token in the stream.
+  Token getCurToken() { return curTok; }
+
+  /// Move to the next token in the stream and return it.
+  Token getNextToken() { return curTok = getTok(); }
+
+  /// Move to the next token in the stream, asserting on the current token
+  /// matching the expectation.
+  void consume(Token tok) {
+    assert(tok == curTok && "consume Token mismatch expectation");
+    getNextToken();
+  }
+
+  /// Return the current identifier (prereq: getCurToken() == tok_identifier)
+  llvm::StringRef getId() {
+    assert(curTok == tok_identifier);
+    return IdentifierStr;
+  }
+
+  /// Return the current number (prereq: getCurToken() == tok_number)
+  double getValue() {
+    assert(curTok == tok_number);
+    return NumVal;
+  }
+
+  /// Return the location for the beginning of the current token.
+  Location getLastLocation() { return lastLocation; }
+
+  // Return the current line in the file.
+  int getLine() { return curLineNum; }
+
+  // Return the current column in the file.
+  int getCol() { return curCol; }
+
+private:
+  /// Delegate to a derived class fetching the next line. Returns an empty
+  /// string to signal end of file (EOF). Lines are expected to always finish
+  /// with "\n"
+  virtual llvm::StringRef readNextLine() = 0;
+
+  /// Return the next character from the stream. This manages the buffer for the
+  /// current line and request the next line buffer to the derived class as
+  /// needed.
+  int getNextChar() {
+    // The current line buffer should not be empty unless it is the end of file.
+    if (curLineBuffer.empty())
+      return EOF;
+    ++curCol;
+    auto nextchar = curLineBuffer.front();
+    curLineBuffer = curLineBuffer.drop_front();
+    if (curLineBuffer.empty())
+      curLineBuffer = readNextLine();
+    if (nextchar == '\n') {
+      ++curLineNum;
+      curCol = 0;
+    }
+    return nextchar;
+  }
+
+  ///  Return the next token from standard input.
+  Token getTok() {
+    // Skip any whitespace.
+    while (isspace(LastChar))
+      LastChar = Token(getNextChar());
+
+    // Save the current location before reading the token characters.
+    lastLocation.line = curLineNum;
+    lastLocation.col = curCol;
+
+    if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9_]*
+      IdentifierStr = (char)LastChar;
+      while (isalnum((LastChar = Token(getNextChar()))) || LastChar == '_')
+        IdentifierStr += (char)LastChar;
+
+      if (IdentifierStr == "return")
+        return tok_return;
+      if (IdentifierStr == "def")
+        return tok_def;
+      if (IdentifierStr == "var")
+        return tok_var;
+      return tok_identifier;
+    }
+
+    if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+
+      std::string NumStr;
+      do {
+        NumStr += LastChar;
+        LastChar = Token(getNextChar());
+      } while (isdigit(LastChar) || LastChar == '.');
+
+      NumVal = strtod(NumStr.c_str(), nullptr);
+      return tok_number;
+    }
+
+    if (LastChar == '#') {
+      // Comment until end of line.
+      do
+        LastChar = Token(getNextChar());
+      while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
+
+      if (LastChar != EOF)
+        return getTok();
+    }
+
+    // Check for end of file.  Don't eat the EOF.
+    if (LastChar == EOF)
+      return tok_eof;
+
+    // Otherwise, just return the character as its ascii value.
+    Token ThisChar = Token(LastChar);
+    LastChar = Token(getNextChar());
+    return ThisChar;
+  }
+
+  /// The last token read from the input.
+  Token curTok = tok_eof;
+
+  /// Location for `curTok`.
+  Location lastLocation;
+
+  /// If the current Token is an identifier, this string contains the value.
+  std::string IdentifierStr;
+
+  /// If the current Token is a number, this contains the value.
+  double NumVal = 0;
+
+  /// The last value returned by getNextChar(). We need to keep it around as we
+  /// always need to read ahead one character to decide when to end a token and
+  /// we can't put it back in the stream after reading from it.
+  Token LastChar = Token(' ');
+
+  /// Keep track of the current line number in the input stream
+  int curLineNum = 0;
+
+  /// Keep track of the current column number in the input stream
+  int curCol = 0;
+
+  /// Buffer supplied by the derived class on calls to `readNextLine()`
+  llvm::StringRef curLineBuffer = "\n";
+};
+
+/// A lexer implementation operating on a buffer in memory.
+class LexerBuffer final : public Lexer {
+public:
+  LexerBuffer(const char *begin, const char *end, std::string filename)
+      : Lexer(std::move(filename)), current(begin), end(end) {}
+
+private:
+  /// Provide one line at a time to the Lexer, return an empty string when
+  /// reaching the end of the buffer.
+  llvm::StringRef readNextLine() override {
+    auto *begin = current;
+    while (current <= end && *current && *current != '\n')
+      ++current;
+    if (current <= end && *current)
+      ++current;
+    llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
+    return result;
+  }
+  const char *current, *end;
+};
+} // namespace toy
+
+#endif // MLIR_TUTORIAL_TOY_LEXER_H_