Lexer.hpp

A general-purpose, fast lexer.

Build a lexer that can convert input strings or streams into a series of provided tokens.

Use AddToken(name, regex) to list out the relevant tokens. ‘name’ is the unique name for this token. ‘regex’ is the regular expression that describes this token. It will return a unique ID associated with this lexeme.

IgnoreToken(name, regex) uses the same arguments, but is used for tokens that should be skipped over.

Names and IDs can be recovered later using GetTokenID(name) and GetTokenName(id).

Tokens can be retrieved either one at a time with Process(string) or Process(stream), which will return the next (non-ignored) token, removing it from the input.

Alternatively, an entire series of tokens can be processed with Tokenize().

Finally, GetLexeme() can be used to retrieve the lexeme from the most recent token found.

Note

Status: BETA

struct TokenInfo
#include <Lexer.hpp>

Information about an individual TYPE of token to be processed within a Lexer.

Public Functions

inline TokenInfo()
inline TokenInfo(const std::string &_name, const std::string &_regex, int _id, bool _save_l = true, bool _save_t = true, const std::string &_desc = "")
TokenInfo(const TokenInfo&) = default
TokenInfo(TokenInfo&&) = default
TokenInfo &operator=(const TokenInfo&) = default
TokenInfo &operator=(TokenInfo&&) = default
inline void Print(std::ostream &os = std::cout) const

Print out the status of this token (for debugging)

Public Members

std::string name

Name of this token type.

std::string desc

More detailed description of this token type.

RegEx regex

Pattern to describe token type.

int id

Unique id for token.

bool save_lexeme

Preserve the lexeme for this token?

bool save_token

Keep token at all? (Whitespace and comments are often discarded).

struct Token
#include <Lexer.hpp>

Information about a token instance from an input stream.

Public Functions

inline Token(int _id, const std::string &str = "", size_t _line = 0)
Token(const Token&) = default
Token(Token&&) = default
Token &operator=(const Token&) = default
Token &operator=(Token&&) = default
inline operator int() const

Token will automatically convert to its ID if used as an int.

inline operator const std::string&() const

Token will automatically convert to its matched sequence (lexeme) is used as a string.

Public Members

int id

Which type of token is this?

std::string lexeme

Sequence matched by this token (or empty if not saved)

size_t line_id

Which line did this token start on?

class TokenStream
#include <Lexer.hpp>

Public Functions

inline TokenStream(const std::string &in_name)
TokenStream(const TokenStream&) = default
TokenStream(TokenStream&&) = default
inline TokenStream(const vector<Token> &in_tokens, const std::string &in_name)
TokenStream &operator=(const TokenStream&) = default
TokenStream &operator=(TokenStream&&) = default
inline size_t size() const
inline const Token &Get(size_t pos) const
inline Ptr<const Token> GetPtr(size_t pos) const
inline const std::string &GetName() const
inline Iterator begin() const
inline Iterator end() const
inline const Token &back() const
inline void push_back(const Token &in)
inline void Print(std::ostream &os = std::cout) const

Private Members

std::string name = ""
vector<Token> tokens
class Iterator
#include <Lexer.hpp>

Public Functions

Iterator(const Iterator&) = default
inline Iterator(const TokenStream &in_ts, size_t in_pos)
Iterator &operator=(const Iterator&) = default
inline const TokenStream &GetTokenStream() const
inline size_t GetIndex() const
inline Ptr<const Token> ToPtr() const
inline Token operator*() const
inline const Token *operator->() const
inline bool operator==(const Iterator &in) const
inline bool operator!=(const Iterator &in) const
inline bool operator<(const Iterator &in) const
inline bool operator<=(const Iterator &in) const
inline bool operator>(const Iterator &in) const
inline bool operator>=(const Iterator &in) const
inline Iterator &operator++()
inline Iterator operator++(int)
inline Iterator &operator--()
inline Iterator operator--(int)
inline bool IsValid() const
inline bool AtEnd() const
inline operator bool() const

Private Members

Ptr<const TokenStream> ts
size_t pos
class Lexer
#include <Lexer.hpp>

A lexer with a set of token types (and associated regular expressions)

Public Functions

Lexer() = default
Lexer(const Lexer&) = default
Lexer(Lexer&&) = default
~Lexer() = default
Lexer &operator=(const Lexer&) = default
Lexer &operator=(Lexer&&) = default
inline size_t GetNumTokens() const

How many types of tokens can be identified in this Lexer?

inline void Reset()
inline bool TokenOK(int id) const
inline int AddToken(const std::string &name, const std::string &regex, bool save_lexeme = true, bool save_token = true, const std::string &desc = "")

Add a new token, specified by a name and the regex used to identify it. Note that token ids count down with highest IDs having priority.

inline int IgnoreToken(const std::string &name, const std::string &regex, const std::string &desc = "")

Add a token to ignore, specified by a name and the regex used to identify it.

inline int GetTokenID(const std::string &name) const

Get the ID associated with a token type (you provide the token name)

inline const TokenInfo &GetTokenInfo(int id) const

Get the full information about a token (you provide the id)

inline std::string GetTokenName(int id) const

Get the name associated with a token type (you provide the ID)

inline bool GetSaveToken(int id) const

Identify if a token should be saved.

inline void Generate() const

Create the NFA that will identify the current set of tokens in a sequence.

inline Token Process(std::istream &is) const

Get the next token found in an input stream. Do so by examining one character at a time. Keep going as long as there is a chance of a valid lexeme (since we want to choose the longest one we can find.) Every time we do hit a valid lexeme, store it as the current “best” and keep going. Once we hit a point where no other valid lexemes are possible, stop and return the best we’ve found so far.

inline Token Process(std::string &in_str) const

Shortcut to process a string rather than a stream, chopping off one token each time.

inline Token ToToken(std::string_view in_str) const

Shortcut to just get a single token.

inline TokenStream Tokenize(std::istream &is, const std::string &name = "in_stream") const

Turn an input stream of text into a vector of tokens.

inline TokenStream Tokenize(std::string_view str, const std::string &name = "in_view") const

Turn an input string into a vector of tokens.

inline TokenStream Tokenize(const vector<std::string> &str_v, const std::string &name = "in_string vector") const

Turn a vector of strings into a vector of tokens.

inline const std::string &GetLexeme() const

Get the lexeme associated with the last token identified.

inline void Print(std::ostream &os = std::cout) const

Print the full information about this lexer (for debugging)

inline void DebugString(std::string test_string) const

Try out the lexer on a string and demonstrate how it’s tokenized.

inline void DebugToken(int token_id) const

Private Members

vector<TokenInfo> token_set

List of all active tokens types.

map<std::string, int> token_map

Map of token names to id.

int cur_token_id = MAX_ID

Which ID should the next new token get?

mutable bool generate_lexer = false

Do we need to regenerate the lexer?

mutable DFA lexer_dfa

Table driven lexer implementation.

mutable std::string lexeme

Current state of lexeme being generated.

Private Static Functions

static inline const TokenInfo &ERROR_TOKEN()

Private Static Attributes

static constexpr int MAX_ID = 255

IDs count down so that first ones have priority.

static constexpr int ERROR_ID = -1

Code for unknown token ID.