Empirical
Lexer.h
Go to the documentation of this file.
1 
11 #ifndef EMP_LEXER_H
12 #define EMP_LEXER_H
13 
14 #include <map>
15 #include <string>
16 
17 #include "../base/vector.h"
18 
19 #include "lexer_utils.h"
20 #include "RegEx.h"
21 
22 namespace emp {
23 
25  struct TokenInfo {
26  std::string name;
28  size_t id;
29  bool save_lexeme;
30 
31  // TokenInfo() : name(""), regex(""), id(-1), save_lexeme(false) { ; }
32  TokenInfo(const std::string & n, const std::string & r, size_t i, bool s=false)
33  : name(n), regex(r), id(i), save_lexeme(s) { ; }
34  TokenInfo(const TokenInfo &) = default;
35  TokenInfo & operator=(const TokenInfo &) = default;
36 
38  void Print(std::ostream & os=std::cout) const {
39  os << "Name:" << name
40  << " RegEx:" << regex.AsString()
41  << " ID:" << id
42  << " save_lexeme:" << save_lexeme
43  << std::endl;
44  }
45  };
46 
48  struct Token {
49  size_t token_id;
50  std::string lexeme;
51 
52  Token(size_t id, const std::string & str="") : token_id(id), lexeme(str) { ; }
53  Token(const Token &) = default;
54  Token & operator=(const Token &) = default;
55 
57  operator size_t() { return token_id; }
58 
60  operator const std::string &() { return lexeme; }
61  };
62 
64  class Lexer {
65  private:
66  emp::vector<TokenInfo> token_set;
67  size_t cur_token_id;
68  mutable bool generate_lexer;
69  mutable DFA lexer_dfa;
70  std::string lexeme;
71 
72  public:
73  static const size_t MAX_TOKEN_ID = 256; // How many token IDs are possible?
74  static const size_t ERROR_ID = MAX_TOKEN_ID; // Code for unknown token ID.
75  static inline bool TokenOK(size_t id) { return id < MAX_TOKEN_ID; }
76 
78  : token_set(), cur_token_id(MAX_TOKEN_ID), generate_lexer(false), lexer_dfa(), lexeme() { }
79  ~Lexer() { ; }
80 
82  size_t GetNumTokens() const { return token_set.size(); }
83 
85  size_t AddToken(const std::string & in_name, const std::string & in_regex) {
86  --cur_token_id;
87  generate_lexer = true;
88  token_set.emplace_back( in_name, in_regex, cur_token_id );
89  return cur_token_id;
90  }
91 
93  constexpr static size_t MaxTokenID() { return MAX_TOKEN_ID; }
94 
96  size_t GetTokenID(const std::string & name) const {
97  for (const auto & t : token_set) {
98  if (t.name == name) return t.id;
99  }
100  return ERROR_ID;
101  }
102 
104  std::string GetTokenName(size_t id) const {
105  if (id >= MAX_TOKEN_ID) return "Error";
106  if (id == 0) return "EOF";
107  if (id < 128) return emp::to_escaped_string((char) id); // Individual characters.
108  for (const auto & t : token_set) {
109  if (t.id == id) return t.name;
110  }
111  return "Unknown";
112  }
113 
115  TokenInfo GetTokenInfo(const std::string & name) const {
116  for (const auto & t : token_set) {
117  if (t.name == name) return t;
118  }
119  return TokenInfo("", "", ERROR_ID);
120  }
121 
123  void Generate() const {
124  NFA lexer_nfa;
125  for (const auto & t : token_set) {
126  lexer_nfa.Merge( to_NFA(t.regex, t.id) );
127  }
128  generate_lexer = false; // We just generated it! Don't again unless another change is made.
129  lexer_dfa = to_DFA(lexer_nfa);
130  }
131 
133  Token Process(std::istream & is) {
134  if (generate_lexer) Generate();
135  size_t cur_pos = 0;
136  size_t best_pos = 0;
137  int cur_state = 0;
138  int cur_stop = 0;
139  int best_stop = -1;
140  lexeme.resize(0);
141 
142  // Keep looking as long as:
143  // 1: We may still be able to contine the current lexeme.
144  // 2: We have not entered an invalid state.
145  // 3: Our input stream has more symbols.
146  while (cur_stop >= 0 && cur_state >= 0 && is) {
147  const char next_char = (char) is.get();
148  cur_pos++;
149  cur_state = lexer_dfa.Next(cur_state, (size_t) next_char);
150  cur_stop = lexer_dfa.GetStop(cur_state);
151  if (cur_stop > 0) { best_pos = cur_pos; best_stop = cur_stop; }
152  lexeme.push_back( next_char );
153  }
154 
155  // If best_pos < cur_pos, we need to rewind the input stream and adjust the lexeme.
156  if (best_pos > 0 && best_pos < cur_pos) {
157  lexeme.resize(best_pos);
158  while (best_pos < cur_pos) { is.unget(); cur_pos--; }
159  }
160 
161  // If we are at the end of this input stream (with no token to return) give back a 0.
162  if (best_stop < 0) {
163  if (!is) return { 0, "" };
164  return { ERROR_ID, lexeme };
165  }
166 
167  return { (size_t) best_stop, lexeme };
168  }
169 
171  Token Process(std::string & in_str) {
172  std::stringstream ss;
173  ss << in_str;
174  auto out_val = Process(ss);
175  in_str = ss.str();
176  return out_val;
177  }
178 
180  const std::string & GetLexeme() { return lexeme; }
181 
183  void Print(std::ostream & os=std::cout) const {
184  for (const auto & t : token_set) t.Print(os);
185  if (generate_lexer) Generate(); // Do we need to regenerate the lexer?
186  lexer_dfa.Print(os); // Table driven lexer implementation.
187  }
188  };
189 
190 
191 }
192 
193 #endif
static const NFA & to_NFA(const NFA &nfa)
Converting NFA to MFA – no change needed.
Definition: lexer_utils.h:30
TokenInfo & operator=(const TokenInfo &)=default
int Next(int state, size_t sym) const
Return the new state after a symbol occurs.
Definition: DFA.h:90
Token(size_t id, const std::string &str="")
Definition: Lexer.h:52
A basic regular expression handler.
Definition: RegEx.h:55
size_t GetNumTokens() const
How many types of tokens can be identified in this Lexer?
Definition: Lexer.h:82
A dynamic NFA class, for easily building non-determanistic finite automata.
Definition: NFA.h:37
void Merge(const tNFA< NUM_SYMBOLS, STOP_TYPE > &nfa2)
Merge another NFA into this one.
Definition: NFA.h:189
static bool TokenOK(size_t id)
Definition: Lexer.h:75
size_t token_id
Which type of token is this?
Definition: Lexer.h:49
void Print(std::ostream &os=std::cout)
Print details about this DFA.
Definition: DFA.h:109
TokenInfo(const std::string &n, const std::string &r, size_t i, bool s=false)
Definition: Lexer.h:32
A lexer with a set of token types (and associated regular expressions)
Definition: Lexer.h:64
std::string AsString() const
Convert the RegEx to an standard string, readable from outsite this class.
Definition: RegEx.h:482
Lexer()
Definition: Lexer.h:77
static constexpr size_t MaxTokenID()
How many total token types are allowed in this lexer?
Definition: Lexer.h:93
size_t size() const
Definition: vector.h:151
static const DFA & to_DFA(const DFA &dfa)
Converting DFA to DFA – no change needed.
Definition: lexer_utils.h:27
Token Process(std::istream &is)
Get the next token found in an input stream.
Definition: Lexer.h:133
void emplace_back(ARGS &&...args)
Definition: vector.h:219
Information about a token instance from an input stream.
Definition: Lexer.h:48
RegEx regex
Pattern to describe token type.
Definition: Lexer.h:27
const std::string & GetLexeme()
Get the lexeme associated with the last token identified.
Definition: Lexer.h:180
TokenInfo GetTokenInfo(const std::string &name) const
Get the full information about a token (you provide the name)
Definition: Lexer.h:115
~Lexer()
Definition: Lexer.h:79
static const PrintStr endl("<br>")
Pre-define emp::endl to insert a "<br>" and thus acting like a newline.
std::string lexeme
The specific sequence matched by this token.
Definition: Lexer.h:50
Information about an individual token type to be processed within a Lexer.
Definition: Lexer.h:25
void Print(std::ostream &os=std::cout) const
Print the full information about this lexer (for debugging)
Definition: Lexer.h:183
static std::string to_escaped_string(char value)
Convert a single chararcter to one that uses a proper escape sequence (in a string) if needed...
Definition: string_utils.h:36
void Print(std::ostream &os=std::cout) const
Print out the status of this token (for debugging)
Definition: Lexer.h:38
void Generate() const
Create the NFA that will identify the current set of tokens in a sequence.
Definition: Lexer.h:123
If we are in emscripten, make sure to include the header.
Definition: array.h:37
std::string GetTokenName(size_t id) const
Get the name associated with a token type (you provide the ID)
Definition: Lexer.h:104
Build a debug wrapper emp::vector around std::vector.
Definition: vector.h:42
Basic regular expression handler.
std::string name
Name of this token type.
Definition: Lexer.h:26
bool save_lexeme
Should we preserve the lexeme for this token?
Definition: Lexer.h:29
A set of utilities to convert between NFAs and DFAs.
Token Process(std::string &in_str)
Shortcut to process a string rather than a stream.
Definition: Lexer.h:171
stop_t GetStop(int state) const
Get the stop value associated with a state.
Definition: DFA.h:76
size_t id
Unique id for token.
Definition: Lexer.h:28
size_t GetTokenID(const std::string &name) const
Get the ID associated with a token type (you provide the token name)
Definition: Lexer.h:96
size_t AddToken(const std::string &in_name, const std::string &in_regex)
Add a new token, specified by a name and the regex used to identify it.
Definition: Lexer.h:85