43 #include "../base/vector.h" 44 #include "../base/Ptr.h" 57 constexpr
static size_t NUM_SYMBOLS = 128;
65 mutable bool dfa_ready;
67 template <
typename... T>
68 void Error(T &&... args) {
81 virtual ~re_base() { ; }
82 virtual void Print(std::ostream & os)
const { os <<
"[]"; }
87 virtual size_t GetSize()
const {
return 0; }
88 virtual bool Simplify() {
return false; }
93 struct re_string :
public re_base {
95 re_string() : str() { ; }
96 re_string(
char c) : str() { str.push_back(c); }
97 re_string(
const std::string & s) : str(s) { ; }
100 size_t GetSize()
const override {
return str.size(); }
101 virtual void AddToNFA(
NFA & nfa,
size_t start,
size_t stop)
const override {
102 size_t prev_id = start;
113 struct re_charset :
public re_base {
115 re_charset() : char_set() { ; }
116 re_charset(
char x,
bool neg=
false) : char_set() {
117 char_set[(size_t)x]=
true;
120 re_charset(
const std::string & s,
bool neg=
false) : char_set() {
121 for (
char x : s) char_set[(size_t)x]=
true;
124 void Print(std::ostream & os)
const override {
125 auto chars = char_set.
GetOnes();
126 bool use_not =
false;
127 if (chars.size() > 64) { chars = (~char_set).GetOnes(); use_not =
true; }
129 if (use_not) os <<
"NOT ";
135 char First()
const {
return (
char) char_set.
FindBit(); }
136 virtual void AddToNFA(
NFA & nfa,
size_t start,
size_t stop)
const override {
137 for (
size_t i = 0; i < NUM_SYMBOLS; i++)
if (char_set[i]) nfa.
AddTransition(start, stop, i);
142 struct re_parent :
public re_base {
146 re_parent() : nodes() { }
147 ~re_parent() {
for (
auto x : nodes) x.Delete(); }
148 void Clear() {
for (
auto x : nodes) x.Delete(); nodes.
resize(0); }
151 size_t GetSize()
const override {
return nodes.
size(); }
153 bool Simplify()
override {
155 for (
auto & x : nodes) {
159 if (x->AsBlock() && x->GetSize() == 1) {
160 auto child = x->AsParent()->nodes[0];
161 x->AsParent()->nodes.resize(0);
172 struct re_block :
public re_parent {
173 void Print(std::ostream & os)
const override {
174 os <<
"BLOCK[";
for (
auto x : nodes) x->Print(os); os <<
"]";
177 bool Simplify()
override {
180 for (
size_t i = 0; i < nodes.size(); i++) {
182 if (nodes[i]->AsCharSet() && nodes[i]->GetSize() == 1) {
183 auto new_node = NewPtr<re_string>(nodes[i]->AsCharSet()->First());
190 nodes[i-1]->AsString()->str += nodes[i]->AsString()->str;
192 nodes.erase(nodes.begin() + (int) i);
199 if (nodes[i]->AsBlock()) {
200 auto old_node = nodes[i]->AsBlock();
201 nodes.erase(nodes.begin() + (int) i);
202 nodes.insert(nodes.begin() + (int) i, old_node->nodes.begin(), old_node->nodes.end());
203 old_node->nodes.resize(0);
212 modify |= re_parent::Simplify();
216 virtual void AddToNFA(
NFA & nfa,
size_t start,
size_t stop)
const override {
217 size_t prev_id = start;
218 for (
auto x : nodes) {
220 x->AddToNFA(nfa, prev_id, next_id);
228 struct re_or :
public re_parent {
230 void Print(std::ostream & os)
const override {
236 virtual void AddToNFA(
NFA & nfa,
size_t start,
size_t stop)
const override {
237 nodes[0]->AddToNFA(nfa, start, stop);
238 nodes[1]->AddToNFA(nfa, start, stop);
243 struct re_star :
public re_parent {
245 void Print(std::ostream & os)
const override { os <<
"*["; nodes[0]->Print(os); os <<
"]"; }
247 virtual void AddToNFA(
NFA & nfa,
size_t start,
size_t stop)
const override {
249 nodes[0]->AddToNFA(nfa, start, target);
256 struct re_plus :
public re_parent {
258 void Print(std::ostream & os)
const override { os <<
"+["; nodes[0]->Print(os); os <<
"]"; }
259 virtual void AddToNFA(
NFA & nfa,
size_t start,
size_t stop)
const override {
261 nodes[0]->AddToNFA(nfa, start, target);
269 struct re_qm :
public re_parent {
271 void Print(std::ostream & os)
const override { os <<
"?["; nodes[0]->Print(os); os <<
"]"; }
272 virtual void AddToNFA(
NFA & nfa,
size_t start,
size_t stop)
const override {
273 nodes[0]->AddToNFA(nfa, start, stop);
282 bool EnsureNext(
char x) {
283 if (pos >= regex.size()) Error(
"Expected ", x,
" before end.");
284 else if (regex[pos] != x) Error(
"Expected ", x,
" at position ", pos,
285 "; found ", regex[pos],
".");
292 char c = regex[pos++];
294 if (c ==
'^') { neg =
true; c = regex[pos++]; }
295 auto out = NewPtr<re_charset>();
297 while (c !=
']' && pos < regex.size()) {
298 if (c ==
'-' && prev_c != -1) {
300 if (c < prev_c) { Error(
"Invalid character range ", prev_c,
'-', c);
continue; }
301 for (
char x = prev_c; x <= c; x++) {
302 out->char_set[(size_t)x] =
true;
308 else if (c ==
'\\') {
311 case 'n': c =
'\n';
break;
312 case 'r': c =
'\r';
break;
313 case 't': c =
'\t';
break;
322 Error(
"Unknown escape char for char sets: '\\", c,
"'.");
325 out->char_set[(size_t)c] =
true;
329 if (neg) out->char_set.NOT_SELF();
336 char c = regex[pos++];
337 auto out = NewPtr<re_string>();
338 while (c !=
'\"' && pos < regex.size()) {
343 case 'n': c =
'\n';
break;
344 case 'r': c =
'\r';
break;
345 case 't': c =
'\t';
break;
351 Error(
"Unknown escape char for literal string: '\\", c,
"'.");
354 out->str.push_back(c);
357 if (c ==
'\"') --pos;
365 char c = regex[pos++];
368 result = NewPtr<re_charset>(
'\n',
true);
375 result = ConstructSet();
379 result = ConstructString();
385 case 'n': c =
'\n';
break;
386 case 'r': c =
'\r';
break;
387 case 't': c =
'\t';
break;
402 Error(
"Unknown escape char for regex: '\\", c,
"'.");
404 result = NewPtr<re_string>(c);
413 Error(
"Expected regex segment but got '", c,
"' at position ", pos,
".");
414 result = NewPtr<re_string>(c);
419 result = NewPtr<re_string>(c);
428 emp_assert(pos >= 0 && pos < regex.size(), pos, regex.size());
431 if (cur_block==
nullptr) cur_block = NewPtr<re_block>();
434 cur_block->push( ConstructSegment() );
436 while (pos < regex.size()) {
437 const char c = regex[pos++];
440 case '|': cur_block->push( NewPtr<re_or>( cur_block->pop(), Process() ) );
break;
441 case '*': cur_block->push( NewPtr<re_star>( cur_block->pop() ) );
break;
442 case '+': cur_block->push( NewPtr<re_plus>( cur_block->pop() ) );
break;
443 case '?': cur_block->push( NewPtr<re_qm>( cur_block->pop() ) );
break;
444 case ')': pos--;
return cur_block;
448 cur_block->push( ConstructSegment() );
458 : regex(r), notes(), valid(true), pos(0), dfa(), dfa_ready(false), head() {
459 Process(
ToPtr(&head));
460 while(head.Simplify());
463 : regex(r.regex), notes(), valid(true), pos(0), dfa(), dfa_ready(false), head() {
464 Process(
ToPtr(&head));
465 while(head.Simplify());
476 Process(
ToPtr(&head));
477 while (head.Simplify());
485 void AddToNFA(
NFA & nfa,
size_t start,
size_t stop)
const { head.AddToNFA(nfa, start, stop); }
491 bool Test(
const std::string & str)
const {
493 return dfa.
Test(str);
501 for (
const std::string & n : notes) {
513 std::cout <<
"INTERNAL: ";
Ptr< T > ToPtr(T *_in, bool own=false)
Convert a T* to a Ptr<T>. By default, don't track.
Definition: Ptr.h:816
static const NFA & to_NFA(const NFA &nfa)
Converting NFA to MFA – no change needed.
Definition: lexer_utils.h:30
stop_t Test(const std::string &str) const
Determine if an entire series of symbols is valid.
Definition: DFA.h:103
RegEx(const std::string &r)
Definition: RegEx.h:457
std::string to_string(ALL_TYPES &&...all_values)
Definition: string_utils.h:511
A basic regular expression handler.
Definition: RegEx.h:55
void Delete()
Definition: Ptr.h:737
RegEx(const RegEx &r)
Definition: RegEx.h:462
A dynamic NFA class, for easily building non-determanistic finite automata.
Definition: NFA.h:37
size_t CountOnes() const
Count the number of ones in the BitSet using bit tricks for a speedup.
Definition: BitSet.h:371
void AddFreeTransition(size_t from, size_t to)
Create a free transition between 'from' and 'to'.
Definition: NFA.h:152
void Generate() const
Assume the RegEx is ready and setup processing for it.
Definition: RegEx.h:532
void push_back(PB_Ts &&...args)
Definition: vector.h:189
std::string AsString() const
Convert the RegEx to an standard string, readable from outsite this class.
Definition: RegEx.h:482
void AddTransition(size_t from, size_t to, size_t sym)
Add a transition between states 'from' and 'to' that can be taken with the provided symbol...
Definition: NFA.h:131
Simple functions to manipulate strings.
size_t size() const
Definition: vector.h:151
static const DFA & to_DFA(const DFA &dfa)
Converting DFA to DFA – no change needed.
Definition: lexer_utils.h:27
emp::vector< size_t > GetOnes() const
Return a vector indicating the posistions of all ones in the BitSet.
Definition: BitSet.h:402
static const PrintStr endl("<br>")
Pre-define emp::endl to insert a "<br>" and thus acting like a newline.
void pop_back()
Definition: vector.h:194
void AddToNFA(NFA &nfa, size_t start, size_t stop) const
Add this regex to an NFA being built.
Definition: RegEx.h:485
std::string to_literal(const LIT_TYPE &value)
Take a value and convert it to a C++-style literal.
Definition: string_utils.h:99
size_t AddNewState()
Add a new state into the NFA and return its id.
Definition: NFA.h:128
A drop-in replacement for std::bitset, with additional bit magic features.
~RegEx()
Definition: RegEx.h:467
void resize(size_t new_size)
Definition: vector.h:161
int FindBit() const
Return the index of the first one in the sequence; return -1 if no ones are available.
Definition: BitSet.h:374
static std::string to_escaped_string(char value)
Convert a single chararcter to one that uses a proper escape sequence (in a string) if needed...
Definition: string_utils.h:36
void PrintInternal()
For debugging: print the internal representation of the regex.
Definition: RegEx.h:497
If we are in emscripten, make sure to include the header.
Definition: array.h:37
void SetStop(size_t state, T stop_val=1)
Set the specified state to be a stop state (with an optional stop value.)
Definition: NFA.h:174
#define emp_assert(...)
Definition: assert.h:199
T & back()
Definition: vector.h:183
RegEx & operator=(const RegEx &r)
Set this RegEx equal to another.
Definition: RegEx.h:470
void PrintDebug()
Print general debuging information about this regex.
Definition: RegEx.h:507
void PrintNotes()
For debugging: print any internal notes generated about this regex.
Definition: RegEx.h:500
bool Test(const std::string &str) const
Test if a string statisfies this regex.
Definition: RegEx.h:491
void Print(const emp::vector< T > &v, std::ostream &os=std::cout, const std::string &spacer=" ")
Print the contects of a vector.
Definition: vector_utils.h:35
A set of utilities to convert between NFAs and DFAs.
BitSet & NOT_SELF()
Perform a Boolean NOT on this BitSet, store result here, and return this object.
Definition: BitSet.h:467
constexpr size_t GetSize(T(&)[N])
Determine the size of a built-in array.
Definition: functions.h:81
A Non-deterministic Finite Automata simulator.