Commit 7b654a83 by Maarten L. Hekkelman

with reserved words automaton

parent ae9d247d
...@@ -112,7 +112,8 @@ class sac_parser ...@@ -112,7 +112,8 @@ class sac_parser
DATA, DATA,
LOOP, LOOP,
GLOBAL, GLOBAL,
SAVE, SAVE_,
SAVE_NAME,
STOP, STOP,
Tag, Tag,
Value Value
...@@ -127,7 +128,8 @@ class sac_parser ...@@ -127,7 +128,8 @@ class sac_parser
case CIFToken::DATA: return "DATA"; case CIFToken::DATA: return "DATA";
case CIFToken::LOOP: return "LOOP"; case CIFToken::LOOP: return "LOOP";
case CIFToken::GLOBAL: return "GLOBAL"; case CIFToken::GLOBAL: return "GLOBAL";
case CIFToken::SAVE: return "SAVE"; case CIFToken::SAVE_: return "SAVE";
case CIFToken::SAVE_NAME: return "SAVE+name";
case CIFToken::STOP: return "STOP"; case CIFToken::STOP: return "STOP";
case CIFToken::Tag: return "Tag"; case CIFToken::Tag: return "Tag";
case CIFToken::Value: return "Value"; case CIFToken::Value: return "Value";
...@@ -135,32 +137,6 @@ class sac_parser ...@@ -135,32 +137,6 @@ class sac_parser
} }
} }
enum class CIFValue
{
Int,
Float,
Numeric,
String,
TextField,
Inapplicable,
Unknown
};
static constexpr const char *get_value_name(CIFValue type)
{
switch (type)
{
case CIFValue::Int: return "Int";
case CIFValue::Float: return "Float";
case CIFValue::Numeric: return "Numeric";
case CIFValue::String: return "String";
case CIFValue::TextField: return "TextField";
case CIFValue::Inapplicable: return "Inapplicable";
case CIFValue::Unknown: return "Unknown";
default: return "Invalid type parameter";
}
}
// get_next_char takes a char from the buffer, or if it is empty // get_next_char takes a char from the buffer, or if it is empty
// from the istream. This function also does carriage/linefeed // from the istream. This function also does carriage/linefeed
// translation. // translation.
......
...@@ -127,7 +127,7 @@ class dictionary_parser : public parser ...@@ -127,7 +127,7 @@ class dictionary_parser : public parser
datablock dict(m_token_value); datablock dict(m_token_value);
datablock::iterator cat = dict.end(); datablock::iterator cat = dict.end();
match(CIFToken::SAVE); match(CIFToken::SAVE_NAME);
while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag) while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag)
{ {
if (m_lookahead == CIFToken::LOOP) if (m_lookahead == CIFToken::LOOP)
...@@ -183,7 +183,7 @@ class dictionary_parser : public parser ...@@ -183,7 +183,7 @@ class dictionary_parser : public parser
} }
} }
match(CIFToken::SAVE); match(CIFToken::SAVE_);
if (isCategorySaveFrame) if (isCategorySaveFrame)
{ {
......
...@@ -40,142 +40,181 @@ namespace cif ...@@ -40,142 +40,181 @@ namespace cif
// -------------------------------------------------------------------- // --------------------------------------------------------------------
sac_parser::sac_parser(std::istream &is, bool init) class reserved_words_automaton
: m_source(*is.rdbuf())
{
if (is.rdbuf() == nullptr)
throw std::runtime_error("Attempt to read from uninitialised stream");
m_validate = true;
m_line_nr = 1;
m_bol = true;
if (init)
m_lookahead = get_next_token();
}
bool sac_parser::is_unquoted_string(std::string_view text)
{ {
bool result = text.empty() or is_ordinary(text.front()); public:
int state = 1; reserved_words_automaton() {}
for (char ch : text) enum move_result
{ {
if (not is_non_blank(ch)) undefined,
no_keyword,
data,
global,
loop,
save,
save_plus,
stop
};
constexpr bool finished() const
{ {
result = false; return m_state <= 0;
break;
} }
switch (state) constexpr bool matched() const
{ {
case 0: return m_state < 0;
break;
case 1:
switch (ch & ~0x20)
{
case 'D': // data_
state = 10;
break;
case 'G':
state = 20; // global_
break;
case 'L':
state = 30; // loop_
break;
case 'S':
state = 40; // stop_ | save_
break;
default:
state = 0;
break;
} }
break;
case 10: state = ((ch & ~0x20) == 'A') ? state + 1 : 0; break; constexpr move_result move(int ch)
case 11: state = ((ch & ~0x20) == 'T') ? state + 1 : 0; break; {
case 12: state = ((ch & ~0x20) == 'A') ? 100 : 0; break; move_result result = undefined;
case 20: state = ((ch & ~0x20) == 'L') ? state + 1 : 0; break;
case 21: state = ((ch & ~0x20) == 'O') ? state + 1 : 0; break;
case 22: state = ((ch & ~0x20) == 'B') ? state + 1 : 0; break;
case 23: state = ((ch & ~0x20) == 'A') ? state + 1 : 0; break;
case 24: state = ((ch & ~0x20) == 'L') ? 200 : 0; break;
case 30: state = ((ch & ~0x20) == 'O') ? state + 1 : 0; break; switch (m_state)
case 31: state = ((ch & ~0x20) == 'O') ? state + 1 : 0; break; {
case 32: state = ((ch & ~0x20) == 'P') ? 200 : 0; break; case 0:
break;
case 40: case -1: // data_
if ((ch & ~0x20) == 'A') if (sac_parser::is_non_blank(ch))
state = 41; m_seen_trailing_chars = true;
else if ((ch & ~0x20) == 'T') else if (m_seen_trailing_chars)
state = 51; result = data;
else else
state = 0; result = no_keyword;
break; break;
case 41: state = ((ch & ~0x20) == 'V') ? state + 1 : 0; break; case -2: // global_
case 42: state = ((ch & ~0x20) == 'E') ? 100 : 0; break; result = sac_parser::is_non_blank(ch) ? no_keyword : global;
break;
case 51: state = ((ch & ~0x20) == 'O') ? state + 1 : 0; break; case -3: // loop_
case 52: state = ((ch & ~0x20) == 'P') ? 200 : 0; break; result = sac_parser::is_non_blank(ch) ? no_keyword : loop;
break;
case 100: case -4: // save_
state = ((ch & ~0x20) == '_') ? 101 : 0; break; if (sac_parser::is_non_blank(ch))
m_seen_trailing_chars = true;
else if (m_seen_trailing_chars)
result = save_plus;
else
result = save;
break;
case 101: case -5: // stop_
result = false; result = sac_parser::is_non_blank(ch) ? no_keyword : stop;
state = 0;
break; break;
case 200: default:
if ((ch & ~0x20) == '_') assert(m_state > 0 and m_state < NODE_COUNT);
for (;;)
{ {
result = false; if (s_dag[m_state].ch == (ch & ~0x20))
state = 201; {
} m_state = s_dag[m_state].next_match;
else
state = 0;
break; break;
}
case 201: m_state = s_dag[m_state].next_nomatch;
result = true;
if (m_state == 0)
{
result = no_keyword;
break; break;
} }
} }
break;
}
if (result != undefined)
m_state = 0;
return result; return result;
}
private:
static constexpr struct node
{
int16_t ch;
int8_t next_match;
int8_t next_nomatch;
} s_dag[] = {
{ 0 },
{ 'D', 5, 2 },
{ 'G', 9, 3 },
{ 'L', 15, 4 },
{ 'S', 19, 0 },
{ 'A', 6, 0 },
{ 'T', 7, 0 },
{ 'A', 8, 0 },
{ '_', -1, 0 },
{ 'L', 10, 0 },
{ 'O', 11, 0 },
{ 'B', 12, 0 },
{ 'A', 13, 0 },
{ 'L', 14, 0 },
{ '_', -2, 0 },
{ 'O', 16, 0},
{ 'O', 17, 0 },
{ 'P', 18, 0 },
{ '_', -3, 0 },
{ 'A', 21, 20 },
{ 'T', 24, 0 },
{ 'V', 22, 0 },
{ 'E', 23, 0 },
{ '_', -4, 0 },
{ 'O', 25, 0 },
{ 'P', 26, 0 },
{ '_', -5, 0 },
};
static constexpr int NODE_COUNT = sizeof(s_dag) / sizeof(node);
int m_state = 1;
bool m_seen_trailing_chars = false;
};
// bool result = text.empty() or is_ordinary(text.front()); // --------------------------------------------------------------------
// int state = 0; sac_parser::sac_parser(std::istream &is, bool init)
: m_source(*is.rdbuf())
{
if (is.rdbuf() == nullptr)
throw std::runtime_error("Attempt to read from uninitialised stream");
// if (result) m_validate = true;
// { m_line_nr = 1;
// for (auto ch : text) m_bol = true;
// {
// switch (state)
// {
// case 0:
// switch
// } if (init)
m_lookahead = get_next_token();
}
bool sac_parser::is_unquoted_string(std::string_view text)
{
bool result = text.empty() or is_ordinary(text.front());
if (result)
{
reserved_words_automaton automaton;
// if (is_non_blank(ch)) for (char ch : text)
// continue; {
// result = false; if (not is_non_blank(ch))
// break; {
// } result = false;
// } break;
}
// // static const std::regex kReservedRx(R"(loop_|stop_|global_|data_\S+|save_\S+)", std::regex_constants::icase); automaton.move(ch);
}
// // but be careful it does not contain e.g. stop_ if (automaton.matched())
// return result and not std::regex_match(text.begin(), text.end(), kReservedRx); result = false;
}
return result;
} }
// get_next_char takes a char from the buffer, or if it is empty // get_next_char takes a char from the buffer, or if it is empty
...@@ -281,6 +320,8 @@ sac_parser::CIFToken sac_parser::get_next_token() ...@@ -281,6 +320,8 @@ sac_parser::CIFToken sac_parser::get_next_token()
// mTokenType = CIFValue::Unknown; // mTokenType = CIFValue::Unknown;
m_token_value = {}; m_token_value = {};
reserved_words_automaton dag;
while (result == CIFToken::Unknown) while (result == CIFToken::Unknown)
{ {
auto ch = get_next_char(); auto ch = get_next_char();
...@@ -517,79 +558,122 @@ sac_parser::CIFToken sac_parser::get_next_token() ...@@ -517,79 +558,122 @@ sac_parser::CIFToken sac_parser::get_next_token()
break; break;
case State::Reserved: case State::Reserved:
switch (ch & ~0x20) switch (dag.move(ch))
{ {
case 'D': // data_ case reserved_words_automaton::undefined:
state = State::Reserved + 10;
break;
case 'G':
state = State::Reserved + 20; // global_
break;
case 'L':
state = State::Reserved + 30; // loop_
break;
case 'S':
state = State::Reserved + 40; // stop_ | save_
break; break;
default:
case reserved_words_automaton::no_keyword:
state = start = restart(start); state = start = restart(start);
break; break;
}
break;
case State::Reserved + 10: if ((ch & ~0x20) == 'A') ++state; else state = start = restart(start); break; case reserved_words_automaton::data:
case State::Reserved + 11: if ((ch & ~0x20) == 'T') ++state; else state = start = restart(start); break;
case State::Reserved + 12: if ((ch & ~0x20) == 'A') ++state; else state = start = restart(start); break;
case State::Reserved + 13: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break;
case State::Reserved + 14: if (is_non_blank(ch)) ++state; else state = start = restart(start); break;
case State::Reserved + 15:
if (not is_non_blank(ch))
{
retract(); retract();
result = CIFToken::DATA;
m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.data() + m_token_buffer.size()); m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.data() + m_token_buffer.size());
} result = CIFToken::DATA;
break; break;
case State::Reserved + 20: if ((ch & ~0x20) == 'L') ++state; else state = start = restart(start); break; case reserved_words_automaton::global:
case State::Reserved + 21: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break; retract();
case State::Reserved + 22: if ((ch & ~0x20) == 'B') ++state; else state = start = restart(start); break; result = CIFToken::GLOBAL;
case State::Reserved + 23: if ((ch & ~0x20) == 'A') ++state; else state = start = restart(start); break; break;
case State::Reserved + 24: if ((ch & ~0x20) == 'L') ++state; else state = start = restart(start); break;
case State::Reserved + 25: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break;
case State::Reserved + 26: if (not is_non_blank(ch)) result = CIFToken::GLOBAL; else state = start = restart(start); break;
case State::Reserved + 30: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break; case reserved_words_automaton::loop:
case State::Reserved + 31: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break; retract();
case State::Reserved + 32: if ((ch & ~0x20) == 'P') ++state; else state = start = restart(start); break; result = CIFToken::LOOP;
case State::Reserved + 33: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break; break;
case State::Reserved + 34: if (not is_non_blank(ch)) result = CIFToken::LOOP; else state = start = restart(start); break;
case State::Reserved + 40: case reserved_words_automaton::save:
if ((ch & ~0x20) == 'A') retract();
state = State::Reserved + 41; result = CIFToken::SAVE_;
else if ((ch & ~0x20) == 'T')
state = State::Reserved + 51;
else
state = start = restart(start);
break; break;
case State::Reserved + 41: if ((ch & ~0x20) == 'V') ++state; else state = start = restart(start); break; case reserved_words_automaton::save_plus:
case State::Reserved + 42: if ((ch & ~0x20) == 'E') ++state; else state = start = restart(start); break;
case State::Reserved + 43: if (is_non_blank(ch)) ++state; else state = start = restart(start); break;
case State::Reserved + 44:
if (not is_non_blank(ch))
{
retract(); retract();
result = CIFToken::SAVE;
m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.data() + m_token_buffer.size()); m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.data() + m_token_buffer.size());
result = CIFToken::SAVE_NAME;
break;
case reserved_words_automaton::stop:
retract();
result = CIFToken::STOP;
break;
} }
break; break;
case State::Reserved + 51: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break; // switch (ch & ~0x20)
case State::Reserved + 52: if ((ch & ~0x20) == 'P') ++state; else state = start = restart(start); break; // {
case State::Reserved + 53: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break; // case 'D': // data_
case State::Reserved + 54: if (not is_non_blank(ch)) result = CIFToken::STOP; else state = start = restart(start); break; // state = State::Reserved + 10;
// break;
// case 'G':
// state = State::Reserved + 20; // global_
// break;
// case 'L':
// state = State::Reserved + 30; // loop_
// break;
// case 'S':
// state = State::Reserved + 40; // stop_ | save_
// break;
// default:
// state = start = restart(start);
// break;
// }
// break;
// case State::Reserved + 10: if ((ch & ~0x20) == 'A') ++state; else state = start = restart(start); break;
// case State::Reserved + 11: if ((ch & ~0x20) == 'T') ++state; else state = start = restart(start); break;
// case State::Reserved + 12: if ((ch & ~0x20) == 'A') ++state; else state = start = restart(start); break;
// case State::Reserved + 13: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break;
// case State::Reserved + 14: if (is_non_blank(ch)) ++state; else state = start = restart(start); break;
// case State::Reserved + 15:
// if (not is_non_blank(ch))
// {
// retract();
// result = CIFToken::DATA;
// m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.data() + m_token_buffer.size());
// }
// break;
// case State::Reserved + 20: if ((ch & ~0x20) == 'L') ++state; else state = start = restart(start); break;
// case State::Reserved + 21: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break;
// case State::Reserved + 22: if ((ch & ~0x20) == 'B') ++state; else state = start = restart(start); break;
// case State::Reserved + 23: if ((ch & ~0x20) == 'A') ++state; else state = start = restart(start); break;
// case State::Reserved + 24: if ((ch & ~0x20) == 'L') ++state; else state = start = restart(start); break;
// case State::Reserved + 25: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break;
// case State::Reserved + 26: if (not is_non_blank(ch)) result = CIFToken::GLOBAL; else state = start = restart(start); break;
// case State::Reserved + 30: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break;
// case State::Reserved + 31: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break;
// case State::Reserved + 32: if ((ch & ~0x20) == 'P') ++state; else state = start = restart(start); break;
// case State::Reserved + 33: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break;
// case State::Reserved + 34: if (not is_non_blank(ch)) result = CIFToken::LOOP; else state = start = restart(start); break;
// case State::Reserved + 40:
// if ((ch & ~0x20) == 'A')
// state = State::Reserved + 41;
// else if ((ch & ~0x20) == 'T')
// state = State::Reserved + 51;
// else
// state = start = restart(start);
// break;
// case State::Reserved + 41: if ((ch & ~0x20) == 'V') ++state; else state = start = restart(start); break;
// case State::Reserved + 42: if ((ch & ~0x20) == 'E') ++state; else state = start = restart(start); break;
// case State::Reserved + 43: if (is_non_blank(ch)) ++state; else state = start = restart(start); break;
// case State::Reserved + 44:
// if (not is_non_blank(ch))
// {
// retract();
// result = CIFToken::SAVE;
// m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.data() + m_token_buffer.size());
// }
// break;
// case State::Reserved + 51: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break;
// case State::Reserved + 52: if ((ch & ~0x20) == 'P') ++state; else state = start = restart(start); break;
// case State::Reserved + 53: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break;
// case State::Reserved + 54: if (not is_non_blank(ch)) result = CIFToken::STOP; else state = start = restart(start); break;
case State::Value: case State::Value:
if (not is_non_blank(ch)) if (not is_non_blank(ch))
...@@ -608,7 +692,7 @@ sac_parser::CIFToken sac_parser::get_next_token() ...@@ -608,7 +692,7 @@ sac_parser::CIFToken sac_parser::get_next_token()
} }
} }
if (VERBOSE >= 5) // if (VERBOSE >= 5)
{ {
std::cerr << get_token_name(result); std::cerr << get_token_name(result);
// if (mTokenType != CIFValue::Unknown) // if (mTokenType != CIFValue::Unknown)
...@@ -874,7 +958,7 @@ void sac_parser::parse_datablock() ...@@ -874,7 +958,7 @@ void sac_parser::parse_datablock()
static const std::string kUnitializedCategory("<invalid>"); static const std::string kUnitializedCategory("<invalid>");
std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names
while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag or m_lookahead == CIFToken::SAVE) while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag or m_lookahead == CIFToken::SAVE_NAME)
{ {
switch (m_lookahead) switch (m_lookahead)
{ {
...@@ -939,7 +1023,7 @@ void sac_parser::parse_datablock() ...@@ -939,7 +1023,7 @@ void sac_parser::parse_datablock()
break; break;
} }
case CIFToken::SAVE: case CIFToken::SAVE_NAME:
parse_save_frame(); parse_save_frame();
break; break;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment