Commit 7b654a83 by Maarten L. Hekkelman

with reserved words automaton

parent ae9d247d
...@@ -112,7 +112,8 @@ class sac_parser ...@@ -112,7 +112,8 @@ class sac_parser
DATA, DATA,
LOOP, LOOP,
GLOBAL, GLOBAL,
SAVE, SAVE_,
SAVE_NAME,
STOP, STOP,
Tag, Tag,
Value Value
...@@ -127,7 +128,8 @@ class sac_parser ...@@ -127,7 +128,8 @@ class sac_parser
case CIFToken::DATA: return "DATA"; case CIFToken::DATA: return "DATA";
case CIFToken::LOOP: return "LOOP"; case CIFToken::LOOP: return "LOOP";
case CIFToken::GLOBAL: return "GLOBAL"; case CIFToken::GLOBAL: return "GLOBAL";
case CIFToken::SAVE: return "SAVE"; case CIFToken::SAVE_: return "SAVE";
case CIFToken::SAVE_NAME: return "SAVE+name";
case CIFToken::STOP: return "STOP"; case CIFToken::STOP: return "STOP";
case CIFToken::Tag: return "Tag"; case CIFToken::Tag: return "Tag";
case CIFToken::Value: return "Value"; case CIFToken::Value: return "Value";
...@@ -135,32 +137,6 @@ class sac_parser ...@@ -135,32 +137,6 @@ class sac_parser
} }
} }
enum class CIFValue
{
Int,
Float,
Numeric,
String,
TextField,
Inapplicable,
Unknown
};
static constexpr const char *get_value_name(CIFValue type)
{
switch (type)
{
case CIFValue::Int: return "Int";
case CIFValue::Float: return "Float";
case CIFValue::Numeric: return "Numeric";
case CIFValue::String: return "String";
case CIFValue::TextField: return "TextField";
case CIFValue::Inapplicable: return "Inapplicable";
case CIFValue::Unknown: return "Unknown";
default: return "Invalid type parameter";
}
}
// get_next_char takes a char from the buffer, or if it is empty // get_next_char takes a char from the buffer, or if it is empty
// from the istream. This function also does carriage/linefeed // from the istream. This function also does carriage/linefeed
// translation. // translation.
......
...@@ -127,7 +127,7 @@ class dictionary_parser : public parser ...@@ -127,7 +127,7 @@ class dictionary_parser : public parser
datablock dict(m_token_value); datablock dict(m_token_value);
datablock::iterator cat = dict.end(); datablock::iterator cat = dict.end();
match(CIFToken::SAVE); match(CIFToken::SAVE_NAME);
while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag) while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag)
{ {
if (m_lookahead == CIFToken::LOOP) if (m_lookahead == CIFToken::LOOP)
...@@ -183,7 +183,7 @@ class dictionary_parser : public parser ...@@ -183,7 +183,7 @@ class dictionary_parser : public parser
} }
} }
match(CIFToken::SAVE); match(CIFToken::SAVE_);
if (isCategorySaveFrame) if (isCategorySaveFrame)
{ {
......
...@@ -40,142 +40,181 @@ namespace cif ...@@ -40,142 +40,181 @@ namespace cif
// -------------------------------------------------------------------- // --------------------------------------------------------------------
sac_parser::sac_parser(std::istream &is, bool init) class reserved_words_automaton
: m_source(*is.rdbuf())
{ {
if (is.rdbuf() == nullptr) public:
throw std::runtime_error("Attempt to read from uninitialised stream"); reserved_words_automaton() {}
m_validate = true;
m_line_nr = 1;
m_bol = true;
if (init) enum move_result
m_lookahead = get_next_token(); {
} undefined,
no_keyword,
data,
global,
loop,
save,
save_plus,
stop
};
constexpr bool finished() const
{
return m_state <= 0;
}
bool sac_parser::is_unquoted_string(std::string_view text) constexpr bool matched() const
{ {
bool result = text.empty() or is_ordinary(text.front()); return m_state < 0;
int state = 1; }
for (char ch : text) constexpr move_result move(int ch)
{ {
if (not is_non_blank(ch)) move_result result = undefined;
{
result = false;
break;
}
switch (state) switch (m_state)
{ {
case 0: case 0:
break; break;
case 1: case -1: // data_
switch (ch & ~0x20) if (sac_parser::is_non_blank(ch))
{ m_seen_trailing_chars = true;
case 'D': // data_ else if (m_seen_trailing_chars)
state = 10; result = data;
break;
case 'G':
state = 20; // global_
break;
case 'L':
state = 30; // loop_
break;
case 'S':
state = 40; // stop_ | save_
break;
default:
state = 0;
break;
}
break;
case 10: state = ((ch & ~0x20) == 'A') ? state + 1 : 0; break;
case 11: state = ((ch & ~0x20) == 'T') ? state + 1 : 0; break;
case 12: state = ((ch & ~0x20) == 'A') ? 100 : 0; break;
case 20: state = ((ch & ~0x20) == 'L') ? state + 1 : 0; break;
case 21: state = ((ch & ~0x20) == 'O') ? state + 1 : 0; break;
case 22: state = ((ch & ~0x20) == 'B') ? state + 1 : 0; break;
case 23: state = ((ch & ~0x20) == 'A') ? state + 1 : 0; break;
case 24: state = ((ch & ~0x20) == 'L') ? 200 : 0; break;
case 30: state = ((ch & ~0x20) == 'O') ? state + 1 : 0; break;
case 31: state = ((ch & ~0x20) == 'O') ? state + 1 : 0; break;
case 32: state = ((ch & ~0x20) == 'P') ? 200 : 0; break;
case 40:
if ((ch & ~0x20) == 'A')
state = 41;
else if ((ch & ~0x20) == 'T')
state = 51;
else else
state = 0; result = no_keyword;
break; break;
case 41: state = ((ch & ~0x20) == 'V') ? state + 1 : 0; break; case -2: // global_
case 42: state = ((ch & ~0x20) == 'E') ? 100 : 0; break; result = sac_parser::is_non_blank(ch) ? no_keyword : global;
break;
case 51: state = ((ch & ~0x20) == 'O') ? state + 1 : 0; break;
case 52: state = ((ch & ~0x20) == 'P') ? 200 : 0; break;
case 100: case -3: // loop_
state = ((ch & ~0x20) == '_') ? 101 : 0; break; result = sac_parser::is_non_blank(ch) ? no_keyword : loop;
case 101:
result = false;
state = 0;
break; break;
case 200: case -4: // save_
if ((ch & ~0x20) == '_') if (sac_parser::is_non_blank(ch))
{ m_seen_trailing_chars = true;
result = false; else if (m_seen_trailing_chars)
state = 201; result = save_plus;
}
else else
state = 0; result = save;
break;
case -5: // stop_
result = sac_parser::is_non_blank(ch) ? no_keyword : stop;
break; break;
case 201: default:
result = true; assert(m_state > 0 and m_state < NODE_COUNT);
for (;;)
{
if (s_dag[m_state].ch == (ch & ~0x20))
{
m_state = s_dag[m_state].next_match;
break;
}
m_state = s_dag[m_state].next_nomatch;
if (m_state == 0)
{
result = no_keyword;
break;
}
}
break; break;
} }
if (result != undefined)
m_state = 0;
return result;
} }
return result; private:
static constexpr struct node
{
int16_t ch;
int8_t next_match;
int8_t next_nomatch;
} s_dag[] = {
{ 0 },
{ 'D', 5, 2 },
{ 'G', 9, 3 },
{ 'L', 15, 4 },
{ 'S', 19, 0 },
{ 'A', 6, 0 },
{ 'T', 7, 0 },
{ 'A', 8, 0 },
{ '_', -1, 0 },
{ 'L', 10, 0 },
{ 'O', 11, 0 },
{ 'B', 12, 0 },
{ 'A', 13, 0 },
{ 'L', 14, 0 },
{ '_', -2, 0 },
{ 'O', 16, 0},
{ 'O', 17, 0 },
{ 'P', 18, 0 },
{ '_', -3, 0 },
{ 'A', 21, 20 },
{ 'T', 24, 0 },
{ 'V', 22, 0 },
{ 'E', 23, 0 },
{ '_', -4, 0 },
{ 'O', 25, 0 },
{ 'P', 26, 0 },
{ '_', -5, 0 },
};
static constexpr int NODE_COUNT = sizeof(s_dag) / sizeof(node);
int m_state = 1;
bool m_seen_trailing_chars = false;
};
// --------------------------------------------------------------------
// bool result = text.empty() or is_ordinary(text.front()); sac_parser::sac_parser(std::istream &is, bool init)
: m_source(*is.rdbuf())
{
if (is.rdbuf() == nullptr)
throw std::runtime_error("Attempt to read from uninitialised stream");
// int state = 0; m_validate = true;
m_line_nr = 1;
m_bol = true;
// if (result) if (init)
// { m_lookahead = get_next_token();
// for (auto ch : text) }
// {
// switch (state)
// {
// case 0:
// switch
// } bool sac_parser::is_unquoted_string(std::string_view text)
{
bool result = text.empty() or is_ordinary(text.front());
if (result)
{
reserved_words_automaton automaton;
for (char ch : text)
{
if (not is_non_blank(ch))
{
result = false;
break;
}
// if (is_non_blank(ch)) automaton.move(ch);
// continue; }
// result = false;
// break;
// }
// }
// // static const std::regex kReservedRx(R"(loop_|stop_|global_|data_\S+|save_\S+)", std::regex_constants::icase); if (automaton.matched())
result = false;
}
// // but be careful it does not contain e.g. stop_ return result;
// return result and not std::regex_match(text.begin(), text.end(), kReservedRx);
} }
// get_next_char takes a char from the buffer, or if it is empty // get_next_char takes a char from the buffer, or if it is empty
...@@ -281,6 +320,8 @@ sac_parser::CIFToken sac_parser::get_next_token() ...@@ -281,6 +320,8 @@ sac_parser::CIFToken sac_parser::get_next_token()
// mTokenType = CIFValue::Unknown; // mTokenType = CIFValue::Unknown;
m_token_value = {}; m_token_value = {};
reserved_words_automaton dag;
while (result == CIFToken::Unknown) while (result == CIFToken::Unknown)
{ {
auto ch = get_next_char(); auto ch = get_next_char();
...@@ -517,79 +558,122 @@ sac_parser::CIFToken sac_parser::get_next_token() ...@@ -517,79 +558,122 @@ sac_parser::CIFToken sac_parser::get_next_token()
break; break;
case State::Reserved: case State::Reserved:
switch (ch & ~0x20) switch (dag.move(ch))
{ {
case 'D': // data_ case reserved_words_automaton::undefined:
state = State::Reserved + 10;
break; break;
case 'G':
state = State::Reserved + 20; // global_ case reserved_words_automaton::no_keyword:
state = start = restart(start);
break; break;
case 'L':
state = State::Reserved + 30; // loop_ case reserved_words_automaton::data:
retract();
m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.data() + m_token_buffer.size());
result = CIFToken::DATA;
break; break;
case 'S':
state = State::Reserved + 40; // stop_ | save_ case reserved_words_automaton::global:
retract();
result = CIFToken::GLOBAL;
break; break;
default:
state = start = restart(start); case reserved_words_automaton::loop:
retract();
result = CIFToken::LOOP;
break;
case reserved_words_automaton::save:
retract();
result = CIFToken::SAVE_;
break;
case reserved_words_automaton::save_plus:
retract();
m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.data() + m_token_buffer.size());
result = CIFToken::SAVE_NAME;
break;
case reserved_words_automaton::stop:
retract();
result = CIFToken::STOP;
break; break;
} }
break; break;
// switch (ch & ~0x20)
// {
// case 'D': // data_
// state = State::Reserved + 10;
// break;
// case 'G':
// state = State::Reserved + 20; // global_
// break;
// case 'L':
// state = State::Reserved + 30; // loop_
// break;
// case 'S':
// state = State::Reserved + 40; // stop_ | save_
// break;
// default:
// state = start = restart(start);
// break;
// }
// break;
case State::Reserved + 10: if ((ch & ~0x20) == 'A') ++state; else state = start = restart(start); break; // case State::Reserved + 10: if ((ch & ~0x20) == 'A') ++state; else state = start = restart(start); break;
case State::Reserved + 11: if ((ch & ~0x20) == 'T') ++state; else state = start = restart(start); break; // case State::Reserved + 11: if ((ch & ~0x20) == 'T') ++state; else state = start = restart(start); break;
case State::Reserved + 12: if ((ch & ~0x20) == 'A') ++state; else state = start = restart(start); break; // case State::Reserved + 12: if ((ch & ~0x20) == 'A') ++state; else state = start = restart(start); break;
case State::Reserved + 13: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break; // case State::Reserved + 13: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break;
case State::Reserved + 14: if (is_non_blank(ch)) ++state; else state = start = restart(start); break; // case State::Reserved + 14: if (is_non_blank(ch)) ++state; else state = start = restart(start); break;
case State::Reserved + 15: // case State::Reserved + 15:
if (not is_non_blank(ch)) // if (not is_non_blank(ch))
{ // {
retract(); // retract();
result = CIFToken::DATA; // result = CIFToken::DATA;
m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.data() + m_token_buffer.size()); // m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.data() + m_token_buffer.size());
} // }
break; // break;
case State::Reserved + 20: if ((ch & ~0x20) == 'L') ++state; else state = start = restart(start); break; // case State::Reserved + 20: if ((ch & ~0x20) == 'L') ++state; else state = start = restart(start); break;
case State::Reserved + 21: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break; // case State::Reserved + 21: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break;
case State::Reserved + 22: if ((ch & ~0x20) == 'B') ++state; else state = start = restart(start); break; // case State::Reserved + 22: if ((ch & ~0x20) == 'B') ++state; else state = start = restart(start); break;
case State::Reserved + 23: if ((ch & ~0x20) == 'A') ++state; else state = start = restart(start); break; // case State::Reserved + 23: if ((ch & ~0x20) == 'A') ++state; else state = start = restart(start); break;
case State::Reserved + 24: if ((ch & ~0x20) == 'L') ++state; else state = start = restart(start); break; // case State::Reserved + 24: if ((ch & ~0x20) == 'L') ++state; else state = start = restart(start); break;
case State::Reserved + 25: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break; // case State::Reserved + 25: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break;
case State::Reserved + 26: if (not is_non_blank(ch)) result = CIFToken::GLOBAL; else state = start = restart(start); break; // case State::Reserved + 26: if (not is_non_blank(ch)) result = CIFToken::GLOBAL; else state = start = restart(start); break;
case State::Reserved + 30: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break; // case State::Reserved + 30: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break;
case State::Reserved + 31: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break; // case State::Reserved + 31: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break;
case State::Reserved + 32: if ((ch & ~0x20) == 'P') ++state; else state = start = restart(start); break; // case State::Reserved + 32: if ((ch & ~0x20) == 'P') ++state; else state = start = restart(start); break;
case State::Reserved + 33: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break; // case State::Reserved + 33: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break;
case State::Reserved + 34: if (not is_non_blank(ch)) result = CIFToken::LOOP; else state = start = restart(start); break; // case State::Reserved + 34: if (not is_non_blank(ch)) result = CIFToken::LOOP; else state = start = restart(start); break;
case State::Reserved + 40: // case State::Reserved + 40:
if ((ch & ~0x20) == 'A') // if ((ch & ~0x20) == 'A')
state = State::Reserved + 41; // state = State::Reserved + 41;
else if ((ch & ~0x20) == 'T') // else if ((ch & ~0x20) == 'T')
state = State::Reserved + 51; // state = State::Reserved + 51;
else // else
state = start = restart(start); // state = start = restart(start);
break; // break;
case State::Reserved + 41: if ((ch & ~0x20) == 'V') ++state; else state = start = restart(start); break; // case State::Reserved + 41: if ((ch & ~0x20) == 'V') ++state; else state = start = restart(start); break;
case State::Reserved + 42: if ((ch & ~0x20) == 'E') ++state; else state = start = restart(start); break; // case State::Reserved + 42: if ((ch & ~0x20) == 'E') ++state; else state = start = restart(start); break;
case State::Reserved + 43: if (is_non_blank(ch)) ++state; else state = start = restart(start); break; // case State::Reserved + 43: if (is_non_blank(ch)) ++state; else state = start = restart(start); break;
case State::Reserved + 44: // case State::Reserved + 44:
if (not is_non_blank(ch)) // if (not is_non_blank(ch))
{ // {
retract(); // retract();
result = CIFToken::SAVE; // result = CIFToken::SAVE;
m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.data() + m_token_buffer.size()); // m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.data() + m_token_buffer.size());
} // }
break; // break;
case State::Reserved + 51: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break; // case State::Reserved + 51: if ((ch & ~0x20) == 'O') ++state; else state = start = restart(start); break;
case State::Reserved + 52: if ((ch & ~0x20) == 'P') ++state; else state = start = restart(start); break; // case State::Reserved + 52: if ((ch & ~0x20) == 'P') ++state; else state = start = restart(start); break;
case State::Reserved + 53: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break; // case State::Reserved + 53: if ((ch & ~0x20) == '_') ++state; else state = start = restart(start); break;
case State::Reserved + 54: if (not is_non_blank(ch)) result = CIFToken::STOP; else state = start = restart(start); break; // case State::Reserved + 54: if (not is_non_blank(ch)) result = CIFToken::STOP; else state = start = restart(start); break;
case State::Value: case State::Value:
if (not is_non_blank(ch)) if (not is_non_blank(ch))
...@@ -608,7 +692,7 @@ sac_parser::CIFToken sac_parser::get_next_token() ...@@ -608,7 +692,7 @@ sac_parser::CIFToken sac_parser::get_next_token()
} }
} }
if (VERBOSE >= 5) // if (VERBOSE >= 5)
{ {
std::cerr << get_token_name(result); std::cerr << get_token_name(result);
// if (mTokenType != CIFValue::Unknown) // if (mTokenType != CIFValue::Unknown)
...@@ -874,7 +958,7 @@ void sac_parser::parse_datablock() ...@@ -874,7 +958,7 @@ void sac_parser::parse_datablock()
static const std::string kUnitializedCategory("<invalid>"); static const std::string kUnitializedCategory("<invalid>");
std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names
while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag or m_lookahead == CIFToken::SAVE) while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag or m_lookahead == CIFToken::SAVE_NAME)
{ {
switch (m_lookahead) switch (m_lookahead)
{ {
...@@ -939,7 +1023,7 @@ void sac_parser::parse_datablock() ...@@ -939,7 +1023,7 @@ void sac_parser::parse_datablock()
break; break;
} }
case CIFToken::SAVE: case CIFToken::SAVE_NAME:
parse_save_frame(); parse_save_frame();
break; break;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment