Commit 19210df6 by Maarten L. Hekkelman

Fix parsing mmCIF files with an unquoted string ??

parent 15c57307
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
cmake_minimum_required(VERSION 3.16) cmake_minimum_required(VERSION 3.16)
# set the project name # set the project name
project(cifpp VERSION 3.0.2 LANGUAGES CXX) project(cifpp VERSION 3.0.4 LANGUAGES CXX)
list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
......
Version 3.0.4
- Fix in mmCIF parser, now correctly handles the unquoted
string ??
Version 3.0.3 Version 3.0.3
- Better configuration checks, for atomic e.g. - Better configuration checks, for atomic e.g.
- Fixed a problem introduced in refactoring mmcif::Atom - Fixed a problem introduced in refactoring mmcif::Atom
...@@ -17,6 +21,9 @@ Version 3.0.0 ...@@ -17,6 +21,9 @@ Version 3.0.0
- Upgraded mmcif::Structure - Upgraded mmcif::Structure
- various other small fixes - various other small fixes
Version 2.0.5
- Backporting updated CMakeLists.txt file
Version 2.0.4 Version 2.0.4
- Reverted a too strict test when reading cif files. - Reverted a too strict test when reading cif files.
......
...@@ -139,7 +139,7 @@ class SacParser ...@@ -139,7 +139,7 @@ class SacParser
int getNextChar(); int getNextChar();
void retract(); void retract();
void restart(); int restart(int start);
CIFToken getNextToken(); CIFToken getNextToken();
void match(CIFToken token); void match(CIFToken token);
...@@ -181,8 +181,9 @@ class SacParser ...@@ -181,8 +181,9 @@ class SacParser
eStateTextField, eStateTextField,
eStateFloat = 100, eStateFloat = 100,
eStateInt = 110, eStateInt = 110,
// eStateNumericSuffix = 200, eStateValue = 300,
eStateValue = 300 eStateDATA,
eStateSAVE
}; };
std::istream &mData; std::istream &mData;
...@@ -191,7 +192,6 @@ class SacParser ...@@ -191,7 +192,6 @@ class SacParser
bool mValidate; bool mValidate;
uint32_t mLineNr; uint32_t mLineNr;
bool mBol; bool mBol;
int mState, mStart;
CIFToken mLookahead; CIFToken mLookahead;
std::string mTokenValue; std::string mTokenValue;
CIFValueType mTokenType; CIFValueType mTokenType;
......
...@@ -42,7 +42,7 @@ namespace cif ...@@ -42,7 +42,7 @@ namespace cif
const uint32_t kMaxLineLength = 132; const uint32_t kMaxLineLength = 132;
const uint8_t kCharTraitsTable[128] = { const uint8_t kCharTraitsTable[128] = {
// 0 1 2 3 4 5 6 7 8 9 a b c d e f // 0 1 2 3 4 5 6 7 8 9 a b c d e f
14, 15, 14, 14, 14, 15, 15, 14, 15, 15, 15, 15, 15, 15, 15, 15, // 2 14, 15, 14, 14, 14, 15, 15, 14, 15, 15, 15, 15, 15, 15, 15, 15, // 2
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 10, 15, 15, 15, 15, // 3 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 10, 15, 15, 15, 15, // 3
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, // 4 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, // 4
...@@ -151,23 +151,26 @@ void SacParser::retract() ...@@ -151,23 +151,26 @@ void SacParser::retract()
mTokenValue.pop_back(); mTokenValue.pop_back();
} }
void SacParser::restart()
int SacParser::restart(int start)
{ {
int result = 0;
while (not mTokenValue.empty()) while (not mTokenValue.empty())
retract(); retract();
switch (mStart) switch (start)
{ {
case eStateStart: case eStateStart:
mState = mStart = eStateFloat; result = eStateFloat;
break; break;
case eStateFloat: case eStateFloat:
mState = mStart = eStateInt; result = eStateInt;
break; break;
case eStateInt: case eStateInt:
mState = mStart = eStateValue; result = eStateValue;
break; break;
default: default:
...@@ -175,6 +178,8 @@ void SacParser::restart() ...@@ -175,6 +178,8 @@ void SacParser::restart()
} }
mBol = false; mBol = false;
return result;
} }
void SacParser::match(SacParser::CIFToken t) void SacParser::match(SacParser::CIFToken t)
...@@ -191,7 +196,7 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -191,7 +196,7 @@ SacParser::CIFToken SacParser::getNextToken()
CIFToken result = eCIFTokenUnknown; CIFToken result = eCIFTokenUnknown;
int quoteChar = 0; int quoteChar = 0;
mState = mStart = eStateStart; int state = eStateStart, start = eStateStart;
mBol = false; mBol = false;
mTokenValue.clear(); mTokenValue.clear();
...@@ -201,7 +206,7 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -201,7 +206,7 @@ SacParser::CIFToken SacParser::getNextToken()
{ {
auto ch = getNextChar(); auto ch = getNextChar();
switch (mState) switch (state)
{ {
case eStateStart: case eStateStart:
if (ch == kEOF) if (ch == kEOF)
...@@ -209,27 +214,23 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -209,27 +214,23 @@ SacParser::CIFToken SacParser::getNextToken()
else if (ch == '\n') else if (ch == '\n')
{ {
mBol = true; mBol = true;
mState = eStateWhite; state = eStateWhite;
} }
else if (ch == ' ' or ch == '\t') else if (ch == ' ' or ch == '\t')
mState = eStateWhite; state = eStateWhite;
else if (ch == '#') else if (ch == '#')
mState = eStateComment; state = eStateComment;
else if (ch == '.')
mState = eStateDot;
else if (ch == '_') else if (ch == '_')
mState = eStateTag; state = eStateTag;
else if (ch == ';' and mBol) else if (ch == ';' and mBol)
mState = eStateTextField; state = eStateTextField;
else if (ch == '\'' or ch == '"') else if (ch == '\'' or ch == '"')
{ {
quoteChar = ch; quoteChar = ch;
mState = eStateQuotedString; state = eStateQuotedString;
} }
else if (ch == '?')
mState = eStateQuestionMark;
else else
restart(); state = start = restart(start);
break; break;
case eStateWhite: case eStateWhite:
...@@ -237,7 +238,7 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -237,7 +238,7 @@ SacParser::CIFToken SacParser::getNextToken()
result = eCIFTokenEOF; result = eCIFTokenEOF;
else if (not isspace(ch)) else if (not isspace(ch))
{ {
mState = eStateStart; state = eStateStart;
retract(); retract();
mTokenValue.clear(); mTokenValue.clear();
} }
...@@ -248,7 +249,7 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -248,7 +249,7 @@ SacParser::CIFToken SacParser::getNextToken()
case eStateComment: case eStateComment:
if (ch == '\n') if (ch == '\n')
{ {
mState = eStateStart; state = eStateStart;
mBol = true; mBol = true;
mTokenValue.clear(); mTokenValue.clear();
} }
...@@ -258,44 +259,19 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -258,44 +259,19 @@ SacParser::CIFToken SacParser::getNextToken()
error("invalid character in comment"); error("invalid character in comment");
break; break;
case eStateQuestionMark:
if (isNonBlank(ch))
mState = eStateValue;
else
{
retract();
result = eCIFTokenValue;
mTokenValue.clear();
mTokenType = eCIFValueUnknown;
}
break;
case eStateDot:
if (isdigit(ch))
mState = eStateFloat + 2;
else if (isspace(ch))
{
retract();
result = eCIFTokenValue;
mTokenType = eCIFValueInapplicable;
}
else
mState = eStateValue;
break;
case eStateTextField: case eStateTextField:
if (ch == '\n') if (ch == '\n')
mState = eStateTextField + 1; state = eStateTextField + 1;
else if (ch == kEOF) else if (ch == kEOF)
error("unterminated textfield"); error("unterminated textfield");
else if (not isAnyPrint(ch) and cif::VERBOSE >= 0) else if (not isAnyPrint(ch))
// error("invalid character in text field '" + string({ static_cast<char>(ch) }) + "' (" + to_string((int)ch) + ")"); // error("invalid character in text field '" + string({ static_cast<char>(ch) }) + "' (" + to_string((int)ch) + ")");
std::cerr << "invalid character in text field '" << std::string({static_cast<char>(ch)}) << "' (" << ch << ") line: " << mLineNr << std::endl; std::cerr << "invalid character in text field '" << std::string({static_cast<char>(ch)}) << "' (" << ch << ") line: " << mLineNr << std::endl;
break; break;
case eStateTextField + 1: case eStateTextField + 1:
if (isTextLead(ch) or ch == ' ' or ch == '\t') if (isTextLead(ch) or ch == ' ' or ch == '\t')
mState = eStateTextField; state = eStateTextField;
else if (ch == ';') else if (ch == ';')
{ {
assert(mTokenValue.length() >= 2); assert(mTokenValue.length() >= 2);
...@@ -313,7 +289,7 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -313,7 +289,7 @@ SacParser::CIFToken SacParser::getNextToken()
if (ch == kEOF) if (ch == kEOF)
error("unterminated quoted string"); error("unterminated quoted string");
else if (ch == quoteChar) else if (ch == quoteChar)
mState = eStateQuotedStringQuote; state = eStateQuotedStringQuote;
else if (not isAnyPrint(ch)) else if (not isAnyPrint(ch))
error("invalid character in quoted string"); error("invalid character in quoted string");
break; break;
...@@ -331,7 +307,7 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -331,7 +307,7 @@ SacParser::CIFToken SacParser::getNextToken()
else if (ch == quoteChar) else if (ch == quoteChar)
; ;
else if (isAnyPrint(ch)) else if (isAnyPrint(ch))
mState = eStateQuotedString; state = eStateQuotedString;
else if (ch == kEOF) else if (ch == kEOF)
error("unterminated quoted string"); error("unterminated quoted string");
else else
...@@ -349,12 +325,12 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -349,12 +325,12 @@ SacParser::CIFToken SacParser::getNextToken()
case eStateFloat: case eStateFloat:
if (ch == '+' or ch == '-') if (ch == '+' or ch == '-')
{ {
mState = eStateFloat + 1; state = eStateFloat + 1;
} }
else if (isdigit(ch)) else if (isdigit(ch))
mState = eStateFloat + 1; state = eStateFloat + 1;
else else
restart(); state = start = restart(start);
break; break;
case eStateFloat + 1: case eStateFloat + 1:
...@@ -362,9 +338,9 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -362,9 +338,9 @@ SacParser::CIFToken SacParser::getNextToken()
// mState = eStateNumericSuffix; // mState = eStateNumericSuffix;
// else // else
if (ch == '.') if (ch == '.')
mState = eStateFloat + 2; state = eStateFloat + 2;
else if (tolower(ch) == 'e') else if (tolower(ch) == 'e')
mState = eStateFloat + 3; state = eStateFloat + 3;
else if (isWhite(ch) or ch == kEOF) else if (isWhite(ch) or ch == kEOF)
{ {
retract(); retract();
...@@ -372,16 +348,13 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -372,16 +348,13 @@ SacParser::CIFToken SacParser::getNextToken()
mTokenType = eCIFValueInt; mTokenType = eCIFValueInt;
} }
else else
restart(); state = start = restart(start);
break; break;
// parsed '.' // parsed '.'
case eStateFloat + 2: case eStateFloat + 2:
// if (ch == '(') // numeric???
// mState = eStateNumericSuffix;
// else
if (tolower(ch) == 'e') if (tolower(ch) == 'e')
mState = eStateFloat + 3; state = eStateFloat + 3;
else if (isWhite(ch) or ch == kEOF) else if (isWhite(ch) or ch == kEOF)
{ {
retract(); retract();
...@@ -389,30 +362,27 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -389,30 +362,27 @@ SacParser::CIFToken SacParser::getNextToken()
mTokenType = eCIFValueFloat; mTokenType = eCIFValueFloat;
} }
else else
restart(); state = start = restart(start);
break; break;
// parsed 'e' // parsed 'e'
case eStateFloat + 3: case eStateFloat + 3:
if (ch == '-' or ch == '+') if (ch == '-' or ch == '+')
mState = eStateFloat + 4; state = eStateFloat + 4;
else if (isdigit(ch)) else if (isdigit(ch))
mState = eStateFloat + 5; state = eStateFloat + 5;
else else
restart(); state = start = restart(start);
break; break;
case eStateFloat + 4: case eStateFloat + 4:
if (isdigit(ch)) if (isdigit(ch))
mState = eStateFloat + 5; state = eStateFloat + 5;
else else
restart(); state = start = restart(start);
break; break;
case eStateFloat + 5: case eStateFloat + 5:
// if (ch == '(')
// mState = eStateNumericSuffix;
// else
if (isWhite(ch) or ch == kEOF) if (isWhite(ch) or ch == kEOF)
{ {
retract(); retract();
...@@ -420,14 +390,14 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -420,14 +390,14 @@ SacParser::CIFToken SacParser::getNextToken()
mTokenType = eCIFValueFloat; mTokenType = eCIFValueFloat;
} }
else else
restart(); state = start = restart(start);
break; break;
case eStateInt: case eStateInt:
if (isdigit(ch) or ch == '+' or ch == '-') if (isdigit(ch) or ch == '+' or ch == '-')
mState = eStateInt + 1; state = eStateInt + 1;
else else
restart(); state = start = restart(start);
break; break;
case eStateInt + 1: case eStateInt + 1:
...@@ -438,35 +408,11 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -438,35 +408,11 @@ SacParser::CIFToken SacParser::getNextToken()
mTokenType = eCIFValueInt; mTokenType = eCIFValueInt;
} }
else else
restart(); state = start = restart(start);
break; break;
// case eStateNumericSuffix:
// if (isdigit(ch))
// mState = eStateNumericSuffix + 1;
// else
// restart();
// break;
//
// case eStateNumericSuffix + 1:
// if (ch == ')')
// {
// result = eCIFTokenValue;
// mTokenType = eCIFValueNumeric;
// }
// else if (not isdigit(ch))
// restart();
// break;
case eStateValue: case eStateValue:
if (isNonBlank(ch)) if (ch == '_')
mState = eStateValue + 1;
else
error("invalid character at this position");
break;
case eStateValue + 1:
if (ch == '_') // first _, check for keywords
{ {
std::string s = toLowerCopy(mTokenValue); std::string s = toLowerCopy(mTokenValue);
...@@ -476,23 +422,40 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -476,23 +422,40 @@ SacParser::CIFToken SacParser::getNextToken()
result = eCIFTokenSTOP; result = eCIFTokenSTOP;
else if (s == "loop_") else if (s == "loop_")
result = eCIFTokenLOOP; result = eCIFTokenLOOP;
else if (s == "data_" or s == "save_") else if (s == "data_")
mState = eStateValue + 2; {
state = eStateDATA;
continue;
} }
else if (not isNonBlank(ch)) else if (s == "save_")
{
state = eStateSAVE;
continue;
}
}
if (result == eCIFTokenUnknown and not isNonBlank(ch))
{ {
retract(); retract();
result = eCIFTokenValue; result = eCIFTokenValue;
mTokenType = eCIFValueString;
if (mTokenValue == ".")
mTokenType = eCIFValueInapplicable;
else if (mTokenValue == "?")
{
mTokenType = eCIFValueUnknown;
mTokenValue.clear();
}
} }
break; break;
case eStateValue + 2: case eStateDATA:
case eStateSAVE:
if (not isNonBlank(ch)) if (not isNonBlank(ch))
{ {
retract(); retract();
if (tolower(mTokenValue[0]) == 'd') if (state == eStateDATA)
result = eCIFTokenDATA; result = eCIFTokenDATA;
else else
result = eCIFTokenSAVE; result = eCIFTokenSAVE;
...@@ -521,6 +484,7 @@ SacParser::CIFToken SacParser::getNextToken() ...@@ -521,6 +484,7 @@ SacParser::CIFToken SacParser::getNextToken()
return result; return result;
} }
DatablockIndex SacParser::indexDatablocks() DatablockIndex SacParser::indexDatablocks()
{ {
DatablockIndex index; DatablockIndex index;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment