Commit 32f4749d by Maarten L. Hekkelman

faster cif parser

parent da12be87
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
cmake_minimum_required(VERSION 3.16) cmake_minimum_required(VERSION 3.16)
# set the project name # set the project name
project(cifpp VERSION 5.0.9 LANGUAGES CXX) project(cifpp VERSION 5.0.10 LANGUAGES CXX)
list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
......
Version 5.0.10
- Fix in progress_bar, was using too much CPU
- Optimised mmCIF parser
Version 5.0.9 Version 5.0.9
- Fix in dihedral angle calculations - Fix in dihedral angle calculations
- Added create_water to model - Added create_water to model
......
...@@ -63,9 +63,14 @@ class sac_parser ...@@ -63,9 +63,14 @@ class sac_parser
kAnyPrintMask = 1 << 3 kAnyPrintMask = 1 << 3
}; };
static bool is_white(int ch) static constexpr bool is_space(int ch)
{ {
return std::isspace(ch) or ch == '#'; return ch == ' ' or ch == '\t' or ch == '\r' or ch == '\n';
}
static constexpr bool is_white(int ch)
{
return is_space(ch) or ch == '#';
} }
static constexpr bool is_ordinary(int ch) static constexpr bool is_ordinary(int ch)
...@@ -136,15 +141,13 @@ class sac_parser ...@@ -136,15 +141,13 @@ class sac_parser
} }
} }
// get_next_char takes a char from the buffer, or if it is empty // get_next_char takes the next character from the istream.
// from the istream. This function also does carriage/linefeed // This function also does carriage/linefeed translation.
// translation.
int get_next_char(); int get_next_char();
// Put the last read character back into the istream
void retract(); void retract();
int restart(int start);
CIFToken get_next_token(); CIFToken get_next_token();
void match(CIFToken token); void match(CIFToken token);
...@@ -191,7 +194,7 @@ class sac_parser ...@@ -191,7 +194,7 @@ class sac_parser
protected: protected:
enum State enum class State
{ {
Start, Start,
White, White,
...@@ -204,9 +207,8 @@ class sac_parser ...@@ -204,9 +207,8 @@ class sac_parser
UnquotedString, UnquotedString,
Tag, Tag,
TextField, TextField,
Float = 100, TextFieldNL,
Int = 110, Reserved,
Reserved = 300,
Value Value
}; };
...@@ -217,11 +219,6 @@ class sac_parser ...@@ -217,11 +219,6 @@ class sac_parser
bool m_bol; bool m_bol;
CIFToken m_lookahead; CIFToken m_lookahead;
static constexpr size_t kRetractBufferSize = 128;
int m_retract_buffer[kRetractBufferSize];
int *m_retract_buffer_ptr = m_retract_buffer;
// token buffer // token buffer
std::vector<char> m_token_buffer; std::vector<char> m_token_buffer;
std::string_view m_token_value; std::string_view m_token_value;
......
...@@ -228,8 +228,9 @@ class validator_factory ...@@ -228,8 +228,9 @@ class validator_factory
const validator &operator[](std::string_view dictionary_name); const validator &operator[](std::string_view dictionary_name);
const validator &construct_validator(std::string_view name, std::istream &is);
private: private:
void construct_validator(std::string_view name, std::istream &is);
// -------------------------------------------------------------------- // --------------------------------------------------------------------
......
...@@ -222,29 +222,25 @@ bool sac_parser::is_unquoted_string(std::string_view text) ...@@ -222,29 +222,25 @@ bool sac_parser::is_unquoted_string(std::string_view text)
// translation. // translation.
int sac_parser::get_next_char() int sac_parser::get_next_char()
{ {
int result; int result = m_source.sbumpc();
if (m_retract_buffer_ptr == m_retract_buffer) if (result == std::char_traits<char>::eof())
result = m_source.sbumpc(); m_token_buffer.push_back(0);
else else
result = *--m_retract_buffer_ptr; {
// very simple CR/LF translation into LF
if (result == '\r') if (result == '\r')
{ {
int lookahead = m_source.sbumpc(); if (m_source.sgetc() == '\n')
if (lookahead != '\n') m_source.sbumpc();
*m_retract_buffer_ptr++ = lookahead;
++m_line_nr;
result = '\n'; result = '\n';
} }
else if (result == '\n')
++m_line_nr;
if (result == std::char_traits<char>::eof())
m_token_buffer.push_back(0);
else
m_token_buffer.push_back(std::char_traits<char>::to_char_type(result)); m_token_buffer.push_back(std::char_traits<char>::to_char_type(result));
}
if (result == '\n')
++m_line_nr;
return result; return result;
} }
...@@ -257,45 +253,16 @@ void sac_parser::retract() ...@@ -257,45 +253,16 @@ void sac_parser::retract()
if (ch == '\n') if (ch == '\n')
--m_line_nr; --m_line_nr;
if (m_retract_buffer_ptr == m_retract_buffer + kRetractBufferSize) if (ch != 0)
throw cif::parse_error(m_line_nr, "Buffer overflow");
*m_retract_buffer_ptr++ = ch == 0 ? std::char_traits<char>::eof() : std::char_traits<char>::to_int_type(ch);
m_token_buffer.pop_back();
}
int sac_parser::restart(int start)
{
int result = 0;
while (not m_token_buffer.empty())
retract();
switch (start)
{ {
case State::Start: // since we always putback at most a single character,
result = State::Float; // the test below should never fail.
break;
case State::Float:
result = State::Int;
break;
case State::Int:
result = State::Value;
break;
case State::Reserved: if (m_source.sputbackc(ch) == std::char_traits<char>::eof())
result = State::Value; throw std::runtime_error("putback failure");
break;
default:
error("Invalid state in SacParser");
} }
m_bol = false; m_token_buffer.pop_back();
return result;
} }
sac_parser::CIFToken sac_parser::get_next_token() sac_parser::CIFToken sac_parser::get_next_token()
...@@ -304,7 +271,7 @@ sac_parser::CIFToken sac_parser::get_next_token() ...@@ -304,7 +271,7 @@ sac_parser::CIFToken sac_parser::get_next_token()
CIFToken result = CIFToken::Unknown; CIFToken result = CIFToken::Unknown;
int quoteChar = 0; int quoteChar = 0;
int state = State::Start, start = State::Start; State state = State::Start;
m_bol = false; m_bol = false;
m_token_buffer.clear(); m_token_buffer.clear();
...@@ -344,13 +311,13 @@ sac_parser::CIFToken sac_parser::get_next_token() ...@@ -344,13 +311,13 @@ sac_parser::CIFToken sac_parser::get_next_token()
else if (dag.move(ch) == reserved_words_automaton::undefined) else if (dag.move(ch) == reserved_words_automaton::undefined)
state = State::Reserved; state = State::Reserved;
else else
state = start = restart(start); state = State::Value;
break; break;
case State::White: case State::White:
if (ch == kEOF) if (ch == kEOF)
result = CIFToken::Eof; result = CIFToken::Eof;
else if (not isspace(ch)) else if (not is_space(ch))
{ {
state = State::Start; state = State::Start;
retract(); retract();
...@@ -380,19 +347,19 @@ sac_parser::CIFToken sac_parser::get_next_token() ...@@ -380,19 +347,19 @@ sac_parser::CIFToken sac_parser::get_next_token()
result = CIFToken::Value; result = CIFToken::Value;
} }
else else
state = start = restart(start); state = State::Value;
break; break;
case State::TextField: case State::TextField:
if (ch == '\n') if (ch == '\n')
state = State::TextField + 1; state = State::TextFieldNL;
else if (ch == kEOF) else if (ch == kEOF)
error("unterminated textfield"); error("unterminated textfield");
else if (not is_any_print(ch) and cif::VERBOSE > 2) else if (not is_any_print(ch) and cif::VERBOSE > 2)
warning("invalid character in text field '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")"); warning("invalid character in text field '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
break; break;
case State::TextField + 1: case State::TextFieldNL:
if (is_text_lead(ch) or ch == ' ' or ch == '\t') if (is_text_lead(ch) or ch == ' ' or ch == '\t')
state = State::TextField; state = State::TextField;
else if (ch == ';') else if (ch == ';')
...@@ -445,98 +412,21 @@ sac_parser::CIFToken sac_parser::get_next_token() ...@@ -445,98 +412,21 @@ sac_parser::CIFToken sac_parser::get_next_token()
} }
break; break;
case State::Float: case State::Reserved:
if (ch == '+' or ch == '-') switch (dag.move(ch))
state = State::Float + 1;
else if ((ch >= '0' and ch <= '9'))
state = State::Float + 1;
else
state = start = restart(start);
break;
case State::Float + 1:
if (ch == '.')
state = State::Float + 2;
else if ((ch & ~0x20) == 'E')
state = State::Float + 3;
else if (is_white(ch) or ch == kEOF)
{
retract();
result = CIFToken::Value;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
}
else
state = start = restart(start);
break;
// parsed '.'
case State::Float + 2:
if ((ch & ~0x20) == 'E')
state = State::Float + 3;
else if (is_white(ch) or ch == kEOF)
{
retract();
result = CIFToken::Value;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
}
else
state = start = restart(start);
break;
// parsed 'e'
case State::Float + 3:
if (ch == '-' or ch == '+')
state = State::Float + 4;
else if ((ch >= '0' and ch <= '9'))
state = State::Float + 5;
else
state = start = restart(start);
break;
case State::Float + 4:
if ((ch >= '0' and ch <= '9'))
state = State::Float + 5;
else
state = start = restart(start);
break;
case State::Float + 5:
if (is_white(ch) or ch == kEOF)
{ {
retract(); case reserved_words_automaton::undefined:
result = CIFToken::Value;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
}
else
state = start = restart(start);
break;
case State::Int:
if ((ch >= '0' and ch <= '9') or ch == '+' or ch == '-')
state = State::Int + 1;
else
state = start = restart(start);
break; break;
case State::Int + 1: case reserved_words_automaton::no_keyword:
if (is_white(ch) or ch == kEOF) if (not is_non_blank(ch))
{ {
retract(); retract();
result = CIFToken::Value; result = CIFToken::Value;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size()); m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
} }
else else
state = start = restart(start); state = State::Value;
break;
case State::Reserved:
switch (dag.move(ch))
{
case reserved_words_automaton::undefined:
break;
case reserved_words_automaton::no_keyword:
state = start = restart(start);
break; break;
case reserved_words_automaton::data: case reserved_words_automaton::data:
...@@ -664,7 +554,7 @@ bool sac_parser::parse_single_datablock(const std::string &datablock) ...@@ -664,7 +554,7 @@ bool sac_parser::parse_single_datablock(const std::string &datablock)
break; break;
case string_quote: case string_quote:
if (std::isspace(ch)) if (is_space(ch))
state = start; state = start;
else else
state = string; state = string;
...@@ -676,7 +566,7 @@ bool sac_parser::parse_single_datablock(const std::string &datablock) ...@@ -676,7 +566,7 @@ bool sac_parser::parse_single_datablock(const std::string &datablock)
break; break;
case data: case data:
if (isspace(ch) and dblk[si] == 0) if (is_space(ch) and dblk[si] == 0)
found = true; found = true;
else if (dblk[si++] != ch) else if (dblk[si++] != ch)
state = start; state = start;
...@@ -754,7 +644,7 @@ sac_parser::datablock_index sac_parser::index_datablocks() ...@@ -754,7 +644,7 @@ sac_parser::datablock_index sac_parser::index_datablocks()
break; break;
case string_quote: case string_quote:
if (std::isspace(ch)) if (is_space(ch))
state = start; state = start;
else else
state = string; state = string;
...@@ -778,7 +668,7 @@ sac_parser::datablock_index sac_parser::index_datablocks() ...@@ -778,7 +668,7 @@ sac_parser::datablock_index sac_parser::index_datablocks()
case data_name: case data_name:
if (is_non_blank(ch)) if (is_non_blank(ch))
datablock.insert(datablock.end(), char(ch)); datablock.insert(datablock.end(), char(ch));
else if (isspace(ch)) else if (is_space(ch))
{ {
if (not datablock.empty()) if (not datablock.empty())
index[datablock] = m_source.pubseekoff(0, std::ios_base::cur, std::ios_base::in); index[datablock] = m_source.pubseekoff(0, std::ios_base::cur, std::ios_base::in);
......
...@@ -491,9 +491,9 @@ const validator &validator_factory::operator[](std::string_view dictionary_name) ...@@ -491,9 +491,9 @@ const validator &validator_factory::operator[](std::string_view dictionary_name)
} }
} }
void validator_factory::construct_validator(std::string_view name, std::istream &is) const validator &validator_factory::construct_validator(std::string_view name, std::istream &is)
{ {
m_validators.emplace_back(parse_dictionary(name, is)); return m_validators.emplace_back(parse_dictionary(name, is));
} }
} // namespace cif } // namespace cif
#include <cif++.hpp>
class dummy_parser : public cif::sac_parser
{
public:
dummy_parser(std::istream &is)
: sac_parser(is)
{
}
void produce_datablock(std::string_view name) override
{
}
void produce_category(std::string_view name) override
{
}
void produce_row() override
{
}
void produce_item(std::string_view category, std::string_view item, std::string_view value) override
{
}
};
int main()
{
cif::gzio::ifstream in("/srv/data/pdb/mmCIF/gl/8glv.cif.gz");
dummy_parser parser(in);
parser.parse_file();
// cif::file f("/srv/data/pdb/mmCIF/gl/8glv.cif.gz");
return 0;
}
\ No newline at end of file
...@@ -2861,7 +2861,7 @@ save__cat_1.name ...@@ -2861,7 +2861,7 @@ save__cat_1.name
std::istream is_dict(&buffer); std::istream is_dict(&buffer);
auto validator = cif::parse_dictionary("test_dict.dic", is_dict); auto &validator = cif::validator_factory::instance().construct_validator("test_dict.dic", is_dict);
cif::file f; cif::file f;
f.set_validator(&validator); f.set_validator(&validator);
...@@ -2899,8 +2899,6 @@ _cat_1.name ...@@ -2899,8 +2899,6 @@ _cat_1.name
ss << f; ss << f;
cif::file f2(ss); cif::file f2(ss);
f2.set_validator(&validator);
BOOST_ASSERT(f2.is_valid()); BOOST_ASSERT(f2.is_valid());
auto &audit_conform = f2.front()["audit_conform"]; auto &audit_conform = f2.front()["audit_conform"];
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment