Commit 32f4749d by Maarten L. Hekkelman

faster cif parser

parent da12be87
......@@ -25,7 +25,7 @@
cmake_minimum_required(VERSION 3.16)
# set the project name
project(cifpp VERSION 5.0.9 LANGUAGES CXX)
project(cifpp VERSION 5.0.10 LANGUAGES CXX)
list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
......
Version 5.0.10
- Fix in progress_bar, was using too much CPU
- Optimised mmCIF parser
Version 5.0.9
- Fix in dihedral angle calculations
- Added create_water to model
......
......@@ -63,9 +63,14 @@ class sac_parser
kAnyPrintMask = 1 << 3
};
static bool is_white(int ch)
static constexpr bool is_space(int ch)
{
return std::isspace(ch) or ch == '#';
return ch == ' ' or ch == '\t' or ch == '\r' or ch == '\n';
}
static constexpr bool is_white(int ch)
{
return is_space(ch) or ch == '#';
}
static constexpr bool is_ordinary(int ch)
......@@ -136,15 +141,13 @@ class sac_parser
}
}
// get_next_char takes a char from the buffer, or if it is empty
// from the istream. This function also does carriage/linefeed
// translation.
// get_next_char takes the next character from the istream.
// This function also does carriage/linefeed translation.
int get_next_char();
// Put the last read character back into the istream
void retract();
int restart(int start);
CIFToken get_next_token();
void match(CIFToken token);
......@@ -191,7 +194,7 @@ class sac_parser
protected:
enum State
enum class State
{
Start,
White,
......@@ -204,9 +207,8 @@ class sac_parser
UnquotedString,
Tag,
TextField,
Float = 100,
Int = 110,
Reserved = 300,
TextFieldNL,
Reserved,
Value
};
......@@ -217,11 +219,6 @@ class sac_parser
bool m_bol;
CIFToken m_lookahead;
static constexpr size_t kRetractBufferSize = 128;
int m_retract_buffer[kRetractBufferSize];
int *m_retract_buffer_ptr = m_retract_buffer;
// token buffer
std::vector<char> m_token_buffer;
std::string_view m_token_value;
......
......@@ -228,8 +228,9 @@ class validator_factory
const validator &operator[](std::string_view dictionary_name);
const validator &construct_validator(std::string_view name, std::istream &is);
private:
void construct_validator(std::string_view name, std::istream &is);
// --------------------------------------------------------------------
......
......@@ -222,29 +222,25 @@ bool sac_parser::is_unquoted_string(std::string_view text)
// translation.
int sac_parser::get_next_char()
{
int result;
if (m_retract_buffer_ptr == m_retract_buffer)
result = m_source.sbumpc();
else
result = *--m_retract_buffer_ptr;
// very simple CR/LF translation into LF
if (result == '\r')
{
int lookahead = m_source.sbumpc();
if (lookahead != '\n')
*m_retract_buffer_ptr++ = lookahead;
result = '\n';
}
int result = m_source.sbumpc();
if (result == std::char_traits<char>::eof())
m_token_buffer.push_back(0);
else
m_token_buffer.push_back(std::char_traits<char>::to_char_type(result));
{
if (result == '\r')
{
if (m_source.sgetc() == '\n')
m_source.sbumpc();
if (result == '\n')
++m_line_nr;
++m_line_nr;
result = '\n';
}
else if (result == '\n')
++m_line_nr;
m_token_buffer.push_back(std::char_traits<char>::to_char_type(result));
}
return result;
}
......@@ -257,45 +253,16 @@ void sac_parser::retract()
if (ch == '\n')
--m_line_nr;
if (m_retract_buffer_ptr == m_retract_buffer + kRetractBufferSize)
throw cif::parse_error(m_line_nr, "Buffer overflow");
*m_retract_buffer_ptr++ = ch == 0 ? std::char_traits<char>::eof() : std::char_traits<char>::to_int_type(ch);
m_token_buffer.pop_back();
}
int sac_parser::restart(int start)
{
int result = 0;
while (not m_token_buffer.empty())
retract();
switch (start)
if (ch != 0)
{
case State::Start:
result = State::Float;
break;
case State::Float:
result = State::Int;
break;
// since we always putback at most a single character,
// the test below should never fail.
case State::Int:
result = State::Value;
break;
case State::Reserved:
result = State::Value;
break;
default:
error("Invalid state in SacParser");
if (m_source.sputbackc(ch) == std::char_traits<char>::eof())
throw std::runtime_error("putback failure");
}
m_bol = false;
return result;
m_token_buffer.pop_back();
}
sac_parser::CIFToken sac_parser::get_next_token()
......@@ -304,7 +271,7 @@ sac_parser::CIFToken sac_parser::get_next_token()
CIFToken result = CIFToken::Unknown;
int quoteChar = 0;
int state = State::Start, start = State::Start;
State state = State::Start;
m_bol = false;
m_token_buffer.clear();
......@@ -344,13 +311,13 @@ sac_parser::CIFToken sac_parser::get_next_token()
else if (dag.move(ch) == reserved_words_automaton::undefined)
state = State::Reserved;
else
state = start = restart(start);
state = State::Value;
break;
case State::White:
if (ch == kEOF)
result = CIFToken::Eof;
else if (not isspace(ch))
else if (not is_space(ch))
{
state = State::Start;
retract();
......@@ -380,19 +347,19 @@ sac_parser::CIFToken sac_parser::get_next_token()
result = CIFToken::Value;
}
else
state = start = restart(start);
state = State::Value;
break;
case State::TextField:
if (ch == '\n')
state = State::TextField + 1;
state = State::TextFieldNL;
else if (ch == kEOF)
error("unterminated textfield");
else if (not is_any_print(ch) and cif::VERBOSE > 2)
warning("invalid character in text field '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
break;
case State::TextField + 1:
case State::TextFieldNL:
if (is_text_lead(ch) or ch == ' ' or ch == '\t')
state = State::TextField;
else if (ch == ';')
......@@ -445,90 +412,6 @@ sac_parser::CIFToken sac_parser::get_next_token()
}
break;
case State::Float:
if (ch == '+' or ch == '-')
state = State::Float + 1;
else if ((ch >= '0' and ch <= '9'))
state = State::Float + 1;
else
state = start = restart(start);
break;
case State::Float + 1:
if (ch == '.')
state = State::Float + 2;
else if ((ch & ~0x20) == 'E')
state = State::Float + 3;
else if (is_white(ch) or ch == kEOF)
{
retract();
result = CIFToken::Value;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
}
else
state = start = restart(start);
break;
// parsed '.'
case State::Float + 2:
if ((ch & ~0x20) == 'E')
state = State::Float + 3;
else if (is_white(ch) or ch == kEOF)
{
retract();
result = CIFToken::Value;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
}
else
state = start = restart(start);
break;
// parsed 'e'
case State::Float + 3:
if (ch == '-' or ch == '+')
state = State::Float + 4;
else if ((ch >= '0' and ch <= '9'))
state = State::Float + 5;
else
state = start = restart(start);
break;
case State::Float + 4:
if ((ch >= '0' and ch <= '9'))
state = State::Float + 5;
else
state = start = restart(start);
break;
case State::Float + 5:
if (is_white(ch) or ch == kEOF)
{
retract();
result = CIFToken::Value;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
}
else
state = start = restart(start);
break;
case State::Int:
if ((ch >= '0' and ch <= '9') or ch == '+' or ch == '-')
state = State::Int + 1;
else
state = start = restart(start);
break;
case State::Int + 1:
if (is_white(ch) or ch == kEOF)
{
retract();
result = CIFToken::Value;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
}
else
state = start = restart(start);
break;
case State::Reserved:
switch (dag.move(ch))
{
......@@ -536,7 +419,14 @@ sac_parser::CIFToken sac_parser::get_next_token()
break;
case reserved_words_automaton::no_keyword:
state = start = restart(start);
if (not is_non_blank(ch))
{
retract();
result = CIFToken::Value;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
}
else
state = State::Value;
break;
case reserved_words_automaton::data:
......@@ -664,7 +554,7 @@ bool sac_parser::parse_single_datablock(const std::string &datablock)
break;
case string_quote:
if (std::isspace(ch))
if (is_space(ch))
state = start;
else
state = string;
......@@ -676,7 +566,7 @@ bool sac_parser::parse_single_datablock(const std::string &datablock)
break;
case data:
if (isspace(ch) and dblk[si] == 0)
if (is_space(ch) and dblk[si] == 0)
found = true;
else if (dblk[si++] != ch)
state = start;
......@@ -754,7 +644,7 @@ sac_parser::datablock_index sac_parser::index_datablocks()
break;
case string_quote:
if (std::isspace(ch))
if (is_space(ch))
state = start;
else
state = string;
......@@ -778,7 +668,7 @@ sac_parser::datablock_index sac_parser::index_datablocks()
case data_name:
if (is_non_blank(ch))
datablock.insert(datablock.end(), char(ch));
else if (isspace(ch))
else if (is_space(ch))
{
if (not datablock.empty())
index[datablock] = m_source.pubseekoff(0, std::ios_base::cur, std::ios_base::in);
......
......@@ -491,9 +491,9 @@ const validator &validator_factory::operator[](std::string_view dictionary_name)
}
}
void validator_factory::construct_validator(std::string_view name, std::istream &is)
const validator &validator_factory::construct_validator(std::string_view name, std::istream &is)
{
m_validators.emplace_back(parse_dictionary(name, is));
return m_validators.emplace_back(parse_dictionary(name, is));
}
} // namespace cif
#include <cif++.hpp>
class dummy_parser : public cif::sac_parser
{
public:
dummy_parser(std::istream &is)
: sac_parser(is)
{
}
void produce_datablock(std::string_view name) override
{
}
void produce_category(std::string_view name) override
{
}
void produce_row() override
{
}
void produce_item(std::string_view category, std::string_view item, std::string_view value) override
{
}
};
int main()
{
cif::gzio::ifstream in("/srv/data/pdb/mmCIF/gl/8glv.cif.gz");
dummy_parser parser(in);
parser.parse_file();
// cif::file f("/srv/data/pdb/mmCIF/gl/8glv.cif.gz");
return 0;
}
\ No newline at end of file
......@@ -2861,7 +2861,7 @@ save__cat_1.name
std::istream is_dict(&buffer);
auto validator = cif::parse_dictionary("test_dict.dic", is_dict);
auto &validator = cif::validator_factory::instance().construct_validator("test_dict.dic", is_dict);
cif::file f;
f.set_validator(&validator);
......@@ -2899,8 +2899,6 @@ _cat_1.name
ss << f;
cif::file f2(ss);
f2.set_validator(&validator);
BOOST_ASSERT(f2.is_valid());
auto &audit_conform = f2.front()["audit_conform"];
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment