faster cif parser

32f4749d · Maarten L. Hekkelman · da12be87 · 32f4749d · 32f4749d · 32f4749d
Commit 32f4749d authored Jun 07, 2023 by Maarten L. Hekkelman
Showing with 101 additions and 171 deletions

CMakeLists.txt
+1 -1

changelog
+4 -0

include/cif++/parser.hpp
+13 -16

include/cif++/validate.hpp
+2 -1

src/parser.cpp
+38 -148

src/validate.cpp
+2 -2

test/io-test.cpp
+40 -0

test/unit-v2-test.cpp
+1 -3

No files found.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,7 @@
 cmake_minimum_required(VERSION 3.16)

 # set the project name
-project(cifpp VERSION 5.0.9 LANGUAGES CXX)
+project(cifpp VERSION 5.0.10 LANGUAGES CXX)

 list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")


--- a/changelog
+++ b/changelog
+Version 5.0.10
+- Fix in progress_bar, was using too much CPU
+- Optimised mmCIF parser
+
 Version 5.0.9
 - Fix in dihedral angle calculations
 - Added create_water to model

--- a/include/cif++/parser.hpp
+++ b/include/cif++/parser.hpp
@@ -63,9 +63,14 @@ class sac_parser
 		kAnyPrintMask = 1 << 3
 	};

-	static bool is_white(int ch)
+	static constexpr bool is_space(int ch)
 	{
-		return std::isspace(ch) or ch == '#';
+		return ch == ' ' or ch == '\t' or ch == '\r' or ch == '\n';
+	}
+
+	static constexpr bool is_white(int ch)
+	{
+		return is_space(ch) or ch == '#';
 	}

 	static constexpr bool is_ordinary(int ch)
@@ -136,15 +141,13 @@ class sac_parser
 		}
 	}

-	// get_next_char takes a char from the buffer, or if it is empty
-	// from the istream. This function also does carriage/linefeed
-	// translation.
+	// get_next_char takes the next character from the istream.
+	// This function also does carriage/linefeed translation.
 	int get_next_char();

+	// Put the last read character back into the istream
 	void retract();

-	int restart(int start);
-
 	CIFToken get_next_token();

 	void match(CIFToken token);
@@ -191,7 +194,7 @@ class sac_parser

  protected:

-	enum State
+	enum class State
 	{
 		Start,
 		White,
@@ -204,9 +207,8 @@ class sac_parser
 		UnquotedString,
 		Tag,
 		TextField,
-		Float = 100,
-		Int = 110,
-		Reserved = 300,
+		TextFieldNL,
+		Reserved,
 		Value
 	};

@@ -217,11 +219,6 @@ class sac_parser
 	bool m_bol;
 	CIFToken m_lookahead;

-	static constexpr size_t kRetractBufferSize = 128;
-
-	int m_retract_buffer[kRetractBufferSize];
-	int *m_retract_buffer_ptr = m_retract_buffer;
-
 	// token buffer
 	std::vector<char> m_token_buffer;
 	std::string_view m_token_value;

--- a/include/cif++/validate.hpp
+++ b/include/cif++/validate.hpp
@@ -228,8 +228,9 @@ class validator_factory

 	const validator &operator[](std::string_view dictionary_name);

+	const validator &construct_validator(std::string_view name, std::istream &is);
+
  private:
-	void construct_validator(std::string_view name, std::istream &is);

 	// --------------------------------------------------------------------


--- a/src/parser.cpp
+++ b/src/parser.cpp
@@ -222,29 +222,25 @@ bool sac_parser::is_unquoted_string(std::string_view text)
 // translation.
 int sac_parser::get_next_char()
 {
-	int result;
-
-	if (m_retract_buffer_ptr == m_retract_buffer)
-		result = m_source.sbumpc();
-	else
-		result = *--m_retract_buffer_ptr;
-
-	// very simple CR/LF translation into LF
-	if (result == '\r')
-	{
-		int lookahead = m_source.sbumpc();
-		if (lookahead != '\n')
-			*m_retract_buffer_ptr++ = lookahead;
-		result = '\n';
-	}
+	int result = m_source.sbumpc();

 	if (result == std::char_traits<char>::eof())
 		m_token_buffer.push_back(0);
 	else
-		m_token_buffer.push_back(std::char_traits<char>::to_char_type(result));
+	{
+		if (result == '\r')
+		{
+			if (m_source.sgetc() == '\n')
+				m_source.sbumpc();

-	if (result == '\n')
-		++m_line_nr;
+			++m_line_nr;
+			result = '\n';
+		}
+		else if (result == '\n')
+			++m_line_nr;
+		
+		m_token_buffer.push_back(std::char_traits<char>::to_char_type(result));
+	}

 	return result;
 }
@@ -257,45 +253,16 @@ void sac_parser::retract()
 	if (ch == '\n')
 		--m_line_nr;

-	if (m_retract_buffer_ptr == m_retract_buffer + kRetractBufferSize)
-		throw cif::parse_error(m_line_nr, "Buffer overflow");
-
-	*m_retract_buffer_ptr++ = ch == 0 ? std::char_traits<char>::eof() : std::char_traits<char>::to_int_type(ch);
-	m_token_buffer.pop_back();
-}
-
-int sac_parser::restart(int start)
-{
-	int result = 0;
-
-	while (not m_token_buffer.empty())
-		retract();
-
-	switch (start)
+	if (ch != 0)
 	{
-		case State::Start:
-			result = State::Float;
-			break;
-
-		case State::Float:
-			result = State::Int;
-			break;
+		// since we always putback at most a single character,
+		// the test below should never fail.

-		case State::Int:
-			result = State::Value;
-			break;
-		
-		case State::Reserved:
-			result = State::Value;
-			break;
-
-		default:
-			error("Invalid state in SacParser");
+		if (m_source.sputbackc(ch) == std::char_traits<char>::eof())
+			throw std::runtime_error("putback failure");
 	}

-	m_bol = false;
-
-	return result;
+	m_token_buffer.pop_back();
 }

 sac_parser::CIFToken sac_parser::get_next_token()
@@ -304,7 +271,7 @@ sac_parser::CIFToken sac_parser::get_next_token()

 	CIFToken result = CIFToken::Unknown;
 	int quoteChar = 0;
-	int state = State::Start, start = State::Start;
+	State state = State::Start;
 	m_bol = false;

 	m_token_buffer.clear();
@@ -344,13 +311,13 @@ sac_parser::CIFToken sac_parser::get_next_token()
 				else if (dag.move(ch) == reserved_words_automaton::undefined)
 					state = State::Reserved;
 				else
-					state = start = restart(start);
+					state = State::Value;
 				break;

 			case State::White:
 				if (ch == kEOF)
 					result = CIFToken::Eof;
-				else if (not isspace(ch))
+				else if (not is_space(ch))
 				{
 					state = State::Start;
 					retract();
@@ -380,19 +347,19 @@ sac_parser::CIFToken sac_parser::get_next_token()
 					result = CIFToken::Value;
 				}
 				else
-					state = start = restart(start);
+					state = State::Value;
 				break;

 			case State::TextField:
 				if (ch == '\n')
-					state = State::TextField + 1;
+					state = State::TextFieldNL;
 				else if (ch == kEOF)
 					error("unterminated textfield");
 				else if (not is_any_print(ch) and cif::VERBOSE > 2)
 					warning("invalid character in text field '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
 				break;

-			case State::TextField + 1:
+			case State::TextFieldNL:
 				if (is_text_lead(ch) or ch == ' ' or ch == '\t')
 					state = State::TextField;
 				else if (ch == ';')
@@ -445,90 +412,6 @@ sac_parser::CIFToken sac_parser::get_next_token()
 				}
 				break;

-			case State::Float:
-				if (ch == '+' or ch == '-')
-					state = State::Float + 1;
-				else if ((ch >= '0' and ch <= '9'))
-					state = State::Float + 1;
-				else
-					state = start = restart(start);
-				break;
-
-			case State::Float + 1:
-				if (ch == '.')
-					state = State::Float + 2;
-				else if ((ch & ~0x20) == 'E')
-					state = State::Float + 3;
-				else if (is_white(ch) or ch == kEOF)
-				{
-					retract();
-					result = CIFToken::Value;
-					m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
-				}
-				else
-					state = start = restart(start);
-				break;
-
-			// parsed '.'
-			case State::Float + 2:
-				if ((ch & ~0x20) == 'E')
-					state = State::Float + 3;
-				else if (is_white(ch) or ch == kEOF)
-				{
-					retract();
-					result = CIFToken::Value;
-					m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
-				}
-				else
-					state = start = restart(start);
-				break;
-
-			// parsed 'e'
-			case State::Float + 3:
-				if (ch == '-' or ch == '+')
-					state = State::Float + 4;
-				else if ((ch >= '0' and ch <= '9'))
-					state = State::Float + 5;
-				else
-					state = start = restart(start);
-				break;
-
-			case State::Float + 4:
-				if ((ch >= '0' and ch <= '9'))
-					state = State::Float + 5;
-				else
-					state = start = restart(start);
-				break;
-
-			case State::Float + 5:
-				if (is_white(ch) or ch == kEOF)
-				{
-					retract();
-					result = CIFToken::Value;
-					m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
-				}
-				else
-					state = start = restart(start);
-				break;
-
-			case State::Int:
-				if ((ch >= '0' and ch <= '9') or ch == '+' or ch == '-')
-					state = State::Int + 1;
-				else
-					state = start = restart(start);
-				break;
-
-			case State::Int + 1:
-				if (is_white(ch) or ch == kEOF)
-				{
-					retract();
-					result = CIFToken::Value;
-					m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
-				}
-				else
-					state = start = restart(start);
-				break;
-
 			case State::Reserved:
 				switch (dag.move(ch))
 				{
@@ -536,7 +419,14 @@ sac_parser::CIFToken sac_parser::get_next_token()
 						break;

 					case reserved_words_automaton::no_keyword:
-						state = start = restart(start);
+						if (not is_non_blank(ch))
+						{
+							retract();
+							result = CIFToken::Value;
+							m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
+						}
+						else
+							state = State::Value;
 						break;

 					case reserved_words_automaton::data:
@@ -664,7 +554,7 @@ bool sac_parser::parse_single_datablock(const std::string &datablock)
 				break;

 			case string_quote:
-				if (std::isspace(ch))
+				if (is_space(ch))
 					state = start;
 				else
 					state = string;
@@ -676,7 +566,7 @@ bool sac_parser::parse_single_datablock(const std::string &datablock)
 				break;

 			case data:
-				if (isspace(ch) and dblk[si] == 0)
+				if (is_space(ch) and dblk[si] == 0)
 					found = true;
 				else if (dblk[si++] != ch)
 					state = start;
@@ -754,7 +644,7 @@ sac_parser::datablock_index sac_parser::index_datablocks()
 				break;

 			case string_quote:
-				if (std::isspace(ch))
+				if (is_space(ch))
 					state = start;
 				else
 					state = string;
@@ -778,7 +668,7 @@ sac_parser::datablock_index sac_parser::index_datablocks()
 			case data_name:
 				if (is_non_blank(ch))
 					datablock.insert(datablock.end(), char(ch));
-				else if (isspace(ch))
+				else if (is_space(ch))
 				{
 					if (not datablock.empty())
 						index[datablock] = m_source.pubseekoff(0, std::ios_base::cur, std::ios_base::in);

--- a/src/validate.cpp
+++ b/src/validate.cpp
@@ -491,9 +491,9 @@ const validator &validator_factory::operator[](std::string_view dictionary_name)
 	}
 }

-void validator_factory::construct_validator(std::string_view name, std::istream &is)
+const validator &validator_factory::construct_validator(std::string_view name, std::istream &is)
 {
-	m_validators.emplace_back(parse_dictionary(name, is));
+	return m_validators.emplace_back(parse_dictionary(name, is));
 }

 } // namespace cif
--- a/test/io-test.cpp
+++ b/test/io-test.cpp
+#include <cif++.hpp>
+
+class dummy_parser : public cif::sac_parser
+{
+  public:
+	dummy_parser(std::istream &is)
+		: sac_parser(is)
+	{
+	}
+
+	void produce_datablock(std::string_view name) override
+	{
+	}
+
+	void produce_category(std::string_view name) override
+	{
+	}
+
+	void produce_row() override
+	{
+	}
+
+	void produce_item(std::string_view category, std::string_view item, std::string_view value) override
+	{
+	}
+};
+
+
+int main()
+{
+	cif::gzio::ifstream in("/srv/data/pdb/mmCIF/gl/8glv.cif.gz");
+
+	dummy_parser parser(in);
+	parser.parse_file();
+
+	// cif::file f("/srv/data/pdb/mmCIF/gl/8glv.cif.gz");
+
+	return 0;
+}
\ No newline at end of file
--- a/test/unit-v2-test.cpp
+++ b/test/unit-v2-test.cpp
@@ -2861,7 +2861,7 @@ save__cat_1.name

 	std::istream is_dict(&buffer);

-	auto validator = cif::parse_dictionary("test_dict.dic", is_dict);
+	auto &validator = cif::validator_factory::instance().construct_validator("test_dict.dic", is_dict);

 	cif::file f;
 	f.set_validator(&validator);
@@ -2899,8 +2899,6 @@ _cat_1.name
 	ss << f;

 	cif::file f2(ss);
-
-	f2.set_validator(&validator);
 	BOOST_ASSERT(f2.is_valid());

 	auto &audit_conform = f2.front()["audit_conform"];