PDBx validation and reconstruction code, take 1

82086a93 · Maarten L. Hekkelman · abd97cc1 · 82086a93 · 82086a93 · 82086a93
Commit 82086a93 authored Jan 02, 2024 by Maarten L. Hekkelman
Showing with 1011 additions and 28 deletions

CMakeLists.txt
+3 -1

include/cif++/pdb.hpp
+35 -19

src/pdb/pdb2cif.cpp
+4 -0

src/pdb/reconstruct.cpp
+505 -2

src/pdb/validate-pdbx.cpp
+211 -0

test/test-main.cpp
+5 -6

test/validate-pdbx-test.cpp
+248 -0

No files found.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -300,6 +300,7 @@ set(project_sources
    ${PROJECT_SOURCE_DIR}/src/pdb/pdb2cif_remark_3.hpp
    ${PROJECT_SOURCE_DIR}/src/pdb/pdb2cif_remark_3.cpp
 	${PROJECT_SOURCE_DIR}/src/pdb/reconstruct.cpp
+	${PROJECT_SOURCE_DIR}/src/pdb/validate-pdbx.cpp
 	)
 set(project_headers
@@ -562,7 +563,8 @@ if(BUILD_TESTING)
    model
    rename-compound
    sugar
-    spinner)
+    spinner
+	validate-pdbx)
  foreach(CIFPP_TEST IN LISTS CIFPP_tests)
    set(CIFPP_TEST "${CIFPP_TEST}-test")

--- a/include/cif++/pdb.hpp
+++ b/include/cif++/pdb.hpp
 /*-
 * SPDX-License-Identifier: BSD-2-Clause
- * 
+ *
 * Copyright (c) 2023 NKI/AVL, Netherlands Cancer Institute
- * 
+ *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
- * 
+ *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
- * 
+ *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -30,13 +30,13 @@
 /**
 * @file pdb.hpp
- * 
+ *
 * This file presents the API to read and write files in the
 * legacy and ancient PDB format.
- * 
+ *
 * The code works on the basis of best effort since it is
 * impossible to have correct round trip fidelity.
- * 
+ *
 */
 namespace cif::pdb
@@ -81,7 +81,7 @@ inline void write(std::ostream &os, const file &f)
 /** @brief Write out the data in @a db to file @a file
 * in legacy PDB format or mmCIF format, depending on the
 * filename extension.
- * 
+ *
 * If extension of @a file is *.gz* the resulting file will
 * be written in gzip compressed format.
 */
@@ -90,7 +90,7 @@ void write(const std::filesystem::path &file, const datablock &db);
 /** @brief Write out the data in @a f to file @a file
 * in legacy PDB format or mmCIF format, depending on the
 * filename extension.
- * 
+ *
 * If extension of @a file is *.gz* the resulting file will
 * be written in gzip compressed format.
 */
@@ -102,13 +102,30 @@ inline void write(const std::filesystem::path &p, const file &f)
 // --------------------------------------------------------------------
 /** \brief Reconstruct all missing categories for an assumed PDBx file.
- * 
+ *
 * Some people believe that simply dumping some atom records is enough.
- * 
+ *
- * \param db The cif::datablock that hopefully contains some valid data
+ * \param file The cif::file that hopefully contains some valid data
+ * \param dictionary The mmcif dictionary to use
 */
-void reconstruct_pdbx(datablock &db);
+void reconstruct_pdbx(file &pdbx_file, std::string_view dictionary = "mmcif_pdbx");
+/** \brief This is an extension to cif::validator, use the logic in common
+ * PDBx files to see if the file is internally consistent.
+ *
+ * This function for now checks if the following categories are consistent:
+ *
+ * atom_site -> pdbx_poly_seq_scheme -> entity_poly_seq -> entity_poly -> entity
+ *
+ * Use the common \ref cif::VERBOSE flag to turn on diagnostic messages.
+ *
+ * \param file The input file
+ * \param dictionary The mmcif dictionary to use
+ * \result Returns true if the file was valid and consistent
+ */
+bool is_valid_pdbx_file(const file &pdbx_file, std::string_view dictionary = "mmcif_pdbx");
 // --------------------------------------------------------------------
 // Other I/O related routines
@@ -117,7 +134,7 @@ void reconstruct_pdbx(datablock &db);
 *
 * The line returned should be compatible with the legacy PDB
 * format and is e.g. used in the DSSP program.
- * 
+ *
 * @param data The datablock to use as source for the requested data
 * @param truncate_at The maximum length of the line returned
 */
@@ -127,7 +144,7 @@ std::string get_HEADER_line(const datablock &data, std::string::size_type trunca
 *
 * The line returned should be compatible with the legacy PDB
 * format and is e.g. used in the DSSP program.
- * 
+ *
 * @param data The datablock to use as source for the requested data
 * @param truncate_at The maximum length of the line returned
 */
@@ -137,7 +154,7 @@ std::string get_COMPND_line(const datablock &data, std::string::size_type trunca
 *
 * The line returned should be compatible with the legacy PDB
 * format and is e.g. used in the DSSP program.
- * 
+ *
 * @param data The datablock to use as source for the requested data
 * @param truncate_at The maximum length of the line returned
 */
@@ -147,12 +164,11 @@ std::string get_SOURCE_line(const datablock &data, std::string::size_type trunca
 *
 * The line returned should be compatible with the legacy PDB
 * format and is e.g. used in the DSSP program.
- * 
+ *
 * @param data The datablock to use as source for the requested data
 * @param truncate_at The maximum length of the line returned
 */
 std::string get_AUTHOR_line(const datablock &data, std::string::size_type truncate_at = 127);
-} // namespace pdbx
+} // namespace cif::pdb
--- a/src/pdb/pdb2cif.cpp
+++ b/src/pdb/pdb2cif.cpp
@@ -6478,6 +6478,10 @@ file read(std::istream &is)
 			{
 				std::throw_with_nested(std::runtime_error("Since the file did not start with a valid PDB HEADER line mmCIF was assumed, but that failed."));
 			}
+			// Since we're using the cif::pdb way of reading the file, the data may need
+			// reconstruction
+			reconstruct_pdbx(result);
 		}
 	}

--- a/src/pdb/reconstruct.cpp
+++ b/src/pdb/reconstruct.cpp
@@ -24,20 +24,523 @@
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-#include "cif++/pdb.hpp"
+#include "cif++.hpp"
 // --------------------------------------------------------------------
 namespace cif::pdb
 {
-void reconstruct_pdbx(datablock &db)
+void checkAtomRecords(datablock &db)
+{
+	using namespace literals;
+	auto &cf = compound_factory::instance();
+	auto &atom_site = db["atom_site"];
+	auto &atom_type = db["atom_type"];
+	auto &chem_comp = db["chem_comp"];
+	for (auto row : atom_site)
+	{
+		const auto &[symbol, label_asym_id, auth_asym_id, label_comp_id, auth_comp_id, label_seq_id, auth_seq_id, label_atom_id, auth_atom_id] =
+			row.get<std::string, std::optional<std::string>, std::optional<std::string>, std::optional<std::string>, std::optional<std::string>,
+				std::optional<int>, std::optional<std::string>, std::optional<std::string>, std::optional<std::string>>(
+				"type_symbol", "label_asym_id", "auth_asym_id", "label_comp_id", "auth_comp_id", "label_seq_id", "auth_seq_id", "label_atom_id", "auth_atom_id");
+		if (symbol.empty())
+			throw std::runtime_error("Missing type symbol in atom_site record");
+		if (atom_type.count("symbol"_key == symbol) == 0)
+			atom_type.emplace({ { "symbol", symbol } });
+		if (not(label_asym_id.has_value() or auth_asym_id.has_value()))
+			throw std::runtime_error("atom_site records does not have a label_asym_id nor an auth_asym_id, cannot continue");
+		if (not(label_comp_id.has_value() or auth_comp_id.has_value()))
+			throw std::runtime_error("atom_site records does not have a label_comp_id nor an auth_comp_id, cannot continue");
+		if (not(label_atom_id.has_value() or auth_atom_id.has_value()))
+			throw std::runtime_error("atom_site records does not have a label_atom_id nor an auth_atom_id, cannot continue");
+		std::string asym_id = label_asym_id.value_or(*auth_asym_id);
+		std::string comp_id = label_comp_id.value_or(*auth_comp_id);
+		bool is_peptide = cf.is_known_peptide(comp_id);
+		auto compound = cf.create(comp_id);
+		if (not compound)
+			throw std::runtime_error("Missing compound information for " + comp_id);
+		std::string mon_nstd_flag(".");
+		if (is_peptide)
+		{
+			if (compound_factory::kAAMap.find(comp_id) != compound_factory::kAAMap.end())
+				mon_nstd_flag = "y";
+			else
+				mon_nstd_flag = "n";
+		}
+		auto chem_comp_entry = chem_comp.find_first("id"_key == comp_id);
+		if (not chem_comp_entry)
+		{
+			chem_comp.emplace({ //
+				{ "id", comp_id },
+				{ "type", compound->type() },
+				{ "mon_nstd_flag", mon_nstd_flag },
+				{ "name", compound->name() },
+				{ "formula", compound->formula() },
+				{ "formula_weight", compound->formula_weight() } });
+		}
+		else
+		{
+			std::vector<item> items;
+			if (not chem_comp_entry["type"])
+				items.emplace_back(item{ "type", compound->type() });
+			if (not chem_comp_entry["mon_nstd_flag"])
+				items.emplace_back(item{ "mon_nstd_flag", mon_nstd_flag });
+			if (not chem_comp_entry["name"])
+				items.emplace_back(item{ "name", compound->name() });
+			if (not chem_comp_entry["formula"])
+				items.emplace_back(item{ "formula", compound->formula() });
+			if (not chem_comp_entry["formula_weight"])
+				items.emplace_back(item{ "formula_weight", compound->formula_weight() });
+			if (not items.empty())
+				chem_comp_entry.assign(std::move(items));
+		}
+		if (is_peptide and not(label_seq_id.has_value() or auth_seq_id.has_value()))
+			throw std::runtime_error("atom_site record has peptide comp_id but no sequence number, cannot continue");
+		std::string seq_id;
+		if (label_seq_id.has_value())
+			seq_id = std::to_string(*label_seq_id);
+		else if (auth_seq_id.has_value())
+			seq_id = *auth_seq_id;
+		row.assign({ //
+			{ "auth_asym_id", auth_asym_id.value_or(*label_asym_id) },
+			{ "auth_seq_id", auth_seq_id.value_or(std::to_string(*label_seq_id)) },
+			{ "auth_comp_id", auth_comp_id.value_or(*label_comp_id) },
+			{ "auth_atom_id", auth_atom_id.value_or(*label_atom_id) } });
+	}
+}
+void createStructAsym(datablock &db)
+{
+	auto &atom_site = db["atom_site"];
+	auto &struct_asym = db["struct_asym"];
+	for (auto label_asym_id : atom_site.rows<std::string>("label_asym_id"))
+	{
+		if (label_asym_id.empty())
+			throw std::runtime_error("File contains atom_site records without a label_asym_id");
+		if (struct_asym.count(key("id") == label_asym_id) == 0)
+		{
+			struct_asym.emplace({ //
+				{ "id", label_asym_id } });
+		}
+	}
+}
+void createEntity(datablock &db)
+{
+	using namespace literals;
+	auto &cf = compound_factory::instance();
+	auto &atom_site = db["atom_site"];
+	atom_site.add_column("label_entity_id");
+	auto &struct_asym = db["struct_asym"];
+	struct_asym.add_column("entity_id");
+	std::map<std::string,std::vector<std::tuple<std::string,int>>> asyms;
+	for (auto asym_id : db["struct_asym"].rows<std::string>("id"))
+	{
+		int last_seq_id = -1;
+		for (const auto &[comp_id, seq_id] : atom_site.find<std::string,int>("label_asym_id"_key == asym_id, "label_comp_id", "label_seq_id"))
+		{
+			if (seq_id == last_seq_id)
+				continue;
+			last_seq_id = seq_id;
+			asyms[asym_id].emplace_back(comp_id, last_seq_id);
+		}
+	}
+	auto less = [](const std::vector<std::tuple<std::string,int>> &a, const std::vector<std::tuple<std::string,int>> &b)
+	{
+		int d = static_cast<int>(a.size()) - static_cast<int>(b.size());
+		return d == 0 ? a > b : d > 0;
+	};
+	std::set<std::vector<std::tuple<std::string,int>>,decltype(less)> entities(less);
+	for (const auto &[asym_id, content] : asyms)
+		entities.emplace(content);
+	auto water_weight = cf.create("HOH")->formula_weight();
+	int poly_count = 0;
+	auto &entity = db["entity"];
+	for (auto &content : entities)
+	{
+		auto entity_id = entity.get_unique_id("");
+		std::string type, desc;
+		float weight = 0;
+		int count = 0;
+		auto first_comp_id = std::get<0>(content.front());
+		if (first_comp_id == "HOH")
+		{
+			type = "water";
+			desc = "water";
+			weight = water_weight;
+		}
+		else if (content.size() == 1)
+		{
+			auto c = cf.create(first_comp_id);
+			type = "non-polymer";
+			desc = c->name();
+			weight = c->formula_weight();
+		}
+		else
+		{
+			type = "polymer";
+			desc = "polymer-" + std::to_string(++poly_count);
+			weight = water_weight;
+			for (const auto &[comp_id, seq_id] : content)
+				weight += cf.create(comp_id)->formula_weight() - water_weight;
+		}
+		for (const auto &[asym_id, ac] : asyms)
+		{
+			if (ac != content)
+				continue;
+			atom_site.update_value("label_asym_id"_key == asym_id, "label_entity_id", entity_id);
+			struct_asym.update_value("id"_key == asym_id, "entity_id", entity_id);
+			if (type != "water")
+				++count;
+			else
+				count = atom_site.count("label_asym_id"_key == asym_id and "label_atom_id"_key == "O");
+		}
+		entity.emplace({ // 
+			{ "id", entity_id },
+			{ "type", type },
+			{ "pdbx_description", desc },
+			{ "formula_weight", weight },
+			{ "pdbx_number_of_molecules", count }
+		});
+	}
+}
+void createEntityPoly(datablock &db)
 {
+	using namespace literals;
+	auto &cf = compound_factory::instance();
+	auto &atom_site = db["atom_site"];
+	auto &entity_poly = db["entity_poly"];
+	for (auto entity_id : db["entity"].find<std::string>("type"_key == "polymer", "id"))
+	{
+		std::string type;
+		int last_seq_id = -1;
+		std::string seq, seq_can;
+		bool non_std_monomer = false;
+		bool non_std_linkage = false;
+		std::string pdb_strand_id;
+		for (const auto &[comp_id, seq_id, auth_asym_id] : atom_site.find<std::string,int,std::string>("label_entity_id"_key == entity_id, "label_comp_id", "label_seq_id", "auth_asym_id"))
+		{
+			if (seq_id == last_seq_id)
+				continue;
+			last_seq_id = seq_id;
+			auto c = cf.create(comp_id);
+			std::string letter, letter_can;
+			// TODO: Perhaps we should improve this... 
+			if (type != "other")
+			{
+				std::string c_type;
+				if (cf.is_known_base(comp_id))
+				{
+					c_type = "polydeoxyribonucleotide";
+					letter = letter_can = compound_factory::kBaseMap.at(comp_id);
+				}
+				else if (cf.is_known_peptide(comp_id))
+				{
+					c_type = "polypeptide(L)";
+					letter = letter_can = compound_factory::kAAMap.at(comp_id);
+				}
+				else if (iequals(c->type(), "D-PEPTIDE LINKING"))
+				{
+					c_type = "polypeptide(D)";
+					letter = 'X';
+					letter_can = '(' + comp_id + ')';
+					non_std_linkage = true;
+					non_std_monomer = true;
+				}
+				else if (iequals(c->type(), "L-PEPTIDE LINKING") or iequals(c->type(), "PEPTIDE LINKING"))
+				{
+					c_type = "polypeptide(L)";
+					letter = 'X';
+					letter_can = '(' + comp_id + ')';
+					non_std_monomer = true;
+				}
+				if (type.empty())
+					type = c_type;
+				else if (type != c_type)
+					type = "other";
+			}
+			seq += letter;
+			seq_can += letter_can;
+			pdb_strand_id = auth_asym_id;
+		}
+		for (auto i = seq.begin() + 80; i < seq.end(); i += 80)
+			i = seq.insert(i, '\n') + 1;
+		for (auto i = seq_can.begin() + 76; i < seq_can.end(); i += 76)
+		{
+			auto j = i;
+			while (j < i + 4 and j < seq_can.end())
+			{
+				if (*j == '(')
+					break;
+				++j;
+			}
+			if (j < seq_can.end())
+				i = seq_can.insert(j, '\n') + 1;
+			else
+				i = j;
+		}
+		entity_poly.emplace({ // 
+			{ "entity_id", entity_id },
+			{ "type", type },
+			{ "nstd_linkage", non_std_linkage },
+			{ "nstd_monomer", non_std_monomer },
+			{ "pdbx_seq_one_letter_code", seq },
+			{ "pdbx_seq_one_letter_code_can", seq_can },
+			{ "pdbx_strand_id", pdb_strand_id }
+		});
+	}
+}
+void createEntityPolySeq(datablock &db)
+{
+	if (db.get("entity_poly") == nullptr)
+		createEntityPoly(db);
+	using namespace literals;
+	auto &atom_site = db["atom_site"];
+	auto &entity_poly = db["entity_poly"];
+	auto &entity_poly_seq = db["entity_poly_seq"];
+	auto &struct_asym = db["struct_asym"];
+	for (auto entity_id : entity_poly.rows<std::string>("entity_id"))
+	{
+		int last_seq_id = -1;
+		std::string last_comp_id;
+		std::string asym_id = struct_asym.find_first<std::string>("entity_id"_key == entity_id, "id");
+		for (const auto &[comp_id, seq_id] : atom_site.find<std::string,int>("label_entity_id"_key == entity_id and "label_asym_id"_key == asym_id, "label_comp_id", "label_seq_id"))
+		{
+			bool hetero = false;
+			if (seq_id == last_seq_id)
+			{
+				if (last_comp_id != comp_id)
+					hetero = true;
+				else
+					continue;
+			}
+			if (hetero)
+			{
+				entity_poly_seq.back().assign({
+					{ "hetero", true }
+				});
+			}
+			entity_poly_seq.emplace({ // 
+				{ "entity_id", entity_id },
+				{ "num", seq_id },
+				{ "mon_id", comp_id },
+				{ "hetero", hetero }
+			});
+			last_seq_id = seq_id;
+			last_comp_id = comp_id;
+		}
+		// you cannot assume this is correct...
+		entity_poly_seq.sort([](row_handle a, row_handle b)
+		{
+			return a.get<int>("num") < b.get<int>("num");
+		});
+	}
+}
+void createPdbxPolySeqScheme(datablock &db)
+{
+	if (db.get("entity_poly_seq") == nullptr)
+		createEntityPolySeq(db);
+	using namespace literals;
+	auto &atom_site = db["atom_site"];
+	auto &entity_poly = db["entity_poly"];
+	auto &entity_poly_seq = db["entity_poly_seq"];
+	auto &struct_asym = db["struct_asym"];
+	auto &pdbx_poly_seq_scheme = db["pdbx_poly_seq_scheme"];
+	for (const auto &[entity_id, pdb_strand_id] : entity_poly.rows<std::string, std::string>("entity_id", "pdbx_strand_id"))
+	{
+		for (auto asym_id : struct_asym.find<std::string>("entity_id"_key == entity_id, "id"))
+		{
+			for (const auto &[comp_id, num, hetero] : entity_poly_seq.find<std::string,int,bool>("entity_id"_key == entity_id, "mon_id", "num", "hetero"))
+			{
+				const auto &[auth_seq_num, auth_mon_id, ins_code] =
+					atom_site.find_first<std::string,std::string,std::optional<std::string>>(
+						"label_asym_id"_key == asym_id and "label_seq_id"_key == num,
+						"auth_seq_id", "auth_comp_id", "pdbx_PDB_ins_code"
+					);
+				pdbx_poly_seq_scheme.emplace({ //
+					{ "asym_id", asym_id },
+					{ "entity_id", entity_id  },
+					{ "seq_id", num },
+					{ "mon_id", comp_id },
+					{ "ndb_seq_num", num },
+					{ "pdb_seq_num", auth_seq_num },
+					{ "auth_seq_num", auth_seq_num },
+					{ "pdb_mon_id", auth_mon_id },
+					{ "auth_mon_id", auth_mon_id },
+					{ "pdb_strand_id", pdb_strand_id },
+					{ "pdb_ins_code", ins_code },
+					{ "hetero", hetero }
+				});
+			}
+		}
+	}
+}
+void reconstruct_pdbx(file &file, std::string_view dictionary)
+{
+	if (file.empty())
+		throw std::runtime_error("Cannot reconstruct PDBx, file seems to be empty");
+	auto &db = file.front();
 	if (db.get("atom_site") == nullptr)
 		throw std::runtime_error("Cannot reconstruct PDBx file, atom data missing");
+	auto &validator = validator_factory::instance()[dictionary];
+	std::string entry_id;
+	// Phenix files do not have an entry record
+	if (db.get("entry") == nullptr)
+	{
+		entry_id = db.name();
+		category entry("entry");
+		entry.emplace({ { "id", entry_id } });
+		db.emplace_back(std::move(entry));
+	}
+	else
+	{
+		auto &entry = db["entry"];
+		if (entry.size() != 1)
+			throw std::runtime_error("Unexpected size of entry category");
+		entry_id = entry.front().get<std::string>("id");
+	}
+	for (auto &cat : db)
+	{
+		auto cv = validator.get_validator_for_category(cat.name());
+		if (not cv)
+			continue;
+		for (auto link : validator.get_links_for_child(cat.name()))
+		{
+			if (link->m_parent_category != "entry")
+				continue;
+			// So, this cat should have a link to the entry
+			auto pk = find(link->m_parent_keys.begin(), link->m_parent_keys.end(), "id");
+			if (pk == link->m_parent_keys.end())
+				continue;
+			auto ix = pk - link->m_parent_keys.begin();
+			auto key = link->m_child_keys[ix];
+			for (auto row : cat)
+			{
+				row.assign({ { key, entry_id } });
+			}
+		}
+		// See if all categories that need a key do have a value
+		if (cv->m_keys.size() == 1)
+		{
+			auto key = cv->m_keys.front();
+			for (auto row : cat)
+			{
+				auto ord = row.get<std::string>(key.c_str());
+				if (ord.empty())
+					row.assign({ //
+						{ key, cat.get_unique_id([](int nr)
+								   { return std::to_string(nr); }) } });
+			}
+		}
+	}
+	file.load_dictionary(dictionary);
+	// Now create any missing categories
+	// First, see if atom records make sense at all
+	// Will take care of atom_type and chem_comp as well.
+	checkAtomRecords(db);
+	// Next make sure we have struct_asym records
+	if (db.get("struct_asym") == nullptr)
+		createStructAsym(db);
+	if (db.get("entity") == nullptr)
+		createEntity(db);
+	if (db.get("pdbx_poly_seq_scheme") == nullptr)
+		createPdbxPolySeqScheme(db);
 }
 } // namespace cif::pdb
--- a/src/pdb/validate-pdbx.cpp
+++ b/src/pdb/validate-pdbx.cpp
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 NKI/AVL, Netherlands Cancer Institute
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "cif++.hpp"
+namespace cif::pdb
+{
+condition get_parents_condition(const validator &validator, row_handle rh, const category &parentCat)
+{
+	condition result;
+	auto &childCat = rh.get_category();
+	auto childName = childCat.name();
+	auto parentName = parentCat.name();
+	auto links = validator.get_links_for_child(childName);
+	links.erase(remove_if(links.begin(), links.end(), [n = parentName](auto &l)
+					{ return l->m_parent_category != n; }),
+		links.end());
+	if (not links.empty())
+	{
+		for (auto &link : links)
+		{
+			condition cond;
+			for (size_t ix = 0; ix < link->m_child_keys.size(); ++ix)
+			{
+				auto childValue = rh[link->m_child_keys[ix]];
+				if (childValue.empty())
+					continue;
+				cond = std::move(cond) and key(link->m_parent_keys[ix]) == childValue.text();
+			}
+			result = std::move(result) or std::move(cond);
+		}
+	}
+	else if (cif::VERBOSE > 0)
+		std::cerr << "warning: no child to parent links were found for child " << childName << " and parent " << parentName << '\n';
+	return result;
+}
+bool is_valid_pdbx_file(const file &file, std::string_view dictionary)
+{
+	using namespace cif::literals;
+	auto &cf = cif::compound_factory::instance();
+	auto &validator = cif::validator_factory::instance().operator[](dictionary);
+	bool result = true;
+	try
+	{
+		if (file.empty())
+			throw validation_error("Empty file");
+		auto &db = file.front();
+		if (db.empty())
+			throw validation_error("Empty datablock");
+		auto &atom_site = db["atom_site"];
+		if (atom_site.empty())
+			throw validation_error("Empty or missing atom_site category");
+		auto &pdbx_poly_seq_scheme = db["pdbx_poly_seq_scheme"];
+		std::string last_asym_id;
+		int last_seq_id = -1;
+		for (auto r : atom_site)
+		{
+			auto seq_id = r.get<std::optional<int>>("label_seq_id");
+			if (not seq_id.has_value()) // not a residue in a polymer
+				continue;
+			if (*seq_id == last_seq_id)
+				continue;
+			last_seq_id = *seq_id;
+			auto comp_id = r.get<std::string>("label_comp_id");
+			if (not cf.is_known_peptide(comp_id))
+				continue;
+			auto p = pdbx_poly_seq_scheme.find(get_parents_condition(validator, r, pdbx_poly_seq_scheme));
+			if (p.size() != 1)
+				throw validation_error("For each residue in atom_site that is a residue in a polymer there should be exactly one pdbx_poly_seq_scheme record");
+		}
+		auto &entity = db["entity"];
+		if (entity.empty())
+			throw validation_error("Entity category is missing or empty");
+		auto &entity_poly = db["entity_poly"];
+		if (entity_poly.empty())
+			throw validation_error("Entity_poly category is missing or empty");
+		auto &entity_poly_seq = db["entity_poly_seq"];
+		if (entity_poly_seq.empty())
+			throw validation_error("Entity_poly_seq category is missing or empty");
+		auto &struct_asym = db["struct_asym"];
+		if (struct_asym.empty())
+			throw validation_error("struct_asym category is missing or empty");
+		for (auto entity_id : entity.find<std::string>("type"_key == "polymer", "id"))
+		{
+			if (entity_poly.count("entity_id"_key == entity_id) != 1)
+				throw validation_error("There should be exactly one entity_poly record per polymer entity");
+			// const auto entity_poly_type = entity_poly.find1<std::string>("entity_id"_key == entity_id, "type");
+			std::map<int,std::set<std::string>> mon_per_seq_id;
+			for (const auto &[num, mon_id, hetero] : entity_poly_seq.find<int, std::string, bool>("entity_id"_key == entity_id, "num", "mon_id", "hetero"))
+			{
+				mon_per_seq_id[num].emplace(mon_id);
+				for (auto asym_id : struct_asym.find<std::string>("entity_id"_key == entity_id, "id"))
+				{
+					if (pdbx_poly_seq_scheme.count(
+							"asym_id"_key == asym_id and
+							"mon_id"_key == mon_id and
+							"seq_id"_key == num and
+							"hetero"_key == hetero) != 1)
+					{
+						throw validation_error("For each entity_poly_seq record there should be exactly one pdbx_poly_seq record");
+					}
+				}
+			}
+			for (const auto &[seq_id, mon_id, hetero] : pdbx_poly_seq_scheme.find<int, std::string, bool>("entity_id"_key == entity_id, "seq_id", "mon_id", "hetero"))
+			{
+				if (entity_poly_seq.count(
+						"mon_id"_key == mon_id and
+						"num"_key == seq_id and
+						"hetero"_key == hetero) != 1)
+				{
+					throw validation_error("For each pdbx_poly_seq/struct_asym record there should be exactly one entity_poly_seq record");
+				}
+				if ((mon_per_seq_id[seq_id].size() > 1) != hetero)
+					throw validation_error("Mismatch between the hetero flag in the poly seq schemes and the number residues per seq_id");
+			}
+			for (const auto &[seq_id, mon_ids] : mon_per_seq_id)
+			{
+				for (auto asym_id : struct_asym.find<std::string>("entity_id"_key == entity_id, "id"))
+				{
+					condition cond;
+					for (auto mon_id : mon_ids)
+					{
+						if (cond)
+							cond = std::move(cond) or "label_comp_id"_key == mon_id;
+						else
+							cond = "label_comp_id"_key == mon_id;
+					}
+					cond = "label_entity_id"_key == entity_id and
+						"label_asym_id"_key == asym_id and
+						"label_seq_id"_key == seq_id and not std::move(cond);
+					if (atom_site.exists(std::move(cond)))
+						throw validation_error("An atom_site record exists that has no parent in the poly seq scheme categories");
+				}
+			}
+		}
+		result = true;
+	}
+	catch (const std::exception &ex)
+	{
+		result = false;
+		if (cif::VERBOSE > 0)
+			std::clog << ex.what() << '\n';
+	}
+	return result;
+}
+} // namespace cif::pdb
\ No newline at end of file
--- a/test/test-main.cpp
+++ b/test/test-main.cpp
@@ -13,10 +13,11 @@ int main(int argc, char *argv[])
 	// Build a new parser on top of Catch2's
 	using namespace Catch::clara;
-	auto cli = session.cli()                                // Get Catch2's command line parser
+	auto cli = session.cli()                               // Get Catch2's command line parser
-	           | Opt(gTestDir, "data-dir")                // bind variable to a new option, with a hint string
+	           | Opt(gTestDir, "data-dir")                 // bind variable to a new option, with a hint string
-	                 ["-D"]["--data-dir"]                   // the option names it will respond to
+	                 ["-D"]["--data-dir"]                  // the option names it will respond to
-	           ("The directory containing the data files"); // description string for the help output
+	           ("The directory containing the data files") // description string for the help output
+	           | Opt(cif::VERBOSE, "verbose")["-v"]["--cif-verbose"]("Flag for cif::VERBOSE");
 	// Now pass the new composite back to Catch2 so it uses that
 	session.cli(cli);
@@ -34,6 +35,5 @@ int main(int argc, char *argv[])
 	cif::compound_factory::instance().push_dictionary(gTestDir / "HEM.cif");
 	return session.run();
 }
\ No newline at end of file
--- a/test/validate-pdbx-test.cpp
+++ b/test/validate-pdbx-test.cpp
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "test-main.hpp"
+#include <catch2/catch.hpp>
+#include <cif++.hpp>
+#include <stdexcept>
+// --------------------------------------------------------------------
+cif::file operator""_cf(const char *text, size_t length)
+{
+	struct membuf : public std::streambuf
+	{
+		membuf(char *text, size_t length)
+		{
+			this->setg(text, text, text + length);
+		}
+	} buffer(const_cast<char *>(text), length);
+	std::istream is(&buffer);
+	return cif::file(is);
+}
+// --------------------------------------------------------------------
+TEST_CASE("test-1")
+{
+	auto f = R"(data_1CBS
+# 
+_entry.id   1CBS 
+# 
+_entity.id                     1
+_entity.type                   polymer
+# 
+_entity_poly.entity_id                      1 
+_entity_poly.type                           'polypeptide(L)' 
+_entity_poly.nstd_linkage                   no 
+_entity_poly.nstd_monomer                   no 
+_entity_poly.pdbx_seq_one_letter_code       
+;PNFSG
+;
+_entity_poly.pdbx_seq_one_letter_code_can   
+;PNFSG
+;
+_entity_poly.pdbx_strand_id                 A 
+_entity_poly.pdbx_target_identifier         ? 
+# 
+loop_
+_entity_poly_seq.entity_id 
+_entity_poly_seq.num 
+_entity_poly_seq.mon_id 
+_entity_poly_seq.hetero 
+1 1   PRO n 
+1 2   ASN n 
+1 3   PHE n 
+1 4   SER n 
+1 5   GLY n 
+#
+loop_
+_struct_asym.id 
+_struct_asym.pdbx_blank_PDB_chainid_flag 
+_struct_asym.pdbx_modified 
+_struct_asym.entity_id 
+_struct_asym.details 
+A N N 1 ? 
+# 
+loop_
+_atom_type.symbol 
+C 
+N 
+O 
+S 
+# 
+loop_
+_atom_site.group_PDB 
+_atom_site.id 
+_atom_site.type_symbol 
+_atom_site.label_atom_id 
+_atom_site.label_alt_id 
+_atom_site.label_comp_id 
+_atom_site.label_asym_id 
+_atom_site.label_entity_id 
+_atom_site.label_seq_id 
+_atom_site.pdbx_PDB_ins_code 
+_atom_site.Cartn_x 
+_atom_site.Cartn_y 
+_atom_site.Cartn_z 
+_atom_site.occupancy 
+_atom_site.B_iso_or_equiv 
+_atom_site.pdbx_formal_charge 
+_atom_site.auth_seq_id 
+_atom_site.auth_comp_id 
+_atom_site.auth_asym_id 
+_atom_site.auth_atom_id 
+_atom_site.pdbx_PDB_model_num 
+ATOM   2    C CA  . PRO A 1 1   ? 18.150 13.525 43.680 1.00 28.82 ? 1   PRO A CA  1 
+ATOM   9    C CA  . ASN A 1 2   ? 20.576 16.457 43.578 1.00 20.79 ? 2   ASN A CA  1 
+ATOM   17   C CA  . PHE A 1 3   ? 21.144 17.838 40.087 1.00 12.62 ? 3   PHE A CA  1 
+ATOM   28   C CA  . SER A 1 4   ? 23.170 20.780 41.464 1.00 11.30 ? 4   SER A CA  1 
+ATOM   34   C CA  . GLY A 1 5   ? 26.628 21.486 40.103 1.00 10.86 ? 5   GLY A CA  1 
+# 
+loop_
+_pdbx_poly_seq_scheme.asym_id 
+_pdbx_poly_seq_scheme.entity_id 
+_pdbx_poly_seq_scheme.seq_id 
+_pdbx_poly_seq_scheme.mon_id 
+_pdbx_poly_seq_scheme.ndb_seq_num 
+_pdbx_poly_seq_scheme.pdb_seq_num 
+_pdbx_poly_seq_scheme.auth_seq_num 
+_pdbx_poly_seq_scheme.pdb_mon_id 
+_pdbx_poly_seq_scheme.auth_mon_id 
+_pdbx_poly_seq_scheme.pdb_strand_id 
+_pdbx_poly_seq_scheme.pdb_ins_code 
+_pdbx_poly_seq_scheme.hetero 
+A 1 1   PRO 1   1   1   PRO PRO A . n 
+A 1 2   ASN 2   2   2   ASN ASN A . n 
+A 1 3   PHE 3   3   3   PHE PHE A . n 
+A 1 4   SER 4   4   4   SER SER A . n 
+A 1 5   GLY 5   5   5   GLY GLY A . n 
+# 
+)"_cf;
+	SECTION("Plain file")
+	{
+		REQUIRE(cif::pdb::is_valid_pdbx_file(f));
+	}
+	SECTION("Delete one atom_site")
+	{
+		auto &db = f.front();
+		auto n = db["atom_site"].erase(cif::key("id") == 2);
+		REQUIRE(n == 1);
+		REQUIRE(cif::pdb::is_valid_pdbx_file(f));
+	}
+	SECTION("Delete a pdbx_poly_seq_scheme record")
+	{
+		auto &db = f.front();
+		auto n = db["pdbx_poly_seq_scheme"].erase(cif::key("seq_id") == 2);
+		REQUIRE(n == 1);
+		REQUIRE_FALSE(cif::pdb::is_valid_pdbx_file(f));
+	}
+	SECTION("Delete an entity_poly_seq record")
+	{
+		auto &db = f.front();
+		auto n = db["entity_poly_seq"].erase(cif::key("num") == 2);
+		REQUIRE(n == 1);
+		REQUIRE_FALSE(cif::pdb::is_valid_pdbx_file(f));
+	}
+	SECTION("Delete an entity_poly record")
+	{
+		auto &db = f.front();
+		auto n = db["entity_poly"].erase(cif::key("entity_id") == 1);
+		REQUIRE(n == 1);
+		REQUIRE_FALSE(cif::pdb::is_valid_pdbx_file(f));
+	}
+	SECTION("Mutate an atom_site record")
+	{
+		auto &db = f.front();
+		auto r = db["atom_site"].find1(cif::key("id") == 9);
+		r.assign({
+			{ "label_comp_id", "ALA" },
+			{ "auth_comp_id", "ALA" }
+		});
+		REQUIRE_FALSE(cif::pdb::is_valid_pdbx_file(f));
+	}
+	SECTION("Hetero consistency")
+	{
+		auto &db = f.front();
+		db["entity_poly_seq"].emplace({ //
+			{ "entity_id", 1 },
+			{ "num", 1 },
+			{ "mon_id", "ALA" },
+			{ "hetero", "n" }
+		});
+		db["pdbx_poly_seq_scheme"].emplace({ //
+			{ "asym_id", "A" },
+			{ "entity_id", "1" },
+			{ "seq_id", "1" },
+			{ "mon_id", "ALA" },
+			{ "ndb_seq_num", "1" },
+			{ "pdb_seq_num", "1" },
+			{ "auth_seq_num", "1" },
+			{ "pdb_mon_id", "ALA" },
+			{ "auth_mon_id", "ALA" },
+			{ "pdb_strand_id", "A" },
+			{ "pdb_ins_code", "." },
+			{ "hetero", "n" }
+		});
+		REQUIRE_FALSE(cif::pdb::is_valid_pdbx_file(f));
+	}
+}