Merge remote-tracking branch 'github/develop-cif2fasta' into develop

afc541b9 · Maarten L. Hekkelman · 22537c0e · 7e4d2ffb · afc541b9 · afc541b9
Commit afc541b9 authored Jan 22, 2024 by Maarten L. Hekkelman
12 changed files
--- a/include/cif++/category.hpp
+++ b/include/cif++/category.hpp
@@ -70,6 +70,26 @@ class duplicate_key_error : public std::runtime_error
 	}
 };

+/// @brief A missing_key_error is thrown when an attempt is made
+/// to create an index when one of the key fields is missing.
+class missing_key_error : public std::runtime_error
+{
+  public:
+    /**
+     * @brief Construct a new duplicate key error object
+     */
+	missing_key_error(const std::string &msg, const std::string &key)
+		: std::runtime_error(msg)
+		, m_key(key)
+	{
+	}
+
+	const std::string &get_key() const noexcept { return m_key; }
+
+  private:
+	std::string m_key;
+};
+
 /// @brief A multiple_results_error is throw when you request a single
 /// row using a query but the query contains more than exactly one row.
 class multiple_results_error : public std::runtime_error
@@ -516,7 +536,7 @@ class category
 	/// @param column The name of the column to return the value for
 	/// @return The value found
 	template <typename T>
-	T find1(condition &&cond, const char *column) const
+	T find1(condition &&cond, std::string_view column) const
 	{
 		return find1<T>(cbegin(), std::move(cond), column);
 	}
@@ -530,7 +550,7 @@ class category
 	/// @param column The name of the column to return the value for
 	/// @return The value found
 	template <typename T, std::enable_if_t<not is_optional_v<T>, int> = 0>
-	T find1(const_iterator pos, condition &&cond, const char *column) const
+	T find1(const_iterator pos, condition &&cond, std::string_view column) const
 	{
 		auto h = find<T>(pos, std::move(cond), column);

@@ -549,7 +569,7 @@ class category
 	/// @param column The name of the column to return the value for
 	/// @return The value found, can be empty if no row matches the condition
 	template <typename T, std::enable_if_t<is_optional_v<T>, int> = 0>
-	T find1(const_iterator pos, condition &&cond, const char *column) const
+	T find1(const_iterator pos, condition &&cond, std::string_view column) const
 	{
 		auto h = find<typename T::value_type>(pos, std::move(cond), column);

@@ -644,7 +664,7 @@ class category
 	/// @param column The column for which the value should be returned
 	/// @return The value found or a default constructed value if not found
 	template <typename T>
-	T find_first(condition &&cond, const char *column) const
+	T find_first(condition &&cond, std::string_view column) const
 	{
 		return find_first<T>(cbegin(), std::move(cond), column);
 	}
@@ -657,7 +677,7 @@ class category
 	/// @param column The column for which the value should be returned
 	/// @return The value found or a default constructed value if not found
 	template <typename T>
-	T find_first(const_iterator pos, condition &&cond, const char *column) const
+	T find_first(const_iterator pos, condition &&cond, std::string_view column) const
 	{
 		auto h = find<T>(pos, std::move(cond), column);

@@ -701,7 +721,7 @@ class category
 	/// @param cond The condition to search for
 	/// @return The value found or the minimal value for the type
 	template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, int> = 0>
-	T find_max(const char *column, condition &&cond) const
+	T find_max(std::string_view column, condition &&cond) const
 	{
 		T result = std::numeric_limits<T>::min();

@@ -719,7 +739,7 @@ class category
 	/// @param column The column to use for the value
 	/// @return The value found or the minimal value for the type
 	template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, int> = 0>
-	T find_max(const char *column) const
+	T find_max(std::string_view column) const
 	{
 		return find_max<T>(column, all());
 	}
@@ -730,7 +750,7 @@ class category
 	/// @param cond The condition to search for
 	/// @return The value found or the maximum value for the type
 	template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, int> = 0>
-	T find_min(const char *column, condition &&cond) const
+	T find_min(std::string_view column, condition &&cond) const
 	{
 		T result = std::numeric_limits<T>::max();

@@ -748,7 +768,7 @@ class category
 	/// @param column The column to use for the value
 	/// @return The value found or the maximum value for the type
 	template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, int> = 0>
-	T find_min(const char *column) const
+	T find_min(std::string_view column) const
 	{
 		return find_min<T>(column, all());
 	}
@@ -756,8 +776,17 @@ class category
 	/// @brief Return whether a row exists that matches condition @a cond
 	/// @param cond The condition to match
 	/// @return True if a row exists
+	[[deprecated("Use contains instead")]]
 	bool exists(condition &&cond) const
 	{
+		return contains(std::move(cond));
+	}
+
+	/// @brief Return whether a row exists that matches condition @a cond
+	/// @param cond The condition to match
+	/// @return True if a row exists
+	bool contains(condition &&cond) const
+	{
 		bool result = false;

 		if (cond)
@@ -922,6 +951,11 @@ class category
 			{ return prefix + std::to_string(nr + 1); });
 	}

+	/// @brief Generate a new, unique value for a item named @a tag
+	/// @param tag The name of the item
+	/// @return a new unique value
+	std::string get_unique_value(std::string_view tag);
+
 	// --------------------------------------------------------------------

 	/// \brief Update a single column named @a tag in the rows that match \a cond to value \a value

--- a/include/cif++/condition.hpp
+++ b/include/cif++/condition.hpp
@@ -934,6 +934,16 @@ struct key
 	{
 	}

+	/**
+	 * @brief Construct a new key object using @a itemTag as name
+	 * 
+	 * @param itemTag 
+	 */
+	explicit key(std::string_view itemTag)
+		: m_item_tag(itemTag)
+	{
+	}
+
 	key(const key &) = delete;
 	key &operator=(const key &) = delete;


--- a/include/cif++/item.hpp
+++ b/include/cif++/item.hpp
@@ -556,7 +556,7 @@ struct item_handle::item_value_as<T, std::enable_if_t<std::is_arithmetic_v<T> an

 		auto txt = ref.text();

-		if (txt.empty())
+		if (ref.empty())
 			result = 1;
 		else
 		{

--- a/include/cif++/iterator.hpp
+++ b/include/cif++/iterator.hpp
@@ -562,22 +562,23 @@ class conditional_iterator_proxy

 		reference operator*()
 		{
-			return *mBegin;
+			return *m_begin;
 		}

 		pointer operator->()
 		{
-			return &*mBegin;
+			m_current = *m_begin;
+			return &m_current;
 		}

 		conditional_iterator_impl &operator++()
 		{
-			while (mBegin != mEnd)
+			while (m_begin != m_end)
 			{
-				if (++mBegin == mEnd)
+				if (++m_begin == m_end)
 					break;
 				
-				if (m_condition->operator()(mBegin))
+				if (m_condition->operator()(m_begin))
 					break;
 			}

@@ -591,18 +592,22 @@ class conditional_iterator_proxy
 			return result;
 		}

-		bool operator==(const conditional_iterator_impl &rhs) const { return mBegin == rhs.mBegin; }
-		bool operator!=(const conditional_iterator_impl &rhs) const { return mBegin != rhs.mBegin; }
+		bool operator==(const conditional_iterator_impl &rhs) const { return m_begin == rhs.m_begin; }
+		bool operator!=(const conditional_iterator_impl &rhs) const { return m_begin != rhs.m_begin; }
+
+		bool operator==(const row_iterator &rhs) const { return m_begin == rhs; }
+		bool operator!=(const row_iterator &rhs) const { return m_begin != rhs; }

 		template <typename IRowType, typename... ITs>
-		bool operator==(const iterator_impl<IRowType, ITs...> &rhs) const { return mBegin == rhs; }
+		bool operator==(const iterator_impl<IRowType, ITs...> &rhs) const { return m_begin == rhs; }

 		template <typename IRowType, typename... ITs>
-		bool operator!=(const iterator_impl<IRowType, ITs...> &rhs) const { return mBegin != rhs; }
+		bool operator!=(const iterator_impl<IRowType, ITs...> &rhs) const { return m_begin != rhs; }

 	  private:
-		CategoryType *mCat;
-		base_iterator mBegin, mEnd;
+		CategoryType *m_cat;
+		base_iterator m_begin, m_end;
+		value_type m_current;
 		const condition *m_condition;
 	};

@@ -673,13 +678,13 @@ iterator_proxy<Category, Ts...>::iterator_proxy(Category &cat, row_iterator pos,
 template <typename Category, typename... Ts>
 conditional_iterator_proxy<Category, Ts...>::conditional_iterator_impl::conditional_iterator_impl(
 	Category &cat, row_iterator pos, const condition &cond, const std::array<uint16_t, N> &cix)
-	: mCat(&cat)
-	, mBegin(pos, cix)
-	, mEnd(cat.end(), cix)
+	: m_cat(&cat)
+	, m_begin(pos, cix)
+	, m_end(cat.end(), cix)
 	, m_condition(&cond)
 {
 	if (m_condition == nullptr or m_condition->empty())
-		mBegin = mEnd;
+		m_begin = m_end;
 }

 template <typename Category, typename... Ts>

--- a/include/cif++/validate.hpp
+++ b/include/cif++/validate.hpp
@@ -202,7 +202,7 @@ struct category_validator
 	}

 	/// @brief Add item_validator @a v to the list of item validators
-	void addItemValidator(item_validator &&v);
+	void add_item_validator(item_validator &&v);

 	/// @brief Return the item_validator for item @a tag, may return nullptr
 	const item_validator *get_validator_for_item(std::string_view tag) const;

--- a/src/category.cpp
+++ b/src/category.cpp
@@ -672,7 +672,7 @@ void category::set_validator(const validator *v, datablock &db)
 				std::ostringstream msg;
 				msg << "Cannot construct index since the key field" << (missing.size() > 1 ? "s" : "") << " "
 							<< cif::join(missing, ", ") << " in " << m_name << " " << (missing.size() == 1 ? "is" : "are") << " missing\n";
-				throw std::runtime_error(msg.str());
+				throw missing_key_error(msg.str(), *missing.begin());
 			}
 		}
 	}
@@ -860,7 +860,7 @@ bool category::validate_links() const
 			auto cond = get_parents_condition(r, *parent);
 			if (not cond)
 				continue;
-			if (not parent->exists(std::move(cond)))
+			if (not parent->contains(std::move(cond)))
 			{
 				++missing;
 				if (VERBOSE and first_missing_rows.size() < 5)
@@ -997,7 +997,7 @@ bool category::has_children(row_handle r) const

 	for (auto &&[childCat, link] : m_child_links)
 	{
-		if (not childCat->exists(get_children_condition(r, *childCat)))
+		if (not childCat->contains(get_children_condition(r, *childCat)))
 			continue;

 		result = true;
@@ -1013,7 +1013,7 @@ bool category::has_parents(row_handle r) const

 	for (auto &&[parentCat, link] : m_parent_links)
 	{
-		if (not parentCat->exists(get_parents_condition(r, *parentCat)))
+		if (not parentCat->contains(get_parents_condition(r, *parentCat)))
 			continue;

 		result = true;
@@ -1214,7 +1214,7 @@ void category::erase_orphans(condition &&cond, category &parent)
 		if (not cond(r))
 			continue;
 		
-		if (parent.exists(get_parents_condition(r, parent)))
+		if (parent.contains(get_parents_condition(r, parent)))
 			continue;

 		if (VERBOSE > 1)
@@ -1263,7 +1263,7 @@ std::string category::get_unique_id(std::function<std::string(int)> generator)
 	{
 		for (;;)
 		{
-			if (not exists(key(id_tag) == result))
+			if (not contains(key(id_tag) == result))
 				break;
 			
 			result = generator(static_cast<int>(m_last_unique_num++));
@@ -1273,6 +1273,36 @@ std::string category::get_unique_id(std::function<std::string(int)> generator)
 	return result;
 }

+std::string category::get_unique_value(std::string_view tag)
+{
+	std::string result;
+
+	if (m_validator and m_cat_validator)
+	{
+		auto iv = m_cat_validator->get_validator_for_item(tag);
+
+		if (iv and iv->m_type and iv->m_type->m_primitive_type == DDL_PrimitiveType::Numb)
+		{
+			uint64_t v = find_max<uint64_t>(tag);
+			result = std::to_string(v + 1);
+		}
+	}
+
+	if (result.empty())
+	{
+		// brain-dead implementation
+		for (size_t ix = 0; ix < size(); ++ix)
+		{
+			// result = m_name + "-" + std::to_string(ix);
+			result = cif_id_for_number(ix);
+			if (not contains(key(tag) == result))
+				break;
+		}
+	}
+
+	return result;
+}
+
 void category::update_value(const std::vector<row_handle> &rows, std::string_view tag, std::string_view value)
 {
 	using namespace std::literals;
@@ -1376,7 +1406,7 @@ void category::update_value(const std::vector<row_handle> &rows, std::string_vie
 						check = std::move(check) && key(ck) == parent[pk].text();
 				}

-				if (childCat->exists(std::move(check))) // phew..., narrow escape
+				if (childCat->contains(std::move(check))) // phew..., narrow escape
 					continue;

 				// create the actual copy, if we can...
@@ -1405,7 +1435,7 @@ void category::update_value(const std::vector<row_handle> &rows, std::string_vie
 void category::update_value(row *row, uint16_t column, std::string_view value, bool updateLinked, bool validate)
 {
 	// make sure we have an index, if possible
-	if (m_index == nullptr and m_cat_validator != nullptr)
+	if ((updateLinked or validate) and m_index == nullptr and m_cat_validator != nullptr)
 		m_index = new category_index(*this);

 	auto &col = m_columns[column];
@@ -1444,7 +1474,7 @@ void category::update_value(row *row, uint16_t column, std::string_view value, b
 	if (not value.empty())
 		row->append(column, { value });

-	if (reinsert)
+	if (reinsert and m_index != nullptr)
 		m_index->insert(*this, row);

 	// see if we need to update any child categories that depend on this value

--- a/src/dictionary_parser.cpp
+++ b/src/dictionary_parser.cpp
@@ -87,7 +87,7 @@ class dictionary_parser : public parser
 				error("Undefined category '" + iv.first);

 			for (auto &v : iv.second)
-				const_cast<category_validator *>(cv)->addItemValidator(std::move(v));
+				const_cast<category_validator *>(cv)->add_item_validator(std::move(v));
 		}

 		// check all item validators for having a typeValidator

--- a/src/model.cpp
+++ b/src/model.cpp
@@ -1783,7 +1783,7 @@ atom &structure::emplace_atom(atom &&atom)
 	std::string symbol = atom.get_property("type_symbol");

 	using namespace cif::literals;
-	if (not atom_type.exists("symbol"_key == symbol))
+	if (not atom_type.contains("symbol"_key == symbol))
 		atom_type.emplace({ { "symbol", symbol } });

 	return m_atoms.emplace_back(std::move(atom));
@@ -1969,7 +1969,7 @@ void structure::change_residue(residue &res, const std::string &newCompound,

 		// create rest
 		auto &chemComp = m_db["chem_comp"];
-		if (not chemComp.exists(key("id") == newCompound))
+		if (not chemComp.contains(key("id") == newCompound))
 		{
 			chemComp.emplace({{"id", newCompound},
 				{"name", compound->name()},
@@ -2702,7 +2702,7 @@ void structure::cleanup_empty_categories()
 	for (auto chemComp : chem_comp)
 	{
 		std::string compID = chemComp["id"].as<std::string>();
-		if (atomSite.exists("label_comp_id"_key == compID or "auth_comp_id"_key == compID))
+		if (atomSite.contains("label_comp_id"_key == compID or "auth_comp_id"_key == compID))
 			continue;

 		obsoleteChemComps.push_back(chemComp);
@@ -2719,7 +2719,7 @@ void structure::cleanup_empty_categories()
 	for (auto entity : entities)
 	{
 		std::string entityID = entity["id"].as<std::string>();
-		if (atomSite.exists("label_entity_id"_key == entityID))
+		if (atomSite.contains("label_entity_id"_key == entityID))
 			continue;

 		obsoleteEntities.push_back(entity);

--- a/src/pdb/reconstruct.cpp
+++ b/src/pdb/reconstruct.cpp
@@ -31,6 +31,100 @@
 namespace cif::pdb
 {

+void fillLabelAsymID(category &atom_site)
+{
+	// pray that label_entity_id is filled in and use that to discriminate between asyms
+
+	std::map<std::tuple<std::string, std::string>, std::string> mapAuthAsymIDAndEntityToLabelAsymID;
+
+	for (const auto &[label_entity_id, auth_asym_id, label_asym_id] :
+		atom_site.find<std::optional<std::string>, std::string, std::string>(
+			key("label_asym_id") != cif::null, "label_entity_id", "auth_asym_id", "label_asym_id"))
+	{
+		if (not label_entity_id.has_value())
+			continue;
+
+		auto key = make_tuple(auth_asym_id, *label_entity_id);
+		auto i = mapAuthAsymIDAndEntityToLabelAsymID.find(key);
+
+		if (i == mapAuthAsymIDAndEntityToLabelAsymID.end())
+			mapAuthAsymIDAndEntityToLabelAsymID.emplace(make_pair(key, label_asym_id));
+		else if (i->second != label_asym_id)
+		{
+			if (cif::VERBOSE > 0)
+				std::clog << "Inconsistent assignment of label_asym_id for the tuple entity_id: " << *label_entity_id << " and auth_asym_id: " << auth_asym_id << '\n';
+
+			mapAuthAsymIDAndEntityToLabelAsymID.clear();
+			break;
+		}
+	}
+
+	for (const auto &[key, value] : mapAuthAsymIDAndEntityToLabelAsymID)
+	{
+		const auto &[auth_asym_id, label_entity_id] = key;
+
+		for (auto row : atom_site.find(cif::key("label_asym_id") == null and
+									   cif::key("auth_asym_id") == auth_asym_id and
+									   cif::key("label_entity_id") == label_entity_id))
+		{
+			row.assign("label_asym_id", value, false, true);
+		}
+	}
+
+	// Check to see if we're done
+	if (atom_site.contains(key("label_asym_id") == cif::null))
+	{
+		// nope, not yet.
+		throw std::runtime_error("atom_site category still contains records with empty label_asym_id, don't know how to continue");
+	}
+}
+
+void fixNegativeSeqID(category &atom_site)
+{
+	std::set<std::string> asymsWithNegativeSeqID;
+	for (auto asym_id : atom_site.find<std::string>(key("label_seq_id") < 0, "label_asym_id"))
+		asymsWithNegativeSeqID.emplace(asym_id);
+
+	for (auto asym_id : asymsWithNegativeSeqID)
+	{
+		// create a pseudo entity_poly_seq first
+
+		std::vector<std::tuple<std::string, int>> poly_seq;
+		for (auto key : atom_site.find<std::string, int>(key("label_asym_id") == asym_id, "auth_seq_id", "label_seq_id"))
+		{
+			if (poly_seq.empty() or poly_seq.back() != key)
+				poly_seq.emplace_back(key);
+		}
+
+		// simply renumber all items, but only if it is really a poly (i.e. size > 1)
+		if (poly_seq.size() > 1)
+		{
+			int seq_id = 1;
+			for (const auto &[auth_seq_id, label_seq_id] : poly_seq)
+			{
+				for (auto row : atom_site.find(key("label_asym_id") == asym_id and
+											   key("auth_seq_id") == auth_seq_id and
+											   key("label_seq_id") == label_seq_id))
+				{
+					row.assign("label_seq_id", std::to_string(seq_id), false, false);
+				}
+
+				++seq_id;
+			}
+		}
+		else if (poly_seq.size() == 1) // a monomer?
+		{
+			const auto &[auth_seq_id, label_seq_id] = poly_seq.front();
+
+			for (auto row : atom_site.find(key("label_asym_id") == asym_id and
+											key("auth_seq_id") == auth_seq_id and
+											key("label_seq_id") == label_seq_id))
+			{
+				row.assign("label_seq_id", ".", false, false);
+			}
+		}
+	}
+}

 void checkAtomRecords(datablock &db)
 {
@@ -42,6 +136,14 @@ void checkAtomRecords(datablock &db)
 	auto &atom_type = db["atom_type"];
 	auto &chem_comp = db["chem_comp"];

+	// Some common errors: missing label_asym_id for some of the atom records
+	if (atom_site.contains(key("label_asym_id") == cif::null))
+		fillLabelAsymID(atom_site);
+
+	// And negative seq_id values
+	if (atom_site.contains(key("label_seq_id") < 0))
+		fixNegativeSeqID(atom_site);
+
 	for (auto row : atom_site)
 	{
 		const auto &[symbol, label_asym_id, auth_asym_id, label_comp_id, auth_comp_id, label_seq_id, auth_seq_id, label_atom_id, auth_atom_id] =
@@ -117,16 +219,50 @@ void checkAtomRecords(datablock &db)
 			throw std::runtime_error("atom_site record has peptide comp_id but no sequence number, cannot continue");

 		std::string seq_id;
-		if (label_seq_id.has_value())
+		if (label_seq_id.has_value() and *label_seq_id > 0)
 			seq_id = std::to_string(*label_seq_id);
 		else if (auth_seq_id.has_value())
+		{
 			seq_id = *auth_seq_id;

+			row.assign("label_seq_id", seq_id, false, true);
+		}
+
+		if (not label_atom_id.has_value())
+			row.assign("label_atom_id", asym_id, false, true);
+
 		row.assign({ //
 			{ "auth_asym_id", auth_asym_id.value_or(*label_asym_id) },
 			{ "auth_seq_id", auth_seq_id.value_or(std::to_string(*label_seq_id)) },
 			{ "auth_comp_id", auth_comp_id.value_or(*label_comp_id) },
 			{ "auth_atom_id", auth_atom_id.value_or(*label_atom_id) } });
+		
+		// Rewrite the coordinates and other fields that look better in a fixed format
+		// Be careful not to nuke invalidly formatted data here
+		for (const auto &[tag, prec] : std::initializer_list<std::tuple<std::string_view,std::string::size_type>>{
+				{ "cartn_x", 3 },
+				{ "cartn_y", 3 },
+				{ "cartn_z", 3 },
+				{ "occupancy", 2 },
+				{ "b_iso_or_equiv", 2 }
+			})
+		{
+			if (row[tag].empty())
+				continue;
+			
+			float v;
+			auto s = row.get<std::string>(tag);
+			if (auto [ptr, ec] = cif::from_chars(s.data(), s.data() + s.length(), v); ec != std::errc())
+				continue;
+
+			if (s.length() < prec + 1 or s[s.length() - prec - 1] != '.')
+			{
+				char b[12];
+
+				if (auto [ptr, ec] = cif::to_chars(b, b + sizeof(b), v, cif::chars_format::fixed, prec); ec == std::errc())
+					row.assign(tag, {b, ptr}, false, false);
+			}
+		}
 	}
 }

@@ -159,34 +295,34 @@ void createEntity(datablock &db)
 	auto &struct_asym = db["struct_asym"];
 	struct_asym.add_column("entity_id");

-	std::map<std::string,std::vector<std::tuple<std::string,int>>> asyms;
+	std::map<std::string, std::vector<std::tuple<std::string, int>>> asyms;

 	for (auto asym_id : db["struct_asym"].rows<std::string>("id"))
 	{
 		int last_seq_id = -1;

-		for (const auto &[comp_id, seq_id] : atom_site.find<std::string,int>("label_asym_id"_key == asym_id, "label_comp_id", "label_seq_id"))
+		for (const auto &[comp_id, seq_id] : atom_site.find<std::string, int>("label_asym_id"_key == asym_id, "label_comp_id", "label_seq_id"))
 		{
 			if (seq_id == last_seq_id)
 				continue;
-			
+
 			last_seq_id = seq_id;

 			asyms[asym_id].emplace_back(comp_id, last_seq_id);
 		}
 	}

-	auto less = [](const std::vector<std::tuple<std::string,int>> &a, const std::vector<std::tuple<std::string,int>> &b)
+	auto less = [](const std::vector<std::tuple<std::string, int>> &a, const std::vector<std::tuple<std::string, int>> &b)
 	{
 		int d = static_cast<int>(a.size()) - static_cast<int>(b.size());
 		return d == 0 ? a > b : d > 0;
 	};

-	std::set<std::vector<std::tuple<std::string,int>>,decltype(less)> entities(less);
+	std::set<std::vector<std::tuple<std::string, int>>, decltype(less)> entities(less);

 	for (const auto &[asym_id, content] : asyms)
 		entities.emplace(content);
-	
+
 	auto water_weight = cf.create("HOH")->formula_weight();

 	int poly_count = 0;
@@ -230,7 +366,7 @@ void createEntity(datablock &db)
 		{
 			if (ac != content)
 				continue;
-			
+
 			atom_site.update_value("label_asym_id"_key == asym_id, "label_entity_id", entity_id);
 			struct_asym.update_value("id"_key == asym_id, "entity_id", entity_id);

@@ -240,13 +376,12 @@ void createEntity(datablock &db)
 				count = atom_site.count("label_asym_id"_key == asym_id and "label_atom_id"_key == "O");
 		}

-		entity.emplace({ // 
+		entity.emplace({ //
 			{ "id", entity_id },
 			{ "type", type },
 			{ "pdbx_description", desc },
 			{ "formula_weight", weight },
-			{ "pdbx_number_of_molecules", count }
-		});
+			{ "pdbx_number_of_molecules", count } });
 	}
 }

@@ -268,11 +403,11 @@ void createEntityPoly(datablock &db)
 		bool non_std_linkage = false;
 		std::string pdb_strand_id;

-		for (const auto &[comp_id, seq_id, auth_asym_id] : atom_site.find<std::string,int,std::string>("label_entity_id"_key == entity_id, "label_comp_id", "label_seq_id", "auth_asym_id"))
+		for (const auto &[comp_id, seq_id, auth_asym_id] : atom_site.find<std::string, int, std::string>("label_entity_id"_key == entity_id, "label_comp_id", "label_seq_id", "auth_asym_id"))
 		{
 			if (seq_id == last_seq_id)
 				continue;
-			
+
 			last_seq_id = seq_id;

 			auto c = cf.create(comp_id);
@@ -280,7 +415,7 @@ void createEntityPoly(datablock &db)
 			std::string letter;
 			char letter_can;

-			// TODO: Perhaps we should improve this... 
+			// TODO: Perhaps we should improve this...
 			if (type != "other")
 			{
 				std::string c_type;
@@ -301,7 +436,7 @@ void createEntityPoly(datablock &db)
 					letter_can = c->one_letter_code();
 					if (letter_can == 0)
 						letter_can = 'X';
-					
+
 					letter = '(' + comp_id + ')';

 					non_std_linkage = true;
@@ -334,7 +469,7 @@ void createEntityPoly(datablock &db)

 		for (auto i = seq.begin() + 80; i < seq.end(); i += 80)
 			i = seq.insert(i, '\n') + 1;
-		
+
 		for (auto i = seq_can.begin() + 76; i < seq_can.end(); i += 76)
 		{
 			auto j = i;
@@ -351,15 +486,14 @@ void createEntityPoly(datablock &db)
 				i = j;
 		}

-		entity_poly.emplace({ // 
+		entity_poly.emplace({ //
 			{ "entity_id", entity_id },
 			{ "type", type },
 			{ "nstd_linkage", non_std_linkage },
 			{ "nstd_monomer", non_std_monomer },
 			{ "pdbx_seq_one_letter_code", seq },
 			{ "pdbx_seq_one_letter_code_can", seq_can },
-			{ "pdbx_strand_id", pdb_strand_id }
-		});
+			{ "pdbx_strand_id", pdb_strand_id } });
 	}
 }

@@ -381,7 +515,7 @@ void createEntityPolySeq(datablock &db)
 		std::string last_comp_id;
 		std::string asym_id = struct_asym.find_first<std::string>("entity_id"_key == entity_id, "id");

-		for (const auto &[comp_id, seq_id] : atom_site.find<std::string,int>("label_entity_id"_key == entity_id and "label_asym_id"_key == asym_id, "label_comp_id", "label_seq_id"))
+		for (const auto &[comp_id, seq_id] : atom_site.find<std::string, int>("label_entity_id"_key == entity_id and "label_asym_id"_key == asym_id, "label_comp_id", "label_seq_id"))
 		{
 			bool hetero = false;

@@ -395,27 +529,22 @@ void createEntityPolySeq(datablock &db)

 			if (hetero)
 			{
-				entity_poly_seq.back().assign({
-					{ "hetero", true }
-				});
+				entity_poly_seq.back().assign({ { "hetero", true } });
 			}

-			entity_poly_seq.emplace({ // 
+			entity_poly_seq.emplace({ //
 				{ "entity_id", entity_id },
 				{ "num", seq_id },
 				{ "mon_id", comp_id },
-				{ "hetero", hetero }
-			});
-			
+				{ "hetero", hetero } });
+
 			last_seq_id = seq_id;
 			last_comp_id = comp_id;
 		}

 		// you cannot assume this is correct...
 		entity_poly_seq.sort([](row_handle a, row_handle b)
-		{
-			return a.get<int>("num") < b.get<int>("num");
-		});
+			{ return a.get<int>("num") < b.get<int>("num"); });
 	}
 }

@@ -436,17 +565,16 @@ void createPdbxPolySeqScheme(datablock &db)
 	{
 		for (auto asym_id : struct_asym.find<std::string>("entity_id"_key == entity_id, "id"))
 		{
-			for (const auto &[comp_id, num, hetero] : entity_poly_seq.find<std::string,int,bool>("entity_id"_key == entity_id, "mon_id", "num", "hetero"))
+			for (const auto &[comp_id, num, hetero] : entity_poly_seq.find<std::string, int, bool>("entity_id"_key == entity_id, "mon_id", "num", "hetero"))
 			{
 				const auto &[auth_seq_num, auth_mon_id, ins_code] =
-					atom_site.find_first<std::string,std::string,std::optional<std::string>>(
+					atom_site.find_first<std::string, std::string, std::optional<std::string>>(
 						"label_asym_id"_key == asym_id and "label_seq_id"_key == num,
-						"auth_seq_id", "auth_comp_id", "pdbx_PDB_ins_code"
-					);
-				
+						"auth_seq_id", "auth_comp_id", "pdbx_PDB_ins_code");
+
 				pdbx_poly_seq_scheme.emplace({ //
 					{ "asym_id", asym_id },
-					{ "entity_id", entity_id  },
+					{ "entity_id", entity_id },
 					{ "seq_id", num },
 					{ "mon_id", comp_id },
 					{ "ndb_seq_num", num },
@@ -456,8 +584,81 @@ void createPdbxPolySeqScheme(datablock &db)
 					{ "auth_mon_id", auth_mon_id },
 					{ "pdb_strand_id", pdb_strand_id },
 					{ "pdb_ins_code", ins_code },
-					{ "hetero", hetero }
-				});
+					{ "hetero", hetero } });
+			}
+		}
+	}
+}
+
+// Some programs write out a ndb_poly_seq_scheme, which has been replaced by pdbx_poly_seq_scheme
+void comparePolySeqSchemes(datablock &db)
+{
+	auto &ndb_poly_seq_scheme = db["ndb_poly_seq_scheme"];
+	auto &pdbx_poly_seq_scheme = db["pdbx_poly_seq_scheme"];
+
+	// Since often ndb_poly_seq_scheme only contains an id and mon_id field
+	// we assume that it should match the accompanying pdbx_poly_seq
+
+	std::vector<std::string> asym_ids_ndb, asym_ids_pdbx;
+
+	for (auto asym_id : ndb_poly_seq_scheme.rows<std::string>("id"))
+	{
+		auto i = std::lower_bound(asym_ids_ndb.begin(), asym_ids_ndb.end(), asym_id);
+		if (i == asym_ids_ndb.end() or *i != asym_id)
+			asym_ids_ndb.insert(i, asym_id);
+	}
+	
+	for (auto asym_id : pdbx_poly_seq_scheme.rows<std::string>("asym_id"))
+	{
+		auto i = std::lower_bound(asym_ids_pdbx.begin(), asym_ids_pdbx.end(), asym_id);
+		if (i == asym_ids_pdbx.end() or *i != asym_id)
+			asym_ids_pdbx.insert(i, asym_id);
+	}
+	
+	// If we have different Asym ID's assume the ndb is invalid.
+	if (asym_ids_ndb != asym_ids_pdbx)
+	{
+		if (cif::VERBOSE > 0)
+			std::clog << "The asym ID's of ndb_poly_seq_scheme and pdbx_poly_seq_scheme are not equal, dropping ndb_poly_seq_scheme\n";
+		ndb_poly_seq_scheme.clear();
+	}
+	else
+	{
+		for (const auto &asym_id : asym_ids_ndb)
+		{
+			bool valid = true;
+
+			auto ndb_range = ndb_poly_seq_scheme.find(key("id") == asym_id);
+			auto pdbx_range = pdbx_poly_seq_scheme.find(key("asym_id") == asym_id);
+
+			for (auto ndb_i = ndb_range.begin(), pdbx_i = pdbx_range.begin();
+				ndb_i != ndb_range.end() or pdbx_i != pdbx_range.end(); ++ndb_i, ++pdbx_i)
+			{
+				if (ndb_i == ndb_range.end() or pdbx_i == pdbx_range.end())
+				{
+					if (cif::VERBOSE > 0)
+						std::clog << "The sequences in ndb_poly_seq_scheme and pdbx_poly_seq_scheme are unequal in size for asym ID " << asym_id << '\n';
+					valid = false;
+					break;
+				}
+
+				auto ndb_mon_id = ndb_i->get<std::string>("mon_id");
+				auto pdbx_mon_id = pdbx_i->get<std::string>("mon_id");
+
+				if (ndb_mon_id != pdbx_mon_id)
+				{
+					if (cif::VERBOSE > 0)
+						std::clog << "The sequences in ndb_poly_seq_scheme and pdbx_poly_seq_scheme contain different mon ID's for asym ID " << asym_id << '\n';
+					valid = false;
+					break;
+				}
+			}
+
+			if (not valid)
+			{
+				if (cif::VERBOSE > 0)
+					std::clog << "Dropping asym ID " << asym_id << " from ndb_poly_seq_scheme\n";
+				ndb_poly_seq_scheme.erase(key("id") == asym_id);
 			}
 		}
 	}
@@ -498,45 +699,154 @@ void reconstruct_pdbx(file &file, std::string_view dictionary)
 		entry_id = entry.front().get<std::string>("id");
 	}

+	std::vector<std::string> invalidCategories;
+
 	for (auto &cat : db)
 	{
-		auto cv = validator.get_validator_for_category(cat.name());
-		if (not cv)
-			continue;
-
-		for (auto link : validator.get_links_for_child(cat.name()))
+		try
 		{
-			if (link->m_parent_category != "entry")
+			auto cv = validator.get_validator_for_category(cat.name());
+			if (not cv)
 				continue;

-			// So, this cat should have a link to the entry
+			for (auto link : validator.get_links_for_child(cat.name()))
+			{
+				if (link->m_parent_category != "entry")
+					continue;

-			auto pk = find(link->m_parent_keys.begin(), link->m_parent_keys.end(), "id");
-			if (pk == link->m_parent_keys.end())
-				continue;
+				// So, this cat should have a link to the entry

-			auto ix = pk - link->m_parent_keys.begin();
-			auto key = link->m_child_keys[ix];
+				auto pk = find(link->m_parent_keys.begin(), link->m_parent_keys.end(), "id");
+				if (pk == link->m_parent_keys.end())
+					continue;
+
+				auto ix = pk - link->m_parent_keys.begin();
+				auto key = link->m_child_keys[ix];
+
+				for (auto row : cat)
+				{
+					row.assign({ { key, entry_id } });
+				}
+			}

-			for (auto row : cat)
+			// Fill in all mandatory fields
+			for (auto key : cv->m_mandatory_fields)
 			{
-				row.assign({ { key, entry_id } });
+				if (not cat.has_column(key))
+				{
+					if (cif::VERBOSE > 0)
+						std::clog << "Adding mandatory key " << key << " to category " << cat.name() << '\n';
+					cat.add_column(key);
+				}
 			}
-		}

-		// See if all categories that need a key do have a value
-		if (cv->m_keys.size() == 1)
-		{
-			auto key = cv->m_keys.front();
-			for (auto row : cat)
+			enum class State
 			{
-				auto ord = row.get<std::string>(key.c_str());
-				if (ord.empty())
-					row.assign({ //
-						{ key, cat.get_unique_id([](int nr)
-								   { return std::to_string(nr); }) } });
+				Start,
+				MissingKeys,
+				DuplicateKeys
+			} state = State::Start;
+
+			for (;;)
+			{
+				// See if we can build an index
+				try
+				{
+					cat.set_validator(&validator, db);
+				}
+				catch (const missing_key_error &ex)
+				{
+					if (state == State::MissingKeys)
+					{
+						if (cif::VERBOSE > 0)
+							std::clog << "Repairing failed for category " << cat.name() << ", missing keys remain: " << ex.what() << '\n';
+
+						throw;
+					}
+
+					state = State::MissingKeys;
+
+					auto key = ex.get_key();
+
+					if (cif::VERBOSE > 0)
+						std::clog << "Need to add key " << key << " to category " << cat.name() << '\n';
+
+					for (auto row : cat)
+					{
+						auto ord = row.get<std::string>(key.c_str());
+						if (ord.empty())
+							row.assign({ //
+								{ key, cat.get_unique_value(key) } });
+					}
+
+					continue;
+				}
+				catch (const duplicate_key_error &ex)
+				{
+					if (state == State::DuplicateKeys)
+					{
+						if (cif::VERBOSE > 0)
+							std::clog << "Repairing failed for category " << cat.name() << ", duplicate keys remain: " << ex.what() << '\n';
+
+						throw;
+					}
+
+					state = State::DuplicateKeys;
+
+					if (cif::VERBOSE > 0)
+						std::clog << "Attempt to fix " << cat.name() << " failed: " << ex.what() << '\n';
+
+					// replace fields that do not define a relation to a parent
+
+					std::set<std::string> replaceableKeys;
+					for (auto key : cv->m_keys)
+					{
+						bool replaceable = true;
+						for (auto lv : validator.get_links_for_child(cat.name()))
+						{
+							if (find(lv->m_child_keys.begin(), lv->m_child_keys.end(), key) != lv->m_child_keys.end())
+							{
+								replaceable = false;
+								break;
+							}
+						}
+
+						if (replaceable)
+							replaceableKeys.insert(key);
+					}
+
+					if (replaceableKeys.empty())
+						throw std::runtime_error("Cannot repair category " + cat.name() + " since it contains duplicate keys that cannot be replaced");
+
+					for (auto key : replaceableKeys)
+					{
+						for (auto row : cat)
+							row.assign(key, cat.get_unique_value(key), false, false);
+					}
+
+					continue;
+				}
+
+				break;
 			}
 		}
+		catch (const std::exception &ex)
+		{
+			if (cif::VERBOSE > 0)
+				std::clog << ex.what() << '\n';
+
+			std::clog << "Will drop category " << cat.name() << " since it cannot be repaired\n";
+
+			invalidCategories.emplace_back(cat.name());
+		}
+	}
+
+	for (auto cat_name : invalidCategories)
+	{
+		auto i = find_if(db.begin(), db.end(), [cat_name](const category &cat)
+			{ return cat.name() == cat_name; });
+		if (i != db.end())
+			db.erase(i);
 	}

 	file.load_dictionary(dictionary);
@@ -550,12 +860,15 @@ void reconstruct_pdbx(file &file, std::string_view dictionary)
 	// Next make sure we have struct_asym records
 	if (db.get("struct_asym") == nullptr)
 		createStructAsym(db);
-	
+
 	if (db.get("entity") == nullptr)
 		createEntity(db);

 	if (db.get("pdbx_poly_seq_scheme") == nullptr)
 		createPdbxPolySeqScheme(db);
+
+	if (db.get("ndb_poly_seq_scheme") != nullptr)
+		comparePolySeqSchemes(db);
 }

 } // namespace cif::pdb
--- a/src/pdb/validate-pdbx.cpp
+++ b/src/pdb/validate-pdbx.cpp
@@ -183,7 +183,7 @@ bool is_valid_pdbx_file(const file &file, std::string_view dictionary)
 						"label_asym_id"_key == asym_id and
 						"label_seq_id"_key == seq_id and not std::move(cond);
 					
-					if (atom_site.exists(std::move(cond)))
+					if (atom_site.contains(std::move(cond)))
 						throw validation_error("An atom_site record exists that has no parent in the poly seq scheme categories");
 				}
 			}

--- a/src/validate.cpp
+++ b/src/validate.cpp
@@ -233,7 +233,7 @@ void item_validator::operator()(std::string_view value) const

 // --------------------------------------------------------------------

-void category_validator::addItemValidator(item_validator &&v)
+void category_validator::add_item_validator(item_validator &&v)
 {
 	if (v.m_mandatory)
 		m_mandatory_fields.insert(v.m_tag);

--- a/test/unit-v2-test.cpp
+++ b/test/unit-v2-test.cpp
@@ -568,7 +568,7 @@ _test.value
 	auto &test = db["test"];
 	REQUIRE(test.size() == 5);

-	REQUIRE(test.exists("value"_key == cif::null));
+	REQUIRE(test.contains("value"_key == cif::null));
 	REQUIRE(test.find("value"_key == cif::null).size() == 2);
 }