Added remove column

f4506438 · Maarten L. Hekkelman · fc14a655 · f4506438 · f4506438 · f4506438
Commit f4506438 authored Jan 22, 2024 by Maarten L. Hekkelman
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 179 additions and 114 deletions

include/cif++/category.hpp
+36 -34

src/category.cpp
+100 -65

src/datablock.cpp
+4 -0

src/pdb/reconstruct.cpp
+39 -15

No files found.
--- a/include/cif++/category.hpp
+++ b/include/cif++/category.hpp
@@ -31,22 +31,22 @@
 #include "cif++/condition.hpp"
 #include "cif++/iterator.hpp"
 #include "cif++/row.hpp"
-#include "cif++/validate.hpp"
 #include "cif++/text.hpp"
+#include "cif++/validate.hpp"
 #include <array>
 /** \file category.hpp
-  * Documentation for the cif::category class
+ * Documentation for the cif::category class
-  *
+ *
-  * The category class should meet the requirements of Container and
+ * The category class should meet the requirements of Container and
-  * SequenceContainer.
+ * SequenceContainer.
-  * 
+ *
-  * TODO: implement all of:
+ * TODO: implement all of:
-  * https://en.cppreference.com/w/cpp/named_req/Container
+ * https://en.cppreference.com/w/cpp/named_req/Container
-  * https://en.cppreference.com/w/cpp/named_req/SequenceContainer
+ * https://en.cppreference.com/w/cpp/named_req/SequenceContainer
-  * and more?
+ * and more?
-  */
+ */
 namespace cif
 {
@@ -61,9 +61,9 @@ namespace cif
 class duplicate_key_error : public std::runtime_error
 {
  public:
-    /**
+	/**
-     * @brief Construct a new duplicate key error object
+	 * @brief Construct a new duplicate key error object
-     */
+	 */
 	duplicate_key_error(const std::string &msg)
 		: std::runtime_error(msg)
 	{
@@ -75,9 +75,9 @@ class duplicate_key_error : public std::runtime_error
 class missing_key_error : public std::runtime_error
 {
  public:
-    /**
+	/**
-     * @brief Construct a new duplicate key error object
+	 * @brief Construct a new duplicate key error object
-     */
+	 */
 	missing_key_error(const std::string &msg, const std::string &key)
 		: std::runtime_error(msg)
 		, m_key(key)
@@ -95,9 +95,9 @@ class missing_key_error : public std::runtime_error
 class multiple_results_error : public std::runtime_error
 {
  public:
-    /**
+	/**
-     * @brief Construct a new multiple results error object
+	 * @brief Construct a new multiple results error object
-     */
+	 */
 	multiple_results_error()
 		: std::runtime_error("query should have returned exactly one row")
 	{
@@ -156,8 +156,8 @@ class category
 	// --------------------------------------------------------------------
 	const std::string &name() const { return m_name; } ///< Returns the name of the category
-	iset key_fields() const; ///< Returns the cif::iset of key field names. Retrieved from the @ref category_validator for this category
+	iset key_fields() const;                           ///< Returns the cif::iset of key field names. Retrieved from the @ref category_validator for this category
-	std::set<uint16_t> key_field_indices() const; ///< Returns a set of indices for the key fields.
+	std::set<uint16_t> key_field_indices() const;      ///< Returns a set of indices for the key fields.
 	/// @brief Set the validator for this category to @a v
 	/// @param v The category_validator to assign. A nullptr value is allowed.
@@ -182,7 +182,7 @@ class category
 	/// @brief Validate links, that means, values in this category should have an
 	/// accompanying value in parent categories.
-	/// 
+	///
 	/// @note
 	/// The code makes one exception when validating missing links and that's between
 	/// *atom_site* and a parent *pdbx_poly_seq_scheme* or *entity_poly_seq*.
@@ -285,7 +285,7 @@ class category
 	/// Return the theoretical maximum number or rows that can be stored
 	size_t max_size() const
 	{
-		return std::numeric_limits<size_t>::max();	// this is a bit optimistic, I guess
+		return std::numeric_limits<size_t>::max(); // this is a bit optimistic, I guess
 	}
 	/// Return true if the category is empty
@@ -321,7 +321,7 @@ class category
 	/// @code{.cpp}
 	/// for (const auto &[name, value] : cat.rows<std::string,int>("item_name", "item_value"))
 	///   std::cout << name << ": " << value << '\n';
-	/// @endcode 
+	/// @endcode
 	///
 	/// @tparam Ts The types for the columns requested
 	/// @param names The names for the columns requested
@@ -344,7 +344,7 @@ class category
 	///
 	/// for (int id : cat.rows<int>("id"))
 	///   std::cout << id << '\n';
-	/// @endcode 
+	/// @endcode
 	///
 	/// @tparam Ts The types for the columns requested
 	/// @param names The names for the columns requested
@@ -363,7 +363,7 @@ class category
 	/// @code{.cpp}
 	/// for (row_handle rh : cat.find(cif::key("first_name") == "John" and cif::key("last_name") == "Doe"))
 	///    .. // do something with rh
-	/// @endcode 
+	/// @endcode
 	///
 	/// @param cond The condition for the query
 	/// @return A special iterator that loops over all elements that match. The iterator can be dereferenced
@@ -417,7 +417,7 @@ class category
 	/// @code{.cpp}
 	/// for (const auto &[name, value] : cat.find<std::string,int>(cif::key("item_value") > 10, "item_name", "item_value"))
 	///    std::cout << name << ": " << value << '\n';
-	/// @endcode 
+	/// @endcode
 	///
 	/// @param cond The condition for the query
 	/// @tparam Ts The types for the columns requested
@@ -776,8 +776,7 @@ class category
 	/// @brief Return whether a row exists that matches condition @a cond
 	/// @param cond The condition to match
 	/// @return True if a row exists
-	[[deprecated("Use contains instead")]]
+	[[deprecated("Use contains instead")]] bool exists(condition &&cond) const
-	bool exists(condition &&cond) const
 	{
 		return contains(std::move(cond));
 	}
@@ -875,7 +874,7 @@ class category
 	// 	insert_impl(pos, std::move(row));
 	// }
-	/// Erase the row pointed to by @a pos and return the iterator to the 
+	/// Erase the row pointed to by @a pos and return the iterator to the
 	/// row following pos.
 	iterator erase(iterator pos);
@@ -941,7 +940,6 @@ class category
 	/// result is unique in the context of this category
 	std::string get_unique_id(std::function<std::string(int)> generator = cif::cif_id_for_number);
 	/// @brief Generate a new, unique ID based on a string prefix followed by a number
 	/// @param prefix The string prefix
 	/// @return a new unique ID
@@ -1038,6 +1036,11 @@ class category
 		return result;
 	}
+	/** @brief Remove column name @a colum_name
+	 * @param column_name The column to be removed
+	 */
+	void remove_column(std::string_view column_name);
 	/// @brief Return whether a column with name @a name exists in this category
 	/// @param name The name of the column
 	/// @return True if the column exists
@@ -1082,11 +1085,10 @@ class category
 	void write(std::ostream &os, const std::vector<uint16_t> &order, bool includeEmptyColumns) const;
  public:
 	/// friend function to make it possible to do:
 	/// @code {.cpp}
 	/// std::cout << my_category;
-	/// @endcode 
+	/// @endcode
 	friend std::ostream &operator<<(std::ostream &os, const category &cat)
 	{
 		cat.write(os);

--- a/src/category.cpp
+++ b/src/category.cpp
--- a/src/datablock.cpp
+++ b/src/datablock.cpp
@@ -374,6 +374,10 @@ void datablock::write(std::ostream &os, const std::vector<std::string> &tag_orde
 bool datablock::operator==(const datablock &rhs) const
 {
+	// shortcut
+	if (this == &rhs)
+		return true;
 	auto &dbA = *this;
 	auto &dbB = rhs;

--- a/src/pdb/reconstruct.cpp
+++ b/src/pdb/reconstruct.cpp
@@ -117,8 +117,8 @@ void fixNegativeSeqID(category &atom_site)
 			const auto &[auth_seq_id, label_seq_id] = poly_seq.front();
 			for (auto row : atom_site.find(key("label_asym_id") == asym_id and
-											key("auth_seq_id") == auth_seq_id and
+										   key("auth_seq_id") == auth_seq_id and
-											key("label_seq_id") == label_seq_id))
+										   key("label_seq_id") == label_seq_id))
 			{
 				row.assign("label_seq_id", ".", false, false);
 			}
@@ -236,20 +236,19 @@ void checkAtomRecords(datablock &db)
 			{ "auth_seq_id", auth_seq_id.value_or(std::to_string(*label_seq_id)) },
 			{ "auth_comp_id", auth_comp_id.value_or(*label_comp_id) },
 			{ "auth_atom_id", auth_atom_id.value_or(*label_atom_id) } });
 		// Rewrite the coordinates and other fields that look better in a fixed format
 		// Be careful not to nuke invalidly formatted data here
-		for (auto [tag, prec] : std::vector<std::tuple<std::string_view,std::string::size_type>>{
+		for (auto [tag, prec] : std::vector<std::tuple<std::string_view, std::string::size_type>>{
-				{ "cartn_x", 3 },
+				 { "cartn_x", 3 },
-				{ "cartn_y", 3 },
+				 { "cartn_y", 3 },
-				{ "cartn_z", 3 },
+				 { "cartn_z", 3 },
-				{ "occupancy", 2 },
+				 { "occupancy", 2 },
-				{ "b_iso_or_equiv", 2 }
+				 { "b_iso_or_equiv", 2 } })
-			})
 		{
 			if (row[tag].empty())
 				continue;
 			float v;
 			auto s = row.get<std::string>(tag);
 			if (auto [ptr, ec] = cif::from_chars(s.data(), s.data() + s.length(), v); ec != std::errc())
@@ -260,8 +259,30 @@ void checkAtomRecords(datablock &db)
 				char b[12];
 				if (auto [ptr, ec] = cif::to_chars(b, b + sizeof(b), v, cif::chars_format::fixed, prec); ec == std::errc())
-					row.assign(tag, {b, static_cast<std::string::size_type>(ptr - b)}, false, false);
+					row.assign(tag, { b, static_cast<std::string::size_type>(ptr - b) }, false, false);
+			}
+		}
+	}
+	auto *cv = atom_site.get_cat_validator();
+	if (cv)
+	{
+		// See if there are columns that are no longer known
+		for (auto tag : atom_site.get_columns())
+		{
+			if (cv->get_validator_for_item(tag) != nullptr)
+				continue;
+			auto r = atom_site.find_first(key(tag) != null);
+			if (not r)
+			{
+				if (cif::VERBOSE > 0)
+					std::clog << "Dropping unknown column " << tag << '\n';
+				atom_site.remove_column(tag);
 			}
+			else if (cif::VERBOSE > 0)
+				std::clog << "Keeping unknown column " << std::quoted(tag) << " in atom_site since it is not empty\n";
 		}
 	}
 }
@@ -607,14 +628,14 @@ void comparePolySeqSchemes(datablock &db)
 		if (i == asym_ids_ndb.end() or *i != asym_id)
 			asym_ids_ndb.insert(i, asym_id);
 	}
 	for (auto asym_id : pdbx_poly_seq_scheme.rows<std::string>("asym_id"))
 	{
 		auto i = std::lower_bound(asym_ids_pdbx.begin(), asym_ids_pdbx.end(), asym_id);
 		if (i == asym_ids_pdbx.end() or *i != asym_id)
 			asym_ids_pdbx.insert(i, asym_id);
 	}
 	// If we have different Asym ID's assume the ndb is invalid.
 	if (asym_ids_ndb != asym_ids_pdbx)
 	{
@@ -632,7 +653,7 @@ void comparePolySeqSchemes(datablock &db)
 			auto pdbx_range = pdbx_poly_seq_scheme.find(key("asym_id") == asym_id);
 			for (auto ndb_i = ndb_range.begin(), pdbx_i = pdbx_range.begin();
-				ndb_i != ndb_range.end() or pdbx_i != pdbx_range.end(); ++ndb_i, ++pdbx_i)
+				 ndb_i != ndb_range.end() or pdbx_i != pdbx_range.end(); ++ndb_i, ++pdbx_i)
 			{
 				if (ndb_i == ndb_range.end() or pdbx_i == pdbx_range.end())
 				{
@@ -662,6 +683,9 @@ void comparePolySeqSchemes(datablock &db)
 			}
 		}
 	}
+	if (ndb_poly_seq_scheme.empty())
+		db.erase(std::remove(db.begin(), db.end(), ndb_poly_seq_scheme), db.end());
 }
 void reconstruct_pdbx(file &file, std::string_view dictionary)