Commit afc541b9 by Maarten L. Hekkelman

Merge remote-tracking branch 'github/develop-cif2fasta' into develop

parents 22537c0e 7e4d2ffb
......@@ -70,6 +70,26 @@ class duplicate_key_error : public std::runtime_error
}
};
/// @brief A missing_key_error is thrown when an attempt is made
/// to create an index when one of the key fields is missing.
class missing_key_error : public std::runtime_error
{
public:
/**
* @brief Construct a new duplicate key error object
*/
missing_key_error(const std::string &msg, const std::string &key)
: std::runtime_error(msg)
, m_key(key)
{
}
const std::string &get_key() const noexcept { return m_key; }
private:
std::string m_key;
};
/// @brief A multiple_results_error is throw when you request a single
/// row using a query but the query contains more than exactly one row.
class multiple_results_error : public std::runtime_error
......@@ -516,7 +536,7 @@ class category
/// @param column The name of the column to return the value for
/// @return The value found
template <typename T>
T find1(condition &&cond, const char *column) const
T find1(condition &&cond, std::string_view column) const
{
return find1<T>(cbegin(), std::move(cond), column);
}
......@@ -530,7 +550,7 @@ class category
/// @param column The name of the column to return the value for
/// @return The value found
template <typename T, std::enable_if_t<not is_optional_v<T>, int> = 0>
T find1(const_iterator pos, condition &&cond, const char *column) const
T find1(const_iterator pos, condition &&cond, std::string_view column) const
{
auto h = find<T>(pos, std::move(cond), column);
......@@ -549,7 +569,7 @@ class category
/// @param column The name of the column to return the value for
/// @return The value found, can be empty if no row matches the condition
template <typename T, std::enable_if_t<is_optional_v<T>, int> = 0>
T find1(const_iterator pos, condition &&cond, const char *column) const
T find1(const_iterator pos, condition &&cond, std::string_view column) const
{
auto h = find<typename T::value_type>(pos, std::move(cond), column);
......@@ -644,7 +664,7 @@ class category
/// @param column The column for which the value should be returned
/// @return The value found or a default constructed value if not found
template <typename T>
T find_first(condition &&cond, const char *column) const
T find_first(condition &&cond, std::string_view column) const
{
return find_first<T>(cbegin(), std::move(cond), column);
}
......@@ -657,7 +677,7 @@ class category
/// @param column The column for which the value should be returned
/// @return The value found or a default constructed value if not found
template <typename T>
T find_first(const_iterator pos, condition &&cond, const char *column) const
T find_first(const_iterator pos, condition &&cond, std::string_view column) const
{
auto h = find<T>(pos, std::move(cond), column);
......@@ -701,7 +721,7 @@ class category
/// @param cond The condition to search for
/// @return The value found or the minimal value for the type
template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, int> = 0>
T find_max(const char *column, condition &&cond) const
T find_max(std::string_view column, condition &&cond) const
{
T result = std::numeric_limits<T>::min();
......@@ -719,7 +739,7 @@ class category
/// @param column The column to use for the value
/// @return The value found or the minimal value for the type
template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, int> = 0>
T find_max(const char *column) const
T find_max(std::string_view column) const
{
return find_max<T>(column, all());
}
......@@ -730,7 +750,7 @@ class category
/// @param cond The condition to search for
/// @return The value found or the maximum value for the type
template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, int> = 0>
T find_min(const char *column, condition &&cond) const
T find_min(std::string_view column, condition &&cond) const
{
T result = std::numeric_limits<T>::max();
......@@ -748,7 +768,7 @@ class category
/// @param column The column to use for the value
/// @return The value found or the maximum value for the type
template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, int> = 0>
T find_min(const char *column) const
T find_min(std::string_view column) const
{
return find_min<T>(column, all());
}
......@@ -756,8 +776,17 @@ class category
/// @brief Return whether a row exists that matches condition @a cond
/// @param cond The condition to match
/// @return True if a row exists
[[deprecated("Use contains instead")]]
bool exists(condition &&cond) const
{
return contains(std::move(cond));
}
/// @brief Return whether a row exists that matches condition @a cond
/// @param cond The condition to match
/// @return True if a row exists
bool contains(condition &&cond) const
{
bool result = false;
if (cond)
......@@ -922,6 +951,11 @@ class category
{ return prefix + std::to_string(nr + 1); });
}
/// @brief Generate a new, unique value for a item named @a tag
/// @param tag The name of the item
/// @return a new unique value
std::string get_unique_value(std::string_view tag);
// --------------------------------------------------------------------
/// \brief Update a single column named @a tag in the rows that match \a cond to value \a value
......
......@@ -934,6 +934,16 @@ struct key
{
}
/**
* @brief Construct a new key object using @a itemTag as name
*
* @param itemTag
*/
explicit key(std::string_view itemTag)
: m_item_tag(itemTag)
{
}
key(const key &) = delete;
key &operator=(const key &) = delete;
......
......@@ -556,7 +556,7 @@ struct item_handle::item_value_as<T, std::enable_if_t<std::is_arithmetic_v<T> an
auto txt = ref.text();
if (txt.empty())
if (ref.empty())
result = 1;
else
{
......
......@@ -562,22 +562,23 @@ class conditional_iterator_proxy
reference operator*()
{
return *mBegin;
return *m_begin;
}
pointer operator->()
{
return &*mBegin;
m_current = *m_begin;
return &m_current;
}
conditional_iterator_impl &operator++()
{
while (mBegin != mEnd)
while (m_begin != m_end)
{
if (++mBegin == mEnd)
if (++m_begin == m_end)
break;
if (m_condition->operator()(mBegin))
if (m_condition->operator()(m_begin))
break;
}
......@@ -591,18 +592,22 @@ class conditional_iterator_proxy
return result;
}
bool operator==(const conditional_iterator_impl &rhs) const { return mBegin == rhs.mBegin; }
bool operator!=(const conditional_iterator_impl &rhs) const { return mBegin != rhs.mBegin; }
bool operator==(const conditional_iterator_impl &rhs) const { return m_begin == rhs.m_begin; }
bool operator!=(const conditional_iterator_impl &rhs) const { return m_begin != rhs.m_begin; }
bool operator==(const row_iterator &rhs) const { return m_begin == rhs; }
bool operator!=(const row_iterator &rhs) const { return m_begin != rhs; }
template <typename IRowType, typename... ITs>
bool operator==(const iterator_impl<IRowType, ITs...> &rhs) const { return mBegin == rhs; }
bool operator==(const iterator_impl<IRowType, ITs...> &rhs) const { return m_begin == rhs; }
template <typename IRowType, typename... ITs>
bool operator!=(const iterator_impl<IRowType, ITs...> &rhs) const { return mBegin != rhs; }
bool operator!=(const iterator_impl<IRowType, ITs...> &rhs) const { return m_begin != rhs; }
private:
CategoryType *mCat;
base_iterator mBegin, mEnd;
CategoryType *m_cat;
base_iterator m_begin, m_end;
value_type m_current;
const condition *m_condition;
};
......@@ -673,13 +678,13 @@ iterator_proxy<Category, Ts...>::iterator_proxy(Category &cat, row_iterator pos,
template <typename Category, typename... Ts>
conditional_iterator_proxy<Category, Ts...>::conditional_iterator_impl::conditional_iterator_impl(
Category &cat, row_iterator pos, const condition &cond, const std::array<uint16_t, N> &cix)
: mCat(&cat)
, mBegin(pos, cix)
, mEnd(cat.end(), cix)
: m_cat(&cat)
, m_begin(pos, cix)
, m_end(cat.end(), cix)
, m_condition(&cond)
{
if (m_condition == nullptr or m_condition->empty())
mBegin = mEnd;
m_begin = m_end;
}
template <typename Category, typename... Ts>
......
......@@ -202,7 +202,7 @@ struct category_validator
}
/// @brief Add item_validator @a v to the list of item validators
void addItemValidator(item_validator &&v);
void add_item_validator(item_validator &&v);
/// @brief Return the item_validator for item @a tag, may return nullptr
const item_validator *get_validator_for_item(std::string_view tag) const;
......
......@@ -672,7 +672,7 @@ void category::set_validator(const validator *v, datablock &db)
std::ostringstream msg;
msg << "Cannot construct index since the key field" << (missing.size() > 1 ? "s" : "") << " "
<< cif::join(missing, ", ") << " in " << m_name << " " << (missing.size() == 1 ? "is" : "are") << " missing\n";
throw std::runtime_error(msg.str());
throw missing_key_error(msg.str(), *missing.begin());
}
}
}
......@@ -860,7 +860,7 @@ bool category::validate_links() const
auto cond = get_parents_condition(r, *parent);
if (not cond)
continue;
if (not parent->exists(std::move(cond)))
if (not parent->contains(std::move(cond)))
{
++missing;
if (VERBOSE and first_missing_rows.size() < 5)
......@@ -997,7 +997,7 @@ bool category::has_children(row_handle r) const
for (auto &&[childCat, link] : m_child_links)
{
if (not childCat->exists(get_children_condition(r, *childCat)))
if (not childCat->contains(get_children_condition(r, *childCat)))
continue;
result = true;
......@@ -1013,7 +1013,7 @@ bool category::has_parents(row_handle r) const
for (auto &&[parentCat, link] : m_parent_links)
{
if (not parentCat->exists(get_parents_condition(r, *parentCat)))
if (not parentCat->contains(get_parents_condition(r, *parentCat)))
continue;
result = true;
......@@ -1214,7 +1214,7 @@ void category::erase_orphans(condition &&cond, category &parent)
if (not cond(r))
continue;
if (parent.exists(get_parents_condition(r, parent)))
if (parent.contains(get_parents_condition(r, parent)))
continue;
if (VERBOSE > 1)
......@@ -1263,7 +1263,7 @@ std::string category::get_unique_id(std::function<std::string(int)> generator)
{
for (;;)
{
if (not exists(key(id_tag) == result))
if (not contains(key(id_tag) == result))
break;
result = generator(static_cast<int>(m_last_unique_num++));
......@@ -1273,6 +1273,36 @@ std::string category::get_unique_id(std::function<std::string(int)> generator)
return result;
}
std::string category::get_unique_value(std::string_view tag)
{
std::string result;
if (m_validator and m_cat_validator)
{
auto iv = m_cat_validator->get_validator_for_item(tag);
if (iv and iv->m_type and iv->m_type->m_primitive_type == DDL_PrimitiveType::Numb)
{
uint64_t v = find_max<uint64_t>(tag);
result = std::to_string(v + 1);
}
}
if (result.empty())
{
// brain-dead implementation
for (size_t ix = 0; ix < size(); ++ix)
{
// result = m_name + "-" + std::to_string(ix);
result = cif_id_for_number(ix);
if (not contains(key(tag) == result))
break;
}
}
return result;
}
void category::update_value(const std::vector<row_handle> &rows, std::string_view tag, std::string_view value)
{
using namespace std::literals;
......@@ -1376,7 +1406,7 @@ void category::update_value(const std::vector<row_handle> &rows, std::string_vie
check = std::move(check) && key(ck) == parent[pk].text();
}
if (childCat->exists(std::move(check))) // phew..., narrow escape
if (childCat->contains(std::move(check))) // phew..., narrow escape
continue;
// create the actual copy, if we can...
......@@ -1405,7 +1435,7 @@ void category::update_value(const std::vector<row_handle> &rows, std::string_vie
void category::update_value(row *row, uint16_t column, std::string_view value, bool updateLinked, bool validate)
{
// make sure we have an index, if possible
if (m_index == nullptr and m_cat_validator != nullptr)
if ((updateLinked or validate) and m_index == nullptr and m_cat_validator != nullptr)
m_index = new category_index(*this);
auto &col = m_columns[column];
......@@ -1444,7 +1474,7 @@ void category::update_value(row *row, uint16_t column, std::string_view value, b
if (not value.empty())
row->append(column, { value });
if (reinsert)
if (reinsert and m_index != nullptr)
m_index->insert(*this, row);
// see if we need to update any child categories that depend on this value
......
......@@ -87,7 +87,7 @@ class dictionary_parser : public parser
error("Undefined category '" + iv.first);
for (auto &v : iv.second)
const_cast<category_validator *>(cv)->addItemValidator(std::move(v));
const_cast<category_validator *>(cv)->add_item_validator(std::move(v));
}
// check all item validators for having a typeValidator
......
......@@ -1783,7 +1783,7 @@ atom &structure::emplace_atom(atom &&atom)
std::string symbol = atom.get_property("type_symbol");
using namespace cif::literals;
if (not atom_type.exists("symbol"_key == symbol))
if (not atom_type.contains("symbol"_key == symbol))
atom_type.emplace({ { "symbol", symbol } });
return m_atoms.emplace_back(std::move(atom));
......@@ -1969,7 +1969,7 @@ void structure::change_residue(residue &res, const std::string &newCompound,
// create rest
auto &chemComp = m_db["chem_comp"];
if (not chemComp.exists(key("id") == newCompound))
if (not chemComp.contains(key("id") == newCompound))
{
chemComp.emplace({{"id", newCompound},
{"name", compound->name()},
......@@ -2702,7 +2702,7 @@ void structure::cleanup_empty_categories()
for (auto chemComp : chem_comp)
{
std::string compID = chemComp["id"].as<std::string>();
if (atomSite.exists("label_comp_id"_key == compID or "auth_comp_id"_key == compID))
if (atomSite.contains("label_comp_id"_key == compID or "auth_comp_id"_key == compID))
continue;
obsoleteChemComps.push_back(chemComp);
......@@ -2719,7 +2719,7 @@ void structure::cleanup_empty_categories()
for (auto entity : entities)
{
std::string entityID = entity["id"].as<std::string>();
if (atomSite.exists("label_entity_id"_key == entityID))
if (atomSite.contains("label_entity_id"_key == entityID))
continue;
obsoleteEntities.push_back(entity);
......
......@@ -31,6 +31,100 @@
namespace cif::pdb
{
void fillLabelAsymID(category &atom_site)
{
// pray that label_entity_id is filled in and use that to discriminate between asyms
std::map<std::tuple<std::string, std::string>, std::string> mapAuthAsymIDAndEntityToLabelAsymID;
for (const auto &[label_entity_id, auth_asym_id, label_asym_id] :
atom_site.find<std::optional<std::string>, std::string, std::string>(
key("label_asym_id") != cif::null, "label_entity_id", "auth_asym_id", "label_asym_id"))
{
if (not label_entity_id.has_value())
continue;
auto key = make_tuple(auth_asym_id, *label_entity_id);
auto i = mapAuthAsymIDAndEntityToLabelAsymID.find(key);
if (i == mapAuthAsymIDAndEntityToLabelAsymID.end())
mapAuthAsymIDAndEntityToLabelAsymID.emplace(make_pair(key, label_asym_id));
else if (i->second != label_asym_id)
{
if (cif::VERBOSE > 0)
std::clog << "Inconsistent assignment of label_asym_id for the tuple entity_id: " << *label_entity_id << " and auth_asym_id: " << auth_asym_id << '\n';
mapAuthAsymIDAndEntityToLabelAsymID.clear();
break;
}
}
for (const auto &[key, value] : mapAuthAsymIDAndEntityToLabelAsymID)
{
const auto &[auth_asym_id, label_entity_id] = key;
for (auto row : atom_site.find(cif::key("label_asym_id") == null and
cif::key("auth_asym_id") == auth_asym_id and
cif::key("label_entity_id") == label_entity_id))
{
row.assign("label_asym_id", value, false, true);
}
}
// Check to see if we're done
if (atom_site.contains(key("label_asym_id") == cif::null))
{
// nope, not yet.
throw std::runtime_error("atom_site category still contains records with empty label_asym_id, don't know how to continue");
}
}
void fixNegativeSeqID(category &atom_site)
{
std::set<std::string> asymsWithNegativeSeqID;
for (auto asym_id : atom_site.find<std::string>(key("label_seq_id") < 0, "label_asym_id"))
asymsWithNegativeSeqID.emplace(asym_id);
for (auto asym_id : asymsWithNegativeSeqID)
{
// create a pseudo entity_poly_seq first
std::vector<std::tuple<std::string, int>> poly_seq;
for (auto key : atom_site.find<std::string, int>(key("label_asym_id") == asym_id, "auth_seq_id", "label_seq_id"))
{
if (poly_seq.empty() or poly_seq.back() != key)
poly_seq.emplace_back(key);
}
// simply renumber all items, but only if it is really a poly (i.e. size > 1)
if (poly_seq.size() > 1)
{
int seq_id = 1;
for (const auto &[auth_seq_id, label_seq_id] : poly_seq)
{
for (auto row : atom_site.find(key("label_asym_id") == asym_id and
key("auth_seq_id") == auth_seq_id and
key("label_seq_id") == label_seq_id))
{
row.assign("label_seq_id", std::to_string(seq_id), false, false);
}
++seq_id;
}
}
else if (poly_seq.size() == 1) // a monomer?
{
const auto &[auth_seq_id, label_seq_id] = poly_seq.front();
for (auto row : atom_site.find(key("label_asym_id") == asym_id and
key("auth_seq_id") == auth_seq_id and
key("label_seq_id") == label_seq_id))
{
row.assign("label_seq_id", ".", false, false);
}
}
}
}
void checkAtomRecords(datablock &db)
{
......@@ -42,6 +136,14 @@ void checkAtomRecords(datablock &db)
auto &atom_type = db["atom_type"];
auto &chem_comp = db["chem_comp"];
// Some common errors: missing label_asym_id for some of the atom records
if (atom_site.contains(key("label_asym_id") == cif::null))
fillLabelAsymID(atom_site);
// And negative seq_id values
if (atom_site.contains(key("label_seq_id") < 0))
fixNegativeSeqID(atom_site);
for (auto row : atom_site)
{
const auto &[symbol, label_asym_id, auth_asym_id, label_comp_id, auth_comp_id, label_seq_id, auth_seq_id, label_atom_id, auth_atom_id] =
......@@ -117,16 +219,50 @@ void checkAtomRecords(datablock &db)
throw std::runtime_error("atom_site record has peptide comp_id but no sequence number, cannot continue");
std::string seq_id;
if (label_seq_id.has_value())
if (label_seq_id.has_value() and *label_seq_id > 0)
seq_id = std::to_string(*label_seq_id);
else if (auth_seq_id.has_value())
{
seq_id = *auth_seq_id;
row.assign("label_seq_id", seq_id, false, true);
}
if (not label_atom_id.has_value())
row.assign("label_atom_id", asym_id, false, true);
row.assign({ //
{ "auth_asym_id", auth_asym_id.value_or(*label_asym_id) },
{ "auth_seq_id", auth_seq_id.value_or(std::to_string(*label_seq_id)) },
{ "auth_comp_id", auth_comp_id.value_or(*label_comp_id) },
{ "auth_atom_id", auth_atom_id.value_or(*label_atom_id) } });
// Rewrite the coordinates and other fields that look better in a fixed format
// Be careful not to nuke invalidly formatted data here
for (const auto &[tag, prec] : std::initializer_list<std::tuple<std::string_view,std::string::size_type>>{
{ "cartn_x", 3 },
{ "cartn_y", 3 },
{ "cartn_z", 3 },
{ "occupancy", 2 },
{ "b_iso_or_equiv", 2 }
})
{
if (row[tag].empty())
continue;
float v;
auto s = row.get<std::string>(tag);
if (auto [ptr, ec] = cif::from_chars(s.data(), s.data() + s.length(), v); ec != std::errc())
continue;
if (s.length() < prec + 1 or s[s.length() - prec - 1] != '.')
{
char b[12];
if (auto [ptr, ec] = cif::to_chars(b, b + sizeof(b), v, cif::chars_format::fixed, prec); ec == std::errc())
row.assign(tag, {b, ptr}, false, false);
}
}
}
}
......@@ -159,34 +295,34 @@ void createEntity(datablock &db)
auto &struct_asym = db["struct_asym"];
struct_asym.add_column("entity_id");
std::map<std::string,std::vector<std::tuple<std::string,int>>> asyms;
std::map<std::string, std::vector<std::tuple<std::string, int>>> asyms;
for (auto asym_id : db["struct_asym"].rows<std::string>("id"))
{
int last_seq_id = -1;
for (const auto &[comp_id, seq_id] : atom_site.find<std::string,int>("label_asym_id"_key == asym_id, "label_comp_id", "label_seq_id"))
for (const auto &[comp_id, seq_id] : atom_site.find<std::string, int>("label_asym_id"_key == asym_id, "label_comp_id", "label_seq_id"))
{
if (seq_id == last_seq_id)
continue;
last_seq_id = seq_id;
asyms[asym_id].emplace_back(comp_id, last_seq_id);
}
}
auto less = [](const std::vector<std::tuple<std::string,int>> &a, const std::vector<std::tuple<std::string,int>> &b)
auto less = [](const std::vector<std::tuple<std::string, int>> &a, const std::vector<std::tuple<std::string, int>> &b)
{
int d = static_cast<int>(a.size()) - static_cast<int>(b.size());
return d == 0 ? a > b : d > 0;
};
std::set<std::vector<std::tuple<std::string,int>>,decltype(less)> entities(less);
std::set<std::vector<std::tuple<std::string, int>>, decltype(less)> entities(less);
for (const auto &[asym_id, content] : asyms)
entities.emplace(content);
auto water_weight = cf.create("HOH")->formula_weight();
int poly_count = 0;
......@@ -230,7 +366,7 @@ void createEntity(datablock &db)
{
if (ac != content)
continue;
atom_site.update_value("label_asym_id"_key == asym_id, "label_entity_id", entity_id);
struct_asym.update_value("id"_key == asym_id, "entity_id", entity_id);
......@@ -240,13 +376,12 @@ void createEntity(datablock &db)
count = atom_site.count("label_asym_id"_key == asym_id and "label_atom_id"_key == "O");
}
entity.emplace({ //
entity.emplace({ //
{ "id", entity_id },
{ "type", type },
{ "pdbx_description", desc },
{ "formula_weight", weight },
{ "pdbx_number_of_molecules", count }
});
{ "pdbx_number_of_molecules", count } });
}
}
......@@ -268,11 +403,11 @@ void createEntityPoly(datablock &db)
bool non_std_linkage = false;
std::string pdb_strand_id;
for (const auto &[comp_id, seq_id, auth_asym_id] : atom_site.find<std::string,int,std::string>("label_entity_id"_key == entity_id, "label_comp_id", "label_seq_id", "auth_asym_id"))
for (const auto &[comp_id, seq_id, auth_asym_id] : atom_site.find<std::string, int, std::string>("label_entity_id"_key == entity_id, "label_comp_id", "label_seq_id", "auth_asym_id"))
{
if (seq_id == last_seq_id)
continue;
last_seq_id = seq_id;
auto c = cf.create(comp_id);
......@@ -280,7 +415,7 @@ void createEntityPoly(datablock &db)
std::string letter;
char letter_can;
// TODO: Perhaps we should improve this...
// TODO: Perhaps we should improve this...
if (type != "other")
{
std::string c_type;
......@@ -301,7 +436,7 @@ void createEntityPoly(datablock &db)
letter_can = c->one_letter_code();
if (letter_can == 0)
letter_can = 'X';
letter = '(' + comp_id + ')';
non_std_linkage = true;
......@@ -334,7 +469,7 @@ void createEntityPoly(datablock &db)
for (auto i = seq.begin() + 80; i < seq.end(); i += 80)
i = seq.insert(i, '\n') + 1;
for (auto i = seq_can.begin() + 76; i < seq_can.end(); i += 76)
{
auto j = i;
......@@ -351,15 +486,14 @@ void createEntityPoly(datablock &db)
i = j;
}
entity_poly.emplace({ //
entity_poly.emplace({ //
{ "entity_id", entity_id },
{ "type", type },
{ "nstd_linkage", non_std_linkage },
{ "nstd_monomer", non_std_monomer },
{ "pdbx_seq_one_letter_code", seq },
{ "pdbx_seq_one_letter_code_can", seq_can },
{ "pdbx_strand_id", pdb_strand_id }
});
{ "pdbx_strand_id", pdb_strand_id } });
}
}
......@@ -381,7 +515,7 @@ void createEntityPolySeq(datablock &db)
std::string last_comp_id;
std::string asym_id = struct_asym.find_first<std::string>("entity_id"_key == entity_id, "id");
for (const auto &[comp_id, seq_id] : atom_site.find<std::string,int>("label_entity_id"_key == entity_id and "label_asym_id"_key == asym_id, "label_comp_id", "label_seq_id"))
for (const auto &[comp_id, seq_id] : atom_site.find<std::string, int>("label_entity_id"_key == entity_id and "label_asym_id"_key == asym_id, "label_comp_id", "label_seq_id"))
{
bool hetero = false;
......@@ -395,27 +529,22 @@ void createEntityPolySeq(datablock &db)
if (hetero)
{
entity_poly_seq.back().assign({
{ "hetero", true }
});
entity_poly_seq.back().assign({ { "hetero", true } });
}
entity_poly_seq.emplace({ //
entity_poly_seq.emplace({ //
{ "entity_id", entity_id },
{ "num", seq_id },
{ "mon_id", comp_id },
{ "hetero", hetero }
});
{ "hetero", hetero } });
last_seq_id = seq_id;
last_comp_id = comp_id;
}
// you cannot assume this is correct...
entity_poly_seq.sort([](row_handle a, row_handle b)
{
return a.get<int>("num") < b.get<int>("num");
});
{ return a.get<int>("num") < b.get<int>("num"); });
}
}
......@@ -436,17 +565,16 @@ void createPdbxPolySeqScheme(datablock &db)
{
for (auto asym_id : struct_asym.find<std::string>("entity_id"_key == entity_id, "id"))
{
for (const auto &[comp_id, num, hetero] : entity_poly_seq.find<std::string,int,bool>("entity_id"_key == entity_id, "mon_id", "num", "hetero"))
for (const auto &[comp_id, num, hetero] : entity_poly_seq.find<std::string, int, bool>("entity_id"_key == entity_id, "mon_id", "num", "hetero"))
{
const auto &[auth_seq_num, auth_mon_id, ins_code] =
atom_site.find_first<std::string,std::string,std::optional<std::string>>(
atom_site.find_first<std::string, std::string, std::optional<std::string>>(
"label_asym_id"_key == asym_id and "label_seq_id"_key == num,
"auth_seq_id", "auth_comp_id", "pdbx_PDB_ins_code"
);
"auth_seq_id", "auth_comp_id", "pdbx_PDB_ins_code");
pdbx_poly_seq_scheme.emplace({ //
{ "asym_id", asym_id },
{ "entity_id", entity_id },
{ "entity_id", entity_id },
{ "seq_id", num },
{ "mon_id", comp_id },
{ "ndb_seq_num", num },
......@@ -456,8 +584,81 @@ void createPdbxPolySeqScheme(datablock &db)
{ "auth_mon_id", auth_mon_id },
{ "pdb_strand_id", pdb_strand_id },
{ "pdb_ins_code", ins_code },
{ "hetero", hetero }
});
{ "hetero", hetero } });
}
}
}
}
// Some programs write out a ndb_poly_seq_scheme, which has been replaced by pdbx_poly_seq_scheme
void comparePolySeqSchemes(datablock &db)
{
auto &ndb_poly_seq_scheme = db["ndb_poly_seq_scheme"];
auto &pdbx_poly_seq_scheme = db["pdbx_poly_seq_scheme"];
// Since often ndb_poly_seq_scheme only contains an id and mon_id field
// we assume that it should match the accompanying pdbx_poly_seq
std::vector<std::string> asym_ids_ndb, asym_ids_pdbx;
for (auto asym_id : ndb_poly_seq_scheme.rows<std::string>("id"))
{
auto i = std::lower_bound(asym_ids_ndb.begin(), asym_ids_ndb.end(), asym_id);
if (i == asym_ids_ndb.end() or *i != asym_id)
asym_ids_ndb.insert(i, asym_id);
}
for (auto asym_id : pdbx_poly_seq_scheme.rows<std::string>("asym_id"))
{
auto i = std::lower_bound(asym_ids_pdbx.begin(), asym_ids_pdbx.end(), asym_id);
if (i == asym_ids_pdbx.end() or *i != asym_id)
asym_ids_pdbx.insert(i, asym_id);
}
// If we have different Asym ID's assume the ndb is invalid.
if (asym_ids_ndb != asym_ids_pdbx)
{
if (cif::VERBOSE > 0)
std::clog << "The asym ID's of ndb_poly_seq_scheme and pdbx_poly_seq_scheme are not equal, dropping ndb_poly_seq_scheme\n";
ndb_poly_seq_scheme.clear();
}
else
{
for (const auto &asym_id : asym_ids_ndb)
{
bool valid = true;
auto ndb_range = ndb_poly_seq_scheme.find(key("id") == asym_id);
auto pdbx_range = pdbx_poly_seq_scheme.find(key("asym_id") == asym_id);
for (auto ndb_i = ndb_range.begin(), pdbx_i = pdbx_range.begin();
ndb_i != ndb_range.end() or pdbx_i != pdbx_range.end(); ++ndb_i, ++pdbx_i)
{
if (ndb_i == ndb_range.end() or pdbx_i == pdbx_range.end())
{
if (cif::VERBOSE > 0)
std::clog << "The sequences in ndb_poly_seq_scheme and pdbx_poly_seq_scheme are unequal in size for asym ID " << asym_id << '\n';
valid = false;
break;
}
auto ndb_mon_id = ndb_i->get<std::string>("mon_id");
auto pdbx_mon_id = pdbx_i->get<std::string>("mon_id");
if (ndb_mon_id != pdbx_mon_id)
{
if (cif::VERBOSE > 0)
std::clog << "The sequences in ndb_poly_seq_scheme and pdbx_poly_seq_scheme contain different mon ID's for asym ID " << asym_id << '\n';
valid = false;
break;
}
}
if (not valid)
{
if (cif::VERBOSE > 0)
std::clog << "Dropping asym ID " << asym_id << " from ndb_poly_seq_scheme\n";
ndb_poly_seq_scheme.erase(key("id") == asym_id);
}
}
}
......@@ -498,45 +699,154 @@ void reconstruct_pdbx(file &file, std::string_view dictionary)
entry_id = entry.front().get<std::string>("id");
}
std::vector<std::string> invalidCategories;
for (auto &cat : db)
{
auto cv = validator.get_validator_for_category(cat.name());
if (not cv)
continue;
for (auto link : validator.get_links_for_child(cat.name()))
try
{
if (link->m_parent_category != "entry")
auto cv = validator.get_validator_for_category(cat.name());
if (not cv)
continue;
// So, this cat should have a link to the entry
for (auto link : validator.get_links_for_child(cat.name()))
{
if (link->m_parent_category != "entry")
continue;
auto pk = find(link->m_parent_keys.begin(), link->m_parent_keys.end(), "id");
if (pk == link->m_parent_keys.end())
continue;
// So, this cat should have a link to the entry
auto ix = pk - link->m_parent_keys.begin();
auto key = link->m_child_keys[ix];
auto pk = find(link->m_parent_keys.begin(), link->m_parent_keys.end(), "id");
if (pk == link->m_parent_keys.end())
continue;
auto ix = pk - link->m_parent_keys.begin();
auto key = link->m_child_keys[ix];
for (auto row : cat)
{
row.assign({ { key, entry_id } });
}
}
for (auto row : cat)
// Fill in all mandatory fields
for (auto key : cv->m_mandatory_fields)
{
row.assign({ { key, entry_id } });
if (not cat.has_column(key))
{
if (cif::VERBOSE > 0)
std::clog << "Adding mandatory key " << key << " to category " << cat.name() << '\n';
cat.add_column(key);
}
}
}
// See if all categories that need a key do have a value
if (cv->m_keys.size() == 1)
{
auto key = cv->m_keys.front();
for (auto row : cat)
enum class State
{
auto ord = row.get<std::string>(key.c_str());
if (ord.empty())
row.assign({ //
{ key, cat.get_unique_id([](int nr)
{ return std::to_string(nr); }) } });
Start,
MissingKeys,
DuplicateKeys
} state = State::Start;
for (;;)
{
// See if we can build an index
try
{
cat.set_validator(&validator, db);
}
catch (const missing_key_error &ex)
{
if (state == State::MissingKeys)
{
if (cif::VERBOSE > 0)
std::clog << "Repairing failed for category " << cat.name() << ", missing keys remain: " << ex.what() << '\n';
throw;
}
state = State::MissingKeys;
auto key = ex.get_key();
if (cif::VERBOSE > 0)
std::clog << "Need to add key " << key << " to category " << cat.name() << '\n';
for (auto row : cat)
{
auto ord = row.get<std::string>(key.c_str());
if (ord.empty())
row.assign({ //
{ key, cat.get_unique_value(key) } });
}
continue;
}
catch (const duplicate_key_error &ex)
{
if (state == State::DuplicateKeys)
{
if (cif::VERBOSE > 0)
std::clog << "Repairing failed for category " << cat.name() << ", duplicate keys remain: " << ex.what() << '\n';
throw;
}
state = State::DuplicateKeys;
if (cif::VERBOSE > 0)
std::clog << "Attempt to fix " << cat.name() << " failed: " << ex.what() << '\n';
// replace fields that do not define a relation to a parent
std::set<std::string> replaceableKeys;
for (auto key : cv->m_keys)
{
bool replaceable = true;
for (auto lv : validator.get_links_for_child(cat.name()))
{
if (find(lv->m_child_keys.begin(), lv->m_child_keys.end(), key) != lv->m_child_keys.end())
{
replaceable = false;
break;
}
}
if (replaceable)
replaceableKeys.insert(key);
}
if (replaceableKeys.empty())
throw std::runtime_error("Cannot repair category " + cat.name() + " since it contains duplicate keys that cannot be replaced");
for (auto key : replaceableKeys)
{
for (auto row : cat)
row.assign(key, cat.get_unique_value(key), false, false);
}
continue;
}
break;
}
}
catch (const std::exception &ex)
{
if (cif::VERBOSE > 0)
std::clog << ex.what() << '\n';
std::clog << "Will drop category " << cat.name() << " since it cannot be repaired\n";
invalidCategories.emplace_back(cat.name());
}
}
for (auto cat_name : invalidCategories)
{
auto i = find_if(db.begin(), db.end(), [cat_name](const category &cat)
{ return cat.name() == cat_name; });
if (i != db.end())
db.erase(i);
}
file.load_dictionary(dictionary);
......@@ -550,12 +860,15 @@ void reconstruct_pdbx(file &file, std::string_view dictionary)
// Next make sure we have struct_asym records
if (db.get("struct_asym") == nullptr)
createStructAsym(db);
if (db.get("entity") == nullptr)
createEntity(db);
if (db.get("pdbx_poly_seq_scheme") == nullptr)
createPdbxPolySeqScheme(db);
if (db.get("ndb_poly_seq_scheme") != nullptr)
comparePolySeqSchemes(db);
}
} // namespace cif::pdb
......@@ -183,7 +183,7 @@ bool is_valid_pdbx_file(const file &file, std::string_view dictionary)
"label_asym_id"_key == asym_id and
"label_seq_id"_key == seq_id and not std::move(cond);
if (atom_site.exists(std::move(cond)))
if (atom_site.contains(std::move(cond)))
throw validation_error("An atom_site record exists that has no parent in the poly seq scheme categories");
}
}
......
......@@ -233,7 +233,7 @@ void item_validator::operator()(std::string_view value) const
// --------------------------------------------------------------------
void category_validator::addItemValidator(item_validator &&v)
void category_validator::add_item_validator(item_validator &&v)
{
if (v.m_mandatory)
m_mandatory_fields.insert(v.m_tag);
......
......@@ -568,7 +568,7 @@ _test.value
auto &test = db["test"];
REQUIRE(test.size() == 5);
REQUIRE(test.exists("value"_key == cif::null));
REQUIRE(test.contains("value"_key == cif::null));
REQUIRE(test.find("value"_key == cif::null).size() == 2);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment