Commit 227ff1b8 by Maarten L. Hekkelman

sequence recovery and validation

parent 82086a93
......@@ -166,6 +166,9 @@ class compound
return m_id == "HOH" or m_id == "H2O" or m_id == "WAT";
}
char one_letter_code() const { return m_one_letter_code; }; ///< Return the one letter code to use in a canonical sequence. If unknown the value '\0' is returned
std::string parent_id() const { return m_parent_id; }; ///< Return the parent id code in case a parent is specified (e.g. MET for MSE)
private:
friend class compound_factory_impl;
friend class local_compound_factory_impl;
......@@ -176,11 +179,9 @@ class compound
std::string m_id;
std::string m_name;
std::string m_type;
/// @cond
// m_group is no longer used
std::string __m_group;
/// @endcond
std::string m_formula;
char m_one_letter_code = 0;
std::string m_parent_id;
float m_formula_weight = 0;
int m_formal_charge = 0;
std::vector<compound_atom> m_atoms;
......
......@@ -136,8 +136,8 @@ compound::compound(cif::datablock &db)
if (chemComp.size() != 1)
throw std::runtime_error("Invalid compound file, chem_comp should contain a single row");
cif::tie(m_id, m_name, m_type, m_formula, m_formula_weight, m_formal_charge) =
chemComp.front().get("id", "name", "type", "formula", "formula_weight", "pdbx_formal_charge");
cif::tie(m_id, m_name, m_type, m_formula, m_formula_weight, m_formal_charge, m_one_letter_code, m_parent_id) =
chemComp.front().get("id", "name", "type", "formula", "formula_weight", "pdbx_formal_charge", "one_letter_code", "mon_nstd_parent_comp_id");
// The name should not contain newline characters since that triggers validation errors later on
cif::replace_all(m_name, "\n", "");
......
......@@ -277,7 +277,8 @@ void createEntityPoly(datablock &db)
auto c = cf.create(comp_id);
std::string letter, letter_can;
std::string letter;
char letter_can;
// TODO: Perhaps we should improve this...
if (type != "other")
......@@ -296,16 +297,26 @@ void createEntityPoly(datablock &db)
else if (iequals(c->type(), "D-PEPTIDE LINKING"))
{
c_type = "polypeptide(D)";
letter = 'X';
letter_can = '(' + comp_id + ')';
letter_can = c->one_letter_code();
if (letter_can == 0)
letter_can = 'X';
letter = '(' + comp_id + ')';
non_std_linkage = true;
non_std_monomer = true;
}
else if (iequals(c->type(), "L-PEPTIDE LINKING") or iequals(c->type(), "PEPTIDE LINKING"))
{
c_type = "polypeptide(L)";
letter = 'X';
letter_can = '(' + comp_id + ')';
letter_can = c->one_letter_code();
if (letter_can == 0)
letter_can = 'X';
letter = '(' + comp_id + ')';
non_std_monomer = true;
}
......
......@@ -135,7 +135,7 @@ bool is_valid_pdbx_file(const file &file, std::string_view dictionary)
if (entity_poly.count("entity_id"_key == entity_id) != 1)
throw validation_error("There should be exactly one entity_poly record per polymer entity");
// const auto entity_poly_type = entity_poly.find1<std::string>("entity_id"_key == entity_id, "type");
const auto entity_poly_type = entity_poly.find1<std::string>("entity_id"_key == entity_id, "type");
std::map<int,std::set<std::string>> mon_per_seq_id;
......@@ -177,12 +177,7 @@ bool is_valid_pdbx_file(const file &file, std::string_view dictionary)
condition cond;
for (auto mon_id : mon_ids)
{
if (cond)
cond = std::move(cond) or "label_comp_id"_key == mon_id;
else
cond = "label_comp_id"_key == mon_id;
}
cond = std::move(cond) or "label_comp_id"_key == mon_id;
cond = "label_entity_id"_key == entity_id and
"label_asym_id"_key == asym_id and
......@@ -192,6 +187,74 @@ bool is_valid_pdbx_file(const file &file, std::string_view dictionary)
throw validation_error("An atom_site record exists that has no parent in the poly seq scheme categories");
}
}
const auto &[seq, seq_can] = entity_poly.find1<std::optional<std::string>, std::optional<std::string>>("entity_id"_key == entity_id,
"pdbx_seq_one_letter_code", "pdbx_seq_one_letter_code_can");
std::string::const_iterator si, sci, se, sce;
auto seq_match = [&](bool can, std::string::const_iterator si, std::string::const_iterator se)
{
for (const auto &[seq_id, comp_ids] : mon_per_seq_id)
{
if (si == se)
return false;
bool match = false;
for (auto comp_id : comp_ids)
{
std::string letter;
if (cf.is_known_base(comp_id))
letter = compound_factory::kBaseMap.at(comp_id);
else if (cf.is_known_peptide(comp_id))
letter = compound_factory::kAAMap.at(comp_id);
else
{
if (can)
{
auto c = cf.create(comp_id);
if (c and c->one_letter_code())
letter = c->one_letter_code();
else
letter = "X";
}
else
letter = '(' + comp_id + ')';
}
if (iequals(std::string{si, si + letter.length()}, letter))
{
match = true;
si += letter.length();
break;
}
else
return false;
}
if (not match)
break;
}
return si == se;
};
if (not seq.has_value())
{
if (cif::VERBOSE > 0)
std::clog << "Warning: entity_poly has no sequence for entity_id " << entity_id << '\n';
}
else if (not seq_match(false, seq->begin(), seq->end()))
throw validation_error("Sequences do not match for entity " + entity_id);
if (not seq_can.has_value())
{
if (cif::VERBOSE > 0)
std::clog << "Warning: entity_poly has no sequence for entity_id " << entity_id << '\n';
}
else if (not seq_match(true, seq_can->begin(), seq_can->end()))
throw validation_error("Canonical sequences do not match for entity " + entity_id);
}
result = true;
......
......@@ -232,17 +232,55 @@ A 1 5 GLY 5 5 5 GLY GLY A . n
REQUIRE_FALSE(cif::pdb::is_valid_pdbx_file(f));
}
}
SECTION("Missing hetero for record in atom_site")
{
auto &db = f.front();
auto r1 = db["atom_site"].front();
cif::row_initializer cr(r1);
cr.set_value("id", "3");
cr.set_value("label_comp_id", "ALA");
db["atom_site"].emplace(std::move(cr));
REQUIRE_FALSE(cif::pdb::is_valid_pdbx_file(f));
}
SECTION("Missing letter in entity_poly.pdbx_seq_one_letter_code")
{
auto &db = f.front();
auto &entity_poly = db["entity_poly"];
entity_poly.front().assign({
{ "pdbx_seq_one_letter_code", "PNSG" }
});
REQUIRE_FALSE(cif::pdb::is_valid_pdbx_file(f));
}
SECTION("Too many letters in entity_poly.pdbx_seq_one_letter_code")
{
auto &db = f.front();
auto &entity_poly = db["entity_poly"];
entity_poly.front().assign({
{ "pdbx_seq_one_letter_code", "PNFSGX" }
});
REQUIRE_FALSE(cif::pdb::is_valid_pdbx_file(f));
}
SECTION("Mismatch in entity_poly.pdbx_seq_one_letter_code")
{
auto &db = f.front();
auto &entity_poly = db["entity_poly"];
entity_poly.front().assign({
{ "pdbx_seq_one_letter_code", "PNASG" }
});
REQUIRE_FALSE(cif::pdb::is_valid_pdbx_file(f));
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment