Commit f1ca916d by Maarten L. Hekkelman

Merge remote-tracking branch 'origin/develop' into develop-cif2fasta

parents 4782a4e0 6aae012a
Version 6.0.1
- Add formula weight to entity in pdb2cif
- Change order of categories inside a datablock to match order in file
- Change default order to write out categories in a file based on
parent/child relationship
Version 6.0.0
- Drop the use of CCP4's monomer library for compound information
......
......@@ -18,7 +18,7 @@ Loading Resources
No matter where the resource is located, you should always use the single libcifpp API call :cpp:func:`cif::load_resource` to load them. This function returns a *std::istream* wrapped inside a *std::unique_ptr*.
The order in which resources are search for is:
The order in which resources are searched for is:
* Use the resource that was defined by calling :cpp:func:`cif::add_file_resource`
for this name.
......
......@@ -168,14 +168,18 @@ class compound
private:
friend class compound_factory_impl;
friend class local_compound_factory_impl;
compound(cif::datablock &db);
compound(cif::datablock &db, const std::string &id, const std::string &name, const std::string &type, const std::string &group);
compound(cif::datablock &db, int);
std::string m_id;
std::string m_name;
std::string m_type;
std::string m_group;
/// @cond
// m_group is no longer used
std::string __m_group;
/// @endcond
std::string m_formula;
float m_formula_weight = 0;
int m_formal_charge = 0;
......@@ -214,6 +218,20 @@ class compound_factory
/// Override any previously loaded dictionary with @a inDictFile
void push_dictionary(const std::filesystem::path &inDictFile);
/** @brief Override any previously loaded dictionary with the data in @a file
*
* @note experimental feature
*
* Load the file @a file as a source for compound information. This may
* be e.g. a regular mmCIF file with extra files containing compound
* information.
*
* Be carefull to remove the block again, best use @ref cif::compound_source
* as a stack based object.
*/
void push_dictionary(const file &file);
/// Remove the last pushed dictionary
void pop_dictionary();
......@@ -251,4 +269,35 @@ class compound_factory
std::shared_ptr<compound_factory_impl> m_impl;
};
// --------------------------------------------------------------------
/**
* @brief Stack based source for compound info.
*
* Use this class to temporarily add a compound source to the
* compound_factory.
*
* @code{.cpp}
* cif::file f("1cbs-with-custom-rea.cif");
* cif::compound_source cs(f);
*
* auto &cf = cif::compound_factory::instance();
* auto rea_compound = cf.create("REA");
* @endcode
*/
class compound_source
{
public:
compound_source(const cif::file &file)
{
cif::compound_factory::instance().push_dictionary(file);
}
~compound_source()
{
cif::compound_factory::instance().pop_dictionary();
}
};
} // namespace cif
......@@ -290,6 +290,13 @@ class row_handle
return operator[](get_column_ix(column)).template as<T>();
}
/// \brief Get the value of column @a column cast to type @a T
template <typename T>
T get(std::string_view column) const
{
return operator[](get_column_ix(column)).template as<T>();
}
/// \brief assign each of the columns named in @a values to their respective value
void assign(const std::vector<item> &values)
{
......
......@@ -142,8 +142,6 @@ compound::compound(cif::datablock &db)
// The name should not contain newline characters since that triggers validation errors later on
cif::replace_all(m_name, "\n", "");
m_group = "non-polymer";
auto &chemCompAtom = db["chem_comp_atom"];
for (auto row : chemCompAtom)
{
......@@ -153,6 +151,9 @@ compound::compound(cif::datablock &db)
row.get("atom_id", "type_symbol", "charge", "pdbx_aromatic_flag", "pdbx_leaving_atom_flag", "pdbx_stereo_config",
"model_Cartn_x", "model_Cartn_y", "model_Cartn_z");
atom.type_symbol = atom_type_traits(type_symbol).type();
if (stereo_config.empty())
atom.stereo_config = stereo_config_type::N;
else
atom.stereo_config = parse_stereo_config_from_string(stereo_config);
m_atoms.push_back(std::move(atom));
}
......@@ -163,17 +164,28 @@ compound::compound(cif::datablock &db)
compound_bond bond;
std::string valueOrder;
cif::tie(bond.atom_id[0], bond.atom_id[1], valueOrder, bond.aromatic, bond.stereo_config) = row.get("atom_id_1", "atom_id_2", "value_order", "pdbx_aromatic_flag", "pdbx_stereo_config");
if (valueOrder.empty())
bond.type = bond_type::sing;
else
bond.type = parse_bond_type_from_string(valueOrder);
m_bonds.push_back(std::move(bond));
}
}
compound::compound(cif::datablock &db, const std::string &id, const std::string &name, const std::string &type, const std::string &group)
: m_id(id)
, m_name(name)
, m_type(type)
, m_group(group)
compound::compound(cif::datablock &db, int)
{
auto &chemComp = db["chem_comp"];
if (chemComp.size() != 1)
throw std::runtime_error("Invalid compound file, chem_comp should contain a single row");
cif::tie(m_id, m_name) =
chemComp.front().get("id", "name");
cif::trim(m_name);
m_type = "NON-POLYMER";
auto &chemCompAtom = db["chem_comp_atom"];
for (auto row : chemCompAtom)
{
......@@ -184,7 +196,6 @@ compound::compound(cif::datablock &db, const std::string &id, const std::string
atom.type_symbol = atom_type_traits(type_symbol).type();
m_formal_charge += atom.charge;
m_formula_weight += atom_type_traits(atom.type_symbol).weight();
m_atoms.push_back(std::move(atom));
}
......@@ -209,11 +220,39 @@ compound::compound(cif::datablock &db, const std::string &id, const std::string
else
{
if (cif::VERBOSE > 0)
std::cerr << "Unimplemented chem_comp_bond.type " << btype << " in " << id << '\n';
std::cerr << "Unimplemented chem_comp_bond.type " << btype << " in " << db.name() << '\n';
bond.type = bond_type::sing;
}
m_bonds.push_back(std::move(bond));
}
// reconstruct a formula and weight
m_formula_weight = 0;
std::map<atom_type, int> f;
for (auto &atom : m_atoms)
f[atom.type_symbol] += 1;
if (f.count(atom_type::C))
{
atom_type_traits att(atom_type::C);
m_formula += att.symbol() + std::to_string(f[atom_type::C]) + ' ';
m_formula_weight += att.weight() * f[atom_type::C];
}
for (const auto &[type, count] : f)
{
if (type == atom_type::C)
continue;
atom_type_traits att(type);
m_formula += att.symbol() + std::to_string(count) + ' ';
m_formula_weight += att.weight() * count;
}
if (not m_formula.empty())
m_formula.pop_back();
}
compound_atom compound::get_atom_by_atom_id(const std::string &atom_id) const
......@@ -260,13 +299,12 @@ float compound::bond_length(const std::string &atomId_1, const std::string &atom
auto a = get_atom_by_atom_id(atomId_1);
auto b = get_atom_by_atom_id(atomId_2);
result = distance(point{a.x, a.y, a.z}, point{b.x, b.y, b.z});
result = distance(point{ a.x, a.y, a.z }, point{ b.x, b.y, b.z });
}
return result;
}
// --------------------------------------------------------------------
// known amino acids and bases
......@@ -316,7 +354,7 @@ class compound_factory_impl : public std::enable_shared_from_this<compound_facto
compound_factory_impl();
compound_factory_impl(const fs::path &file, std::shared_ptr<compound_factory_impl> next);
~compound_factory_impl()
virtual ~compound_factory_impl()
{
for (auto c : m_compounds)
delete c;
......@@ -378,8 +416,10 @@ class compound_factory_impl : public std::enable_shared_from_this<compound_facto
m_next->describe(os);
}
private:
compound *create(const std::string &id);
protected:
compound_factory_impl(std::shared_ptr<compound_factory_impl> next);
virtual compound *create(const std::string &id);
std::shared_timed_mutex mMutex;
......@@ -395,10 +435,15 @@ compound_factory_impl::compound_factory_impl()
{
}
compound_factory_impl::compound_factory_impl(std::shared_ptr<compound_factory_impl> next)
: m_next(next)
{
}
compound_factory_impl::compound_factory_impl(const fs::path &file, std::shared_ptr<compound_factory_impl> next)
: m_file(file)
, m_next(next)
: compound_factory_impl(next)
{
m_file = file;
}
compound *compound_factory_impl::create(const std::string &id)
......@@ -476,6 +521,45 @@ compound *compound_factory_impl::create(const std::string &id)
// --------------------------------------------------------------------
class local_compound_factory_impl : public compound_factory_impl
{
public:
local_compound_factory_impl(const cif::file &file, std::shared_ptr<compound_factory_impl> next)
: compound_factory_impl(next)
, m_local_file(file)
{
}
compound *create(const std::string &id) override;
private:
const cif::file &m_local_file;
};
compound *local_compound_factory_impl::create(const std::string &id)
{
compound *result = nullptr;
for (auto &db : m_local_file)
{
if (db.name() == "comp_" + id)
{
cif::datablock db_copy(db);
result = new compound(db_copy, 1);
std::shared_lock lock(mMutex);
m_compounds.push_back(result);
break;
}
}
return result;
}
// --------------------------------------------------------------------
std::unique_ptr<compound_factory> compound_factory::s_instance;
thread_local std::unique_ptr<compound_factory> compound_factory::tl_instance;
bool compound_factory::s_use_thread_local_instance;
......@@ -553,6 +637,18 @@ void compound_factory::push_dictionary(const fs::path &inDictFile)
}
}
void compound_factory::push_dictionary(const cif::file &inDictFile)
{
try
{
m_impl.reset(new local_compound_factory_impl(inDictFile, m_impl));
}
catch (const std::exception &)
{
std::throw_with_nested(std::runtime_error("Error loading dictionary from local mmCIF file"));
}
}
void compound_factory::pop_dictionary()
{
if (m_impl)
......@@ -584,7 +680,8 @@ void compound_factory::report_missing_compound(const std::string &compound_id)
{
using namespace cif::colour;
std::clog << "\n" << cif::coloured("Configuration error:", white, red) << "\n\n"
std::clog << "\n"
<< cif::coloured("Configuration error:", white, red) << "\n\n"
<< "The attempt to retrieve compound information for " << std::quoted(compound_id) << " failed.\n\n"
<< "This information is searched for in a CCD file called components.cif or\n"
<< "components.cif.gz which should be located in one of the following directories:\n\n";
......@@ -601,7 +698,7 @@ void compound_factory::report_missing_compound(const std::string &compound_id)
<< "update=true\n\n"
<< "If you do not have a working cron script, you can manually update the files\n"
<< "in /var/cache/libcifpp using the following commands:\n\n"
<< "curl -o " << CACHE_DIR << "/components.cif https://ftp.wwpdb.org/pub/pdb/data/monomers/components.cif.gz\n"
<< "curl -o " << CACHE_DIR << "/components.cif https://files.wwpdb.org/pub/pdb/data/monomers/components.cif.gz\n"
<< "curl -o " << CACHE_DIR << "/mmcif_pdbx.dic https://mmcif.wwpdb.org/dictionaries/ascii/mmcif_pdbx_v50.dic.gz\n"
<< "curl -o " << CACHE_DIR << "/mmcif_ma.dic https://github.com/ihmwg/ModelCIF/raw/master/dist/mmcif_ma.dic\n\n";
#endif
......
......@@ -158,11 +158,12 @@ std::tuple<datablock::iterator, bool> datablock::emplace(std::string_view name)
if (is_new)
{
auto &c = emplace_front(name);
auto &c = emplace_back(name);
c.set_validator(m_validator, *this);
}
return std::make_tuple(begin(), is_new);
assert(end() != begin());
return std::make_tuple(std::prev(end()), is_new);
}
std::vector<std::string> datablock::get_tag_order() const
......@@ -171,14 +172,16 @@ std::vector<std::string> datablock::get_tag_order() const
// for entry and audit_conform on top
auto ci = find_if(begin(), end(), [](const category &cat) { return cat.name() == "entry"; });
auto ci = find_if(begin(), end(), [](const category &cat)
{ return cat.name() == "entry"; });
if (ci != end())
{
auto cto = ci->get_tag_order();
result.insert(result.end(), cto.begin(), cto.end());
}
ci = find_if(begin(), end(), [](const category &cat) { return cat.name() == "audit_conform"; });
ci = find_if(begin(), end(), [](const category &cat)
{ return cat.name() == "audit_conform"; });
if (ci != end())
{
auto cto = ci->get_tag_order();
......@@ -196,11 +199,107 @@ std::vector<std::string> datablock::get_tag_order() const
return result;
}
namespace
{
using elem_t = std::tuple<std::string, int, bool>;
using cat_order_t = std::vector<elem_t>;
using iter_t = cat_order_t::iterator;
constexpr inline int get_count(iter_t i)
{
return std::get<1>(*i);
}
constexpr inline bool is_on_stack(iter_t i)
{
return std::get<2>(*i);
}
void calculate_cat_order(cat_order_t &cat_order, iter_t i, const validator &validator)
{
if (i == cat_order.end() or get_count(i) >= 0)
return;
auto &&[cat, count, on_stack] = *i;
on_stack = true;
int parent_count = 0;
for (auto link : validator.get_links_for_child(cat))
{
auto ei = std::find_if(cat_order.begin(), cat_order.end(), [parent = link->m_parent_category](elem_t &a)
{ return std::get<0>(a) == parent; });
if (ei == cat_order.end())
continue;
if (not is_on_stack(ei))
calculate_cat_order(cat_order, ei, validator);
parent_count += get_count(ei);
}
count = parent_count + 1;
}
}
void datablock::write(std::ostream &os) const
{
os << "data_" << m_name << '\n'
<< "# \n";
if (m_validator and size() > 0)
{
// If the dictionary declares an audit_conform category, put it in,
// but only if it does not exist already!
if (get("audit_conform") == nullptr and m_validator->get_validator_for_category("audit_conform") != nullptr)
{
category auditConform("audit_conform");
auditConform.emplace({ { "dict_name", m_validator->name() },
{ "dict_version", m_validator->version() } });
auditConform.write(os);
}
// base order on parent child relationships, parents first
cat_order_t cat_order;
for (auto &cat : *this)
cat_order.emplace_back(cat.name(), -1, false);
for (auto i = cat_order.begin(); i != cat_order.end(); ++i)
calculate_cat_order(cat_order, i, *m_validator);
std::sort(cat_order.begin(), cat_order.end(), [](const elem_t &a, const elem_t &b)
{
const auto &[cat_a, count_a, on_stack_a] = a;
const auto &[cat_b, count_b, on_stack_b] = b;
int d = 0;
if (cat_a == "audit_conform")
d = -1;
else if (cat_b == "audit_conform")
d = 1;
else if (cat_a == "entry")
d = -1;
else if (cat_b == "entry")
d = 1;
else
{
d = std::get<1>(a) - std::get<1>(b);
if (d == 0)
d = cat_b.compare(cat_a);
}
return d < 0; });
for (auto &&[cat, count, on_stack] : cat_order)
get(cat)->write(os);
}
else
{
// mmcif support, sort of. First write the 'entry' Category
// and if it exists, _AND_ we have a Validator, write out the
// audit_conform record.
......@@ -219,20 +318,13 @@ void datablock::write(std::ostream &os) const
// but only if it does not exist already!
if (get("audit_conform"))
get("audit_conform")->write(os);
else if (m_validator != nullptr and m_validator->get_validator_for_category("audit_conform") != nullptr)
{
category auditConform("audit_conform");
auditConform.emplace({
{"dict_name", m_validator->name()},
{"dict_version", m_validator->version()}});
auditConform.write(os);
}
for (auto &cat : *this)
{
if (cat.name() != "entry" and cat.name() != "audit_conform")
cat.write(os);
}
}
}
void datablock::write(std::ostream &os, const std::vector<std::string> &tag_order)
......@@ -337,7 +429,7 @@ bool datablock::operator==(const datablock &rhs) const
++catA_i;
else
{
if (not (*dbA.get(*catA_i) == *dbB.get(*catB_i)))
if (not(*dbA.get(*catA_i) == *dbB.get(*catB_i)))
return false;
++catA_i;
++catB_i;
......@@ -347,4 +439,4 @@ bool datablock::operator==(const datablock &rhs) const
return true;
}
} // namespace cif::cif
\ No newline at end of file
} // namespace cif
\ No newline at end of file
......@@ -173,11 +173,12 @@ std::tuple<file::iterator, bool> file::emplace(std::string_view name)
if (is_new)
{
auto &db = emplace_front(name);
auto &db = emplace_back(name);
db.set_validator(m_validator);
}
return std::make_tuple(begin(), is_new);
assert(begin() != end());
return std::make_tuple(std::prev(end()), is_new);
}
void file::load(const std::filesystem::path &p)
......
......@@ -4511,6 +4511,47 @@ void PDBFileParser::ConstructEntities()
}
}
}
// Finish by calculating the formula_weight for each entity
for (auto entity : *getCategory("entity"))
{
auto entity_id = entity["id"].as<std::string>();
float formula_weight = 0;
if (entity["type"] == "polymer")
{
int n = 0;
for (std::string comp_id : getCategory("pdbx_poly_seq_scheme")->find<std::string>(cif::key("entity_id") == entity_id, "mon_id"))
{
auto compound = cif::compound_factory::instance().create(comp_id);
assert(compound);
if (not compound)
throw std::runtime_error("missing information for compound " + comp_id);
formula_weight += compound->formula_weight();
++n;
}
formula_weight -= (n - 1) * 18.015;
}
else if (entity["type"] == "water")
formula_weight = 18.015;
else
{
auto comp_id = getCategory("pdbx_nonpoly_scheme")->find_first<std::optional<std::string>>(cif::key("entity_id") == entity_id, "mon_id");
if (comp_id.has_value())
{
auto compound = cif::compound_factory::instance().create(*comp_id);
assert(compound);
if (not compound)
throw std::runtime_error("missing information for compound " + *comp_id);
formula_weight = compound->formula_weight();
}
}
if (formula_weight > 0)
entity["formula_weight"] = formula_weight;
}
}
void PDBFileParser::ConstructSugarTrees(int &asymNr)
......
......@@ -3468,3 +3468,22 @@ TEST_CASE("compound_not_found_test_1")
auto cmp = cif::compound_factory::instance().create("&&&");
REQUIRE(cmp == nullptr);
}
// --------------------------------------------------------------------
// PDB2CIF tests
TEST_CASE("pdb2cif_formula_weight")
{
cif::compound_factory::instance().push_dictionary(gTestDir / "REA.cif");
cif::file a = cif::pdb::read(gTestDir / "pdb1cbs.ent.gz");
auto fw = a.front()["entity"].find1<float>(cif::key("id") == 1, "formula_weight");
CHECK(std::abs(fw - 15581.802f) < 0.1f);
fw = a.front()["entity"].find1<float>(cif::key("id") == 2, "formula_weight");
CHECK(fw == 300.435f);
fw = a.front()["entity"].find1<float>(cif::key("id") == 3, "formula_weight");
CHECK(fw == 18.015f);
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment