Commit ce28cb7a by Maarten L. Hekkelman

Moved BondMap from libpdb-redo to libcifpp

parent 1e47fa55
...@@ -110,6 +110,7 @@ $(OBJDIR): ...@@ -110,6 +110,7 @@ $(OBJDIR):
mkdir -p $(OBJDIR) mkdir -p $(OBJDIR)
OBJECTS = $(OBJDIR)/AtomType.lo \ OBJECTS = $(OBJDIR)/AtomType.lo \
$(OBJDIR)/BondMap.lo \
$(OBJDIR)/Cif2PDB.lo \ $(OBJDIR)/Cif2PDB.lo \
$(OBJDIR)/Cif++.lo \ $(OBJDIR)/Cif++.lo \
$(OBJDIR)/CifParser.lo \ $(OBJDIR)/CifParser.lo \
...@@ -227,6 +228,7 @@ test: $(TESTS:%=%-test) ...@@ -227,6 +228,7 @@ test: $(TESTS:%=%-test)
HEADERS = \ HEADERS = \
AtomType.hpp \ AtomType.hpp \
BondMap.hpp \
CifParser.hpp \ CifParser.hpp \
Compound.hpp \ Compound.hpp \
PDB2CifRemark3.hpp \ PDB2CifRemark3.hpp \
......
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <unordered_map>
#include <filesystem>
#include <stdexcept>
#include "cif++/Structure.hpp"
namespace mmcif
{
class BondMapException : public std::runtime_error
{
public:
BondMapException(const std::string& msg)
: runtime_error(msg) {}
};
class BondMap
{
public:
BondMap(const Structure& p);
BondMap(const BondMap&) = delete;
BondMap& operator=(const BondMap&) = delete;
bool operator()(const Atom& a, const Atom& b) const
{
return isBonded(index.at(a.id()), index.at(b.id()));
}
bool is1_4(const Atom& a, const Atom& b) const
{
uint32_t ixa = index.at(a.id());
uint32_t ixb = index.at(b.id());
return bond_1_4.count(key(ixa, ixb));
}
// links coming from the struct_conn records:
std::vector<std::string> linked(const Atom& a) const;
// This list of atomID's is comming from either CCD or the CCP4 dictionaries loaded
static std::vector<std::string> atomIDsForCompound(const std::string& compoundID);
private:
bool isBonded(uint32_t ai, uint32_t bi) const
{
return bond.count(key(ai, bi)) != 0;
}
uint64_t key(uint32_t a, uint32_t b) const
{
if (a > b)
std::swap(a, b);
return static_cast<uint64_t>(a) | (static_cast<uint64_t>(b) << 32);
}
std::tuple<uint32_t,uint32_t> dekey(uint64_t k) const
{
return std::make_tuple(
static_cast<uint32_t>(k >> 32),
static_cast<uint32_t>(k)
);
}
uint32_t dim;
std::unordered_map<std::string,uint32_t> index;
std::set<uint64_t> bond, bond_1_4;
std::map<std::string,std::set<std::string>> link;
};
}
...@@ -69,10 +69,10 @@ struct CompoundAtom ...@@ -69,10 +69,10 @@ struct CompoundAtom
{ {
std::string id; std::string id;
AtomType typeSymbol; AtomType typeSymbol;
int charge; int charge = 0;
bool aromatic; bool aromatic = false;
bool leavingAtom; bool leavingAtom = false;
bool stereoConfig; bool stereoConfig = false;
float x, y, z; float x, y, z;
}; };
...@@ -83,7 +83,7 @@ struct CompoundBond ...@@ -83,7 +83,7 @@ struct CompoundBond
{ {
std::string atomID[2]; std::string atomID[2];
BondType type; BondType type;
bool aromatic, stereoConfig; bool aromatic = false, stereoConfig = false;
}; };
/// -------------------------------------------------------------------- /// --------------------------------------------------------------------
...@@ -97,8 +97,6 @@ struct CompoundBond ...@@ -97,8 +97,6 @@ struct CompoundBond
class Compound class Compound
{ {
public: public:
Compound(cif::Datablock &db);
/// \brief factory method, create a Compound based on the three letter code /// \brief factory method, create a Compound based on the three letter code
/// (for amino acids) or the one-letter code (for bases) or the /// (for amino acids) or the one-letter code (for bases) or the
/// code as it is known in the CCD. /// code as it is known in the CCD.
...@@ -131,12 +129,18 @@ class Compound ...@@ -131,12 +129,18 @@ class Compound
private: private:
friend class CompoundFactoryImpl;
friend class CCDCompoundFactoryImpl;
Compound(cif::Datablock &db);
Compound(cif::Datablock &db, const std::string &id, const std::string &name, const std::string &type);
std::string mID; std::string mID;
std::string mName; std::string mName;
std::string mType; std::string mType;
std::string mFormula; std::string mFormula;
float mFormulaWeight; float mFormulaWeight = 0;
int mFormalCharge; int mFormalCharge = 0;
std::vector<CompoundAtom> mAtoms; std::vector<CompoundAtom> mAtoms;
std::vector<CompoundBond> mBonds; std::vector<CompoundBond> mBonds;
}; };
......
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if __has_include("Config.hpp")
#include "Config.hpp"
#endif
#include <fstream>
#include <algorithm>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/iostreams/copy.hpp>
#include "cif++/Cif++.hpp"
#include "cif++/Compound.hpp"
#include "cif++/CifUtils.hpp"
#include "cif++/BondMap.hpp"
namespace mmcif
{
namespace
{
union IDType
{
IDType() : id_n(0){}
IDType(const IDType& rhs) : id_n(rhs.id_n) {}
IDType(const std::string& s)
: IDType()
{
assert(s.length() <= 4);
if (s.length() > 4)
throw BondMapException("Atom ID '" + s + "' is too long");
std::copy(s.begin(), s.end(), id_s);
}
IDType& operator=(const IDType& rhs)
{
id_n = rhs.id_n;
return *this;
}
IDType& operator=(const std::string& s)
{
id_n = 0;
assert(s.length() <= 4);
if (s.length() > 4)
throw BondMapException("Atom ID '" + s + "' is too long");
std::copy(s.begin(), s.end(), id_s);
return *this;
}
bool operator<(const IDType& rhs) const
{
return id_n < rhs.id_n;
}
bool operator<=(const IDType& rhs) const
{
return id_n <= rhs.id_n;
}
bool operator==(const IDType& rhs) const
{
return id_n == rhs.id_n;
}
bool operator!=(const IDType& rhs) const
{
return id_n != rhs.id_n;
}
char id_s[4];
uint32_t id_n;
};
static_assert(sizeof(IDType) == 4, "atom_id_type should be 4 bytes");
}
// // --------------------------------------------------------------------
// void createBondInfoFile(const fs::path& components, const fs::path& infofile)
// {
// std::ofstream outfile(infofile.string() + ".tmp", std::ios::binary);
// if (not outfile.is_open())
// throw BondMapException("Could not create bond info file " + infofile.string() + ".tmp");
// cif::File infile(components);
// std::set<atom_id_type> atomIDs;
// std::vector<atom_id_type> compoundIDs;
// for (auto& db: infile)
// {
// auto chem_comp_bond = db.get("chem_comp_bond");
// if (not chem_comp_bond)
// {
// if (cif::VERBOSE > 1)
// std::cerr << "Missing chem_comp_bond category in data block " << db.getName() << std::endl;
// continue;
// }
// for (const auto& [atom_id_1, atom_id_2]: chem_comp_bond->rows<std::string,std::string>({"atom_id_1", "atom_id_2"}))
// {
// atomIDs.insert(atom_id_1);
// atomIDs.insert(atom_id_2);
// }
// compoundIDs.push_back({ db.getName() });
// }
// if (cif::VERBOSE)
// std::cout << "Number of unique atom names is " << atomIDs.size() << std::endl
// << "Number of unique residue names is " << compoundIDs.size() << std::endl;
// CompoundBondInfoFileHeader header = {};
// header.indexEntries = compoundIDs.size();
// header.atomEntries = atomIDs.size();
// outfile << header;
// for (auto atomID: atomIDs)
// outfile << atomID;
// auto dataOffset = outfile.tellp();
// std::vector<CompoundBondInfo> entries;
// entries.reserve(compoundIDs.size());
// std::map<atom_id_type, uint16_t> atomIDMap;
// for (auto& atomID: atomIDs)
// atomIDMap[atomID] = atomIDMap.size();
// for (auto& db: infile)
// {
// auto chem_comp_bond = db.get("chem_comp_bond");
// if (not chem_comp_bond)
// continue;
// std::set<uint16_t> bondedAtoms;
// for (const auto& [atom_id_1, atom_id_2]: chem_comp_bond->rows<std::string,std::string>({"atom_id_1", "atom_id_2"}))
// {
// bondedAtoms.insert(atomIDMap[atom_id_1]);
// bondedAtoms.insert(atomIDMap[atom_id_2]);
// }
// std::map<uint16_t, int32_t> bondedAtomMap;
// for (auto id: bondedAtoms)
// bondedAtomMap[id] = static_cast<int32_t>(bondedAtomMap.size());
// CompoundBondInfo info = {
// db.getName(),
// static_cast<uint32_t>(bondedAtomMap.size()),
// outfile.tellp() - dataOffset
// };
// entries.push_back(info);
// // An now first write the array of atom ID's in this compound
// for (uint16_t id: bondedAtoms)
// write(outfile, id);
// // And then the symmetric matrix with bonds
// size_t N = bondedAtoms.size();
// size_t M = (N * (N - 1)) / 2;
// size_t K = M / 8;
// if (M % 8)
// K += 1;
// std::vector<uint8_t> m(K);
// for (const auto& [atom_id_1, atom_id_2]: chem_comp_bond->rows<std::string,std::string>({"atom_id_1", "atom_id_2"}))
// {
// auto a = bondedAtomMap[atomIDMap[atom_id_1]];
// auto b = bondedAtomMap[atomIDMap[atom_id_2]];
// assert(a != b);
// assert((int)b < (int)N);
// if (a > b)
// std::swap(a, b);
// size_t ix = ((b - 1) * b) / 2 + a;
// assert(ix < M);
// auto Bix = ix / 8;
// auto bix = ix % 8;
// m[Bix] |= 1 << bix;
// }
// outfile.write(reinterpret_cast<char*>(m.data()), m.size());
// }
// header.dataSize = outfile.tellp() - dataOffset;
// std::sort(entries.begin(), entries.end(), [](CompoundBondInfo& a, CompoundBondInfo& b)
// {
// return a.id < b.id;
// });
// for (auto& info: entries)
// outfile << info;
// outfile.seekp(0);
// outfile << header;
// // compress
// outfile.close();
// std::ifstream in(infofile.string() + ".tmp", std::ios::binary);
// std::ofstream out(infofile, std::ios::binary);
// {
// io::filtering_stream<io::output> os;
// os.push(io::gzip_compressor());
// os.push(out);
// io::copy(in, os);
// }
// in.close();
// out.close();
// fs::remove(infofile.string() + ".tmp");
// }
// --------------------------------------------------------------------
struct CompoundBondInfo
{
IDType mID;
std::set<std::tuple<uint32_t,uint32_t>> mBonded;
bool bonded(uint32_t a1, uint32_t a2) const
{
return mBonded.count({ a1, a2 }) > 0;
}
};
// --------------------------------------------------------------------
class CompoundBondMap
{
public:
static CompoundBondMap &instance()
{
static std::unique_ptr<CompoundBondMap> s_instance(new CompoundBondMap);
return *s_instance;
}
bool bonded(const std::string& compoundID, const std::string& atomID1, const std::string& atomID2);
private:
CompoundBondMap() {}
uint32_t getAtomID(const std::string& atomID)
{
IDType id(atomID);
uint32_t result;
auto i = mAtomIDIndex.find(id);
if (i == mAtomIDIndex.end())
{
result = mAtomIDIndex.size();
mAtomIDIndex[id] = result;
}
else
result = i->second;
return result;
}
std::map<IDType,uint32_t> mAtomIDIndex;
std::vector<CompoundBondInfo> mCompounds;
std::mutex mMutex;
};
bool CompoundBondMap::bonded(const std::string &compoundID, const std::string& atomID1, const std::string& atomID2)
{
std::lock_guard lock(mMutex);
using namespace std::literals;
IDType id(compoundID);
uint32_t a1 = getAtomID(atomID1);
uint32_t a2 = getAtomID(atomID2);
if (a1 > a2)
std::swap(a1, a2);
for (auto &bi: mCompounds)
{
if (bi.mID != id)
continue;
return bi.bonded(a1, a2);
}
// not found in our cache, calculate
auto compound = CompoundFactory::instance().create(compoundID);
if (not compound)
throw BondMapException("Missing compound bond info for " + compoundID);
bool result = false;
CompoundBondInfo bondInfo{ id };
for (auto &atom: compound->bonds())
{
uint32_t ca1 = getAtomID(atom.atomID[0]);
uint32_t ca2 = getAtomID(atom.atomID[1]);
if (ca1 > ca2)
std::swap(ca1, ca2);
bondInfo.mBonded.insert({ca1, ca2});
result = result or (a1 == ca1 and a2 == ca2);
}
mCompounds.push_back(bondInfo);
return result;
}
// --------------------------------------------------------------------
BondMap::BondMap(const Structure& p)
{
auto& compoundBondInfo = CompoundBondMap::instance();
auto atoms = p.atoms();
dim = atoms.size();
// bond = std::vector<bool>(dim * (dim - 1), false);
for (auto& atom: atoms)
{
size_t ix = index.size();
index[atom.id()] = ix;
};
auto bindAtoms = [this](const std::string& a, const std::string& b)
{
uint32_t ixa = index[a];
uint32_t ixb = index[b];
bond.insert(key(ixa, ixb));
};
auto linkAtoms = [this,&bindAtoms](const std::string& a, const std::string& b)
{
bindAtoms(a, b);
link[a].insert(b);
link[b].insert(a);
};
cif::Datablock& db = p.getFile().data();
// collect all compounds first
std::set<std::string> compounds;
for (auto c: db["chem_comp"])
compounds.insert(c["id"].as<std::string>());
// make sure we also have all residues in the polyseq
for (auto m: db["entity_poly_seq"])
{
std::string c = m["mon_id"].as<std::string>();
if (compounds.count(c))
continue;
if (cif::VERBOSE > 1)
std::cerr << "Warning: mon_id " << c << " is missing in the chem_comp category" << std::endl;
compounds.insert(c);
}
cif::Progress progress(compounds.size(), "Creating bond map");
// some helper indices to speed things up a bit
std::map<std::tuple<std::string,int,std::string>,std::string> atomMapByAsymSeqAndAtom;
for (auto& a: p.atoms())
{
auto key = make_tuple(a.labelAsymID(), a.labelSeqID(), a.labelAtomID());
atomMapByAsymSeqAndAtom[key] = a.id();
}
// first link all residues in a polyseq
std::string lastAsymID;
int lastSeqID = 0;
for (auto r: db["pdbx_poly_seq_scheme"])
{
std::string asymID;
int seqID;
cif::tie(asymID, seqID) = r.get("asym_id", "seq_id");
if (asymID != lastAsymID) // first in a new sequece
{
lastAsymID = asymID;
lastSeqID = seqID;
continue;
}
auto c = atomMapByAsymSeqAndAtom[make_tuple(asymID, lastSeqID, "C")];
auto n = atomMapByAsymSeqAndAtom[make_tuple(asymID, seqID, "N")];
if (not (c.empty() or n.empty()))
bindAtoms(c, n);
lastSeqID = seqID;
}
for (auto l: db["struct_conn"])
{
std::string asym1, asym2, atomId1, atomId2;
int seqId1 = 0, seqId2 = 0;
cif::tie(asym1, asym2, atomId1, atomId2, seqId1, seqId2) =
l.get("ptnr1_label_asym_id", "ptnr2_label_asym_id",
"ptnr1_label_atom_id", "ptnr2_label_atom_id",
"ptnr1_label_seq_id", "ptnr2_label_seq_id");
std::string a = atomMapByAsymSeqAndAtom[make_tuple(asym1, seqId1, atomId1)];
std::string b = atomMapByAsymSeqAndAtom[make_tuple(asym2, seqId2, atomId2)];
if (not (a.empty() or b.empty()))
linkAtoms(a, b);
}
// then link all atoms in the compounds
for (auto c: compounds)
{
if (c == "HOH" or c == "H2O" or c == "WAT")
{
if (cif::VERBOSE)
std::cerr << "skipping water in bond map calculation" << std::endl;
continue;
}
auto bonded = [c, &compoundBondInfo](const Atom& a, const Atom& b)
{
auto label_a = a.labelAtomID();
auto label_b = b.labelAtomID();
return compoundBondInfo.bonded(c, label_a, label_b);
};
// loop over poly_seq_scheme
for (auto r: db["pdbx_poly_seq_scheme"].find(cif::Key("mon_id") == c))
{
std::string asymID;
int seqID;
cif::tie(asymID, seqID) = r.get("asym_id", "seq_id");
std::vector<Atom> rAtoms;
copy_if(atoms.begin(), atoms.end(), back_inserter(rAtoms),
[&](auto& a) { return a.labelAsymID() == asymID and a.labelSeqID() == seqID; });
for (uint32_t i = 0; i + 1 < rAtoms.size(); ++i)
{
for (uint32_t j = i + 1; j < rAtoms.size(); ++j)
{
if (bonded(rAtoms[i], rAtoms[j]))
bindAtoms(rAtoms[i].id(), rAtoms[j].id());
}
}
}
// loop over pdbx_nonpoly_scheme
for (auto r: db["pdbx_nonpoly_scheme"].find(cif::Key("mon_id") == c))
{
std::string asymID;
cif::tie(asymID) = r.get("asym_id");
std::vector<Atom> rAtoms;
copy_if(atoms.begin(), atoms.end(), back_inserter(rAtoms),
[&](auto& a) { return a.labelAsymID() == asymID; });
for (uint32_t i = 0; i + 1 < rAtoms.size(); ++i)
{
for (uint32_t j = i + 1; j < rAtoms.size(); ++j)
{
if (bonded(rAtoms[i], rAtoms[j]))
{
uint32_t ixa = index[rAtoms[i].id()];
uint32_t ixb = index[rAtoms[j].id()];
bond.insert(key(ixa, ixb));
}
}
}
}
// loop over pdbx_branch_scheme
for (auto r: db["pdbx_branch_scheme"].find(cif::Key("mon_id") == c))
{
std::string asymID;
cif::tie(asymID) = r.get("asym_id");
std::vector<Atom> rAtoms;
copy_if(atoms.begin(), atoms.end(), back_inserter(rAtoms),
[&](auto& a) { return a.labelAsymID() == asymID; });
for (uint32_t i = 0; i + 1 < rAtoms.size(); ++i)
{
for (uint32_t j = i + 1; j < rAtoms.size(); ++j)
{
if (bonded(rAtoms[i], rAtoms[j]))
{
uint32_t ixa = index[rAtoms[i].id()];
uint32_t ixb = index[rAtoms[j].id()];
bond.insert(key(ixa, ixb));
}
}
}
}
}
// start by creating an index for single bonds
std::multimap<uint32_t,uint32_t> b1_2;
for (auto& bk: bond)
{
uint32_t a, b;
std::tie(a, b) = dekey(bk);
b1_2.insert({ a, b });
b1_2.insert({ b, a });
}
std::multimap<uint32_t,uint32_t> b1_3;
for (uint32_t i = 0; i < dim; ++i)
{
auto a = b1_2.equal_range(i);
std::vector<uint32_t> s;
for (auto j = a.first; j != a.second; ++j)
s.push_back(j->second);
for (size_t si1 = 0; si1 + 1 < s.size(); ++si1)
{
for (size_t si2 = si1 + 1; si2 < s.size(); ++si2)
{
uint32_t x = s[si1];
uint32_t y = s[si2];
if (isBonded(x, y))
continue;
b1_3.insert({ x, y });
b1_3.insert({ y, x });
}
}
}
for (uint32_t i = 0; i < dim; ++i)
{
auto a1 = b1_2.equal_range(i);
auto a2 = b1_3.equal_range(i);
for (auto ai1 = a1.first; ai1 != a1.second; ++ai1)
{
for (auto ai2 = a2.first; ai2 != a2.second; ++ai2)
{
uint32_t b1 = ai1->second;
uint32_t b2 = ai2->second;
if (isBonded(b1, b2))
continue;
bond_1_4.insert(key(b1, b2));
}
}
}
}
std::vector<std::string> BondMap::linked(const Atom& a) const
{
auto i = link.find(a.id());
std::vector<std::string> result;
if (i != link.end())
result = std::vector<std::string>(i->second.begin(), i->second.end());
return result;
}
std::vector<std::string> BondMap::atomIDsForCompound(const std::string& compoundID)
{
std::vector<std::string> result;
auto* compound = mmcif::Compound::create(compoundID);
if (compound == nullptr)
throw BondMapException("Missing bond information for compound " + compoundID);
for (auto& compAtom: compound->atoms())
result.push_back(compAtom.id);
return result;
}
}
...@@ -144,6 +144,51 @@ Compound::Compound(cif::Datablock &db) ...@@ -144,6 +144,51 @@ Compound::Compound(cif::Datablock &db)
} }
} }
Compound::Compound(cif::Datablock &db, const std::string &id, const std::string &name, const std::string &type)
: mID(id)
, mName(name)
, mType(type)
{
auto &chemCompAtom = db["chem_comp_atom"];
for (auto row: chemCompAtom)
{
CompoundAtom atom;
std::string typeSymbol;
cif::tie(atom.id, typeSymbol, atom.charge, atom.x, atom.y, atom.z) =
row.get("atom_id", "type_symbol", "charge", "x", "y", "z");
atom.typeSymbol = AtomTypeTraits(typeSymbol).type();
mFormalCharge += atom.charge;
mFormulaWeight += AtomTypeTraits(atom.typeSymbol).weight();
mAtoms.push_back(std::move(atom));
}
auto &chemCompBond = db["chem_comp_bond"];
for (auto row: chemCompBond)
{
CompoundBond bond;
std::string type;
cif::tie(bond.atomID[0], bond.atomID[1], type, bond.aromatic)
= row.get("atom_id_1", "atom_id_2", "type", "aromatic");
using cif::iequals;
if (iequals(type, "single")) bond.type = BondType::sing;
else if (iequals(type, "double")) bond.type = BondType::doub;
else if (iequals(type, "triple")) bond.type = BondType::trip;
else if (iequals(type, "deloc") or iequals(type, "aromat") or iequals(type, "aromatic"))
bond.type = BondType::delo;
else
{
if (cif::VERBOSE)
std::cerr << "Unimplemented chem_comp_bond.type " << type << " in " << id << std::endl;
bond.type = BondType::sing;
}
mBonds.push_back(std::move(bond));
}
}
CompoundAtom Compound::getAtomByID(const std::string &atomID) const CompoundAtom Compound::getAtomByID(const std::string &atomID) const
{ {
CompoundAtom result = {}; CompoundAtom result = {};
...@@ -332,7 +377,7 @@ class CompoundFactoryImpl ...@@ -332,7 +377,7 @@ class CompoundFactoryImpl
(mNext != nullptr and mNext->isKnownBase(resName)); (mNext != nullptr and mNext->isKnownBase(resName));
} }
private: protected:
std::shared_timed_mutex mMutex; std::shared_timed_mutex mMutex;
std::vector<Compound *> mCompounds; std::vector<Compound *> mCompounds;
...@@ -357,14 +402,61 @@ CompoundFactoryImpl::CompoundFactoryImpl(const std::string &file, CompoundFactor ...@@ -357,14 +402,61 @@ CompoundFactoryImpl::CompoundFactoryImpl(const std::string &file, CompoundFactor
: mNext(next) : mNext(next)
{ {
cif::File cifFile(file); cif::File cifFile(file);
if (not cifFile.isValid())
throw std::runtime_error("Invalid compound file");
for (auto &db: cifFile) auto compList = cifFile.get("comp_list");
if (compList) // So this is a CCP4 restraints file, special handling
{
auto &chemComp = (*compList)["chem_comp"];
for (const auto &[id, name, group]: chemComp.rows<std::string,std::string,std::string>({"id", "name", "group"}))
{
std::string type;
// known groups are (counted from ccp4 monomer dictionary)
// D-pyranose
// DNA
// L-PEPTIDE LINKING
// L-SACCHARIDE
// L-peptide
// L-pyranose
// M-peptide
// NON-POLYMER
// P-peptide
// RNA
// furanose
// non-polymer
// non_polymer
// peptide
// pyranose
// saccharide
if (cif::iequals(id, "gly"))
type = "peptide linking";
else if (cif::iequals(group, "l-peptide") or cif::iequals(group, "L-peptide linking") or cif::iequals(group, "peptide"))
type = "L-peptide linking";
else if (cif::iequals(group, "DNA"))
type = "DNA linking";
else if (cif::iequals(group, "RNA"))
type = "RNA linking";
else
type = "non-polymer";
auto &db = cifFile["comp_" + id];
mCompounds.push_back(new Compound(db, id, name, type));
}
}
else
{ {
auto compound = std::make_unique<Compound>(db); // A CCD components file, validate it first
cifFile.loadDictionary();
mCompounds.push_back(compound.release()); if (not cifFile.isValid())
throw std::runtime_error("Invalid compound file");
for (auto &db: cifFile)
mCompounds.push_back(new Compound(db));
} }
} }
...@@ -466,6 +558,9 @@ Compound *CCDCompoundFactoryImpl::create(std::string id) ...@@ -466,6 +558,9 @@ Compound *CCDCompoundFactoryImpl::create(std::string id)
Compound *result = get(id); Compound *result = get(id);
if (result)
return result;
auto ccd = cif::loadResource("components.cif"); auto ccd = cif::loadResource("components.cif");
if (not ccd) if (not ccd)
throw std::runtime_error("Could not locate the CCD components.cif file, please make sure the software is installed properly and/or use the update-dictionary-script to fetch the data."); throw std::runtime_error("Could not locate the CCD components.cif file, please make sure the software is installed properly and/or use the update-dictionary-script to fetch the data.");
...@@ -474,7 +569,7 @@ Compound *CCDCompoundFactoryImpl::create(std::string id) ...@@ -474,7 +569,7 @@ Compound *CCDCompoundFactoryImpl::create(std::string id)
if (mIndex.empty()) if (mIndex.empty())
{ {
if (cif::VERBOSE) if (cif::VERBOSE > 1)
{ {
std::cout << "Creating component index " << "..."; std::cout << "Creating component index " << "...";
std::cout.flush(); std::cout.flush();
...@@ -483,14 +578,14 @@ Compound *CCDCompoundFactoryImpl::create(std::string id) ...@@ -483,14 +578,14 @@ Compound *CCDCompoundFactoryImpl::create(std::string id)
cif::Parser parser(*ccd, file, false); cif::Parser parser(*ccd, file, false);
mIndex = parser.indexDatablocks(); mIndex = parser.indexDatablocks();
if (cif::VERBOSE) if (cif::VERBOSE > 1)
std::cout << " done" << std::endl; std::cout << " done" << std::endl;
// reload the resource, perhaps this should be improved... // reload the resource, perhaps this should be improved...
ccd = cif::loadResource("components.cif"); ccd = cif::loadResource("components.cif");
} }
if (cif::VERBOSE) if (cif::VERBOSE > 1)
{ {
std::cout << "Loading component " << id << "..."; std::cout << "Loading component " << id << "...";
std::cout.flush(); std::cout.flush();
...@@ -499,13 +594,22 @@ Compound *CCDCompoundFactoryImpl::create(std::string id) ...@@ -499,13 +594,22 @@ Compound *CCDCompoundFactoryImpl::create(std::string id)
cif::Parser parser(*ccd, file, false); cif::Parser parser(*ccd, file, false);
parser.parseSingleDatablock(id, mIndex); parser.parseSingleDatablock(id, mIndex);
if (cif::VERBOSE) if (cif::VERBOSE > 1)
std::cout << " done" << std::endl; std::cout << " done" << std::endl;
auto &db = file.firstDatablock(); if (not file.empty())
if (db.getName() == id) {
result = new Compound(db); auto &db = file.firstDatablock();
else if (cif::VERBOSE) if (db.getName() == id)
{
result = new Compound(db);
std::shared_lock lock(mMutex);
mCompounds.push_back(result);
}
}
if (result == nullptr and cif::VERBOSE > 1)
std::cerr << "Could not locate compound " << id << " in the CCD components file" << std::endl; std::cerr << "Could not locate compound " << id << " in the CCD components file" << std::endl;
return result; return result;
......
#
data_comp_list
loop_
_chem_comp.id
_chem_comp.three_letter_code
_chem_comp.name
_chem_comp.group
_chem_comp.number_atoms_all
_chem_comp.number_atoms_nh
_chem_comp.desc_level
UN_ UN_ UN_NINE L-peptide 13 6 .
#
data_comp_UN_
#
loop_
_chem_comp_atom.comp_id
_chem_comp_atom.atom_id
_chem_comp_atom.type_symbol
_chem_comp_atom.type_energy
_chem_comp_atom.charge
_chem_comp_atom.x
_chem_comp_atom.y
_chem_comp_atom.z
UN_ N N NT3 1 0.227 -1.259 0.452
UN_ H H H 0 0.069 -1.019 1.421
UN_ H2 H H 0 1.104 -1.640 0.356
UN_ H3 H H 0 -0.424 -1.909 0.174
UN_ CA C CH1 0 0.103 -0.030 -0.392
UN_ HA H H 0 0.160 -0.299 -1.339
UN_ CB C CH3 0 -1.244 0.625 -0.159
UN_ HB3 H H 0 -1.857 -0.018 0.234
UN_ HB2 H H 0 -1.605 0.932 -1.008
UN_ HB1 H H 0 -1.150 1.385 0.442
UN_ C C C 0 1.270 0.922 -0.094
UN_ O O O 0 2.008 1.323 -0.994
UN_ OXT O OC -1 1.498 1.305 1.054
loop_
_chem_comp_tree.comp_id
_chem_comp_tree.atom_id
_chem_comp_tree.atom_back
_chem_comp_tree.atom_forward
_chem_comp_tree.connect_type
UN_ N n/a CA START
UN_ H N . .
UN_ H2 N . .
UN_ H3 N . .
UN_ CA N C .
UN_ HA CA . .
UN_ CB CA HB3 .
UN_ HB1 CB . .
UN_ HB2 CB . .
UN_ HB3 CB . .
UN_ C CA . END
UN_ O C . .
UN_ OXT C . .
loop_
_chem_comp_bond.comp_id
_chem_comp_bond.atom_id_1
_chem_comp_bond.atom_id_2
_chem_comp_bond.type
_chem_comp_bond.aromatic
_chem_comp_bond.value_dist
_chem_comp_bond.value_dist_esd
UN_ CB CA SINGLE n 1.509 0.014
UN_ CA C SINGLE n 1.533 0.011
UN_ C O DOUBLE n 1.247 0.019
UN_ C OXT SINGLE n 1.247 0.019
UN_ CA N SINGLE n 1.482 0.010
UN_ CB HB3 SINGLE n 0.972 0.015
UN_ CB HB2 SINGLE n 0.972 0.015
UN_ CB HB1 SINGLE n 0.972 0.015
UN_ CA HA SINGLE n 0.986 0.020
UN_ N H SINGLE n 0.911 0.020
UN_ N H2 SINGLE n 0.911 0.020
UN_ N H3 SINGLE n 0.911 0.020
loop_
_chem_comp_angle.comp_id
_chem_comp_angle.atom_id_1
_chem_comp_angle.atom_id_2
_chem_comp_angle.atom_id_3
_chem_comp_angle.value_angle
_chem_comp_angle.value_angle_esd
UN_ CA CB HB3 109.546 1.50
UN_ CA CB HB2 109.546 1.50
UN_ CA CB HB1 109.546 1.50
UN_ HB3 CB HB2 109.386 1.50
UN_ HB3 CB HB1 109.386 1.50
UN_ HB2 CB HB1 109.386 1.50
UN_ CB CA C 111.490 1.50
UN_ CB CA N 109.912 1.50
UN_ CB CA HA 108.878 1.50
UN_ C CA N 109.627 1.50
UN_ C CA HA 108.541 1.50
UN_ N CA HA 108.529 1.50
UN_ CA C O 117.159 1.57
UN_ CA C OXT 117.159 1.57
UN_ O C OXT 125.683 1.50
UN_ CA N H 109.643 1.50
UN_ CA N H2 109.643 1.50
UN_ CA N H3 109.643 1.50
UN_ H N H2 109.028 2.41
UN_ H N H3 109.028 2.41
UN_ H2 N H3 109.028 2.41
loop_
_chem_comp_tor.comp_id
_chem_comp_tor.id
_chem_comp_tor.atom_id_1
_chem_comp_tor.atom_id_2
_chem_comp_tor.atom_id_3
_chem_comp_tor.atom_id_4
_chem_comp_tor.value_angle
_chem_comp_tor.value_angle_esd
_chem_comp_tor.period
UN_ hh1 N CA CB HB3 60.000 15.000 3
UN_ sp2_sp3_1 O C CA CB 0.000 10.00 6
UN_ sp3_sp3_10 CB CA N H 180.000 10.00 3
loop_
_chem_comp_chir.comp_id
_chem_comp_chir.id
_chem_comp_chir.atom_id_centre
_chem_comp_chir.atom_id_1
_chem_comp_chir.atom_id_2
_chem_comp_chir.atom_id_3
_chem_comp_chir.volume_sign
UN_ chir_1 CA N C CB positive
loop_
_chem_comp_plane_atom.comp_id
_chem_comp_plane_atom.plane_id
_chem_comp_plane_atom.atom_id
_chem_comp_plane_atom.dist_esd
UN_ plan-1 C 0.020
UN_ plan-1 CA 0.020
UN_ plan-1 O 0.020
UN_ plan-1 OXT 0.020
loop_
_pdbx_chem_comp_descriptor.comp_id
_pdbx_chem_comp_descriptor.type
_pdbx_chem_comp_descriptor.program
_pdbx_chem_comp_descriptor.program_version
_pdbx_chem_comp_descriptor.descriptor
UN_ SMILES ACDLabs 10.04 "O=C(O)C(N)C"
UN_ SMILES_CANONICAL CACTVS 3.341 "C[C@H](N)C(O)=O"
UN_ SMILES CACTVS 3.341 "C[CH](N)C(O)=O"
UN_ SMILES_CANONICAL "OpenEye OEToolkits" 1.5.0 "C[C@@H](C(=O)O)N"
UN_ SMILES "OpenEye OEToolkits" 1.5.0 "CC(C(=O)O)N"
UN_ InChI InChI 1.03 "InChI=1S/C3H7NO2/c1-2(4)3(5)6/h2H,4H2,1H3,(H,5,6)/t2-/m0/s1"
UN_ InChIKey InChI 1.03 QNAYBMKLOCPYGJ-REOHCLBHSA-N
UN_ ? acedrg 195 "dictionary generator"
UN_ ? acedrg_database 11 "data source"
UN_ ? rdkit 2017.03.2 "Chemoinformatics tool"
UN_ ? refmac5 5.8.0189 "optimization tool"
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
// #include "cif++/DistanceMap.hpp" // #include "cif++/DistanceMap.hpp"
#include "cif++/Cif++.hpp" #include "cif++/Cif++.hpp"
#include "cif++/BondMap.hpp"
// -------------------------------------------------------------------- // --------------------------------------------------------------------
...@@ -1208,3 +1209,156 @@ _test.name ...@@ -1208,3 +1209,156 @@ _test.name
BOOST_CHECK(id == 1); BOOST_CHECK(id == 1);
BOOST_CHECK(name == "aap"); BOOST_CHECK(name == "aap");
} }
// --------------------------------------------------------------------
BOOST_AUTO_TEST_CASE(bondmap_1)
{
cif::VERBOSE = 2;
// sections taken from CCD compounds.cif
auto components = R"(
data_ASN
loop_
_chem_comp_bond.comp_id
_chem_comp_bond.atom_id_1
_chem_comp_bond.atom_id_2
_chem_comp_bond.value_order
_chem_comp_bond.pdbx_aromatic_flag
_chem_comp_bond.pdbx_stereo_config
_chem_comp_bond.pdbx_ordinal
ASN N CA SING N N 1
ASN N H SING N N 2
ASN N H2 SING N N 3
ASN CA C SING N N 4
ASN CA CB SING N N 5
ASN CA HA SING N N 6
ASN C O DOUB N N 7
ASN C OXT SING N N 8
ASN CB CG SING N N 9
ASN CB HB2 SING N N 10
ASN CB HB3 SING N N 11
ASN CG OD1 DOUB N N 12
ASN CG ND2 SING N N 13
ASN ND2 HD21 SING N N 14
ASN ND2 HD22 SING N N 15
ASN OXT HXT SING N N 16
data_PHE
loop_
_chem_comp_bond.comp_id
_chem_comp_bond.atom_id_1
_chem_comp_bond.atom_id_2
_chem_comp_bond.value_order
_chem_comp_bond.pdbx_aromatic_flag
_chem_comp_bond.pdbx_stereo_config
_chem_comp_bond.pdbx_ordinal
PHE N CA SING N N 1
PHE N H SING N N 2
PHE N H2 SING N N 3
PHE CA C SING N N 4
PHE CA CB SING N N 5
PHE CA HA SING N N 6
PHE C O DOUB N N 7
PHE C OXT SING N N 8
PHE CB CG SING N N 9
PHE CB HB2 SING N N 10
PHE CB HB3 SING N N 11
PHE CG CD1 DOUB Y N 12
PHE CG CD2 SING Y N 13
PHE CD1 CE1 SING Y N 14
PHE CD1 HD1 SING N N 15
PHE CD2 CE2 DOUB Y N 16
PHE CD2 HD2 SING N N 17
PHE CE1 CZ DOUB Y N 18
PHE CE1 HE1 SING N N 19
PHE CE2 CZ SING Y N 20
PHE CE2 HE2 SING N N 21
PHE CZ HZ SING N N 22
PHE OXT HXT SING N N 23
data_PRO
loop_
_chem_comp_bond.comp_id
_chem_comp_bond.atom_id_1
_chem_comp_bond.atom_id_2
_chem_comp_bond.value_order
_chem_comp_bond.pdbx_aromatic_flag
_chem_comp_bond.pdbx_stereo_config
_chem_comp_bond.pdbx_ordinal
PRO N CA SING N N 1
PRO N CD SING N N 2
PRO N H SING N N 3
PRO CA C SING N N 4
PRO CA CB SING N N 5
PRO CA HA SING N N 6
PRO C O DOUB N N 7
PRO C OXT SING N N 8
PRO CB CG SING N N 9
PRO CB HB2 SING N N 10
PRO CB HB3 SING N N 11
PRO CG CD SING N N 12
PRO CG HG2 SING N N 13
PRO CG HG3 SING N N 14
PRO CD HD2 SING N N 15
PRO CD HD3 SING N N 16
PRO OXT HXT SING N N 17
)"_cf;
const std::filesystem::path example("../examples/1cbs.cif.gz");
mmcif::File file(example);
mmcif::Structure structure(file);
mmcif::BondMap bm(structure);
// Test the bonds of the first three residues, that's PRO A 1, ASN A 2, PHE A 3
for (const auto& [compound, seqnr]: std::initializer_list<std::tuple<std::string,int>>{ { "PRO", 1 }, { "ASN", 2 }, { "PHE", 3 } })
{
auto& res = structure.getResidue("A", compound, seqnr);
auto atoms = res.atoms();
auto dc = components.get(compound);
BOOST_ASSERT(dc != nullptr);
auto cc = dc->get("chem_comp_bond");
BOOST_ASSERT(cc != nullptr);
std::set<std::tuple<std::string,std::string>> bonded;
for (const auto& [atom_id_1, atom_id_2]: cc->rows<std::string,std::string>({ "atom_id_1", "atom_id_2" }))
{
if (atom_id_1 > atom_id_2)
bonded.insert({ atom_id_2, atom_id_1 });
else
bonded.insert({ atom_id_1, atom_id_2 });
}
for (size_t i = 0; i + 1 < atoms.size(); ++i)
{
auto label_i = atoms[i].labelAtomID();
for (size_t j = i + 1; j < atoms.size(); ++j)
{
auto label_j = atoms[j].labelAtomID();
bool bonded_1 = bm(atoms[i], atoms[j]);
bool bonded_1_i = bm(atoms[j], atoms[i]);
bool bonded_t = label_i > label_j
? bonded.count({ label_j, label_i })
: bonded.count({ label_i, label_j });
BOOST_CHECK(bonded_1 == bonded_t);
BOOST_CHECK(bonded_1_i == bonded_t);
}
}
}
}
BOOST_AUTO_TEST_CASE(bondmap_2)
{
BOOST_CHECK_THROW(mmcif::BondMap::atomIDsForCompound("UN_"), mmcif::BondMapException);
mmcif::CompoundFactory::instance().pushDictionary("./UN_.cif");
BOOST_CHECK(mmcif::BondMap::atomIDsForCompound("UN_").empty() == false);
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment