Moved BondMap from libpdb-redo to libcifpp

ce28cb7a · Maarten L. Hekkelman · 1e47fa55 · ce28cb7a · ce28cb7a · ce28cb7a
Commit ce28cb7a authored Apr 14, 2021 by Maarten L. Hekkelman
Showing with 1175 additions and 23 deletions

GNUmakefile.in
+2 -0

include/cif++/BondMap.hpp
+101 -0

include/cif++/Compound.hpp
+13 -9

src/BondMap.cpp
+634 -0

src/Compound.cpp
+118 -14

test/UN_.cif
+152 -0

test/unit-test.cpp
+155 -0

No files found.
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -110,6 +110,7 @@ $(OBJDIR):
 	mkdir -p $(OBJDIR)

 OBJECTS		= $(OBJDIR)/AtomType.lo \
+			  $(OBJDIR)/BondMap.lo \
 			  $(OBJDIR)/Cif2PDB.lo \
 			  $(OBJDIR)/Cif++.lo \
 			  $(OBJDIR)/CifParser.lo \
@@ -227,6 +228,7 @@ test: $(TESTS:%=%-test)

 HEADERS = \
 	AtomType.hpp \
+	BondMap.hpp \
 	CifParser.hpp \
 	Compound.hpp \
 	PDB2CifRemark3.hpp \

--- a/include/cif++/BondMap.hpp
+++ b/include/cif++/BondMap.hpp
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ * 
+ * Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <unordered_map>
+#include <filesystem>
+#include <stdexcept>
+
+#include "cif++/Structure.hpp"
+
+namespace mmcif
+{
+
+class BondMapException : public std::runtime_error
+{
+  public:
+	BondMapException(const std::string& msg)
+		: runtime_error(msg) {}
+};
+
+class BondMap
+{
+  public:
+	BondMap(const Structure& p);
+	
+	BondMap(const BondMap&) = delete;
+	BondMap& operator=(const BondMap&) = delete;
+
+	bool operator()(const Atom& a, const Atom& b) const
+	{
+		return isBonded(index.at(a.id()), index.at(b.id()));
+	}
+
+	bool is1_4(const Atom& a, const Atom& b) const
+	{
+		uint32_t ixa = index.at(a.id());
+		uint32_t ixb = index.at(b.id());
+	
+		return bond_1_4.count(key(ixa, ixb));
+	}
+	
+	// links coming from the struct_conn records:
+	std::vector<std::string> linked(const Atom& a) const;
+
+	// This list of atomID's is comming from either CCD or the CCP4 dictionaries loaded
+	static std::vector<std::string> atomIDsForCompound(const std::string& compoundID);
+	
+  private:
+
+	bool isBonded(uint32_t ai, uint32_t bi) const
+	{
+		return bond.count(key(ai, bi)) != 0;
+	}
+
+	uint64_t key(uint32_t a, uint32_t b) const
+	{
+		if (a > b)
+			std::swap(a, b);
+		return static_cast<uint64_t>(a) | (static_cast<uint64_t>(b) << 32);
+	}
+	
+	std::tuple<uint32_t,uint32_t> dekey(uint64_t k) const
+	{
+		return std::make_tuple(
+			static_cast<uint32_t>(k >> 32),
+			static_cast<uint32_t>(k)
+		);
+	}
+	
+	uint32_t dim;
+	std::unordered_map<std::string,uint32_t> index;
+	std::set<uint64_t> bond, bond_1_4;
+
+	std::map<std::string,std::set<std::string>> link;
+};
+
+}
--- a/include/cif++/Compound.hpp
+++ b/include/cif++/Compound.hpp
@@ -69,10 +69,10 @@ struct CompoundAtom
 {
 	std::string id;
 	AtomType typeSymbol;
-	int charge;
-	bool aromatic;
-	bool leavingAtom;
-	bool stereoConfig;
+	int charge = 0;
+	bool aromatic = false;
+	bool leavingAtom = false;
+	bool stereoConfig = false;
 	float x, y, z;
 };

@@ -83,7 +83,7 @@ struct CompoundBond
 {
 	std::string atomID[2];
 	BondType type;
-	bool aromatic, stereoConfig;
+	bool aromatic = false, stereoConfig = false;
 };

 /// --------------------------------------------------------------------
@@ -97,8 +97,6 @@ struct CompoundBond
 class Compound
 {
  public:
-	Compound(cif::Datablock &db);
-
 	/// \brief factory method, create a Compound based on the three letter code
 	/// (for amino acids) or the one-letter code (for bases) or the
 	/// code as it is known in the CCD.
@@ -131,12 +129,18 @@ class Compound

  private:

+	friend class CompoundFactoryImpl;
+	friend class CCDCompoundFactoryImpl;
+
+	Compound(cif::Datablock &db);
+	Compound(cif::Datablock &db, const std::string &id, const std::string &name, const std::string &type);
+
 	std::string mID;
 	std::string mName;
 	std::string mType;
 	std::string mFormula;
-	float mFormulaWeight;
-	int mFormalCharge;
+	float mFormulaWeight = 0;
+	int mFormalCharge = 0;
 	std::vector<CompoundAtom> mAtoms;
 	std::vector<CompoundBond> mBonds;
 };

--- a/src/BondMap.cpp
+++ b/src/BondMap.cpp
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ * 
+ * Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#if __has_include("Config.hpp")
+#include "Config.hpp"
+#endif
+
+#include <fstream>
+#include <algorithm>
+
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/iostreams/copy.hpp>
+
+#include "cif++/Cif++.hpp"
+#include "cif++/Compound.hpp"
+#include "cif++/CifUtils.hpp"
+#include "cif++/BondMap.hpp"
+
+namespace mmcif
+{
+
+namespace
+{
+
+union IDType
+{
+	IDType() : id_n(0){}
+	IDType(const IDType& rhs) : id_n(rhs.id_n) {}
+	IDType(const std::string& s)
+		: IDType()
+	{
+		assert(s.length() <= 4);
+		if (s.length() > 4)
+			throw BondMapException("Atom ID '" + s + "' is too long");
+		std::copy(s.begin(), s.end(), id_s);
+	}
+
+	IDType& operator=(const IDType& rhs)
+	{
+		id_n = rhs.id_n;
+		return *this;
+	}
+
+	IDType& operator=(const std::string& s)
+	{
+		id_n = 0;
+		assert(s.length() <= 4);
+		if (s.length() > 4)
+			throw BondMapException("Atom ID '" + s + "' is too long");
+		std::copy(s.begin(), s.end(), id_s);
+		return *this;
+	}
+
+	bool operator<(const IDType& rhs) const
+	{
+		return id_n < rhs.id_n;
+	}
+
+	bool operator<=(const IDType& rhs) const
+	{
+		return id_n <= rhs.id_n;
+	}
+
+	bool operator==(const IDType& rhs) const
+	{
+		return id_n == rhs.id_n;
+	}
+
+	bool operator!=(const IDType& rhs) const
+	{
+		return id_n != rhs.id_n;
+	}
+
+	char		id_s[4];
+	uint32_t	id_n;
+};
+
+static_assert(sizeof(IDType) == 4, "atom_id_type should be 4 bytes");	
+}
+
+// // --------------------------------------------------------------------
+
+// void createBondInfoFile(const fs::path& components, const fs::path& infofile)
+// {
+// 	std::ofstream outfile(infofile.string() + ".tmp", std::ios::binary);
+// 	if (not outfile.is_open())
+// 		throw BondMapException("Could not create bond info file " + infofile.string() + ".tmp");
+
+// 	cif::File infile(components);
+
+// 	std::set<atom_id_type> atomIDs;
+// 	std::vector<atom_id_type> compoundIDs;
+
+// 	for (auto& db: infile)
+// 	{
+// 		auto chem_comp_bond = db.get("chem_comp_bond");
+// 		if (not chem_comp_bond)
+// 		{
+// 			if (cif::VERBOSE > 1)
+// 				std::cerr << "Missing chem_comp_bond category in data block " << db.getName() << std::endl;
+// 			continue;
+// 		}
+
+// 		for (const auto& [atom_id_1, atom_id_2]: chem_comp_bond->rows<std::string,std::string>({"atom_id_1", "atom_id_2"}))
+// 		{
+// 			atomIDs.insert(atom_id_1);
+// 			atomIDs.insert(atom_id_2);
+// 		}
+
+// 		compoundIDs.push_back({ db.getName() });
+// 	}
+
+// 	if (cif::VERBOSE)
+// 		std::cout << "Number of unique atom names is " << atomIDs.size() << std::endl
+// 				  << "Number of unique residue names is " << compoundIDs.size() << std::endl;
+
+// 	CompoundBondInfoFileHeader header = {};
+// 	header.indexEntries = compoundIDs.size();
+// 	header.atomEntries = atomIDs.size();
+
+// 	outfile << header;
+	
+// 	for (auto atomID: atomIDs)
+// 		outfile << atomID;
+
+// 	auto dataOffset = outfile.tellp();
+
+// 	std::vector<CompoundBondInfo> entries;
+// 	entries.reserve(compoundIDs.size());
+
+// 	std::map<atom_id_type, uint16_t> atomIDMap;
+// 	for (auto& atomID: atomIDs)
+// 		atomIDMap[atomID] = atomIDMap.size();
+
+// 	for (auto& db: infile)
+// 	{
+// 		auto chem_comp_bond = db.get("chem_comp_bond");
+// 		if (not chem_comp_bond)
+// 			continue;
+
+// 		std::set<uint16_t> bondedAtoms;
+
+// 		for (const auto& [atom_id_1, atom_id_2]: chem_comp_bond->rows<std::string,std::string>({"atom_id_1", "atom_id_2"}))
+// 		{
+// 			bondedAtoms.insert(atomIDMap[atom_id_1]);
+// 			bondedAtoms.insert(atomIDMap[atom_id_2]);
+// 		}
+
+// 		std::map<uint16_t, int32_t> bondedAtomMap;
+// 		for (auto id: bondedAtoms)
+// 			bondedAtomMap[id] = static_cast<int32_t>(bondedAtomMap.size());
+		
+// 		CompoundBondInfo info = {
+// 			db.getName(),
+// 			static_cast<uint32_t>(bondedAtomMap.size()),
+// 			outfile.tellp() - dataOffset
+// 		};
+
+// 		entries.push_back(info);
+
+// 		// An now first write the array of atom ID's in this compound
+// 		for (uint16_t id: bondedAtoms)
+// 			write(outfile, id);
+
+// 		// And then the symmetric matrix with bonds
+// 		size_t N = bondedAtoms.size();
+// 		size_t M = (N * (N - 1)) / 2;
+
+// 		size_t K = M / 8;
+// 		if (M % 8)
+// 			K += 1;
+		
+// 		std::vector<uint8_t> m(K);
+
+// 		for (const auto& [atom_id_1, atom_id_2]: chem_comp_bond->rows<std::string,std::string>({"atom_id_1", "atom_id_2"}))
+// 		{
+// 			auto a = bondedAtomMap[atomIDMap[atom_id_1]];
+// 			auto b = bondedAtomMap[atomIDMap[atom_id_2]];
+
+// 			assert(a != b);
+// 			assert((int)b < (int)N);
+
+// 			if (a > b)
+// 				std::swap(a, b);
+			
+// 			size_t ix = ((b - 1) * b) / 2 + a;
+// 			assert(ix < M);
+
+// 			auto Bix = ix / 8;
+// 			auto bix = ix % 8;
+
+// 			m[Bix] |= 1 << bix;
+// 		}
+
+// 		outfile.write(reinterpret_cast<char*>(m.data()), m.size());
+// 	}
+
+// 	header.dataSize = outfile.tellp() - dataOffset;
+
+// 	std::sort(entries.begin(), entries.end(), [](CompoundBondInfo& a, CompoundBondInfo& b)
+// 	{
+// 		return a.id < b.id;
+// 	});
+
+// 	for (auto& info: entries)
+// 		outfile << info;
+
+// 	outfile.seekp(0);
+// 	outfile << header;
+
+// 	// compress
+// 	outfile.close();
+
+// 	std::ifstream in(infofile.string() + ".tmp", std::ios::binary);
+// 	std::ofstream out(infofile, std::ios::binary);
+
+// 	{
+// 		io::filtering_stream<io::output> os;
+// 		os.push(io::gzip_compressor());
+// 		os.push(out);
+// 		io::copy(in, os);
+// 	}
+
+// 	in.close();
+// 	out.close();
+
+// 	fs::remove(infofile.string() + ".tmp");
+// }
+
+// --------------------------------------------------------------------
+
+struct CompoundBondInfo
+{
+	IDType	mID;
+	std::set<std::tuple<uint32_t,uint32_t>> mBonded;
+
+	bool bonded(uint32_t a1, uint32_t a2) const
+	{
+		return mBonded.count({ a1, a2 }) > 0;
+	}
+
+};
+
+// --------------------------------------------------------------------
+
+class CompoundBondMap
+{
+  public:
+
+	static CompoundBondMap &instance()
+	{
+		static std::unique_ptr<CompoundBondMap> s_instance(new CompoundBondMap);
+		return *s_instance;
+	}
+
+	bool bonded(const std::string& compoundID, const std::string& atomID1, const std::string& atomID2);
+
+  private:
+
+	CompoundBondMap() {}
+
+	uint32_t getAtomID(const std::string& atomID)
+	{
+		IDType id(atomID);
+
+		uint32_t result;
+
+		auto i = mAtomIDIndex.find(id);
+		if (i == mAtomIDIndex.end())
+		{
+			result = mAtomIDIndex.size();
+			mAtomIDIndex[id] = result;
+		}
+		else
+			result = i->second;
+		
+		return result;
+	}
+
+	std::map<IDType,uint32_t> mAtomIDIndex;
+	std::vector<CompoundBondInfo> mCompounds;
+	std::mutex mMutex;
+};
+
+bool CompoundBondMap::bonded(const std::string &compoundID, const std::string& atomID1, const std::string& atomID2)
+{
+	std::lock_guard lock(mMutex);
+
+	using namespace std::literals;
+
+	IDType id(compoundID);
+	uint32_t a1 = getAtomID(atomID1);
+	uint32_t a2 = getAtomID(atomID2);
+	if (a1 > a2)
+		std::swap(a1, a2);
+	
+	for (auto &bi: mCompounds)
+	{
+		if (bi.mID != id)
+			continue;
+		
+		return bi.bonded(a1, a2);
+	}
+
+	// not found in our cache, calculate
+	auto compound = CompoundFactory::instance().create(compoundID);
+	if (not compound)
+		throw BondMapException("Missing compound bond info for " + compoundID);
+
+	bool result = false;
+
+	CompoundBondInfo bondInfo{ id };
+	for (auto &atom: compound->bonds())
+	{
+		uint32_t ca1 = getAtomID(atom.atomID[0]);
+		uint32_t ca2 = getAtomID(atom.atomID[1]);
+		if (ca1 > ca2)
+			std::swap(ca1, ca2);
+		
+		bondInfo.mBonded.insert({ca1, ca2});
+		result = result or (a1 == ca1 and a2 == ca2);
+	}
+
+	mCompounds.push_back(bondInfo);
+
+	return result;
+}
+
+// --------------------------------------------------------------------
+
+BondMap::BondMap(const Structure& p)
+{
+	auto& compoundBondInfo = CompoundBondMap::instance();
+
+	auto atoms = p.atoms();
+	dim = atoms.size();
+
+//	bond = std::vector<bool>(dim * (dim - 1), false);
+
+	for (auto& atom: atoms)
+	{
+		size_t ix = index.size();
+		index[atom.id()] = ix;
+	};
+	
+	auto bindAtoms = [this](const std::string& a, const std::string& b)
+	{
+		uint32_t ixa = index[a];
+		uint32_t ixb = index[b];
+		
+		bond.insert(key(ixa, ixb));
+	};
+
+	auto linkAtoms = [this,&bindAtoms](const std::string& a, const std::string& b)
+	{
+		bindAtoms(a, b);
+
+		link[a].insert(b);
+		link[b].insert(a);
+	};
+
+	cif::Datablock& db = p.getFile().data();
+
+	// collect all compounds first
+	std::set<std::string> compounds;
+	for (auto c: db["chem_comp"])
+		compounds.insert(c["id"].as<std::string>());
+	
+	// make sure we also have all residues in the polyseq
+	for (auto m: db["entity_poly_seq"])
+	{
+		std::string c = m["mon_id"].as<std::string>();
+		if (compounds.count(c))
+			continue;
+		
+		if (cif::VERBOSE > 1)
+			std::cerr << "Warning: mon_id " << c << " is missing in the chem_comp category" << std::endl;
+		compounds.insert(c);
+	}
+
+	cif::Progress progress(compounds.size(), "Creating bond map");
+
+	// some helper indices to speed things up a bit
+	std::map<std::tuple<std::string,int,std::string>,std::string> atomMapByAsymSeqAndAtom;
+	for (auto& a: p.atoms())
+	{
+		auto key = make_tuple(a.labelAsymID(), a.labelSeqID(), a.labelAtomID());
+		atomMapByAsymSeqAndAtom[key] = a.id();
+	}
+	
+	// first link all residues in a polyseq
+	
+	std::string lastAsymID;
+	int lastSeqID = 0;
+	for (auto r: db["pdbx_poly_seq_scheme"])
+	{
+		std::string asymID;
+		int seqID;
+
+		cif::tie(asymID, seqID) = r.get("asym_id", "seq_id");
+
+		if (asymID != lastAsymID)		// first in a new sequece
+		{
+			lastAsymID = asymID;
+			lastSeqID = seqID;
+			continue;
+		}
+		
+		auto c = atomMapByAsymSeqAndAtom[make_tuple(asymID, lastSeqID, "C")];
+		auto n = atomMapByAsymSeqAndAtom[make_tuple(asymID, seqID, "N")];
+
+		if (not (c.empty() or n.empty()))
+			bindAtoms(c, n);
+		
+		lastSeqID = seqID;
+	}
+
+	for (auto l: db["struct_conn"])
+	{
+		std::string asym1, asym2, atomId1, atomId2;
+		int seqId1 = 0, seqId2 = 0;
+		cif::tie(asym1, asym2, atomId1, atomId2, seqId1, seqId2) =
+			l.get("ptnr1_label_asym_id", "ptnr2_label_asym_id",
+				  "ptnr1_label_atom_id", "ptnr2_label_atom_id",
+				  "ptnr1_label_seq_id", "ptnr2_label_seq_id");
+
+		std::string a = atomMapByAsymSeqAndAtom[make_tuple(asym1, seqId1, atomId1)];
+		std::string b = atomMapByAsymSeqAndAtom[make_tuple(asym2, seqId2, atomId2)];
+			
+		if (not (a.empty() or b.empty()))
+			linkAtoms(a, b);
+	}
+
+	// then link all atoms in the compounds
+	
+	for (auto c: compounds)
+	{
+		if (c == "HOH" or c == "H2O" or c == "WAT")
+		{
+			if (cif::VERBOSE)
+				std::cerr << "skipping water in bond map calculation" << std::endl;
+			continue;
+		}
+
+		auto bonded = [c, &compoundBondInfo](const Atom& a, const Atom& b)
+		{
+			auto label_a = a.labelAtomID();
+			auto label_b = b.labelAtomID();
+
+			return compoundBondInfo.bonded(c, label_a, label_b);
+		};
+
+		// loop over poly_seq_scheme
+		for (auto r: db["pdbx_poly_seq_scheme"].find(cif::Key("mon_id") == c))
+		{
+			std::string asymID;
+			int seqID;
+			cif::tie(asymID, seqID) = r.get("asym_id", "seq_id");
+			
+			std::vector<Atom> rAtoms;
+			copy_if(atoms.begin(), atoms.end(), back_inserter(rAtoms),
+				[&](auto& a) { return a.labelAsymID() == asymID and a.labelSeqID() == seqID; });
+			
+			for (uint32_t i = 0; i + 1 < rAtoms.size(); ++i)
+			{
+				for (uint32_t j = i + 1; j < rAtoms.size(); ++j)
+				{
+					if (bonded(rAtoms[i], rAtoms[j]))
+						bindAtoms(rAtoms[i].id(), rAtoms[j].id());
+				}
+			}
+		}
+
+		// loop over pdbx_nonpoly_scheme
+		for (auto r: db["pdbx_nonpoly_scheme"].find(cif::Key("mon_id") == c))
+		{
+			std::string asymID;
+			cif::tie(asymID) = r.get("asym_id");
+			
+			std::vector<Atom> rAtoms;
+			copy_if(atoms.begin(), atoms.end(), back_inserter(rAtoms),
+				[&](auto& a) { return a.labelAsymID() == asymID; });
+			
+			for (uint32_t i = 0; i + 1 < rAtoms.size(); ++i)
+			{
+				for (uint32_t j = i + 1; j < rAtoms.size(); ++j)
+				{
+					if (bonded(rAtoms[i], rAtoms[j]))
+					{
+						uint32_t ixa = index[rAtoms[i].id()];
+						uint32_t ixb = index[rAtoms[j].id()];
+						
+						bond.insert(key(ixa, ixb));
+					}
+				}
+			}
+		}
+
+		// loop over pdbx_branch_scheme
+		for (auto r: db["pdbx_branch_scheme"].find(cif::Key("mon_id") == c))
+		{
+			std::string asymID;
+			cif::tie(asymID) = r.get("asym_id");
+			
+			std::vector<Atom> rAtoms;
+			copy_if(atoms.begin(), atoms.end(), back_inserter(rAtoms),
+				[&](auto& a) { return a.labelAsymID() == asymID; });
+			
+			for (uint32_t i = 0; i + 1 < rAtoms.size(); ++i)
+			{
+				for (uint32_t j = i + 1; j < rAtoms.size(); ++j)
+				{
+					if (bonded(rAtoms[i], rAtoms[j]))
+					{
+						uint32_t ixa = index[rAtoms[i].id()];
+						uint32_t ixb = index[rAtoms[j].id()];
+						
+						bond.insert(key(ixa, ixb));
+					}
+				}
+			}
+		}
+	}
+	
+	// start by creating an index for single bonds
+	
+	std::multimap<uint32_t,uint32_t> b1_2;
+	for (auto& bk: bond)
+	{
+		uint32_t a, b;
+		std::tie(a, b) = dekey(bk);
+		
+		b1_2.insert({ a, b });
+		b1_2.insert({ b, a });
+	}
+	
+	std::multimap<uint32_t,uint32_t> b1_3;
+	for (uint32_t i = 0; i < dim; ++i)
+	{
+		auto a = b1_2.equal_range(i);
+		
+		std::vector<uint32_t> s;
+		for (auto j = a.first; j != a.second; ++j)
+			s.push_back(j->second);
+
+		for (size_t si1 = 0; si1 + 1 < s.size(); ++si1)
+		{
+			for (size_t si2 = si1 + 1; si2 < s.size(); ++si2)
+			{
+				uint32_t x = s[si1];
+				uint32_t y = s[si2];
+				
+				if (isBonded(x, y))
+					continue;
+				
+				b1_3.insert({ x, y });
+				b1_3.insert({ y, x });
+			}
+		}
+	}
+
+	for (uint32_t i = 0; i < dim; ++i)
+	{
+		auto a1 = b1_2.equal_range(i);
+		auto a2 = b1_3.equal_range(i);
+		
+		for (auto ai1 = a1.first; ai1 != a1.second; ++ai1)
+		{
+			for (auto ai2 = a2.first; ai2 != a2.second; ++ai2)
+			{
+				uint32_t b1 = ai1->second;
+				uint32_t b2 = ai2->second;
+				
+				if (isBonded(b1, b2))
+					continue;
+				
+				bond_1_4.insert(key(b1, b2));
+			}
+		}
+	}
+}
+
+std::vector<std::string> BondMap::linked(const Atom& a) const
+{
+	auto i = link.find(a.id());
+	
+	std::vector<std::string> result;
+	
+	if (i != link.end())
+		result = std::vector<std::string>(i->second.begin(), i->second.end());
+
+	return result;
+}
+
+std::vector<std::string> BondMap::atomIDsForCompound(const std::string& compoundID)
+{
+	std::vector<std::string> result;
+
+	auto* compound = mmcif::Compound::create(compoundID);
+
+	if (compound == nullptr)
+		throw BondMapException("Missing bond information for compound " + compoundID);
+
+	for (auto& compAtom: compound->atoms())
+		result.push_back(compAtom.id);
+
+	return result;
+}
+
+}
--- a/src/Compound.cpp
+++ b/src/Compound.cpp
@@ -144,6 +144,51 @@ Compound::Compound(cif::Datablock &db)
 	}
 }

+Compound::Compound(cif::Datablock &db, const std::string &id, const std::string &name, const std::string &type)
+	: mID(id)
+	, mName(name)
+	, mType(type)
+{
+	auto &chemCompAtom = db["chem_comp_atom"];
+	for (auto row: chemCompAtom)
+	{
+		CompoundAtom atom;
+		std::string typeSymbol;
+		cif::tie(atom.id, typeSymbol, atom.charge, atom.x, atom.y, atom.z) =
+			row.get("atom_id", "type_symbol", "charge", "x", "y", "z");
+		atom.typeSymbol = AtomTypeTraits(typeSymbol).type();
+
+		mFormalCharge += atom.charge;
+		mFormulaWeight += AtomTypeTraits(atom.typeSymbol).weight();
+
+		mAtoms.push_back(std::move(atom));
+	}
+
+	auto &chemCompBond = db["chem_comp_bond"];
+	for (auto row: chemCompBond)
+	{
+		CompoundBond bond;
+		std::string type;
+		cif::tie(bond.atomID[0], bond.atomID[1], type, bond.aromatic)
+			= row.get("atom_id_1", "atom_id_2", "type", "aromatic");
+
+		using cif::iequals;
+
+			 if (iequals(type, "single"))	bond.type = BondType::sing;
+		else if (iequals(type, "double"))	bond.type = BondType::doub;
+		else if (iequals(type, "triple"))	bond.type = BondType::trip;
+		else if (iequals(type, "deloc") or iequals(type, "aromat") or iequals(type, "aromatic"))
+			bond.type = BondType::delo;
+		else
+		{
+			if (cif::VERBOSE)
+				std::cerr << "Unimplemented chem_comp_bond.type " << type << " in " << id << std::endl;
+			bond.type = BondType::sing;
+		}
+		mBonds.push_back(std::move(bond));
+	}
+}
+
 CompoundAtom Compound::getAtomByID(const std::string &atomID) const
 {
 	CompoundAtom result = {};
@@ -332,7 +377,7 @@ class CompoundFactoryImpl
 			   (mNext != nullptr and mNext->isKnownBase(resName));
 	}

-  private:
+  protected:
 	std::shared_timed_mutex mMutex;

 	std::vector<Compound *> mCompounds;
@@ -357,14 +402,61 @@ CompoundFactoryImpl::CompoundFactoryImpl(const std::string &file, CompoundFactor
 	: mNext(next)
 {
 	cif::File cifFile(file);
-	if (not cifFile.isValid())
-		throw std::runtime_error("Invalid compound file");

-	for (auto &db: cifFile)
+	auto compList = cifFile.get("comp_list");
+	if (compList)	// So this is a CCP4 restraints file, special handling
+	{
+		auto &chemComp = (*compList)["chem_comp"];
+
+		for (const auto &[id, name, group]: chemComp.rows<std::string,std::string,std::string>({"id", "name", "group"}))
+		{
+			std::string type;
+			
+			// known groups are (counted from ccp4 monomer dictionary)
+
+			//	D-pyranose
+			//	DNA
+			//	L-PEPTIDE LINKING
+			//	L-SACCHARIDE
+			//	L-peptide
+			//	L-pyranose
+			//	M-peptide
+			//	NON-POLYMER
+			//	P-peptide
+			//	RNA
+			//	furanose
+			//	non-polymer
+			//	non_polymer
+			//	peptide
+			//	pyranose
+			//	saccharide
+			
+			if (cif::iequals(id, "gly"))
+				type = "peptide linking";
+			else if (cif::iequals(group, "l-peptide") or cif::iequals(group, "L-peptide linking") or cif::iequals(group, "peptide"))
+				type = "L-peptide linking";
+			else if (cif::iequals(group, "DNA"))
+				type = "DNA linking";
+			else if (cif::iequals(group, "RNA"))
+				type = "RNA linking";
+			else
+				type = "non-polymer";
+			
+			auto &db = cifFile["comp_" + id];
+
+			mCompounds.push_back(new Compound(db, id, name, type));
+		}
+	}
+	else
 	{
-		auto compound = std::make_unique<Compound>(db);
+		// A CCD components file, validate it first
+		cifFile.loadDictionary();

-		mCompounds.push_back(compound.release());
+		if (not cifFile.isValid())
+			throw std::runtime_error("Invalid compound file");
+
+		for (auto &db: cifFile)
+			mCompounds.push_back(new Compound(db));
 	}
 }

@@ -466,6 +558,9 @@ Compound *CCDCompoundFactoryImpl::create(std::string id)

 	Compound *result = get(id);

+	if (result)
+		return result;
+
 	auto ccd = cif::loadResource("components.cif");
 	if (not ccd)
 		throw std::runtime_error("Could not locate the CCD components.cif file, please make sure the software is installed properly and/or use the update-dictionary-script to fetch the data.");
@@ -474,7 +569,7 @@ Compound *CCDCompoundFactoryImpl::create(std::string id)

 	if (mIndex.empty())
 	{
-		if (cif::VERBOSE)
+		if (cif::VERBOSE > 1)
 		{
 			std::cout << "Creating component index " << "...";
 			std::cout.flush();
@@ -483,14 +578,14 @@ Compound *CCDCompoundFactoryImpl::create(std::string id)
 		cif::Parser parser(*ccd, file, false);
 		mIndex = parser.indexDatablocks();

-		if (cif::VERBOSE)
+		if (cif::VERBOSE > 1)
 			std::cout << " done" << std::endl;
 		
 		// reload the resource, perhaps this should be improved...
 		ccd = cif::loadResource("components.cif");
 	}

-	if (cif::VERBOSE)
+	if (cif::VERBOSE > 1)
 	{
 		std::cout << "Loading component " << id << "...";
 		std::cout.flush();
@@ -499,13 +594,22 @@ Compound *CCDCompoundFactoryImpl::create(std::string id)
 	cif::Parser parser(*ccd, file, false);
 	parser.parseSingleDatablock(id, mIndex);

-	if (cif::VERBOSE)
+	if (cif::VERBOSE > 1)
 		std::cout << " done" << std::endl;

-	auto &db = file.firstDatablock();
-	if (db.getName() == id)
-		result = new Compound(db);
-	else if (cif::VERBOSE)
+	if (not file.empty())
+	{
+		auto &db = file.firstDatablock();
+		if (db.getName() == id)
+		{
+			result = new Compound(db);
+
+			std::shared_lock lock(mMutex);
+			mCompounds.push_back(result);
+		}
+	}
+
+	if (result == nullptr and cif::VERBOSE > 1)
 		std::cerr << "Could not locate compound " << id << " in the CCD components file" << std::endl;

 	return result;

--- a/test/UN_.cif
+++ b/test/UN_.cif
+#
+data_comp_list
+loop_
+_chem_comp.id
+_chem_comp.three_letter_code
+_chem_comp.name
+_chem_comp.group
+_chem_comp.number_atoms_all
+_chem_comp.number_atoms_nh
+_chem_comp.desc_level
+UN_     UN_     UN_NINE     L-peptide     13     6     .     
+#
+data_comp_UN_
+#
+loop_
+_chem_comp_atom.comp_id
+_chem_comp_atom.atom_id
+_chem_comp_atom.type_symbol
+_chem_comp_atom.type_energy
+_chem_comp_atom.charge
+_chem_comp_atom.x
+_chem_comp_atom.y
+_chem_comp_atom.z
+UN_     N       N       NT3     1       0.227       -1.259      0.452       
+UN_     H       H       H       0       0.069       -1.019      1.421       
+UN_     H2      H       H       0       1.104       -1.640      0.356       
+UN_     H3      H       H       0       -0.424      -1.909      0.174       
+UN_     CA      C       CH1     0       0.103       -0.030      -0.392      
+UN_     HA      H       H       0       0.160       -0.299      -1.339      
+UN_     CB      C       CH3     0       -1.244      0.625       -0.159      
+UN_     HB3     H       H       0       -1.857      -0.018      0.234       
+UN_     HB2     H       H       0       -1.605      0.932       -1.008      
+UN_     HB1     H       H       0       -1.150      1.385       0.442       
+UN_     C       C       C       0       1.270       0.922       -0.094      
+UN_     O       O       O       0       2.008       1.323       -0.994      
+UN_     OXT     O       OC      -1      1.498       1.305       1.054       
+
+loop_
+_chem_comp_tree.comp_id
+_chem_comp_tree.atom_id
+_chem_comp_tree.atom_back
+_chem_comp_tree.atom_forward
+_chem_comp_tree.connect_type
+ UN_      N      n/a    CA     START
+ UN_      H      N      .      .
+ UN_      H2     N      .      .
+ UN_      H3     N      .      .
+ UN_      CA     N      C      .
+ UN_      HA     CA     .      .
+ UN_      CB     CA     HB3    .
+ UN_      HB1    CB     .      .
+ UN_      HB2    CB     .      .
+ UN_      HB3    CB     .      .
+ UN_      C      CA     .      END
+ UN_      O      C      .      .
+ UN_      OXT      C      .      .
+loop_
+_chem_comp_bond.comp_id
+_chem_comp_bond.atom_id_1
+_chem_comp_bond.atom_id_2
+_chem_comp_bond.type
+_chem_comp_bond.aromatic
+_chem_comp_bond.value_dist
+_chem_comp_bond.value_dist_esd
+UN_          CB          CA      SINGLE       n     1.509   0.014
+UN_          CA           C      SINGLE       n     1.533   0.011
+UN_           C           O      DOUBLE       n     1.247   0.019
+UN_           C         OXT      SINGLE       n     1.247   0.019
+UN_          CA           N      SINGLE       n     1.482   0.010
+UN_          CB         HB3      SINGLE       n     0.972   0.015
+UN_          CB         HB2      SINGLE       n     0.972   0.015
+UN_          CB         HB1      SINGLE       n     0.972   0.015
+UN_          CA          HA      SINGLE       n     0.986   0.020
+UN_           N           H      SINGLE       n     0.911   0.020
+UN_           N          H2      SINGLE       n     0.911   0.020
+UN_           N          H3      SINGLE       n     0.911   0.020
+loop_
+_chem_comp_angle.comp_id
+_chem_comp_angle.atom_id_1
+_chem_comp_angle.atom_id_2
+_chem_comp_angle.atom_id_3
+_chem_comp_angle.value_angle
+_chem_comp_angle.value_angle_esd
+UN_          CA          CB         HB3     109.546    1.50
+UN_          CA          CB         HB2     109.546    1.50
+UN_          CA          CB         HB1     109.546    1.50
+UN_         HB3          CB         HB2     109.386    1.50
+UN_         HB3          CB         HB1     109.386    1.50
+UN_         HB2          CB         HB1     109.386    1.50
+UN_          CB          CA           C     111.490    1.50
+UN_          CB          CA           N     109.912    1.50
+UN_          CB          CA          HA     108.878    1.50
+UN_           C          CA           N     109.627    1.50
+UN_           C          CA          HA     108.541    1.50
+UN_           N          CA          HA     108.529    1.50
+UN_          CA           C           O     117.159    1.57
+UN_          CA           C         OXT     117.159    1.57
+UN_           O           C         OXT     125.683    1.50
+UN_          CA           N           H     109.643    1.50
+UN_          CA           N          H2     109.643    1.50
+UN_          CA           N          H3     109.643    1.50
+UN_           H           N          H2     109.028    2.41
+UN_           H           N          H3     109.028    2.41
+UN_          H2           N          H3     109.028    2.41
+loop_
+_chem_comp_tor.comp_id
+_chem_comp_tor.id
+_chem_comp_tor.atom_id_1
+_chem_comp_tor.atom_id_2
+_chem_comp_tor.atom_id_3
+_chem_comp_tor.atom_id_4
+_chem_comp_tor.value_angle
+_chem_comp_tor.value_angle_esd
+_chem_comp_tor.period
+ UN_      hh1      N      CA     CB     HB3       60.000   15.000   3
+UN_             sp2_sp3_1           O           C          CA          CB       0.000   10.00     6
+UN_            sp3_sp3_10          CB          CA           N           H     180.000   10.00     3
+loop_
+_chem_comp_chir.comp_id
+_chem_comp_chir.id
+_chem_comp_chir.atom_id_centre
+_chem_comp_chir.atom_id_1
+_chem_comp_chir.atom_id_2
+_chem_comp_chir.atom_id_3
+_chem_comp_chir.volume_sign
+UN_    chir_1    CA    N    C    CB    positive
+loop_
+_chem_comp_plane_atom.comp_id
+_chem_comp_plane_atom.plane_id
+_chem_comp_plane_atom.atom_id
+_chem_comp_plane_atom.dist_esd
+UN_    plan-1           C   0.020
+UN_    plan-1          CA   0.020
+UN_    plan-1           O   0.020
+UN_    plan-1         OXT   0.020
+loop_
+_pdbx_chem_comp_descriptor.comp_id
+_pdbx_chem_comp_descriptor.type
+_pdbx_chem_comp_descriptor.program
+_pdbx_chem_comp_descriptor.program_version
+_pdbx_chem_comp_descriptor.descriptor
+UN_ SMILES           ACDLabs              10.04 "O=C(O)C(N)C"
+UN_ SMILES_CANONICAL CACTVS               3.341 "C[C@H](N)C(O)=O"
+UN_ SMILES           CACTVS               3.341 "C[CH](N)C(O)=O"
+UN_ SMILES_CANONICAL "OpenEye OEToolkits" 1.5.0 "C[C@@H](C(=O)O)N"
+UN_ SMILES           "OpenEye OEToolkits" 1.5.0 "CC(C(=O)O)N"
+UN_ InChI            InChI                1.03  "InChI=1S/C3H7NO2/c1-2(4)3(5)6/h2H,4H2,1H3,(H,5,6)/t2-/m0/s1"
+UN_ InChIKey         InChI                1.03  QNAYBMKLOCPYGJ-REOHCLBHSA-N
+UN_ ?                acedrg               195         "dictionary generator"                  
+UN_ ?                acedrg_database      11          "data source"                           
+UN_ ?                rdkit                2017.03.2   "Chemoinformatics tool"
+UN_ ?                refmac5              5.8.0189    "optimization tool"                     
--- a/test/unit-test.cpp
+++ b/test/unit-test.cpp
@@ -31,6 +31,7 @@

 // #include "cif++/DistanceMap.hpp"
 #include "cif++/Cif++.hpp"
+#include "cif++/BondMap.hpp"

 // --------------------------------------------------------------------

@@ -1208,3 +1209,156 @@ _test.name
    BOOST_CHECK(id == 1);
    BOOST_CHECK(name == "aap");
 }
+
+// --------------------------------------------------------------------
+
+BOOST_AUTO_TEST_CASE(bondmap_1)
+{
+    cif::VERBOSE = 2;
+
+	// sections taken from CCD compounds.cif
+	auto components = R"(
+data_ASN
+loop_
+_chem_comp_bond.comp_id
+_chem_comp_bond.atom_id_1
+_chem_comp_bond.atom_id_2
+_chem_comp_bond.value_order
+_chem_comp_bond.pdbx_aromatic_flag
+_chem_comp_bond.pdbx_stereo_config
+_chem_comp_bond.pdbx_ordinal
+ASN N   CA   SING N N 1
+ASN N   H    SING N N 2
+ASN N   H2   SING N N 3
+ASN CA  C    SING N N 4
+ASN CA  CB   SING N N 5
+ASN CA  HA   SING N N 6
+ASN C   O    DOUB N N 7
+ASN C   OXT  SING N N 8
+ASN CB  CG   SING N N 9
+ASN CB  HB2  SING N N 10
+ASN CB  HB3  SING N N 11
+ASN CG  OD1  DOUB N N 12
+ASN CG  ND2  SING N N 13
+ASN ND2 HD21 SING N N 14
+ASN ND2 HD22 SING N N 15
+ASN OXT HXT  SING N N 16
+data_PHE
+loop_
+_chem_comp_bond.comp_id
+_chem_comp_bond.atom_id_1
+_chem_comp_bond.atom_id_2
+_chem_comp_bond.value_order
+_chem_comp_bond.pdbx_aromatic_flag
+_chem_comp_bond.pdbx_stereo_config
+_chem_comp_bond.pdbx_ordinal
+PHE N   CA  SING N N 1
+PHE N   H   SING N N 2
+PHE N   H2  SING N N 3
+PHE CA  C   SING N N 4
+PHE CA  CB  SING N N 5
+PHE CA  HA  SING N N 6
+PHE C   O   DOUB N N 7
+PHE C   OXT SING N N 8
+PHE CB  CG  SING N N 9
+PHE CB  HB2 SING N N 10
+PHE CB  HB3 SING N N 11
+PHE CG  CD1 DOUB Y N 12
+PHE CG  CD2 SING Y N 13
+PHE CD1 CE1 SING Y N 14
+PHE CD1 HD1 SING N N 15
+PHE CD2 CE2 DOUB Y N 16
+PHE CD2 HD2 SING N N 17
+PHE CE1 CZ  DOUB Y N 18
+PHE CE1 HE1 SING N N 19
+PHE CE2 CZ  SING Y N 20
+PHE CE2 HE2 SING N N 21
+PHE CZ  HZ  SING N N 22
+PHE OXT HXT SING N N 23
+data_PRO
+loop_
+_chem_comp_bond.comp_id
+_chem_comp_bond.atom_id_1
+_chem_comp_bond.atom_id_2
+_chem_comp_bond.value_order
+_chem_comp_bond.pdbx_aromatic_flag
+_chem_comp_bond.pdbx_stereo_config
+_chem_comp_bond.pdbx_ordinal
+PRO N   CA  SING N N 1
+PRO N   CD  SING N N 2
+PRO N   H   SING N N 3
+PRO CA  C   SING N N 4
+PRO CA  CB  SING N N 5
+PRO CA  HA  SING N N 6
+PRO C   O   DOUB N N 7
+PRO C   OXT SING N N 8
+PRO CB  CG  SING N N 9
+PRO CB  HB2 SING N N 10
+PRO CB  HB3 SING N N 11
+PRO CG  CD  SING N N 12
+PRO CG  HG2 SING N N 13
+PRO CG  HG3 SING N N 14
+PRO CD  HD2 SING N N 15
+PRO CD  HD3 SING N N 16
+PRO OXT HXT SING N N 17
+)"_cf;
+
+	const std::filesystem::path example("../examples/1cbs.cif.gz");
+	mmcif::File file(example);
+	mmcif::Structure structure(file);
+
+	mmcif::BondMap bm(structure);
+
+	// Test the bonds of the first three residues, that's PRO A 1, ASN A 2, PHE A 3	
+
+	for (const auto& [compound, seqnr]: std::initializer_list<std::tuple<std::string,int>>{ { "PRO", 1 }, { "ASN", 2 }, { "PHE", 3 } })
+	{
+		auto& res = structure.getResidue("A", compound, seqnr);
+		auto atoms = res.atoms();
+
+		auto dc = components.get(compound);
+		BOOST_ASSERT(dc != nullptr);
+
+		auto cc = dc->get("chem_comp_bond");
+		BOOST_ASSERT(cc != nullptr);
+
+		std::set<std::tuple<std::string,std::string>> bonded;
+
+		for (const auto& [atom_id_1, atom_id_2]: cc->rows<std::string,std::string>({ "atom_id_1", "atom_id_2" }))
+		{
+			if (atom_id_1 > atom_id_2)
+				bonded.insert({ atom_id_2, atom_id_1 });
+			else
+				bonded.insert({ atom_id_1, atom_id_2 });
+		}
+
+		for (size_t i = 0; i + 1 < atoms.size(); ++i)
+		{
+			auto label_i = atoms[i].labelAtomID();
+
+			for (size_t j = i + 1; j < atoms.size(); ++j)
+			{
+				auto label_j = atoms[j].labelAtomID();
+
+				bool bonded_1 = bm(atoms[i], atoms[j]);
+				bool bonded_1_i = bm(atoms[j], atoms[i]);
+
+				bool bonded_t = label_i > label_j
+					? bonded.count({ label_j, label_i })
+					: bonded.count({ label_i, label_j });
+
+				BOOST_CHECK(bonded_1 == bonded_t);
+				BOOST_CHECK(bonded_1_i == bonded_t);
+			}
+		}
+	}
+}
+
+BOOST_AUTO_TEST_CASE(bondmap_2)
+{
+	BOOST_CHECK_THROW(mmcif::BondMap::atomIDsForCompound("UN_"), mmcif::BondMapException);
+
+	mmcif::CompoundFactory::instance().pushDictionary("./UN_.cif");
+
+	BOOST_CHECK(mmcif::BondMap::atomIDsForCompound("UN_").empty() == false);
+}
\ No newline at end of file