renaming, first steps

2e2fc11f · Maarten L. Hekkelman · d44ed57c · 2e2fc11f · 2e2fc11f · 2e2fc11f
Commit 2e2fc11f authored Apr 20, 2021 by Maarten L. Hekkelman
9 changed files
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -231,7 +231,7 @@ $(1)-test: test/$(1)-test
 endef
-TESTS = unit pdb2cif
+TESTS = unit pdb2cif rename-compound
 $(foreach part,$(TESTS),$(eval $(call TEST_template,$(part))))

--- a/configure.ac
+++ b/configure.ac
@@ -38,7 +38,7 @@ LT_INIT([disable-shared pic-only])
 AC_SUBST(LIBTOOL_DEPS)
-dnl versioning, first for libtool
+dnl versioning, for libtool
 LIBCIF_CURRENT=1
 LIBCIF_REVISION=1
 LIBCIF_AGE=1

--- a/include/cif++/Cif++.hpp
+++ b/include/cif++/Cif++.hpp
@@ -1881,6 +1881,12 @@ class Category
 	void reorderByIndex();
 	void sort(std::function<int(const Row&, const Row&)> comparator);
+	// --------------------------------------------------------------------
+	// generate a new, unique ID. Pass it an ID generating function based on
+	// a sequence number. This function will be called until the result is
+	// unique in the context of this category
+	std::string getUniqueID(std::function<std::string(int)> generator = cif::cifIdForNumber);
  private:
 	void write(std::ostream& os);

--- a/include/cif++/CifUtils.hpp
+++ b/include/cif++/CifUtils.hpp
@@ -92,6 +92,11 @@ inline char tolower(char ch)
 std::tuple<std::string,std::string> splitTagName(const std::string& tag);
 // --------------------------------------------------------------------
+// generate a cif name, mainly used to generate asym_id's
+std::string cifIdForNumber(int number);
+// --------------------------------------------------------------------
 //	custom wordwrapping routine
 std::vector<std::string> wordWrap(const std::string& text, unsigned int width);

--- a/include/cif++/Structure.hpp
+++ b/include/cif++/Structure.hpp
@@ -445,7 +445,8 @@ class Structure
 	Atom getAtomByLabel(const std::string& atomID, const std::string& asymID,
 		const std::string& compID, int seqID, const std::string& altID = "");
-	const Residue& getResidue(const std::string& asymID, const std::string& compID, int seqID) const;
+	/// \brief Get a residue, if \a seqID is zero, the non-polymers are searched
+	const Residue& getResidue(const std::string& asymID, const std::string& compID, int seqID = 0) const;
 	// map between auth and label locations

--- a/src/Cif++.cpp
+++ b/src/Cif++.cpp
@@ -1385,6 +1385,23 @@ void Category::sort(std::function<int(const Row&, const Row&)> comparator)
 	assert(size() == rows.size());
 }
+std::string Category::getUniqueID(std::function<std::string(int)> generator)
+{
+	using namespace cif::literals;
+	int nr = size() + 1;
+	for (;;)
+	{
+		std::string result = generator(nr++);
+		if (exists("id"_key == result))
+			continue;
+		return result;
+	}
+}
 size_t Category::size() const
 {
 	size_t result = 0;

--- a/src/CifUtils.cpp
+++ b/src/CifUtils.cpp
@@ -215,6 +215,37 @@ std::tuple<std::string, std::string> splitTagName(const std::string &tag)
 }
 // --------------------------------------------------------------------
+std::string cifIdForNumber(int number)
+{
+	std::string result;
+	if (number >= 26 * 26 * 26)
+		result = 'L' + std::to_string(number);
+	else
+	{
+		if (number >= 26 * 26)
+		{
+			int v = number / (26 * 26);
+			result += 'A' - 1 + v;
+			number %= (26 * 26);
+		}
+		if (number >= 26)
+		{
+			int v = number / 26;
+			result += 'A' - 1 + v;
+			number %= 26;
+		}
+		result += 'A' + number;
+	}
+	assert(not result.empty());
+	return result;
+}
+// --------------------------------------------------------------------
 // Simplified line breaking code taken from a decent text editor.
 // In this case, simplified means it only supports ASCII.

--- a/src/PDB2Cif.cpp
+++ b/src/PDB2Cif.cpp
@@ -566,21 +566,19 @@ class PDBFileParser
 			if (not result.empty() and result.back() != ']')
 				result += '-';
-				 if (sugar->c1.resName == "MAN") result += "alpha-D-mannopyranose";
+			auto compound = CompoundFactory::instance().create(sugar->c1.resName);
+			if (compound)
+				result += compound->name();
+			else if (sugar->c1.resName == "MAN") result += "alpha-D-mannopyranose";
 			else if (sugar->c1.resName == "BMA") result += "beta-D-mannopyranose";
 			else if (sugar->c1.resName == "NAG") result += "2-acetamido-2-deoxy-beta-D-glucopyranose";
 			else if (sugar->c1.resName == "NDG") result += "2-acetamido-2-deoxy-alpha-D-glucopyranose";
 			else if (sugar->c1.resName == "FUC") result += "alpha-L-fucopyranose";
 			else if (sugar->c1.resName == "FUL") result += "beta-L-fucopyranose";
 			else
-			{
+				result += sugar->c1.resName;
-				auto compound = CompoundFactory::instance().create(sugar->c1.resName);
-				if (compound)
-					result += compound->name();
-				else
-					result += sugar->c1.resName;
-			}
 			return result;
 		}
@@ -952,35 +950,6 @@ class PDBFileParser
 		return c;
 	}
-	std::string cifIdForInt(int nr) const
-	{
-		std::string result;
-		if (nr >= 26 * 26 * 26)
-			result = 'L' + std::to_string(nr);
-		else
-		{
-			if (nr >= 26 * 26)
-			{
-				int v = nr / (26 * 26);
-				result += 'A' - 1 + v;
-				nr %= (26 * 26);
-			}
-			if (nr >= 26)
-			{
-				int v = nr / 26;
-				result += 'A' - 1 + v;
-				nr %= 26;
-			}
-			result += 'A' + nr;
-		}
-		assert(not result.empty());
-		return result;
-	}
 	std::vector<char> altLocsForAtom(char chainID, int seqNum, char iCode, std::string atomName);
 	void MapChainID2AsymIDS(char chainID, std::vector<std::string>& asymIds);
@@ -1484,7 +1453,7 @@ void PDBFileParser::ParseTitle()
 		//	37 - 40       ...
-		std::string old		= vS(22, 25);
+		std::string old			= vS(22, 25);
 		std::string date		= pdb2cifDate(vS(12, 20));
 		cat = getCategory("pdbx_database_PDB_obs");
@@ -1506,7 +1475,7 @@ void PDBFileParser::ParseTitle()
 	Match("TITLE ", false);
 	std::string title;
 	if (mRec->is("TITLE "))	//	 1 -  6       Record name    "TITLE "
-	{							//	 9 - 10       Continuation   continuation  Allows concatenation of multiple records.
+	{						//	 9 - 10       Continuation   continuation  Allows concatenation of multiple records.
 		title = vS(11);		//	11 - 80       String         title         Title of the  experiment.
 		GetNextRecord();
 	}
@@ -3770,7 +3739,7 @@ void PDBFileParser::ConstructEntities()
 	int asymNr = 0;
 	for (auto& chain: mChains)
 	{
-		std::string asymID = cifIdForInt(asymNr++);
+		std::string asymID = cif::cifIdForNumber(asymNr++);
 		std::string entityID = mMolID2EntityID[chain.mMolID];
 		mAsymID2EntityID[asymID] = entityID;
@@ -4182,7 +4151,7 @@ void PDBFileParser::ConstructEntities()
 		if (ih != chain.mSeqres.end())
 			continue;
-		heti.asymID = cifIdForInt(asymNr++);
+		heti.asymID = cif::cifIdForNumber(asymNr++);
 	}
 	std::set<std::string> writtenAsyms;
@@ -4261,7 +4230,7 @@ void PDBFileParser::ConstructEntities()
 			{
 				if (waterChains.count(het.chainID) == 0)
 				{
-					asymID = cifIdForInt(asymNr++);
+					asymID = cif::cifIdForNumber(asymNr++);
 					waterChains[het.chainID] = asymID;
 				}
 				else
@@ -4604,7 +4573,7 @@ void PDBFileParser::ConstructSugarTrees(int& asymNr)
 				// create an asym for this sugar tree
-				std::string asymID = cifIdForInt(asymNr++);
+				std::string asymID = cif::cifIdForNumber(asymNr++);
 				getCategory("struct_asym")->emplace({
 					{ "id", asymID },

--- a/src/Structure.cpp
+++ b/src/Structure.cpp
@@ -2237,13 +2237,51 @@ void Structure::moveAtom(Atom& a, Point p)
 void Structure::changeResidue(const Residue& res, const std::string& newCompound,
 		const std::vector<std::tuple<std::string,std::string>>& remappedAtoms)
 {
+	using namespace cif::literals;
+	const auto compound = Compound::create(newCompound);
+	if (not compound)
+		throw std::runtime_error("Unknown compound " + newCompound);
 	cif::Datablock& db = *mFile.impl().mDb;
+	std::string asymID = res.asymID();
 	std::string entityID;
+	std::tie(entityID) = db["struct_asym"].find1<std::string>("id"_key == asymID, { "entity_id" });
 	// First make sure the compound is already known or insert it.
 	// And if the residue is an entity, we must make sure it exists
 	insertCompound(newCompound, res.isEntity());
+	// Next, if it is a non-polymer, update the entityID
+	if (db["pdbx_entity_nonpoly"].exists("entity_id"_key == entityID and "comp_id"_key == res.compoundID()))
+	{
+		try
+		{
+			std::tie(entityID) = db["entity"].find1<std::string>("type"_key == "non-polymer" and "pdbx_description"_key == compound->name(), { "id" });
+		}
+		catch (const std::exception& ex)
+		{
+			entityID = db["entity"].getUniqueID([](int i) { return std::to_string(i); });
+			db["entity"].emplace({
+				{ "id", entityID },
+				{ "type", "non-polymer" },
+				{ "src_method", "man" },
+				{ "pdbx_description", compound->name() },
+				{ "formula_weight", compound->formulaWeight() }
+			});
+		}
+		if (not db["pdbx_entity_nonpoly"].exists("entity_id"_key == entityID and "comp_id"_key == newCompound))
+		{
+			db["pdbx_entity_nonpoly"].emplace({
+				{ "entity_id", entityID },
+				{ "name", compound->name() },
+				{ "comp_id", newCompound }
+			});
+		}
+	}
 	auto& atomSites = db["atom_site"];
 	auto atoms = res.atoms();