Commit c5d277fb by maarten

refactored main using nested exceptions, fixed PDB parser a bit

git-svn-id: svn+ssh://gitlab/srv/svn-repos/pdb-redo/trunk@180 a1961a4f-ab94-4bcc-80e8-33b5a54de466
parent d0b7e21c
/*
Created by: Maarten L. Hekkelman
Date: dinsdag 07 november, 2017
Copyright 2017 NKI AVL
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <vector>
#include <string>
#include <tuple>
#include "cif++/Cif++.h"
namespace cif
{
extern const int
kResidueNrWildcard,
kNoSeqNum;
struct TLSSelection;
typedef std::unique_ptr<TLSSelection> TLSSelectionPtr;
struct TLSResidue;
struct TLSSelection
{
virtual ~TLSSelection() {}
virtual void CollectResidues(cif::Datablock& db, std::vector<TLSResidue>& residues, int indentLevel = 0) const = 0;
std::vector<std::tuple<std::string,int,int>> GetRanges(cif::Datablock& db, bool pdbNamespace) const;
};
// Low level: get the selections
TLSSelectionPtr ParseSelectionDetails(const std::string& program, const std::string& selection);
}
...@@ -1773,7 +1773,7 @@ Row& Row::operator=(const Row& rhs) ...@@ -1773,7 +1773,7 @@ Row& Row::operator=(const Row& rhs)
void Row::assign(const string& name, const string& value, bool emplacing) void Row::assign(const string& name, const string& value, bool emplacing)
{ {
if (mData == nullptr) if (mData == nullptr)
throw logic_error("invalid Row, no data"); throw logic_error("invalid Row, no data assigning value '" + value + "' to " + name);
auto cat = mData->mCategory; auto cat = mData->mCategory;
auto cix = cat->addColumn(name); auto cix = cat->addColumn(name);
...@@ -1905,7 +1905,7 @@ bool Row::empty() const ...@@ -1905,7 +1905,7 @@ bool Row::empty() const
auto Row::begin() const -> const_iterator auto Row::begin() const -> const_iterator
{ {
return const_iterator(mData, mData->mValues); return const_iterator(mData, mData ? mData->mValues : nullptr);
} }
auto Row::end() const -> const_iterator auto Row::end() const -> const_iterator
......
...@@ -264,6 +264,7 @@ const Compound* CompoundFactory::create(std::string id) ...@@ -264,6 +264,7 @@ const Compound* CompoundFactory::create(std::string id)
value = 1.5; value = 1.5;
else else
{ {
if (VERBOSE)
cerr << "Unimplemented chem_comp_bond.type " << type << " in file " << resFile << endl; cerr << "Unimplemented chem_comp_bond.type " << type << " in file " << resFile << endl;
value = 1.0; value = 1.0;
} }
......
...@@ -9,6 +9,8 @@ ...@@ -9,6 +9,8 @@
#include <boost/format.hpp> #include <boost/format.hpp>
#include <boost/numeric/ublas/matrix.hpp> #include <boost/numeric/ublas/matrix.hpp>
#include <clipper/core/spacegroup.h>
#include "cif++/PDB2Cif.h" #include "cif++/PDB2Cif.h"
#include "cif++/AtomType.h" #include "cif++/AtomType.h"
#include "cif++/Compound.h" #include "cif++/Compound.h"
...@@ -31,7 +33,8 @@ namespace error ...@@ -31,7 +33,8 @@ namespace error
{ {
enum pdbErrors enum pdbErrors
{ {
residueNotFound = 1000 residueNotFound = 1000,
invalidDate
}; };
namespace detail namespace detail
...@@ -52,6 +55,9 @@ namespace error ...@@ -52,6 +55,9 @@ namespace error
case residueNotFound: case residueNotFound:
return "Residue not found"; return "Residue not found";
case invalidDate:
return "Invalid date";
default: default:
return "Error in PDB format"; return "Error in PDB format";
} }
...@@ -123,7 +129,6 @@ PDBRecord::PDBRecord(uint32 lineNr, const string& name, const string& value) ...@@ -123,7 +129,6 @@ PDBRecord::PDBRecord(uint32 lineNr, const string& name, const string& value)
PDBRecord::~PDBRecord() PDBRecord::~PDBRecord()
{ {
delete mNext;
} }
void* PDBRecord::operator new(size_t size, size_t vLen) void* PDBRecord::operator new(size_t size, size_t vLen)
...@@ -373,7 +378,13 @@ class PDBFileParser ...@@ -373,7 +378,13 @@ class PDBFileParser
~PDBFileParser() ~PDBFileParser()
{ {
delete mData; PDBRecord* r = mData;
while (r != nullptr)
{
PDBRecord* d = r;
r = d->mNext;
delete d;
}
} }
void Parse(istream& is, cif::File& result); void Parse(istream& is, cif::File& result);
...@@ -489,8 +500,8 @@ class PDBFileParser ...@@ -489,8 +500,8 @@ class PDBFileParser
int mSeqNum; int mSeqNum;
char mIcode; char mIcode;
bool operator==(const AtomRes& rhs) const { return mMonId == rhs.mMonId and mSeqNum == rhs.mSeqNum and mIcode == rhs.mIcode; } bool operator==(const AtomRes& rhs) const { return mSeqNum == rhs.mSeqNum and mIcode == rhs.mIcode; }
bool operator!=(const AtomRes& rhs) const { return mMonId != rhs.mMonId or mSeqNum != rhs.mSeqNum or mIcode != rhs.mIcode; } bool operator!=(const AtomRes& rhs) const { return mSeqNum != rhs.mSeqNum or mIcode != rhs.mIcode; }
}; };
vector<AtomRes> mResiduesSeen; vector<AtomRes> mResiduesSeen;
...@@ -593,16 +604,7 @@ class PDBFileParser ...@@ -593,16 +604,7 @@ class PDBFileParser
int vI(int columnFirst, int columnLast) const int vI(int columnFirst, int columnLast) const
{ {
int result = 0; return mRec->vI(columnFirst, columnLast);
try
{
result = mRec->vI(columnFirst, columnLast);
}
catch (const exception& ex)
{
Error(ex.what());
}
return result;
} }
// ---------------------------------------------------------------- // ----------------------------------------------------------------
...@@ -643,14 +645,14 @@ class PDBFileParser ...@@ -643,14 +645,14 @@ class PDBFileParser
void GetNextRecord(); void GetNextRecord();
void Match(const string& expected); void Match(const string& expected);
void Error(const string& msg) const // void Error(const string& msg) const
{ // {
string lineNr; // string lineNr;
if (mRec != nullptr) // if (mRec != nullptr)
lineNr = " (at line " + to_string(mRec->mLineNr) + ')'; // lineNr = " (at line " + to_string(mRec->mLineNr) + ')';
//
throw runtime_error("Error parsing PDB file" + lineNr + ": " + msg); // throw runtime_error("Error parsing PDB file" + lineNr + ": " + msg);
} // }
void ParseTitle(); void ParseTitle();
void ParseCitation(const string& id); void ParseCitation(const string& id);
...@@ -686,7 +688,7 @@ class PDBFileParser ...@@ -686,7 +688,7 @@ class PDBFileParser
vector<string> SplitCSV(const string& value); vector<string> SplitCSV(const string& value);
string pdb2cifDate(string s) string pdb2cifDate(string s, boost::system::error_code& ec)
{ {
smatch m; smatch m;
const regex const regex
...@@ -700,7 +702,7 @@ class PDBFileParser ...@@ -700,7 +702,7 @@ class PDBFileParser
int day = stoi(m[1].str()); int day = stoi(m[1].str());
auto mi = kMonths.find(m[2].str()); auto mi = kMonths.find(m[2].str());
if (mi == kMonths.end()) if (mi == kMonths.end())
Error("Invalid month"); throw runtime_error("Invalid month: '" + m[2].str() + '\'');
int month = mi->second; int month = mi->second;
int year = 1900 + stoi(m[3].str()); int year = 1900 + stoi(m[3].str());
if (year < 1950) if (year < 1950)
...@@ -714,7 +716,7 @@ class PDBFileParser ...@@ -714,7 +716,7 @@ class PDBFileParser
{ {
auto mi = kMonths.find(m[1].str()); auto mi = kMonths.find(m[1].str());
if (mi == kMonths.end()) if (mi == kMonths.end())
Error("Invalid month"); throw runtime_error("Invalid month: '" + m[1].str() + '\'');
int month = mi->second; int month = mi->second;
int year = 1900 + stoi(m[2].str()); int year = 1900 + stoi(m[2].str());
if (year < 1950) if (year < 1950)
...@@ -722,10 +724,19 @@ class PDBFileParser ...@@ -722,10 +724,19 @@ class PDBFileParser
s = (boost::format("%04d-%02d") % year % month).str(); s = (boost::format("%04d-%02d") % year % month).str();
} }
else
ec = error::make_error_code(error::pdbErrors::invalidDate);
return s; return s;
} }
string pdb2cifDate(string s)
{
boost::system::error_code ec;
pdb2cifDate(s, ec);
return s;
}
string pdb2cifAuth(string author) string pdb2cifAuth(string author)
{ {
ba::trim(author); ba::trim(author);
...@@ -925,7 +936,16 @@ void PDBFileParser::PreParseInput(istream& is) ...@@ -925,7 +936,16 @@ void PDBFileParser::PreParseInput(istream& is)
getline(is, lookahead); getline(is, lookahead);
if (ba::starts_with(lookahead, "HEADER") == false) if (ba::starts_with(lookahead, "HEADER") == false)
Error("This does not look like a PDB file, should start with a HEADER line"); throw runtime_error("This does not look like a PDB file, should start with a HEADER line");
auto contNr = [&lookahead](int offset, int len) -> int
{
string cs = lookahead.substr(offset, len);
ba::trim(cs);
int result = cs.empty() ? 0 : stoi(cs);
return result;
};
PDBRecord* last = nullptr; PDBRecord* last = nullptr;
set<string> dropped; set<string> dropped;
...@@ -964,7 +984,7 @@ void PDBFileParser::PreParseInput(istream& is) ...@@ -964,7 +984,7 @@ void PDBFileParser::PreParseInput(istream& is)
type == "TITLE ") type == "TITLE ")
{ {
int n = 2; int n = 2;
while (lookahead.substr(0, 6) == type and stoi(lookahead.substr(7, 3)) == n) while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
{ {
value += ba::trim_right_copy(lookahead.substr(10)); value += ba::trim_right_copy(lookahead.substr(10));
getline(is, lookahead); getline(is, lookahead);
...@@ -976,7 +996,7 @@ void PDBFileParser::PreParseInput(istream& is) ...@@ -976,7 +996,7 @@ void PDBFileParser::PreParseInput(istream& is)
{ {
int n = 2; int n = 2;
value += '\n'; value += '\n';
while (lookahead.substr(0, 6) == type and stoi(lookahead.substr(7, 3)) == n) while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
{ {
value += ba::trim_right_copy(lookahead.substr(10)); value += ba::trim_right_copy(lookahead.substr(10));
value += '\n'; value += '\n';
...@@ -991,7 +1011,7 @@ void PDBFileParser::PreParseInput(istream& is) ...@@ -991,7 +1011,7 @@ void PDBFileParser::PreParseInput(istream& is)
int n = 2; int n = 2;
while (lookahead.substr(0, 6) == type and while (lookahead.substr(0, 6) == type and
stoi(lookahead.substr(7, 3)) == revNr and stoi(lookahead.substr(7, 3)) == revNr and
stoi(lookahead.substr(10, 2)) == n) contNr(10, 2) == n)
{ {
value += lookahead.substr(38); value += lookahead.substr(38);
getline(is, lookahead); getline(is, lookahead);
...@@ -1002,7 +1022,7 @@ void PDBFileParser::PreParseInput(istream& is) ...@@ -1002,7 +1022,7 @@ void PDBFileParser::PreParseInput(istream& is)
else if (type == "CAVEAT") else if (type == "CAVEAT")
{ {
int n = 2; int n = 2;
while (lookahead.substr(0, 6) == type and stoi(lookahead.substr(7, 3)) == n) while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
{ {
value += ba::trim_right_copy(lookahead.substr(13)); value += ba::trim_right_copy(lookahead.substr(13));
getline(is, lookahead); getline(is, lookahead);
...@@ -1023,7 +1043,7 @@ void PDBFileParser::PreParseInput(istream& is) ...@@ -1023,7 +1043,7 @@ void PDBFileParser::PreParseInput(istream& is)
{ {
value += '\n'; value += '\n';
int n = 2; int n = 2;
while (lookahead.substr(0, 6) == type and stoi(lookahead.substr(7, 3)) == n) while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
{ {
value += ba::trim_copy(lookahead.substr(10)); value += ba::trim_copy(lookahead.substr(10));
value += '\n'; value += '\n';
...@@ -1038,8 +1058,7 @@ void PDBFileParser::PreParseInput(istream& is) ...@@ -1038,8 +1058,7 @@ void PDBFileParser::PreParseInput(istream& is)
int n = 2; int n = 2;
while (lookahead.substr(0, 6) == type and while (lookahead.substr(0, 6) == type and
stoi(lookahead.substr(7, 3)) == compNr and stoi(lookahead.substr(7, 3)) == compNr and
lookahead.substr(16, 2) != " " and contNr(16, 2) == n)
stoi(lookahead.substr(16, 2)) == n)
{ {
value += lookahead.substr(19); value += lookahead.substr(19);
getline(is, lookahead); getline(is, lookahead);
...@@ -1051,9 +1070,7 @@ void PDBFileParser::PreParseInput(istream& is) ...@@ -1051,9 +1070,7 @@ void PDBFileParser::PreParseInput(istream& is)
type == "HETSYN") type == "HETSYN")
{ {
int n = 2; int n = 2;
while (lookahead.substr(0, 6) == type and while (lookahead.substr(0, 6) == type and contNr(8, 2) == n)
lookahead.substr(8, 2) != " " and
stoi(lookahead.substr(8, 2)) == n)
{ {
value += lookahead.substr(16); value += lookahead.substr(16);
getline(is, lookahead); getline(is, lookahead);
...@@ -1116,6 +1133,9 @@ void PDBFileParser::PreParseInput(istream& is) ...@@ -1116,6 +1133,9 @@ void PDBFileParser::PreParseInput(istream& is)
last->mNext = cur; last->mNext = cur;
last = cur; last = cur;
if (type == "END ")
break;
} }
if (not dropped.empty()) if (not dropped.empty())
...@@ -1141,7 +1161,7 @@ void PDBFileParser::GetNextRecord() ...@@ -1141,7 +1161,7 @@ void PDBFileParser::GetNextRecord()
void PDBFileParser::Match(const string& expected) void PDBFileParser::Match(const string& expected)
{ {
if (mRec->mName != expected) if (mRec->mName != expected)
Error("Expected record " + expected + " but found " + mRec->mName); throw runtime_error("At line " + to_string(mRec->mLineNr) + ": expected record " + expected + " but found " + mRec->mName);
} }
vector<string> PDBFileParser::SplitCSV(const string& value) vector<string> PDBFileParser::SplitCSV(const string& value)
...@@ -1224,17 +1244,20 @@ void PDBFileParser::ParseTitle() ...@@ -1224,17 +1244,20 @@ void PDBFileParser::ParseTitle()
// 1 - 6 Record name "SPLIT " // 1 - 6 Record name "SPLIT "
// 9 - 10 Continuation continuation Allows concatenation of multiple records. // 9 - 10 Continuation continuation Allows concatenation of multiple records.
// 12 - 15 IDcode idCode ID code of related datablock. // 12 - 15 IDcode idCode ID code of related datablock.
if (VERBOSE)
Error("skipping unimplemented SPLIT record"); throw runtime_error("SPLIT PDB files are not supported");
GetNextRecord();
// if (VERBOSE)
// Error("skipping unimplemented SPLIT record");
// GetNextRecord();
} }
// CAVEAT // CAVEAT
if (mRec->is("CAVEAT")) // 1 - 6 Record name "CAVEAT" int caveatID = 1;
{ // 9 - 10 Continuation continuation Allows concatenation of multiple records. while (mRec->is("CAVEAT")) // 1 - 6 Record name "CAVEAT"
{
getCategory("database_PDB_caveat")->emplace({ getCategory("database_PDB_caveat")->emplace({
// { "id", vS(12, 15) }, // 12 - 15 IDcode idCode PDB ID code of this datablock. { "id", caveatID++ },
{ "id", 1 }, // 12 - 15 IDcode idCode PDB ID code of this datablock.
{ "text", string{mRec->vS(20) } } // 20 - 79 String comment Free text giving the reason for the CAVEAT. { "text", string{mRec->vS(20) } } // 20 - 79 String comment Free text giving the reason for the CAVEAT.
}); });
...@@ -1347,7 +1370,7 @@ void PDBFileParser::ParseTitle() ...@@ -1347,7 +1370,7 @@ void PDBFileParser::ParseTitle()
} }
if (source == nullptr) if (source == nullptr)
Error("MOL_ID missing"); throw runtime_error("At line " + to_string(mRec->mLineNr) + ": missing MOL_ID in SOURCE");
(*source)[key] = value; (*source)[key] = value;
} }
...@@ -1382,11 +1405,21 @@ void PDBFileParser::ParseTitle() ...@@ -1382,11 +1405,21 @@ void PDBFileParser::ParseTitle()
mExpMethod = vS(11); mExpMethod = vS(11);
cat = getCategory("exptl"); cat = getCategory("exptl");
for (auto si = ba::make_split_iterator(mExpMethod, ba::token_finder(ba::is_any_of(";"), ba::token_compress_on)); not si.eof(); ++si)
{
string expMethod(si->begin(), si->end());
ba::trim(expMethod);
if (expMethod.empty())
continue;
cat->emplace({ cat->emplace({
{ "entry_id", mStructureId }, { "entry_id", mStructureId },
{ "method", mExpMethod }, { "method", expMethod },
{ "crystals_number", mRemark200["NUMBER OF CRYSTALS USED"] } { "crystals_number", mRemark200["NUMBER OF CRYSTALS USED"] }
}); });
}
GetNextRecord(); GetNextRecord();
} }
...@@ -1395,7 +1428,7 @@ void PDBFileParser::ParseTitle() ...@@ -1395,7 +1428,7 @@ void PDBFileParser::ParseTitle()
if (mRec->is("NUMMDL")) if (mRec->is("NUMMDL"))
{ {
if (VERBOSE) if (VERBOSE)
Error("skipping unimplemented NUMMDL record"); cerr << "skipping unimplemented NUMMDL record" << endl;
GetNextRecord(); GetNextRecord();
} }
...@@ -1624,6 +1657,8 @@ void PDBFileParser::ParseRemarks() ...@@ -1624,6 +1657,8 @@ void PDBFileParser::ParseRemarks()
{ {
int remarkNr = vI(8, 10); int remarkNr = vI(8, 10);
try
{
switch (remarkNr) switch (remarkNr)
{ {
case 1: case 1:
...@@ -1780,9 +1815,9 @@ void PDBFileParser::ParseRemarks() ...@@ -1780,9 +1815,9 @@ void PDBFileParser::ParseRemarks()
break; break;
} }
// case 290: // case 290:
// //
// break; // break;
case 350: case 350:
// postponed since we don't have the required information yet // postponed since we don't have the required information yet
...@@ -1939,7 +1974,7 @@ void PDBFileParser::ParseRemarks() ...@@ -1939,7 +1974,7 @@ void PDBFileParser::ParseRemarks()
else if (subtopic == "MAIN CHAIN PLANARITY") state = eMCP; else if (subtopic == "MAIN CHAIN PLANARITY") state = eMCP;
else if (subtopic == "CHIRAL CENTERS") state = eChC; else if (subtopic == "CHIRAL CENTERS") state = eChC;
else if (VERBOSE) else if (VERBOSE)
Error("Unknown subtopic in REMARK 500: " + subtopic); throw runtime_error("Unknown subtopic in REMARK 500: " + subtopic);
headerSeen = false; headerSeen = false;
id = 0; id = 0;
...@@ -2021,15 +2056,15 @@ void PDBFileParser::ParseRemarks() ...@@ -2021,15 +2056,15 @@ void PDBFileParser::ParseRemarks()
{ "auth_asym_id_1", string{ chain1 } }, { "auth_asym_id_1", string{ chain1 } },
{ "auth_comp_id_1", res1 }, { "auth_comp_id_1", res1 },
{ "auth_seq_id_1", seq1 }, { "auth_seq_id_1", seq1 },
// { "PDB_ins_code_1", "" }, // { "PDB_ins_code_1", "" },
// { "label_alt_id_1", "" }, // { "label_alt_id_1", "" },
{ "site_symmetry_1", "1_555" }, { "site_symmetry_1", "1_555" },
{ "auth_atom_id_2", atom2 }, { "auth_atom_id_2", atom2 },
{ "auth_asym_id_2", string { chain2 } }, { "auth_asym_id_2", string { chain2 } },
{ "auth_comp_id_2", res2 }, { "auth_comp_id_2", res2 },
{ "auth_seq_id_2", seq2 }, { "auth_seq_id_2", seq2 },
// { "PDB_ins_code_2", "" }, // { "PDB_ins_code_2", "" },
// { "label_alt_id_2", "" }, // { "label_alt_id_2", "" },
{ "site_symmetry_2", symop }, { "site_symmetry_2", symop },
{ "dist", distance } { "dist", distance }
}); });
...@@ -2042,7 +2077,7 @@ void PDBFileParser::ParseRemarks() ...@@ -2042,7 +2077,7 @@ void PDBFileParser::ParseRemarks()
if (not headerSeen) if (not headerSeen)
{ {
if (ba::starts_with(line, "FORMAT: ") and line != "FORMAT: (10X,I3,1X,2(A3,1X,A1,I4,A1,1X,A4,3X),1X,F6.3)") if (ba::starts_with(line, "FORMAT: ") and line != "FORMAT: (10X,I3,1X,2(A3,1X,A1,I4,A1,1X,A4,3X),1X,F6.3)")
Error("Unexpected format in REMARK 500"); throw runtime_error("Unexpected format in REMARK 500");
headerSeen = line == "M RES CSSEQI ATM1 RES CSSEQI ATM2 DEVIATION"; headerSeen = line == "M RES CSSEQI ATM1 RES CSSEQI ATM2 DEVIATION";
} }
...@@ -2096,7 +2131,7 @@ void PDBFileParser::ParseRemarks() ...@@ -2096,7 +2131,7 @@ void PDBFileParser::ParseRemarks()
if (not headerSeen) if (not headerSeen)
{ {
if (ba::starts_with(line, "FORMAT: ") and line != "FORMAT: (10X,I3,1X,A3,1X,A1,I4,A1,3(1X,A4,2X),12X,F5.1)") if (ba::starts_with(line, "FORMAT: ") and line != "FORMAT: (10X,I3,1X,A3,1X,A1,I4,A1,3(1X,A4,2X),12X,F5.1)")
Error("Unexpected format in REMARK 500"); throw runtime_error("Unexpected format in REMARK 500");
headerSeen = line == "M RES CSSEQI ATM1 ATM2 ATM3"; headerSeen = line == "M RES CSSEQI ATM1 ATM2 ATM3";
} }
...@@ -2115,6 +2150,8 @@ void PDBFileParser::ParseRemarks() ...@@ -2115,6 +2150,8 @@ void PDBFileParser::ParseRemarks()
string atoms[3] = { vS(27, 30), vS(34, 37), vS(41, 44) }; string atoms[3] = { vS(27, 30), vS(34, 37), vS(41, 44) };
string deviation = vF(57, 62); string deviation = vF(57, 62);
if (deviation == "*****")
deviation.clear();
getCategory("pdbx_validate_rmsd_angle")->emplace({ getCategory("pdbx_validate_rmsd_angle")->emplace({
{ "id", to_string(++id) }, { "id", to_string(++id) },
...@@ -2144,7 +2181,7 @@ void PDBFileParser::ParseRemarks() ...@@ -2144,7 +2181,7 @@ void PDBFileParser::ParseRemarks()
if (not headerSeen) if (not headerSeen)
{ {
if (ba::starts_with(line, "FORMAT: ") and line != "FORMAT:(10X,I3,1X,A3,1X,A1,I4,A1,4X,F7.2,3X,F7.2)") if (ba::starts_with(line, "FORMAT: ") and line != "FORMAT:(10X,I3,1X,A3,1X,A1,I4,A1,4X,F7.2,3X,F7.2)")
Error("Unexpected format in REMARK 500"); throw runtime_error("Unexpected format in REMARK 500");
headerSeen = line == "M RES CSSEQI PSI PHI"; headerSeen = line == "M RES CSSEQI PSI PHI";
} }
...@@ -2347,7 +2384,7 @@ void PDBFileParser::ParseRemarks() ...@@ -2347,7 +2384,7 @@ void PDBFileParser::ParseRemarks()
if (s == "SITE") if (s == "SITE")
state = sId; state = sId;
else if (VERBOSE) else if (VERBOSE)
Error("Invalid REMARK 800 record, expected SITE"); throw runtime_error("Invalid REMARK 800 record, expected SITE");
break; break;
case sId: case sId:
...@@ -2357,7 +2394,7 @@ void PDBFileParser::ParseRemarks() ...@@ -2357,7 +2394,7 @@ void PDBFileParser::ParseRemarks()
state = sEvidence; state = sEvidence;
} }
else if (VERBOSE) else if (VERBOSE)
Error("Invalid REMARK 800 record, expected SITE_IDENTIFIER"); throw runtime_error("Invalid REMARK 800 record, expected SITE_IDENTIFIER");
break; break;
case sEvidence: case sEvidence:
...@@ -2367,7 +2404,7 @@ void PDBFileParser::ParseRemarks() ...@@ -2367,7 +2404,7 @@ void PDBFileParser::ParseRemarks()
state = sDesc; state = sDesc;
} }
else if (VERBOSE) else if (VERBOSE)
Error("Invalid REMARK 800 record, expected SITE_IDENTIFIER"); throw runtime_error("Invalid REMARK 800 record, expected SITE_IDENTIFIER");
break; break;
case sDesc: case sDesc:
...@@ -2454,6 +2491,11 @@ void PDBFileParser::ParseRemarks() ...@@ -2454,6 +2491,11 @@ void PDBFileParser::ParseRemarks()
} }
} }
} }
catch (const exception& ex)
{
throw_with_nested(runtime_error("Error parsing REMARK " + to_string(remarkNr)));
}
}
if (not (compoundDetails.empty() and sequenceDetails.empty() and sourceDetails.empty())) if (not (compoundDetails.empty() and sequenceDetails.empty() and sourceDetails.empty()))
{ {
...@@ -2566,11 +2608,24 @@ void PDBFileParser::ParseRemark200() ...@@ -2566,11 +2608,24 @@ void PDBFileParser::ParseRemark200()
{ "crystal_id", 1 } { "crystal_id", 1 }
}); });
string collectionDate;
boost::system::error_code ec;
collectionDate = pdb2cifDate(rm200("DATE OF DATA COLLECTION", diffrnNr), ec);
if (ec)
{
if (VERBOSE)
cerr << ec.message() << " for pdbx_collection_date" << endl;
// The date field can become truncated when multiple values are available
if (diffrnNr != 1)
collectionDate.clear();
}
getCategory("diffrn_detector")->emplace({ getCategory("diffrn_detector")->emplace({
{ "diffrn_id", diffrnNr }, { "diffrn_id", diffrnNr },
{ "detector", rm200("DETECTOR TYPE", diffrnNr) }, { "detector", rm200("DETECTOR TYPE", diffrnNr) },
{ "type", rm200("DETECTOR MANUFACTURER", diffrnNr) }, { "type", rm200("DETECTOR MANUFACTURER", diffrnNr) },
{ "pdbx_collection_date", pdb2cifDate(rm200("DATE OF DATA COLLECTION", diffrnNr)) }, { "pdbx_collection_date", collectionDate },
{ "details", rm200("OPTICS", diffrnNr) } { "details", rm200("OPTICS", diffrnNr) }
}); });
...@@ -2586,7 +2641,7 @@ void PDBFileParser::ParseRemark200() ...@@ -2586,7 +2641,7 @@ void PDBFileParser::ParseRemark200()
vector<string> wavelengths; vector<string> wavelengths;
string wl = rm200("WAVELENGTH OR RANGE (A)", diffrnNr); string wl = rm200("WAVELENGTH OR RANGE (A)", diffrnNr);
ba::split(wavelengths, wl, ba::is_any_of(", "), ba::token_compress_on); ba::split(wavelengths, wl, ba::is_any_of(", -"), ba::token_compress_on);
diffrnWaveLengths.insert(wavelengths.begin(), wavelengths.end()); diffrnWaveLengths.insert(wavelengths.begin(), wavelengths.end());
...@@ -2997,7 +3052,7 @@ void PDBFileParser::ParsePrimaryStructure() ...@@ -2997,7 +3052,7 @@ void PDBFileParser::ParsePrimaryStructure()
else if (mRec->is("DBREF2")) // 1 - 6 Record name "DBREF2" else if (mRec->is("DBREF2")) // 1 - 6 Record name "DBREF2"
{ // 8 - 11 IDcode idCode ID code of this datablock. { // 8 - 11 IDcode idCode ID code of this datablock.
if (vC(13) != cur.chainID) // 13 Character chainID Chain identifier. if (vC(13) != cur.chainID) // 13 Character chainID Chain identifier.
Error("Chain ID's for DBREF1/DBREF2 records do not match"); throw runtime_error("Chain ID's for DBREF1/DBREF2 records do not match");
cur.dbAccession = vS(19, 40); // 19 - 40 LString dbAccession Sequence database accession code, cur.dbAccession = vS(19, 40); // 19 - 40 LString dbAccession Sequence database accession code,
// left justified. // left justified.
cur.dbSeqBegin = vI(46, 55); // 46 - 55 Integer seqBegin Initial sequence number of the cur.dbSeqBegin = vI(46, 55); // 46 - 55 Integer seqBegin Initial sequence number of the
...@@ -3096,7 +3151,9 @@ void PDBFileParser::ParseHeterogen() ...@@ -3096,7 +3151,9 @@ void PDBFileParser::ParseHeterogen()
GetNextRecord(); GetNextRecord();
} }
while (mRec->is("HETNAM")) // 1 - 6 Record name "HETNAM" for (;;)
{
if (mRec->is("HETNAM")) // 1 - 6 Record name "HETNAM"
{ // 9 - 10 Continuation continuation Allows concatenation of multiple records. { // 9 - 10 Continuation continuation Allows concatenation of multiple records.
string hetID = vS(12, 14); // 12 - 14 LString(3) hetID Het identifier, right-justified. string hetID = vS(12, 14); // 12 - 14 LString(3) hetID Het identifier, right-justified.
string text = vS(16); // 16 - 70 String text Chemical name. string text = vS(16); // 16 - 70 String text Chemical name.
...@@ -3105,9 +3162,10 @@ void PDBFileParser::ParseHeterogen() ...@@ -3105,9 +3162,10 @@ void PDBFileParser::ParseHeterogen()
InsertChemComp(hetID); InsertChemComp(hetID);
GetNextRecord(); GetNextRecord();
continue;
} }
while (mRec->is("HETSYN")) // 1 - 6 Record name "HETSYN" if (mRec->is("HETSYN")) // 1 - 6 Record name "HETSYN"
{ // 9 - 10 Continuation continuation Allows concatenation of multiple records. { // 9 - 10 Continuation continuation Allows concatenation of multiple records.
string hetID = vS(12, 14); // 12 - 14 LString(3) hetID Het identifier, right-justified. string hetID = vS(12, 14); // 12 - 14 LString(3) hetID Het identifier, right-justified.
string syn = vS(16); // 16 - 70 SList hetSynonyms List of synonyms. string syn = vS(16); // 16 - 70 SList hetSynonyms List of synonyms.
...@@ -3115,6 +3173,10 @@ void PDBFileParser::ParseHeterogen() ...@@ -3115,6 +3173,10 @@ void PDBFileParser::ParseHeterogen()
mHetsyns[hetID] = syn; mHetsyns[hetID] = syn;
GetNextRecord(); GetNextRecord();
continue;
}
break;
} }
while (mRec->is("FORMUL")) // 1 - 6 Record name "FORMUL" while (mRec->is("FORMUL")) // 1 - 6 Record name "FORMUL"
...@@ -3145,7 +3207,7 @@ void PDBFileParser::ConstructEntities() ...@@ -3145,7 +3207,7 @@ void PDBFileParser::ConstructEntities()
{ {
if (r->is("MODEL ")) if (r->is("MODEL "))
{ {
modelNr = vI(11, 14); modelNr = r->vI(11, 14);
continue; continue;
} }
...@@ -3245,6 +3307,8 @@ void PDBFileParser::ConstructEntities() ...@@ -3245,6 +3307,8 @@ void PDBFileParser::ConstructEntities()
} }
} }
set<char> terminatedChains;
for (auto r = mData; r != nullptr; r = r->mNext) for (auto r = mData; r != nullptr; r = r->mNext)
{ {
if (r->is("ATOM ") or r->is("HETATM")) if (r->is("ATOM ") or r->is("HETATM"))
...@@ -3295,7 +3359,8 @@ void PDBFileParser::ConstructEntities() ...@@ -3295,7 +3359,8 @@ void PDBFileParser::ConstructEntities()
} }
} }
if (r->is("HETATM")) // There appears to be a program that writes out HETATM records as ATOM records....
if (r->is("HETATM") or terminatedChains.count(chainID))
{ {
if (isWater(resName)) if (isWater(resName))
mWaterHetId = resName; mWaterHetId = resName;
...@@ -3317,6 +3382,12 @@ void PDBFileParser::ConstructEntities() ...@@ -3317,6 +3382,12 @@ void PDBFileParser::ConstructEntities()
continue; continue;
} }
if (r->is("TER "))
{
char chainID = r->vC(22); // 22 Character chainID Chain identifier.
terminatedChains.insert(chainID);
}
} }
// Create missing compounds // Create missing compounds
...@@ -3371,23 +3442,28 @@ void PDBFileParser::ConstructEntities() ...@@ -3371,23 +3442,28 @@ void PDBFileParser::ConstructEntities()
int seqNr = 1; int seqNr = 1;
for (auto& res: chain.mSeqres) for (auto& res: chain.mSeqres)
{ {
mChainSeq2AsymSeq[make_tuple(chain.mDbref.chainID, res.mSeqNum, res.mIcode)] = make_tuple(asymId, seqNr, true);
string seqId = to_string(seqNr);
++seqNr;
set<string> monIds = { res.mMonId };
monIds.insert(res.mAlts.begin(), res.mAlts.end());
for (string monId: monIds)
{
string authMonId, authSeqNum; string authMonId, authSeqNum;
if (res.mSeen) if (res.mSeen)
{ {
authMonId = res.mMonId; authMonId = monId;
authSeqNum = to_string(res.mSeqNum); authSeqNum = to_string(res.mSeqNum);
} }
mChainSeq2AsymSeq[make_tuple(chain.mDbref.chainID, res.mSeqNum, res.mIcode)] = make_tuple(asymId, seqNr, true);
string seqId = to_string(seqNr);
++seqNr;
cat->emplace({ cat->emplace({
{ "asym_id", asymId }, { "asym_id", asymId },
{ "entity_id", mMolID2EntityID[chain.mMolId] }, { "entity_id", mMolID2EntityID[chain.mMolId] },
{ "seq_id", seqId }, { "seq_id", seqId },
{ "mon_id", res.mMonId }, { "mon_id", monId },
{ "ndb_seq_num", seqId }, { "ndb_seq_num", seqId },
{ "pdb_seq_num", res.mSeqNum }, { "pdb_seq_num", res.mSeqNum },
{ "auth_seq_num", authSeqNum }, { "auth_seq_num", authSeqNum },
...@@ -3399,6 +3475,7 @@ void PDBFileParser::ConstructEntities() ...@@ -3399,6 +3475,7 @@ void PDBFileParser::ConstructEntities()
}); });
} }
} }
}
// We have now created all compounds, write them out // We have now created all compounds, write them out
uint32 structRefId = 0, structRefSeqAlignId = 0; uint32 structRefId = 0, structRefSeqAlignId = 0;
...@@ -3431,9 +3508,12 @@ void PDBFileParser::ConstructEntities() ...@@ -3431,9 +3508,12 @@ void PDBFileParser::ConstructEntities()
{ "gene_src_common_name", cmp.mSource["ORGANISM_COMMON"] }, { "gene_src_common_name", cmp.mSource["ORGANISM_COMMON"] },
{ "pdbx_gene_src_gene", cmp.mSource["GENE"] }, { "pdbx_gene_src_gene", cmp.mSource["GENE"] },
{ "gene_src_strain", cmp.mSource["STRAIN"] }, { "gene_src_strain", cmp.mSource["STRAIN"] },
{ "gene_src_tissue", cmp.mSource["TISSUE"] },
{ "gene_src_tissue_fraction", cmp.mSource["TISSUE_FRACTION"] },
{ "pdbx_gene_src_cell_line", cmp.mSource["CELL_LINE"] }, { "pdbx_gene_src_cell_line", cmp.mSource["CELL_LINE"] },
{ "pdbx_gene_src_organelle", cmp.mSource["ORGANELLE"] }, { "pdbx_gene_src_organelle", cmp.mSource["ORGANELLE"] },
{ "pdbx_gene_src_cellular_location", cmp.mSource["CELLULAR_LOCATION"] }, { "pdbx_gene_src_cellular_location", cmp.mSource["CELLULAR_LOCATION"] },
{ "host_org_common_name", cmp.mSource["EXPRESSION_SYSTEM_COMMON"] },
{ "pdbx_gene_src_scientific_name", cmp.mSource["ORGANISM_SCIENTIFIC"] }, { "pdbx_gene_src_scientific_name", cmp.mSource["ORGANISM_SCIENTIFIC"] },
{ "pdbx_gene_src_ncbi_taxonomy_id", cmp.mSource["ORGANISM_TAXID"] }, { "pdbx_gene_src_ncbi_taxonomy_id", cmp.mSource["ORGANISM_TAXID"] },
{ "pdbx_host_org_scientific_name", cmp.mSource["EXPRESSION_SYSTEM"] }, { "pdbx_host_org_scientific_name", cmp.mSource["EXPRESSION_SYSTEM"] },
...@@ -4613,9 +4693,21 @@ void PDBFileParser::ParseCrystallographic() ...@@ -4613,9 +4693,21 @@ void PDBFileParser::ParseCrystallographic()
{ "Z_PDB", vF(67, 70) } // 67 - 70 Integer z Z value. { "Z_PDB", vF(67, 70) } // 67 - 70 Integer z Z value.
}); });
string spageGroup, intTablesNr;
try
{
spageGroup = vS(56, 66);
clipper::Spacegroup sg(clipper::Spgr_descr{spageGroup});
intTablesNr = to_string(sg.spacegroup_number());
}
catch (...)
{
}
getCategory("symmetry")->emplace({ getCategory("symmetry")->emplace({
{ "entry_id", mStructureId }, { "entry_id", mStructureId },
{ "space_group_name_H-M", vS(56, 66) } { "space_group_name_H-M", spageGroup },
{ "Int_Tables_number", intTablesNr }
}); });
GetNextRecord(); GetNextRecord();
...@@ -4901,7 +4993,7 @@ void PDBFileParser::ParseCoordinate(int modelNr) ...@@ -4901,7 +4993,7 @@ void PDBFileParser::ParseCoordinate(int modelNr)
int u23 = vI(64, 70); // 64 - 70 Integer u[1][2] U(2,3) int u23 = vI(64, 70); // 64 - 70 Integer u[1][2] U(2,3)
if (vS(7, 11) + vS(77, 80) != check) if (vS(7, 11) + vS(77, 80) != check)
Error("ANISOU record should follow corresponding ATOM record"); throw runtime_error("ANISOU record should follow corresponding ATOM record");
auto f = [](float f) -> string { return (boost::format("%6.4f") % f).str(); }; auto f = [](float f) -> string { return (boost::format("%6.4f") % f).str(); };
...@@ -4912,7 +5004,7 @@ void PDBFileParser::ParseCoordinate(int modelNr) ...@@ -4912,7 +5004,7 @@ void PDBFileParser::ParseCoordinate(int modelNr)
{ "pdbx_label_alt_id", altLoc != ' ' ? string { altLoc } : "." }, { "pdbx_label_alt_id", altLoc != ' ' ? string { altLoc } : "." },
{ "pdbx_label_comp_id", resName }, { "pdbx_label_comp_id", resName },
{ "pdbx_label_asym_id", asymId }, { "pdbx_label_asym_id", asymId },
{ "pdbx_label_seq_id", seqId }, { "pdbx_label_seq_id", (isResseq and seqId > 0) ? to_string(seqId) : "." },
{ "U[1][1]", f(u11 / 10000.f) }, { "U[1][1]", f(u11 / 10000.f) },
{ "U[2][2]", f(u22 / 10000.f) }, { "U[2][2]", f(u22 / 10000.f) },
{ "U[3][3]", f(u33 / 10000.f) }, { "U[3][3]", f(u33 / 10000.f) },
...@@ -5028,12 +5120,9 @@ void PDBFileParser::Parse(istream& is, cif::File& result) ...@@ -5028,12 +5120,9 @@ void PDBFileParser::Parse(istream& is, cif::File& result)
} }
catch (const exception& ex) catch (const exception& ex)
{ {
cerr << "Error parsing REMARK 3: " << endl throw_with_nested(runtime_error("Error parsing REMARK 3"));
<< ex.what() << endl;
} }
// //
//
//
// auto cat = getCategory("pdbx_refine_tls_group"); // auto cat = getCategory("pdbx_refine_tls_group");
// for (Row r: *cat) // for (Row r: *cat)
// { // {
...@@ -5062,7 +5151,10 @@ void PDBFileParser::Parse(istream& is, cif::File& result) ...@@ -5062,7 +5151,10 @@ void PDBFileParser::Parse(istream& is, cif::File& result)
} }
catch (const exception& ex) catch (const exception& ex)
{ {
Error(ex.what()); if (mRec != nullptr)
throw_with_nested(runtime_error("Error parsing PDB at line " + to_string(mRec->mLineNr)));
else
throw;
} }
} }
...@@ -5175,6 +5267,8 @@ void PDBFileParser::PDBChain::AlignResToSeqRes() ...@@ -5175,6 +5267,8 @@ void PDBFileParser::PDBChain::AlignResToSeqRes()
x = highX; x = highX;
y = highY; y = highY;
try
{
while (x >= 0 and y >= 0) while (x >= 0 and y >= 0)
{ {
switch (tb(x, y)) switch (tb(x, y))
...@@ -5182,6 +5276,7 @@ void PDBFileParser::PDBChain::AlignResToSeqRes() ...@@ -5182,6 +5276,7 @@ void PDBFileParser::PDBChain::AlignResToSeqRes()
case -1: case -1:
throw runtime_error("A residue found in the ATOM records (" + ry[y].mMonId + throw runtime_error("A residue found in the ATOM records (" + ry[y].mMonId +
" @ " + string{mDbref.chainID} + ":" + to_string(ry[y].mSeqNum) + " @ " + string{mDbref.chainID} + ":" + to_string(ry[y].mSeqNum) +
((ry[y].mIcode == ' ' or ry[y].mIcode == 0) ? "" : string{ ry[y].mIcode })+
") was not found in the SEQRES records"); ") was not found in the SEQRES records");
--y; --y;
break; break;
...@@ -5206,6 +5301,42 @@ void PDBFileParser::PDBChain::AlignResToSeqRes() ...@@ -5206,6 +5301,42 @@ void PDBFileParser::PDBChain::AlignResToSeqRes()
--y; --y;
} }
} }
}
catch (const exception& ex)
{
if (VERBOSE)
{
std::vector<pair<string,string>> alignment;
for (x = highX, y = highY; x >= 0 and y >= 0; )
{
switch (tb(x, y))
{
case -1:
alignment.push_back(make_pair("...", ry[y].mMonId));
--y;
break;
case 1:
alignment.push_back(make_pair(rx[x].mMonId, "..."));
--x;
break;
case 0:
alignment.push_back(make_pair(rx[x].mMonId, ry[y].mMonId));
--x;
--y;
break;
}
}
reverse(alignment.begin(), alignment.end());
for (auto a: alignment)
cerr << " " << a.first << " -- " << a.second << endl;
}
throw;
}
// assign numbers to the residues that don't have them yet // assign numbers to the residues that don't have them yet
stack<int> unnumbered; stack<int> unnumbered;
...@@ -5248,15 +5379,7 @@ void ReadPDBFile(istream& pdbFile, cif::File& cifFile) ...@@ -5248,15 +5379,7 @@ void ReadPDBFile(istream& pdbFile, cif::File& cifFile)
cifFile.loadDictionary("mmcif_pdbx"); cifFile.loadDictionary("mmcif_pdbx");
try
{
p.Parse(pdbFile, cifFile); p.Parse(pdbFile, cifFile);
}
catch (const exception& ex)
{
cerr << "Error parsing PDB file" << endl;
throw;
}
cifFile.validate(); cifFile.validate();
} }
...@@ -978,7 +978,10 @@ float Remark3Parser::parse() ...@@ -978,7 +978,10 @@ float Remark3Parser::parse()
} }
if (not remarks.empty() and not iequals(remarks, "NULL")) if (not remarks.empty() and not iequals(remarks, "NULL"))
{
if (not mDb["refine"].empty())
mDb["refine"].front()["details"] = remarks; mDb["refine"].front()["details"] = remarks;
}
float score = float(lineCount - dropped) / lineCount; float score = float(lineCount - dropped) / lineCount;
......
/*
Created by: Maarten L. Hekkelman
Date: dinsdag 07 november, 2017
Copyright 2017 NKI AVL
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "cif++/Config.h"
#include <termios.h>
#include <sys/ioctl.h>
#include <iostream>
#include <iomanip>
#include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/filesystem/path.hpp>
#include "cif++/CifUtils.h"
#include "cif++/Structure.h"
#include "cif++/TlsParser.h"
using namespace std;
namespace po = boost::program_options;
namespace ba = boost::algorithm;
namespace fs = boost::filesystem;
namespace c = libcif;
namespace cif
{
static const char* kRedOn = "\033[37;1;41m";
static const char* kRedOff = "\033[0m";
const int
kResidueNrWildcard = numeric_limits<int>::min(),
kNoSeqNum = numeric_limits<int>::max() - 1;
using namespace std;
// --------------------------------------------------------------------
// We parse selection statements and create a selection expression tree
// which is then interpreted by setting the selected flag for the
// residues. After that, the selected ranges are collected and printed.
struct TLSResidue
{
string chainID;
int seqNr;
char iCode;
string name;
bool selected;
string asymID;
int seqID;
bool operator==(const TLSResidue& rhs) const
{
return chainID == rhs.chainID and
seqNr == rhs.seqNr and
iCode == rhs.iCode and
iequals(name, rhs.name) and
selected == rhs.selected;
}
};
void DumpSelection(const vector<TLSResidue>& selected, int indentLevel)
{
string indent(indentLevel * 2, ' ');
auto i = selected.begin();
bool first = true;
// First print in PDB space
while (i != selected.end())
{
auto b = find_if(i, selected.end(), [](auto s) -> bool { return s.selected; });
if (b == selected.end())
break;
if (first)
cout << indent << "PDB:" << endl;
first = false;
auto e = find_if(b, selected.end(), [b](auto s) -> bool { return s.chainID != b->chainID or not s.selected; });
cout << indent << " >> " << b->chainID << ' ' << b->seqNr << ':' << (e - 1)->seqNr << endl;
i = e;
}
// Then in mmCIF space
if (not first)
cout << indent << "mmCIF:" << endl;
i = selected.begin();
while (i != selected.end())
{
auto b = find_if(i, selected.end(), [](auto s) -> bool { return s.selected; });
if (b == selected.end())
break;
auto e = find_if(b, selected.end(), [b](auto s) -> bool { return s.asymID != b->asymID or not s.selected; });
string asymID = b->asymID;
int from = b->seqID, to = from;
for (auto j = b + 1; j != e; ++j)
{
if (j->seqID == to + 1)
to = j->seqID;
else if (j->seqID != to) // probably an insertion code
{
if (from == kNoSeqNum or to == kNoSeqNum)
cout << indent << " >> " << asymID << endl;
else
cout << indent << " >> " << asymID << ' ' << from << ':' << to << endl;
asymID = b->asymID;
from = to = b->seqID;
}
}
if (from == kNoSeqNum or to == kNoSeqNum)
cout << indent << " >> " << asymID << endl;
else
cout << indent << " >> " << asymID << ' ' << from << ':' << to << endl;
i = e;
}
if (first)
{
if (isatty(STDOUT_FILENO))
cout << indent << kRedOn << "Empty selection" << kRedOff << endl;
else
cout << indent << kRedOn << "Empty selection" << kRedOff << endl;
}
}
vector<tuple<string,int,int>> TLSSelection::GetRanges(Datablock& db, bool pdbNamespace) const
{
vector<TLSResidue> selected;
// Collect the residues from poly seq scheme...
for (auto r: db["pdbx_poly_seq_scheme"])
{
string chain, seqNr, iCode, name;
string asymID;
int seqID;
if (pdbNamespace)
cif::tie(chain, seqNr, iCode, name, asymID, seqID) = r.get("pdb_strand_id", "pdb_seq_num", "pdb_ins_code", "pdb_comp_id", "asym_id", "seq_id");
else
{
cif::tie(chain, seqNr, name) = r.get("asym_id", "seq_id", "mon_id");
asymID = chain;
seqID = stoi(seqNr);
}
if (seqNr.empty())
continue;
if (iCode.length() > 1)
throw runtime_error("invalid iCode");
selected.push_back({chain, stoi(seqNr), iCode[0], name, false, asymID, seqID});
}
// ... those from the nonpoly scheme
for (auto r: db["pdbx_nonpoly_scheme"])
{
string chain, seqNr, iCode, name, asymID;
if (pdbNamespace)
{
cif::tie(chain, seqNr, iCode, name, asymID) = r.get("pdb_strand_id", "pdb_seq_num", "pdb_ins_code", "pdb_mon_id", "asym_id");
if (seqNr.empty())
continue;
}
else
{
cif::tie(chain, name) = r.get("asym_id", "mon_id");
asymID = chain;
seqNr = "0";
}
if (iequals(name, "HOH") or iequals(name, "H2O"))
continue;
if (iCode.length() > 1)
throw runtime_error("invalid iCode");
selected.push_back({chain, stoi(seqNr), iCode[0], name, false, asymID, kNoSeqNum});
}
// selected might consist of multiple ranges
// output per chain
stable_sort(selected.begin(), selected.end(), [](auto& a, auto& b) -> bool
{
int d = a.chainID.compare(b.chainID);
if (d == 0)
d = a.seqNr - b.seqNr;
return d < 0;
});
CollectResidues(db, selected);
vector<tuple<string,int,int>> result;
auto i = selected.begin();
while (i != selected.end())
{
auto b = find_if(i, selected.end(), [](auto s) -> bool { return s.selected; });
if (b == selected.end())
break;
auto e = find_if(b, selected.end(), [b](auto s) -> bool { return s.asymID != b->asymID or not s.selected; });
// return ranges with strict increasing sequence numbers.
// So when there's a gap in the sequence we split the range.
// Beware of iCodes though
result.push_back(make_tuple(b->asymID, b->seqID, b->seqID));
for (auto j = b + 1; j != e; ++j)
{
if (j->seqID == get<2>(result.back()) + 1)
get<2>(result.back()) = j->seqID;
else if (j->seqID != get<2>(result.back())) // probably an insertion code
result.push_back(make_tuple(b->asymID, j->seqID, j->seqID));
}
i = e;
}
return result;
}
struct TLSSelectionNot : public TLSSelection
{
TLSSelectionNot(TLSSelectionPtr selection)
: selection(selection.release()) {}
virtual void CollectResidues(Datablock& db, vector<TLSResidue>& residues, int indentLevel) const
{
selection->CollectResidues(db, residues, indentLevel + 1);
for (auto& r: residues)
r.selected = not r.selected;
if (VERBOSE)
{
cout << string(indentLevel * 2, ' ') << "NOT" << endl;
DumpSelection(residues, indentLevel);
}
}
TLSSelectionPtr selection;
};
struct TLSSelectionAll : public TLSSelection
{
TLSSelectionAll() {}
virtual void CollectResidues(Datablock& db, vector<TLSResidue>& residues, int indentLevel) const
{
for (auto& r: residues)
r.selected = true;
if (VERBOSE)
{
cout << string(indentLevel * 2, ' ') << "ALL" << endl;
DumpSelection(residues, indentLevel);
}
}
};
struct TLSSelectionChain : public TLSSelectionAll
{
TLSSelectionChain(const string& chainID)
: m_chain(chainID) {}
virtual void CollectResidues(Datablock& db, vector<TLSResidue>& residues, int indentLevel) const
{
bool allChains = m_chain == "*";
for (auto& r: residues)
r.selected = allChains or r.chainID == m_chain;
if (VERBOSE)
{
cout << string(indentLevel * 2, ' ') << "CHAIN " << m_chain << endl;
DumpSelection(residues, indentLevel);
}
}
string m_chain;
};
struct TLSSelectionResID : public TLSSelectionAll
{
TLSSelectionResID(int seqNr, char iCode)
: m_seq_nr(seqNr), m_icode(iCode) {}
virtual void CollectResidues(Datablock& db, vector<TLSResidue>& residues, int indentLevel) const
{
for (auto& r: residues)
r.selected = r.seqNr == m_seq_nr and r.iCode == m_icode;
if (VERBOSE)
{
cout << string(indentLevel * 2, ' ') << "ResID " << m_seq_nr << (m_icode ? string { m_icode} : "") << endl;
DumpSelection(residues, indentLevel);
}
}
int m_seq_nr;
char m_icode;
};
struct TLSSelectionRangeSeq : public TLSSelectionAll
{
TLSSelectionRangeSeq(int first, int last)
: m_first(first), m_last(last) {}
virtual void CollectResidues(Datablock& db, vector<TLSResidue>& residues, int indentLevel) const
{
for (auto& r: residues)
{
r.selected = ((r.seqNr >= m_first or m_first == kResidueNrWildcard) and
(r.seqNr <= m_last or m_last == kResidueNrWildcard));
}
if (VERBOSE)
{
cout << string(indentLevel * 2, ' ') << "Range " << m_first << ':' << m_last << endl;
DumpSelection(residues, indentLevel);
}
}
int m_first, m_last;
};
struct TLSSelectionRangeID : public TLSSelectionAll
{
TLSSelectionRangeID(int first, int last, char icodeFirst = 0, char icodeLast = 0)
: m_first(first), m_last(last), m_icode_first(icodeFirst), m_icode_last(icodeLast) {}
virtual void CollectResidues(Datablock& db, vector<TLSResidue>& residues, int indentLevel) const
{
// need to do this per chain
set<string> chains;
for (auto& r: residues)
chains.insert(r.chainID);
for (string chain: chains)
{
auto f = find_if(residues.begin(), residues.end(),
[=](auto r) -> bool {
return r.chainID == chain and r.seqNr == m_first and r.iCode == m_icode_first;
});
auto l = find_if(residues.begin(), residues.end(),
[=](auto r) -> bool {
return r.chainID == chain and r.seqNr == m_last and r.iCode == m_icode_last;
});
if (f != residues.end() and l != residues.end() and f <= l)
{
++l;
for (; f != l; ++f)
f->selected = true;
}
}
if (VERBOSE)
{
cout << string(indentLevel * 2, ' ') << "Through " << m_first << ':' << m_last << endl;
DumpSelection(residues, indentLevel);
}
}
int m_first, m_last;
char m_icode_first, m_icode_last;
};
struct TLSSelectionUnion : public TLSSelection
{
TLSSelectionUnion(TLSSelectionPtr& lhs, TLSSelectionPtr& rhs)
: lhs(lhs.release()), rhs(rhs.release()) {}
TLSSelectionUnion(TLSSelectionPtr& lhs, TLSSelectionPtr&& rhs)
: lhs(lhs.release()), rhs(rhs.release()) {}
virtual void CollectResidues(Datablock& db, vector<TLSResidue>& residues, int indentLevel) const
{
auto a = residues;
for_each(a.begin(), a.end(), [](auto& r) { r.selected = false; });
auto b = residues;
for_each(b.begin(), b.end(), [](auto& r) { r.selected = false; });
lhs->CollectResidues(db, a, indentLevel + 1);
rhs->CollectResidues(db, b, indentLevel + 1);
for (auto ai = a.begin(), bi = b.begin(), ri = residues.begin(); ri != residues.end(); ++ai, ++bi, ++ri)
ri->selected = ai->selected or bi->selected;
if (VERBOSE)
{
cout << string(indentLevel * 2, ' ') << "Union" << endl;
DumpSelection(residues, indentLevel);
}
}
TLSSelectionPtr lhs;
TLSSelectionPtr rhs;
};
struct TLSSelectionIntersection : public TLSSelection
{
TLSSelectionIntersection(TLSSelectionPtr& lhs, TLSSelectionPtr& rhs)
: lhs(lhs.release()), rhs(rhs.release()) {}
TLSSelectionIntersection(TLSSelectionPtr& lhs, TLSSelectionPtr&& rhs)
: lhs(lhs.release()), rhs(rhs.release()) {}
virtual void CollectResidues(Datablock& db, vector<TLSResidue>& residues, int indentLevel) const
{
auto a = residues;
for_each(a.begin(), a.end(), [](auto& r) { r.selected = false; });
auto b = residues;
for_each(b.begin(), b.end(), [](auto& r) { r.selected = false; });
lhs->CollectResidues(db, a, indentLevel + 1);
rhs->CollectResidues(db, b, indentLevel + 1);
for (auto ai = a.begin(), bi = b.begin(), ri = residues.begin(); ri != residues.end(); ++ai, ++bi, ++ri)
ri->selected = ai->selected and bi->selected;
if (VERBOSE)
{
cout << string(indentLevel * 2, ' ') << "Intersection" << endl;
DumpSelection(residues, indentLevel);
}
}
TLSSelectionPtr lhs;
TLSSelectionPtr rhs;
};
struct TLSSelectionByName : public TLSSelectionAll
{
public:
TLSSelectionByName(const string& resname)
: m_name(resname) {}
virtual void CollectResidues(Datablock& db, vector<TLSResidue>& residues, int indentLevel) const
{
for (auto& r: residues)
r.selected = r.name == m_name;
if (VERBOSE)
{
cout << string(indentLevel * 2, ' ') << "Name " << m_name << endl;
DumpSelection(residues, indentLevel);
}
}
string m_name;
};
struct TLSSelectionByElement : public TLSSelectionAll
{
public:
TLSSelectionByElement(const string& element)
: m_element(element) {}
virtual void CollectResidues(Datablock& db, vector<TLSResidue>& residues, int indentLevel) const
{
// rationale... We want to select residues only. So we select
// residues that have just a single atom of type m_element.
// And we assume these have as residue name... m_element.
// ... Right?
for (auto& r: residues)
r.selected = iequals(r.name, m_element);
if (VERBOSE)
{
cout << string(indentLevel * 2, ' ') << "Element " << m_element << endl;
DumpSelection(residues, indentLevel);
}
}
string m_element;
};
// --------------------------------------------------------------------
class TLSSelectionParserImpl
{
public:
TLSSelectionParserImpl(const string& selection)
: m_selection(selection), m_p(m_selection.begin()), m_end(m_selection.end()) {}
virtual TLSSelectionPtr Parse() = 0;
protected:
virtual int GetNextToken() = 0;
virtual void Match(int token);
virtual string ToString(int token) = 0;
string m_selection;
string::iterator m_p, m_end;
int m_lookahead;
string m_token;
};
void TLSSelectionParserImpl::Match(int token)
{
if (m_lookahead == token)
m_lookahead = GetNextToken();
else
{
string expected;
if (token >= 256)
expected = ToString(token);
else
expected = { char(token) };
string found;
if (m_lookahead >= 256)
found = ToString(m_lookahead) + " (" + m_token + ')';
else
found = { char(m_lookahead) };
throw runtime_error("Expected " + expected + " but found " + found);
}
}
// --------------------------------------------------------------------
class TLSSelectionParserImplPhenix : public TLSSelectionParserImpl
{
public:
TLSSelectionParserImplPhenix(const string& selection)
: TLSSelectionParserImpl(selection)
{
m_lookahead = GetNextToken();
}
virtual TLSSelectionPtr Parse();
private:
TLSSelectionPtr ParseAtomSelection();
TLSSelectionPtr ParseTerm();
TLSSelectionPtr ParseFactor();
enum TOKEN {
pt_NONE = 0,
pt_IDENT = 256,
pt_STRING,
pt_NUMBER,
pt_RESID,
pt_EOLN,
pt_KW_ALL,
pt_KW_CHAIN,
pt_KW_RESSEQ,
pt_KW_RESID,
pt_KW_ICODE,
pt_KW_RESNAME,
pt_KW_ELEMENT,
pt_KW_AND,
pt_KW_OR,
pt_KW_NOT,
pt_KW_PDB,
pt_KW_ENTRY,
pt_KW_THROUGH
};
virtual int GetNextToken();
virtual string ToString(int token);
int m_value_i;
string m_value_s;
char m_icode;
};
int TLSSelectionParserImplPhenix::GetNextToken()
{
int result = pt_NONE;
enum STATE {
st_START,
st_RESID = 200,
st_NUM = 300,
st_IDENT = 400,
st_QUOTED = 500,
st_DQUOTED = 550,
st_OTHER = 600
};
int state = st_START;
m_value_i = 0;
m_icode = 0;
m_value_s.clear();
auto s = m_p;
auto start = state;
m_token.clear();
auto restart = [&]()
{
switch (start)
{
case st_START: state = start = st_RESID; break;
case st_RESID: state = start = st_NUM; break;
case st_NUM: state = start = st_IDENT; break;
case st_IDENT: state = start = st_QUOTED; break;
case st_QUOTED: state = start = st_DQUOTED; break;
case st_DQUOTED:state = start = st_OTHER; break;
}
m_token.clear();
m_p = s;
};
auto retract = [&]()
{
--m_p;
m_token.pop_back();
};
while (result == pt_NONE)
{
char ch = *m_p++;
if (m_p > m_end)
ch = 0;
else
m_token += ch;
switch (state)
{
// start block
case st_START:
if (ch == 0)
result = pt_EOLN;
else if (isspace(ch))
{
m_token.clear();
++s;
}
else
restart();
break;
// RESID block
case st_RESID:
if (ch == '-')
state = st_RESID + 1;
else if (isdigit(ch))
{
m_value_i = (ch - '0');
state = st_RESID + 2;
}
else
restart();
break;
case st_RESID + 1:
if (isdigit(ch))
{
m_value_i = -(ch - '0');
state = st_RESID + 2;
}
else
restart();
break;
case st_RESID + 2:
if (isdigit(ch))
m_value_i = 10 * m_value_i + (m_value_i < 0 ? -1 : 1) * (ch - '0');
else if (isalpha(ch))
{
m_icode = ch;
state = st_RESID + 3;
}
else
restart();
break;
case st_RESID + 3:
if (isalnum(ch))
restart();
else
{
retract();
result = pt_RESID;
}
break;
// NUM block
case st_NUM:
if (ch == '-')
state = st_NUM + 1;
else if (isdigit(ch))
{
m_value_i = ch - '0';
state = st_NUM + 2;
}
else
restart();
break;
case st_NUM + 1:
if (isdigit(ch))
{
m_value_i = -(ch - '0');
state = st_NUM + 2;
}
else
restart();
break;
case st_NUM + 2:
if (isdigit(ch))
m_value_i = 10 * m_value_i + (m_value_i < 0 ? -1 : 1) * (ch - '0');
else if (not isalpha(ch))
{
result = pt_NUMBER;
retract();
}
else
restart();
break;
// IDENT block
case st_IDENT:
if (isalnum(ch))
{
m_value_s = { ch };
state = st_IDENT + 1;
}
else
restart();
break;
case st_IDENT + 1:
if (isalnum(ch) or ch == '\'')
m_value_s += ch;
else
{
--m_p;
result = pt_IDENT;
}
break;
// QUOTED block
case st_QUOTED:
if (ch == '\'')
{
m_value_s.clear();
state = st_QUOTED + 1;
}
else
restart();
break;
case st_QUOTED + 1:
if (ch == '\'')
result = pt_STRING;
else if (ch == 0)
throw runtime_error("Unexpected end of selection, missing quote character?");
else
m_value_s += ch;
break;
// QUOTED block
case st_DQUOTED:
if (ch == '\"')
{
m_value_s.clear();
state = st_DQUOTED + 1;
}
else
restart();
break;
case st_DQUOTED + 1:
if (ch == '\"')
result = pt_STRING;
else if (ch == 0)
throw runtime_error("Unexpected end of selection, missing quote character?");
else
m_value_s += ch;
break;
// OTHER block
case st_OTHER:
result = ch;
break;
}
}
if (result == pt_IDENT)
{
if (iequals(m_value_s, "CHAIN"))
result = pt_KW_CHAIN;
else if (iequals(m_value_s, "ALL"))
result = pt_KW_ALL;
else if (iequals(m_value_s, "AND"))
result = pt_KW_AND;
else if (iequals(m_value_s, "OR"))
result = pt_KW_OR;
else if (iequals(m_value_s, "NOT"))
result = pt_KW_NOT;
else if (iequals(m_value_s, "RESSEQ"))
result = pt_KW_RESSEQ;
else if (iequals(m_value_s, "RESID") or iequals(m_value_s, "RESI"))
result = pt_KW_RESID;
else if (iequals(m_value_s, "RESNAME"))
result = pt_KW_RESNAME;
else if (iequals(m_value_s, "ELEMENT"))
result = pt_KW_ELEMENT;
else if (iequals(m_value_s, "PDB"))
result = pt_KW_PDB;
else if (iequals(m_value_s, "ENTRY"))
result = pt_KW_ENTRY;
else if (iequals(m_value_s, "THROUGH"))
result = pt_KW_THROUGH;
}
return result;
}
string TLSSelectionParserImplPhenix::ToString(int token)
{
switch (token)
{
case pt_IDENT: return "identifier";
case pt_STRING: return "string";
case pt_NUMBER: return "number";
case pt_RESID: return "resid";
case pt_EOLN: return "end of line";
case pt_KW_ALL: return "ALL";
case pt_KW_CHAIN: return "CHAIN";
case pt_KW_RESSEQ: return "RESSEQ";
case pt_KW_RESID: return "RESID";
case pt_KW_RESNAME: return "RESNAME";
case pt_KW_ELEMENT: return "ELEMENT";
case pt_KW_AND: return "AND";
case pt_KW_OR: return "OR";
case pt_KW_NOT: return "NOT";
case pt_KW_PDB: return "PDB";
case pt_KW_ENTRY: return "ENTRY";
case pt_KW_THROUGH: return "THROUGH";
default: return "character";
}
}
TLSSelectionPtr TLSSelectionParserImplPhenix::Parse()
{
if (m_lookahead == pt_KW_PDB)
{
Match(pt_KW_PDB);
// Match(pt_KW_ENTRY);
throw runtime_error("Unimplemented PDB ENTRY specification");
}
TLSSelectionPtr result = ParseAtomSelection();
bool extraParenthesis = false;
if (m_lookahead == ')')
{
extraParenthesis = true;
m_lookahead = GetNextToken();
}
Match(pt_EOLN);
if (extraParenthesis)
cerr << "WARNING: too many closing parenthesis in TLS selection statement" << endl;
return result;
}
TLSSelectionPtr TLSSelectionParserImplPhenix::ParseAtomSelection()
{
TLSSelectionPtr result = ParseTerm();
while (m_lookahead == pt_KW_OR)
{
Match(pt_KW_OR);
result.reset(new TLSSelectionUnion(result, ParseTerm()));
}
return result;
}
TLSSelectionPtr TLSSelectionParserImplPhenix::ParseTerm()
{
TLSSelectionPtr result = ParseFactor();
while (m_lookahead == pt_KW_AND)
{
Match(pt_KW_AND);
result.reset(new TLSSelectionIntersection(result, ParseFactor()));
}
return result;
}
TLSSelectionPtr TLSSelectionParserImplPhenix::ParseFactor()
{
TLSSelectionPtr result;
switch (m_lookahead)
{
case '(':
Match('(');
result = ParseAtomSelection();
if (m_lookahead == pt_EOLN)
cerr << "WARNING: missing closing parenthesis in TLS selection statement" << endl;
else
Match(')');
break;
case pt_KW_NOT:
Match(pt_KW_NOT);
result.reset(new TLSSelectionNot(ParseAtomSelection()));
break;
case pt_KW_CHAIN:
{
Match(pt_KW_CHAIN);
string chainID = m_value_s;
if (m_lookahead == pt_NUMBER) // sigh
{
chainID = to_string(m_value_i);
Match(pt_NUMBER);
}
else
Match(m_lookahead == pt_STRING ? pt_STRING : pt_IDENT);
result.reset(new TLSSelectionChain(chainID));
break;
}
case pt_KW_RESNAME:
{
Match(pt_KW_RESNAME);
string name = m_value_s;
Match(pt_IDENT);
result.reset(new TLSSelectionByName(name));
break;
}
case pt_KW_ELEMENT:
{
Match(pt_KW_ELEMENT);
string element = m_value_s;
Match(pt_IDENT);
result.reset(new TLSSelectionByElement(element));
break;
}
case pt_KW_RESSEQ:
{
Match(pt_KW_RESSEQ);
int from = m_value_i;
Match(pt_NUMBER);
int to = from;
if (m_lookahead == ':')
{
Match(':');
to = m_value_i;
Match(pt_NUMBER);
}
result.reset(new TLSSelectionRangeSeq(from, to));
break;
}
case pt_KW_RESID:
{
Match(pt_KW_RESID);
int from, to;
char icode_from = 0, icode_to = 0;
bool through = false;
from = to = m_value_i;
if (m_lookahead == pt_NUMBER)
Match(pt_NUMBER);
else
{
icode_from = m_icode;
Match(pt_RESID);
}
if (m_lookahead == ':' or m_lookahead == pt_KW_THROUGH or m_lookahead == '-')
{
through = m_lookahead == pt_KW_THROUGH;
Match(m_lookahead);
to = m_value_i;
if (m_lookahead == pt_NUMBER)
Match(pt_NUMBER);
else
{
icode_to = m_icode;
Match(pt_RESID);
}
if (through)
result.reset(new TLSSelectionRangeID(from, to, icode_from, icode_to));
else
{
if (VERBOSE and (icode_from or icode_to))
cerr << "Warning, ignoring insertion codes" << endl;
result.reset(new TLSSelectionRangeSeq(from, to));
}
}
else
result.reset(new TLSSelectionResID(from, icode_from));
break;
}
case pt_KW_ALL:
Match(pt_KW_ALL);
result.reset(new TLSSelectionAll());
break;
default:
throw runtime_error("Unexpected token " + ToString(m_lookahead) + " (" + m_token + ')');
}
return result;
}
// --------------------------------------------------------------------
class TLSSelectionParserImplBuster : public TLSSelectionParserImpl
{
public:
TLSSelectionParserImplBuster(const string& selection);
virtual TLSSelectionPtr Parse();
protected:
enum TOKEN {
bt_NONE = 0,
bt_IDENT = 256,
bt_NUMBER,
bt_EOLN,
};
virtual int GetNextToken();
virtual string ToString(int token);
TLSSelectionPtr ParseGroup();
tuple<string,int> ParseAtom();
TLSSelectionPtr ParseOldGroup();
int m_value_i;
string m_value_s;
bool m_parsing_old_style = false;
};
TLSSelectionParserImplBuster::TLSSelectionParserImplBuster(const string& selection)
: TLSSelectionParserImpl(selection)
{
m_lookahead = GetNextToken();
}
int TLSSelectionParserImplBuster::GetNextToken()
{
int result = bt_NONE;
enum STATE { st_START, st_NEGATE, st_NUM, st_IDENT } state = st_START;
m_value_i = 0;
m_value_s.clear();
bool negative = false;
while (result == bt_NONE)
{
char ch = *m_p++;
if (m_p > m_end)
ch = 0;
switch (state)
{
case st_START:
if (ch == 0)
result = bt_EOLN;
else if (isspace(ch))
continue;
else if (isdigit(ch))
{
m_value_i = ch - '0';
state = st_NUM;
}
else if (isalpha(ch))
{
m_value_s = { ch };
state = st_IDENT;
}
else if (ch == '-')
{
state = st_NEGATE;
}
else
result = ch;
break;
case st_NEGATE:
if (isdigit(ch))
{
m_value_i = ch - '0';
state = st_NUM;
negative = true;
}
else
{
--m_p;
result = '-';
}
break;
case st_NUM:
if (isdigit(ch))
m_value_i = 10 * m_value_i + (ch - '0');
else
{
if (negative)
m_value_i = -m_value_i;
result = bt_NUMBER;
--m_p;
}
break;
case st_IDENT:
if (isalnum(ch))
m_value_s += ch;
else
{
--m_p;
result = bt_IDENT;
}
break;
}
}
return result;
}
string TLSSelectionParserImplBuster::ToString(int token)
{
switch (token)
{
case bt_IDENT: return "identifier (" + m_value_s + ')';
case bt_NUMBER: return "number (" + to_string(m_value_i) + ')';
case bt_EOLN: return "end of line";
default:
assert(false);
return "unknown token";
}
}
TLSSelectionPtr TLSSelectionParserImplBuster::ParseGroup()
{
TLSSelectionPtr result;
auto add = [&result](const string& chainID, int from, int to)
{
TLSSelectionPtr sc(new TLSSelectionChain(chainID));
TLSSelectionPtr sr(new TLSSelectionRangeSeq(from, to));
TLSSelectionPtr s(new TLSSelectionIntersection(sc, sr));
if (result == nullptr)
result.reset(s.release());
else
result.reset(new TLSSelectionUnion{result, s });
};
Match('{');
do
{
string chain1;
int seqNr1;
std::tie(chain1, seqNr1) = ParseAtom();
if (m_lookahead == '-')
{
string chain2;
int seqNr2 = seqNr1;
Match('-');
if (m_lookahead == bt_NUMBER)
{
seqNr2 = m_value_i;
Match(bt_NUMBER);
}
else
{
std::tie(chain2, seqNr2) = ParseAtom();
if (chain1 != chain2)
{
cerr << "Warning, ranges over multiple chains detected" << endl;
TLSSelectionPtr sc1(new TLSSelectionChain(chain1));
TLSSelectionPtr sr1(new TLSSelectionRangeSeq(seqNr1, kResidueNrWildcard));
TLSSelectionPtr s1(new TLSSelectionIntersection(sc1, sr1));
TLSSelectionPtr sc2(new TLSSelectionChain(chain2));
TLSSelectionPtr sr2(new TLSSelectionRangeSeq(kResidueNrWildcard, seqNr2));
TLSSelectionPtr s2(new TLSSelectionIntersection(sc2, sr2));
TLSSelectionPtr s(new TLSSelectionUnion(s1, s2));
if (result == nullptr)
result.reset(s.release());
else
result.reset(new TLSSelectionUnion{result, s });
chain1.clear();
}
}
if (not chain1.empty())
add(chain1, seqNr1, seqNr2);
}
else
add(chain1, seqNr1, seqNr1);
}
while (m_lookahead != '}');
Match('}');
return result;
}
tuple<string,int> TLSSelectionParserImplBuster::ParseAtom()
{
string chain = m_value_s;
int seqNr = kResidueNrWildcard;
if (m_lookahead == '*')
Match('*');
else
Match(bt_IDENT);
Match('|');
if (m_lookahead == '*')
Match('*');
else
{
seqNr = m_value_i;
Match(bt_NUMBER);
if (m_lookahead == ':')
{
Match(':');
string atom = m_value_s;
if (VERBOSE)
cerr << "Warning: ignoring atom ID '" << atom << "' in TLS selection" << endl;
Match(bt_IDENT);
}
}
return make_tuple(chain, seqNr);
}
TLSSelectionPtr TLSSelectionParserImplBuster::Parse()
{
TLSSelectionPtr result = ParseGroup();
Match(bt_EOLN);
return result;
}
// --------------------------------------------------------------------
class TLSSelectionParserImplBusterOld : public TLSSelectionParserImpl
{
public:
TLSSelectionParserImplBusterOld(const string& selection)
: TLSSelectionParserImpl(selection)
{
m_lookahead = GetNextToken();
}
virtual TLSSelectionPtr Parse();
private:
TLSSelectionPtr ParseAtomSelection();
TLSSelectionPtr ParseTerm();
TLSSelectionPtr ParseFactor();
TLSSelectionPtr ParseResid();
TLSSelectionPtr ParseChainResid();
enum TOKEN {
pt_NONE = 0,
pt_IDENT = 256,
pt_CHAINRESID,
pt_STRING,
pt_NUMBER,
pt_RANGE,
pt_EOLN,
pt_KW_ALL,
pt_KW_CHAIN,
pt_KW_RESSEQ,
pt_KW_RESID,
pt_KW_RESNAME,
pt_KW_ELEMENT,
pt_KW_AND,
pt_KW_OR,
pt_KW_NOT,
pt_KW_PDB,
pt_KW_ENTRY,
pt_KW_THROUGH
};
virtual int GetNextToken();
virtual string ToString(int token);
int m_value_i;
string m_value_s;
int m_value_r[2];
};
int TLSSelectionParserImplBusterOld::GetNextToken()
{
int result = pt_NONE;
enum STATE { st_START, st_NEGATE, st_NUM, st_RANGE, st_IDENT_1, st_IDENT, st_CHAINRESID, st_QUOTED_1, st_QUOTED_2 } state = st_START;
m_value_i = 0;
m_value_s.clear();
bool negative = false;
while (result == pt_NONE)
{
char ch = *m_p++;
if (m_p > m_end)
ch = 0;
switch (state)
{
case st_START:
if (ch == 0)
result = pt_EOLN;
else if (isspace(ch))
continue;
else if (isdigit(ch))
{
m_value_i = ch - '0';
state = st_NUM;
}
else if (isalpha(ch))
{
m_value_s = { ch };
state = st_IDENT_1;
}
else if (ch == '-')
{
state = st_NEGATE;
}
else if (ch == '\'')
{
state = st_QUOTED_1;
}
else
result = ch;
break;
case st_NEGATE:
if (isdigit(ch))
{
m_value_i = ch - '0';
state = st_NUM;
negative = true;
}
else
{
--m_p;
result = '-';
}
break;
case st_NUM:
if (isdigit(ch))
m_value_i = 10 * m_value_i + (ch - '0');
else if (ch == '-' or ch == ':')
{
if (negative)
m_value_i = -m_value_i;
m_value_r[0] = m_value_i;
m_value_r[1] = 0;
state = st_RANGE;
}
else
{
if (negative)
m_value_i = -m_value_i;
result = pt_NUMBER;
--m_p;
}
break;
case st_RANGE: // TODO: question, is "-2--1" a valid range? We do not support that, yet
if (isdigit(ch))
m_value_r[1] = 10 * m_value_r[1] + (ch - '0');
else if (m_value_r[1] != 0)
{
result = pt_RANGE;
--m_p;
}
else
{
--m_p;
--m_p;
result = pt_NUMBER;
}
break;
case st_IDENT_1:
if (isalpha(ch))
{
m_value_s += ch;
state = st_IDENT;
}
else if (isdigit(ch))
{
m_value_i = (ch - '0');
state = st_CHAINRESID;
}
else
{
--m_p;
result = pt_IDENT;
}
break;
case st_CHAINRESID:
if (isalpha(ch))
{
m_value_s += to_string(m_value_i);
m_value_s += ch;
state = st_IDENT;
}
else if (isdigit(ch))
m_value_i = 10 * m_value_i + (ch - '0');
else
{
--m_p;
result = pt_CHAINRESID;
}
break;
case st_IDENT:
if (isalnum(ch))
m_value_s += ch;
else
{
--m_p;
result = pt_IDENT;
}
break;
case st_QUOTED_1:
if (ch == '\'')
{
--m_p;
result = '\'';
}
else
{
m_value_s = { ch };
state = st_QUOTED_2;
}
break;
case st_QUOTED_2:
if (ch == '\'')
result = pt_STRING;
else if (ch == 0)
throw runtime_error("Unexpected end of selection, missing quote character?");
else
m_value_s += ch;
break;
}
}
if (result == pt_IDENT)
{
if (iequals(m_value_s, "CHAIN"))
result = pt_KW_CHAIN;
else if (iequals(m_value_s, "ALL"))
result = pt_KW_ALL;
else if (iequals(m_value_s, "AND"))
result = pt_KW_AND;
else if (iequals(m_value_s, "OR"))
result = pt_KW_OR;
else if (iequals(m_value_s, "NOT"))
result = pt_KW_NOT;
else if (iequals(m_value_s, "RESSEQ"))
result = pt_KW_RESSEQ;
else if (iequals(m_value_s, "RESID") or iequals(m_value_s, "RESI") or iequals(m_value_s, "RESIDUES"))
result = pt_KW_RESID;
else if (iequals(m_value_s, "RESNAME"))
result = pt_KW_RESNAME;
else if (iequals(m_value_s, "PDB"))
result = pt_KW_PDB;
else if (iequals(m_value_s, "ENTRY"))
result = pt_KW_ENTRY;
else if (iequals(m_value_s, "THROUGH"))
result = pt_KW_THROUGH;
}
return result;
}
string TLSSelectionParserImplBusterOld::ToString(int token)
{
switch (token)
{
case pt_IDENT: return "identifier (" + m_value_s + ')';
case pt_STRING: return "string (" + m_value_s + ')';
case pt_NUMBER: return "number (" + to_string(m_value_i) + ')';
case pt_RANGE: return "range (" + to_string(m_value_r[0]) + ':' + to_string(m_value_r[1]) + ')';
case pt_EOLN: return "end of line";
case pt_KW_ALL: return "ALL";
case pt_KW_CHAIN: return "CHAIN";
case pt_KW_RESSEQ: return "RESSEQ";
case pt_KW_RESID: return "RESID";
case pt_KW_RESNAME: return "RESNAME";
case pt_KW_ELEMENT: return "ELEMENT";
case pt_KW_AND: return "AND";
case pt_KW_OR: return "OR";
case pt_KW_NOT: return "NOT";
case pt_KW_PDB: return "PDB";
case pt_KW_ENTRY: return "ENTRY";
case pt_KW_THROUGH: return "THROUGH";
default:
assert(false);
return "unknown token";
}
}
TLSSelectionPtr TLSSelectionParserImplBusterOld::Parse()
{
if (m_lookahead == pt_KW_PDB)
{
Match(pt_KW_PDB);
// Match(pt_KW_ENTRY);
throw runtime_error("Unimplemented PDB ENTRY specification");
}
TLSSelectionPtr result = ParseAtomSelection();
Match(pt_EOLN);
return result;
}
TLSSelectionPtr TLSSelectionParserImplBusterOld::ParseAtomSelection()
{
TLSSelectionPtr result = ParseTerm();
while (m_lookahead == pt_KW_OR)
{
Match(pt_KW_OR);
result.reset(new TLSSelectionUnion(result, ParseTerm()));
}
return result;
}
TLSSelectionPtr TLSSelectionParserImplBusterOld::ParseTerm()
{
TLSSelectionPtr result = ParseFactor();
while (m_lookahead == pt_KW_AND)
{
Match(pt_KW_AND);
result.reset(new TLSSelectionIntersection(result, ParseFactor()));
}
return result;
}
TLSSelectionPtr TLSSelectionParserImplBusterOld::ParseFactor()
{
TLSSelectionPtr result;
switch (m_lookahead)
{
case '(':
Match('(');
result = ParseAtomSelection();
Match(')');
break;
case pt_KW_NOT:
Match(pt_KW_NOT);
result.reset(new TLSSelectionNot(ParseAtomSelection()));
break;
case pt_KW_CHAIN:
{
Match(pt_KW_CHAIN);
string chainID = m_value_s;
if (m_lookahead == pt_NUMBER) // sigh
{
chainID = to_string(m_value_i);
Match(pt_NUMBER);
}
else
Match(m_lookahead == pt_STRING ? pt_STRING : pt_IDENT);
result.reset(new TLSSelectionChain(chainID));
break;
}
case pt_KW_RESNAME:
{
Match(pt_KW_RESNAME);
string name = m_value_s;
Match(pt_IDENT);
result.reset(new TLSSelectionByName(name));
break;
}
case pt_KW_RESSEQ:
Match(pt_KW_RESSEQ);
result = ParseResid();
break;
case pt_KW_RESID:
Match(pt_KW_RESID);
result = ParseResid();
break;
case pt_KW_ALL:
Match(pt_KW_ALL);
result.reset(new TLSSelectionAll());
break;
case pt_CHAINRESID:
result = ParseChainResid();
break;
default:
throw runtime_error("Unexpected token " + ToString(m_lookahead));
}
return result;
}
TLSSelectionPtr TLSSelectionParserImplBusterOld::ParseResid()
{
TLSSelectionPtr result;
for (;;)
{
int from, to;
if (m_lookahead == pt_RANGE)
{
from = m_value_r[0];
to = m_value_r[1];
Match(pt_RANGE);
}
else
{
from = m_value_i;
Match(pt_NUMBER);
to = from;
if (m_lookahead == ':' or m_lookahead == '-' or m_lookahead == pt_KW_THROUGH)
{
Match(m_lookahead);
to = m_value_i;
Match(pt_NUMBER);
}
}
TLSSelectionPtr range(new TLSSelectionRangeSeq(from, to));
if (result)
result.reset(new TLSSelectionUnion(result, range));
else
result.reset(range.release());
if (m_lookahead == ',')
{
Match(',');
continue;
}
break;
}
return result;
}
TLSSelectionPtr TLSSelectionParserImplBusterOld::ParseChainResid()
{
TLSSelectionPtr result;
for (;;)
{
int from, to;
from = to = m_value_i;
string chainID = m_value_s;
Match(pt_CHAINRESID);
if (m_lookahead == '-')
{
Match(m_lookahead);
to = m_value_i;
if (m_value_s != chainID)
throw runtime_error("Cannot have two different chainIDs in a range selection");
Match(pt_CHAINRESID);
}
TLSSelectionPtr sc(new TLSSelectionChain(chainID));
TLSSelectionPtr sr(new TLSSelectionRangeSeq(from, to));
TLSSelectionPtr range(new TLSSelectionIntersection(sc, sr));
if (result)
result.reset(new TLSSelectionUnion(result, range));
else
result.reset(range.release());
if (m_lookahead == ',')
{
Match(',');
continue;
}
break;
}
return result;
}
// --------------------------------------------------------------------
class TLSSelectionParserBase
{
public:
virtual TLSSelectionPtr Parse(const string& selection) const = 0;
virtual ~TLSSelectionParserBase() {}
};
template<typename IMPL>
class TLSSelectionParser
{
public:
virtual TLSSelectionPtr Parse(const string& selection) const
{
TLSSelectionPtr result;
try
{
IMPL p(selection);
result = p.Parse();
}
catch (const exception& ex)
{
cerr << "ParseError: " << ex.what() << endl;
}
return result;
}
};
// --------------------------------------------------------------------
TLSSelectionPtr ParseSelectionDetails(const string& program, const string& selection)
{
TLSSelectionParser<TLSSelectionParserImplPhenix> phenix;
TLSSelectionParser<TLSSelectionParserImplBuster> buster;
TLSSelectionParser<TLSSelectionParserImplBusterOld> busterOld;
TLSSelectionPtr result;
if (ba::icontains(program, "buster"))
{
result = buster.Parse(selection);
if (not result)
{
if (VERBOSE)
cerr << "Falling back to old BUSTER" << endl;
result = busterOld.Parse(selection);
}
if (not result)
{
if (VERBOSE)
cerr << "Falling back to PHENIX" << endl;
result = phenix.Parse(selection);
}
}
else if (ba::icontains(program, "phenix"))
{
result = phenix.Parse(selection);
if (not result)
{
if (VERBOSE)
cerr << "Falling back to BUSTER" << endl;
result = buster.Parse(selection);
}
if (not result)
{
if (VERBOSE)
cerr << "Falling back to old BUSTER" << endl;
result = busterOld.Parse(selection);
}
}
else
{
if (VERBOSE)
cerr << "No known program specified, trying PHENIX" << endl;
result = phenix.Parse(selection);
if (not result)
{
if (VERBOSE)
cerr << "Falling back to BUSTER" << endl;
result = buster.Parse(selection);
}
if (not result)
{
if (VERBOSE)
cerr << "Falling back to old BUSTER" << endl;
result = busterOld.Parse(selection);
}
}
return result;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment