Commit 701ff3d3 by Maarten L. Hekkelman

Moved DSSP code back here from libcifpp

parent 8ec1983b
......@@ -123,13 +123,19 @@ find_package(Threads)
find_package(cifpp 5.0.0 REQUIRED)
find_package(Boost COMPONENTS date_time program_options)
# The DSSP code is in a separate library, optionally to be used by others
add_library(dssp_library OBJECT ${PROJECT_SOURCE_DIR}/src/DSSP.cpp)
target_link_libraries(dssp_library cifpp::cifpp)
add_executable(mkdssp
${PROJECT_SOURCE_DIR}/src/dssp.cpp
${PROJECT_SOURCE_DIR}/src/dssp.hpp
${PROJECT_SOURCE_DIR}/src/mkdssp.cpp)
${PROJECT_SOURCE_DIR}/src/dssp_wrapper.cpp
${PROJECT_SOURCE_DIR}/src/dssp_wrapper.hpp
${PROJECT_SOURCE_DIR}/src/mkdssp.cpp
$<TARGET_OBJECTS:dssp_library>)
target_include_directories(mkdssp PRIVATE cifpp::cifpp ${CMAKE_SOURCE_DIR}/include ${CMAKE_BINARY_DIR})
target_link_libraries(mkdssp PRIVATE cifpp::cifpp Boost::date_time Boost::program_options)
target_include_directories(mkdssp PRIVATE ${CMAKE_BINARY_DIR})
target_link_libraries(mkdssp PRIVATE dssp_library cifpp::cifpp Boost::date_time Boost::program_options)
if(USE_RSRC)
mrc_target_resources(mkdssp ${CIFPP_SHARE_DIR}/mmcif_pdbx.dic)
......@@ -153,7 +159,7 @@ endif()
# test
add_executable(unit-test ${PROJECT_SOURCE_DIR}/test/unit-test.cpp ${PROJECT_SOURCE_DIR}/src/dssp.cpp)
add_executable(unit-test ${PROJECT_SOURCE_DIR}/test/unit-test.cpp ${PROJECT_SOURCE_DIR}/src/dssp_wrapper.cpp)
if(USE_RSRC)
mrc_target_resources(unit-test ${CIFPP_SHARE_DIR}/mmcif_pdbx.dic)
......@@ -164,7 +170,7 @@ target_include_directories(unit-test PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(unit-test cifpp::cifpp Boost::date_time)
target_link_libraries(unit-test dssp_library cifpp::cifpp Boost::date_time)
if(MSVC)
# Specify unwind semantics so that MSVC knowns how to handle exceptions
......
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Calculate DSSP-like secondary structure information
#include <iomanip>
#include <numeric>
#include <thread>
#include "DSSP.hpp"
// --------------------------------------------------------------------
namespace dssp
{
const double
kPI = 3.141592653589793238462643383279502884;
struct point
{
float mX, mY, mZ;
};
inline point operator-(const point &lhs, const point &rhs)
{
return { lhs.mX - rhs.mX, lhs.mY - rhs.mY, lhs.mZ - rhs.mZ };
}
inline point operator*(const point &lhs, float rhs)
{
return { lhs.mX * rhs, lhs.mY * rhs, lhs.mZ * rhs };
}
inline float distance_sq(const point &a, const point &b)
{
return (a.mX - b.mX) * (a.mX - b.mX) +
(a.mY - b.mY) * (a.mY - b.mY) +
(a.mZ - b.mZ) * (a.mZ - b.mZ);
}
inline float distance(const point &a, const point &b)
{
return std::sqrt(distance_sq(a, b));
}
inline float dot_product(const point &a, const point &b)
{
return a.mX * b.mX + a.mY * b.mY + a.mZ * b.mZ;
}
inline point cross_product(const point &a, const point &b)
{
return {
a.mY * b.mZ - b.mY * a.mZ,
a.mZ * b.mX - b.mZ * a.mX,
a.mX * b.mY - b.mX * a.mY
};
}
float dihedral_angle(const point &p1, const point &p2, const point &p3, const point &p4)
{
point v12 = p1 - p2; // vector from p2 to p1
point v43 = p4 - p3; // vector from p3 to p4
point z = p2 - p3; // vector from p3 to p2
point p = cross_product(z, v12);
point x = cross_product(z, v43);
point y = cross_product(z, x);
double u = dot_product(x, x);
double v = dot_product(y, y);
double result = 360;
if (u > 0 and v > 0)
{
u = dot_product(p, x) / std::sqrt(u);
v = dot_product(p, y) / std::sqrt(v);
if (u != 0 or v != 0)
result = std::atan2(v, u) * 180 / kPI;
}
return result;
}
float cosinus_angle(const point &p1, const point &p2, const point &p3, const point &p4)
{
point v12 = p1 - p2;
point v34 = p3 - p4;
double result = 0;
double x = dot_product(v12, v12) * dot_product(v34, v34);
if (x > 0)
result = dot_product(v12, v34) / std::sqrt(x);
return result;
}
enum residue_type
{
kUnknownResidue,
//
kAlanine, // A ala
kArginine, // R arg
kAsparagine, // N asn
kAsparticAcid, // D asp
kCysteine, // C cys
kGlutamicAcid, // E glu
kGlutamine, // Q gln
kGlycine, // G gly
kHistidine, // H his
kIsoleucine, // I ile
kLeucine, // L leu
kLysine, // K lys
kMethionine, // M met
kPhenylalanine, // F phe
kProline, // P pro
kSerine, // S ser
kThreonine, // T thr
kTryptophan, // W trp
kTyrosine, // Y tyr
kValine, // V val
kResidueTypeCount
};
struct
{
residue_type type;
char code;
char name[4];
} const kResidueInfo[] = {
{ kUnknownResidue, 'X', "UNK" },
{ kAlanine, 'A', "ALA" },
{ kArginine, 'R', "ARG" },
{ kAsparagine, 'N', "ASN" },
{ kAsparticAcid, 'D', "ASP" },
{ kCysteine, 'C', "CYS" },
{ kGlutamicAcid, 'E', "GLU" },
{ kGlutamine, 'Q', "GLN" },
{ kGlycine, 'G', "GLY" },
{ kHistidine, 'H', "HIS" },
{ kIsoleucine, 'I', "ILE" },
{ kLeucine, 'L', "LEU" },
{ kLysine, 'K', "LYS" },
{ kMethionine, 'M', "MET" },
{ kPhenylalanine, 'F', "PHE" },
{ kProline, 'P', "PRO" },
{ kSerine, 'S', "SER" },
{ kThreonine, 'T', "THR" },
{ kTryptophan, 'W', "TRP" },
{ kTyrosine, 'Y', "TYR" },
{ kValine, 'V', "VAL" }
};
residue_type MapResidue(std::string_view inName)
{
residue_type result = kUnknownResidue;
for (auto &ri : kResidueInfo)
{
if (inName == ri.name)
{
result = ri.type;
break;
}
}
return result;
}
struct HBond
{
residue *res;
double energy;
};
enum class bridge_type
{
None,
Parallel,
AntiParallel
};
struct bridge
{
bridge_type type;
uint32_t sheet, ladder;
std::set<bridge *> link;
std::deque<uint32_t> i, j;
std::string chainI, chainJ;
bool operator<(const bridge &b) const { return chainI < b.chainI or (chainI == b.chainI and i.front() < b.i.front()); }
};
struct bridge_partner
{
residue *m_residue;
uint32_t ladder;
bool parallel;
};
// --------------------------------------------------------------------
const float
// kSSBridgeDistance = 3.0f,
kMinimalDistance = 0.5f,
kMinimalCADistance = 9.0f,
kMinHBondEnergy = -9.9f,
kMaxHBondEnergy = -0.5f,
kCouplingConstant = -27.888f, // = -332 * 0.42 * 0.2
kMaxPeptideBondLength = 2.5f;
const float
kRadiusN = 1.65f,
kRadiusCA = 1.87f,
kRadiusC = 1.76f,
kRadiusO = 1.4f,
kRadiusSideAtom = 1.8f,
kRadiusWater = 1.4f;
struct residue
{
residue(int model_nr,
std::string_view pdb_strand_id, int pdb_seq_num, std::string_view pdb_ins_code,
const std::vector<cif::row_handle> &atoms)
: mPDBStrandID(pdb_strand_id)
, mPDBSeqNum(pdb_seq_num)
, mPDBInsCode(pdb_ins_code)
, mChainBreak(chain_break_type::None)
{
// update the box containing all atoms
mBox[0].mX = mBox[0].mY = mBox[0].mZ = std::numeric_limits<float>::max();
mBox[1].mX = mBox[1].mY = mBox[1].mZ = -std::numeric_limits<float>::max();
mH = point{ std::numeric_limits<float>::max(), std::numeric_limits<float>::max(), std::numeric_limits<float>::max() };
int seen = 0;
for (auto atom : atoms)
{
std::string asymID, compID, atomID, type, authAsymID;
std::optional<std::string> altID;
int seqID, authSeqID;
std::optional<int> model;
float x, y, z;
cif::tie(asymID, compID, atomID, altID, type, seqID, model, x, y, z, authAsymID, authSeqID) =
atom.get("label_asym_id", "label_comp_id", "label_atom_id", "label_alt_id", "type_symbol", "label_seq_id",
"pdbx_PDB_model_num", "Cartn_x", "Cartn_y", "Cartn_z",
"auth_asym_id", "auth_seq_id");
if (model and model != model_nr)
continue;
if (seen == 0)
{
mAsymID = asymID;
mCompoundID = compID;
mSeqID = seqID;
mAuthSeqID = authSeqID;
mAuthAsymID = authAsymID;
mType = MapResidue(mCompoundID);
if (altID)
mAltID = *altID;
}
if (atomID == "CA")
{
seen |= 1;
mCAlpha = { x, y, z };
ExtendBox(mCAlpha, kRadiusCA + 2 * kRadiusWater);
}
else if (atomID == "C")
{
seen |= 2;
mC = { x, y, z };
ExtendBox(mC, kRadiusC + 2 * kRadiusWater);
}
else if (atomID == "N")
{
seen |= 4;
mH = mN = { x, y, z };
ExtendBox(mN, kRadiusN + 2 * kRadiusWater);
}
else if (atomID == "O")
{
seen |= 8;
mO = { x, y, z };
ExtendBox(mO, kRadiusO + 2 * kRadiusWater);
}
else if (type != "H")
{
mSideChain.emplace_back(point { x, y, z });
ExtendBox(mSideChain.back(), kRadiusSideAtom + 2 * kRadiusWater);
}
}
mComplete = seen == (1 | 2 | 4 | 8);
mRadius = mBox[1].mX - mBox[0].mX;
if (mRadius < mBox[1].mY - mBox[0].mY)
mRadius = mBox[1].mY - mBox[0].mY;
if (mRadius < mBox[1].mZ - mBox[0].mZ)
mRadius = mBox[1].mZ - mBox[0].mZ;
mCenter.mX = (mBox[0].mX + mBox[1].mX) / 2;
mCenter.mY = (mBox[0].mY + mBox[1].mY) / 2;
mCenter.mZ = (mBox[0].mZ + mBox[1].mZ) / 2;
}
void assignHydrogen()
{
// assign the Hydrogen
mH = mN;
if (mType != kProline and mPrev != nullptr)
{
auto pc = mPrev->mC;
auto po = mPrev->mO;
float CODistance = static_cast<float>(distance(pc, po));
mH.mX += (pc.mX - po.mX) / CODistance;
mH.mY += (pc.mY - po.mY) / CODistance;
mH.mZ += (pc.mZ - po.mZ) / CODistance;
}
}
void SetSecondaryStructure(structure_type inSS) { mSecondaryStructure = inSS; }
structure_type GetSecondaryStructure() const { return mSecondaryStructure; }
void SetBetaPartner(uint32_t n, residue &inResidue, uint32_t inLadder, bool inParallel)
{
assert(n == 0 or n == 1);
mBetaPartner[n].m_residue = &inResidue;
mBetaPartner[n].ladder = inLadder;
mBetaPartner[n].parallel = inParallel;
}
bridge_partner GetBetaPartner(uint32_t n) const
{
assert(n == 0 or n == 1);
return mBetaPartner[n];
}
void SetSheet(uint32_t inSheet) { mSheet = inSheet; }
uint32_t GetSheet() const { return mSheet; }
bool IsBend() const { return mBend; }
void SetBend(bool inBend) { mBend = inBend; }
helix_position_type GetHelixFlag(helix_type helixType) const
{
size_t stride = static_cast<size_t>(helixType);
assert(stride < 4);
return mHelixFlags[stride];
}
bool IsHelixStart(helix_type helixType) const
{
size_t stride = static_cast<size_t>(helixType);
assert(stride < 4);
return mHelixFlags[stride] == helix_position_type::Start or mHelixFlags[stride] == helix_position_type::StartAndEnd;
}
void SetHelixFlag(helix_type helixType, helix_position_type inHelixFlag)
{
size_t stride = static_cast<size_t>(helixType);
assert(stride < 4);
mHelixFlags[stride] = inHelixFlag;
}
void SetSSBridgeNr(uint8_t inBridgeNr)
{
if (mType != kCysteine)
throw std::runtime_error("Only cysteine residues can form sulphur bridges");
mSSBridgeNr = inBridgeNr;
}
uint8_t GetSSBridgeNr() const
{
if (mType != kCysteine)
throw std::runtime_error("Only cysteine residues can form sulphur bridges");
return mSSBridgeNr;
}
double CalculateSurface(const std::vector<residue> &inResidues);
double CalculateSurface(const point &inAtom, float inRadius, const std::vector<residue *> &inNeighbours);
bool AtomIntersectsBox(const point &atom, float inRadius) const
{
return atom.mX + inRadius >= mBox[0].mX and
atom.mX - inRadius <= mBox[1].mX and
atom.mY + inRadius >= mBox[0].mY and
atom.mY - inRadius <= mBox[1].mY and
atom.mZ + inRadius >= mBox[0].mZ and
atom.mZ - inRadius <= mBox[1].mZ;
}
void ExtendBox(const point &atom, float inRadius)
{
if (mBox[0].mX > atom.mX - inRadius)
mBox[0].mX = atom.mX - inRadius;
if (mBox[0].mY > atom.mY - inRadius)
mBox[0].mY = atom.mY - inRadius;
if (mBox[0].mZ > atom.mZ - inRadius)
mBox[0].mZ = atom.mZ - inRadius;
if (mBox[1].mX < atom.mX + inRadius)
mBox[1].mX = atom.mX + inRadius;
if (mBox[1].mY < atom.mY + inRadius)
mBox[1].mY = atom.mY + inRadius;
if (mBox[1].mZ < atom.mZ + inRadius)
mBox[1].mZ = atom.mZ + inRadius;
}
residue *mNext = nullptr;
residue *mPrev = nullptr;
// const Monomer &mM;
std::string mAsymID;
int mSeqID;
std::string mCompoundID;
std::string mAltID;
int mNumber;
bool mComplete = false;
std::string mAuthAsymID;
int mAuthSeqID;
std::string mPDBStrandID;
int mPDBSeqNum;
std::string mPDBInsCode;
point mCAlpha, mC, mN, mO, mH;
point mBox[2] = {};
float mRadius;
point mCenter;
std::vector<point> mSideChain;
double mAccessibility = 0;
double mAlpha = 360, mKappa = 360, mPhi = 360, mPsi = 360, mTCO = 0;
residue_type mType;
uint8_t mSSBridgeNr = 0;
structure_type mSecondaryStructure = structure_type::Loop;
HBond mHBondDonor[2] = {}, mHBondAcceptor[2] = {};
bridge_partner mBetaPartner[2] = {};
uint32_t mSheet = 0;
helix_position_type mHelixFlags[4] = { helix_position_type::None, helix_position_type::None, helix_position_type::None, helix_position_type::None }; //
bool mBend = false;
chain_break_type mChainBreak = chain_break_type::None;
};
// --------------------------------------------------------------------
class accumulator
{
public:
struct candidate
{
point location;
double radius;
double distance;
bool operator<(const candidate &rhs) const
{
return distance < rhs.distance;
}
};
void operator()(const point &a, const point &b, double d, double r)
{
double distance = distance_sq(a, b);
d += kRadiusWater;
r += kRadiusWater;
double test = d + r;
test *= test;
if (distance < test and distance > 0.0001)
{
candidate c = { b - a, r * r, distance };
m_x.push_back(c);
push_heap(m_x.begin(), m_x.end());
}
}
void sort()
{
sort_heap(m_x.begin(), m_x.end());
}
std::vector<candidate> m_x;
};
// we use a fibonacci sphere to calculate the even distribution of the dots
class MSurfaceDots
{
public:
static MSurfaceDots &Instance();
size_t size() const { return mPoints.size(); }
const point &operator[](size_t inIx) const { return mPoints[inIx]; }
double weight() const { return mWeight; }
private:
MSurfaceDots(int32_t inN);
std::vector<point> mPoints;
double mWeight;
};
MSurfaceDots &MSurfaceDots::Instance()
{
const int32_t kN = 200;
static MSurfaceDots sInstance(kN);
return sInstance;
}
MSurfaceDots::MSurfaceDots(int32_t N)
{
auto P = 2 * N + 1;
const float kGoldenRatio = (1 + std::sqrt(5.0f)) / 2;
mWeight = (4 * kPI) / P;
for (auto i = -N; i <= N; ++i)
{
float lat = std::asin((2.0f * i) / P);
float lon = static_cast<float>(std::fmod(i, kGoldenRatio) * 2 * kPI / kGoldenRatio);
mPoints.emplace_back(point { std::sin(lon) * std::cos(lat), std::cos(lon) * std::cos(lat), std::sin(lat) });
}
}
double residue::CalculateSurface(const point &inAtom, float inRadius, const std::vector<residue *> &inNeighbours)
{
accumulator accumulate;
for (auto r : inNeighbours)
{
if (r->AtomIntersectsBox(inAtom, inRadius))
{
accumulate(inAtom, r->mN, inRadius, kRadiusN);
accumulate(inAtom, r->mCAlpha, inRadius, kRadiusCA);
accumulate(inAtom, r->mC, inRadius, kRadiusC);
accumulate(inAtom, r->mO, inRadius, kRadiusO);
for (auto &atom : r->mSideChain)
accumulate(inAtom, atom, inRadius, kRadiusSideAtom);
}
}
accumulate.sort();
float radius = inRadius + kRadiusWater;
double surface = 0;
MSurfaceDots &surfaceDots = MSurfaceDots::Instance();
for (size_t i = 0; i < surfaceDots.size(); ++i)
{
point xx = surfaceDots[i] * radius;
bool free = true;
for (size_t k = 0; free and k < accumulate.m_x.size(); ++k)
free = accumulate.m_x[k].radius < distance_sq(xx, accumulate.m_x[k].location);
if (free)
surface += surfaceDots.weight();
}
return surface * radius * radius;
}
double residue::CalculateSurface(const std::vector<residue> &inResidues)
{
std::vector<residue *> neighbours;
for (auto &r : inResidues)
{
point center = r.mCenter;
double radius = r.mRadius;
if (distance(mCenter, center) < mRadius + radius)
neighbours.push_back(const_cast<residue *>(&r));
}
mAccessibility = CalculateSurface(mN, kRadiusN, neighbours) +
CalculateSurface(mCAlpha, kRadiusCA, neighbours) +
CalculateSurface(mC, kRadiusC, neighbours) +
CalculateSurface(mO, kRadiusO, neighbours);
for (auto &atom : mSideChain)
mAccessibility += CalculateSurface(atom, kRadiusSideAtom, neighbours);
return mAccessibility;
}
void CalculateAccessibilities(std::vector<residue> &inResidues, statistics &stats)
{
stats.accessible_surface = 0;
for (auto &residue : inResidues)
stats.accessible_surface += residue.CalculateSurface(inResidues);
}
// --------------------------------------------------------------------
// TODO: use the angle to improve bond energy calculation.
double CalculateHBondEnergy(residue &inDonor, residue &inAcceptor)
{
double result = 0;
if (inDonor.mType != kProline)
{
double distanceHO = distance(inDonor.mH, inAcceptor.mO);
double distanceHC = distance(inDonor.mH, inAcceptor.mC);
double distanceNC = distance(inDonor.mN, inAcceptor.mC);
double distanceNO = distance(inDonor.mN, inAcceptor.mO);
if (distanceHO < kMinimalDistance or distanceHC < kMinimalDistance or distanceNC < kMinimalDistance or distanceNO < kMinimalDistance)
result = kMinHBondEnergy;
else
result = kCouplingConstant / distanceHO - kCouplingConstant / distanceHC + kCouplingConstant / distanceNC - kCouplingConstant / distanceNO;
// DSSP compatibility mode:
result = std::round(result * 1000) / 1000;
if (result < kMinHBondEnergy)
result = kMinHBondEnergy;
}
// update donor
if (result < inDonor.mHBondAcceptor[0].energy)
{
inDonor.mHBondAcceptor[1] = inDonor.mHBondAcceptor[0];
inDonor.mHBondAcceptor[0].res = &inAcceptor;
inDonor.mHBondAcceptor[0].energy = result;
}
else if (result < inDonor.mHBondAcceptor[1].energy)
{
inDonor.mHBondAcceptor[1].res = &inAcceptor;
inDonor.mHBondAcceptor[1].energy = result;
}
// and acceptor
if (result < inAcceptor.mHBondDonor[0].energy)
{
inAcceptor.mHBondDonor[1] = inAcceptor.mHBondDonor[0];
inAcceptor.mHBondDonor[0].res = &inDonor;
inAcceptor.mHBondDonor[0].energy = result;
}
else if (result < inAcceptor.mHBondDonor[1].energy)
{
inAcceptor.mHBondDonor[1].res = &inDonor;
inAcceptor.mHBondDonor[1].energy = result;
}
return result;
}
// --------------------------------------------------------------------
void CalculateHBondEnergies(std::vector<residue> &inResidues)
{
// Calculate the HBond energies
for (uint32_t i = 0; i + 1 < inResidues.size(); ++i)
{
auto &ri = inResidues[i];
for (uint32_t j = i + 1; j < inResidues.size(); ++j)
{
auto &rj = inResidues[j];
if (distance(ri.mCAlpha, rj.mCAlpha) < kMinimalCADistance)
{
CalculateHBondEnergy(ri, rj);
if (j != i + 1)
CalculateHBondEnergy(rj, ri);
}
}
}
}
// --------------------------------------------------------------------
bool NoChainBreak(const residue *a, const residue *b)
{
bool result = a->mAsymID == b->mAsymID;
for (auto r = a; result and r != b; r = r->mNext)
{
auto next = r->mNext;
if (next == nullptr)
result = false;
else
result = next->mNumber == r->mNumber + 1;
}
return result;
}
bool NoChainBreak(const residue &a, const residue &b)
{
return NoChainBreak(&a, &b);
}
// --------------------------------------------------------------------
bool TestBond(const residue *a, const residue *b)
{
return (a->mHBondAcceptor[0].res == b and a->mHBondAcceptor[0].energy < kMaxHBondEnergy) or
(a->mHBondAcceptor[1].res == b and a->mHBondAcceptor[1].energy < kMaxHBondEnergy);
}
bool test_bond(DSSP::residue_info const &a, DSSP::residue_info const &b)
{
return a and b and TestBond(a.m_impl, b.m_impl);
}
// --------------------------------------------------------------------
bridge_type TestBridge(const residue &r1, const residue &r2)
{ // I. a d II. a d parallel
auto a = r1.mPrev; // \ /
auto b = &r1; // b e b e
auto c = r1.mNext; // / \ ..
auto d = r2.mPrev; // c f c f
auto e = &r2; //
auto f = r2.mNext; // III. a <- f IV. a f antiparallel
//
bridge_type result = bridge_type::None; // b e b <-> e
//
// c -> d c d
if (a and c and NoChainBreak(a, c) and d and f and NoChainBreak(d, f))
{
if ((TestBond(c, e) and TestBond(e, a)) or (TestBond(f, b) and TestBond(b, d)))
result = bridge_type::Parallel;
else if ((TestBond(c, d) and TestBond(f, a)) or (TestBond(e, b) and TestBond(b, e)))
result = bridge_type::AntiParallel;
}
return result;
}
// --------------------------------------------------------------------
// return true if any of the residues in bridge a is identical to any of the residues in bridge b
bool Linked(const bridge &a, const bridge &b)
{
return find_first_of(a.i.begin(), a.i.end(), b.i.begin(), b.i.end()) != a.i.end() or
find_first_of(a.i.begin(), a.i.end(), b.j.begin(), b.j.end()) != a.i.end() or
find_first_of(a.j.begin(), a.j.end(), b.i.begin(), b.i.end()) != a.j.end() or
find_first_of(a.j.begin(), a.j.end(), b.j.begin(), b.j.end()) != a.j.end();
}
// --------------------------------------------------------------------
void CalculateBetaSheets(std::vector<residue> &inResidues, statistics &stats)
{
// Calculate Bridges
std::vector<bridge> bridges;
for (uint32_t i = 1; i + 4 < inResidues.size(); ++i)
{
auto &ri = inResidues[i];
for (uint32_t j = i + 3; j + 1 < inResidues.size(); ++j)
{
auto &rj = inResidues[j];
bridge_type type = TestBridge(ri, rj);
if (type == bridge_type::None)
continue;
bool found = false;
for (bridge &bridge : bridges)
{
if (type != bridge.type or i != bridge.i.back() + 1)
continue;
if (type == bridge_type::Parallel and bridge.j.back() + 1 == j)
{
bridge.i.push_back(i);
bridge.j.push_back(j);
found = true;
break;
}
if (type == bridge_type::AntiParallel and bridge.j.front() - 1 == j)
{
bridge.i.push_back(i);
bridge.j.push_front(j);
found = true;
break;
}
}
if (not found)
{
bridge bridge = {};
bridge.type = type;
bridge.i.push_back(i);
bridge.chainI = ri.mAsymID;
bridge.j.push_back(j);
bridge.chainJ = rj.mAsymID;
bridges.push_back(bridge);
}
}
}
// extend ladders
std::sort(bridges.begin(), bridges.end());
for (uint32_t i = 0; i < bridges.size(); ++i)
{
for (uint32_t j = i + 1; j < bridges.size(); ++j)
{
uint32_t ibi = bridges[i].i.front();
uint32_t iei = bridges[i].i.back();
uint32_t jbi = bridges[i].j.front();
uint32_t jei = bridges[i].j.back();
uint32_t ibj = bridges[j].i.front();
uint32_t iej = bridges[j].i.back();
uint32_t jbj = bridges[j].j.front();
uint32_t jej = bridges[j].j.back();
if (bridges[i].type != bridges[j].type or
NoChainBreak(inResidues[std::min(ibi, ibj)], inResidues[std::max(iei, iej)]) == false or
NoChainBreak(inResidues[std::min(jbi, jbj)], inResidues[std::max(jei, jej)]) == false or
ibj - iei >= 6 or
(iei >= ibj and ibi <= iej))
{
continue;
}
bool bulge;
if (bridges[i].type == bridge_type::Parallel)
bulge = ((jbj - jei < 6 and ibj - iei < 3) or (jbj - jei < 3));
else
bulge = ((jbi - jej < 6 and ibj - iei < 3) or (jbi - jej < 3));
if (bulge)
{
bridges[i].i.insert(bridges[i].i.end(), bridges[j].i.begin(), bridges[j].i.end());
if (bridges[i].type == bridge_type::Parallel)
bridges[i].j.insert(bridges[i].j.end(), bridges[j].j.begin(), bridges[j].j.end());
else
bridges[i].j.insert(bridges[i].j.begin(), bridges[j].j.begin(), bridges[j].j.end());
bridges.erase(bridges.begin() + j);
--j;
}
}
}
// Sheet
std::set<bridge *> ladderset;
for (bridge &bridge : bridges)
{
ladderset.insert(&bridge);
size_t n = bridge.i.size();
if (n > kHistogramSize)
n = kHistogramSize;
if (bridge.type == bridge_type::Parallel)
stats.histogram.parallel_bridges_per_ladder[n - 1] += 1;
else
stats.histogram.antiparallel_bridges_per_ladder[n - 1] += 1;
}
uint32_t sheet = 1, ladder = 0;
while (not ladderset.empty())
{
std::set<bridge *> sheetset;
sheetset.insert(*ladderset.begin());
ladderset.erase(ladderset.begin());
bool done = false;
while (not done)
{
done = true;
for (bridge *a : sheetset)
{
for (bridge *b : ladderset)
{
if (Linked(*a, *b))
{
sheetset.insert(b);
ladderset.erase(b);
done = false;
break;
}
}
if (not done)
break;
}
}
for (bridge *bridge : sheetset)
{
bridge->ladder = ladder;
bridge->sheet = sheet;
bridge->link = sheetset;
++ladder;
}
size_t nrOfLaddersPerSheet = sheetset.size();
if (nrOfLaddersPerSheet > kHistogramSize)
nrOfLaddersPerSheet = kHistogramSize;
if (nrOfLaddersPerSheet == 1 and (*sheetset.begin())->i.size() > 1)
stats.histogram.ladders_per_sheet[0] += 1;
else if (nrOfLaddersPerSheet > 1)
stats.histogram.ladders_per_sheet[nrOfLaddersPerSheet - 1] += 1;
++sheet;
}
for (bridge &bridge : bridges)
{
// find out if any of the i and j set members already have
// a bridge assigned, if so, we're assigning bridge 2
uint32_t betai = 0, betaj = 0;
for (uint32_t l : bridge.i)
{
if (inResidues[l].GetBetaPartner(0).m_residue != nullptr)
{
betai = 1;
break;
}
}
for (uint32_t l : bridge.j)
{
if (inResidues[l].GetBetaPartner(0).m_residue != nullptr)
{
betaj = 1;
break;
}
}
structure_type ss = structure_type::Betabridge;
if (bridge.i.size() > 1)
ss = structure_type::Strand;
if (bridge.type == bridge_type::Parallel)
{
stats.count.H_bonds_in_parallel_bridges += bridge.i.back() - bridge.i.front() + 2;
std::deque<uint32_t>::iterator j = bridge.j.begin();
for (uint32_t i : bridge.i)
inResidues[i].SetBetaPartner(betai, inResidues[*j++], bridge.ladder, true);
j = bridge.i.begin();
for (uint32_t i : bridge.j)
inResidues[i].SetBetaPartner(betaj, inResidues[*j++], bridge.ladder, true);
}
else
{
stats.count.H_bonds_in_antiparallel_bridges += bridge.i.back() - bridge.i.front() + 2;
std::deque<uint32_t>::reverse_iterator j = bridge.j.rbegin();
for (uint32_t i : bridge.i)
inResidues[i].SetBetaPartner(betai, inResidues[*j++], bridge.ladder, false);
j = bridge.i.rbegin();
for (uint32_t i : bridge.j)
inResidues[i].SetBetaPartner(betaj, inResidues[*j++], bridge.ladder, false);
}
for (uint32_t i = bridge.i.front(); i <= bridge.i.back(); ++i)
{
if (inResidues[i].GetSecondaryStructure() != structure_type::Strand)
inResidues[i].SetSecondaryStructure(ss);
inResidues[i].SetSheet(bridge.sheet);
}
for (uint32_t i = bridge.j.front(); i <= bridge.j.back(); ++i)
{
if (inResidues[i].GetSecondaryStructure() != structure_type::Strand)
inResidues[i].SetSecondaryStructure(ss);
inResidues[i].SetSheet(bridge.sheet);
}
}
}
// --------------------------------------------------------------------
void CalculateAlphaHelices(std::vector<residue> &inResidues, statistics &stats, bool inPreferPiHelices = true)
{
// Helix and Turn
for (helix_type helixType : { helix_type::_3_10, helix_type::alpha, helix_type::pi })
{
uint32_t stride = static_cast<uint32_t>(helixType) + 3;
for (uint32_t i = 0; i + stride < inResidues.size(); ++i)
{
if (NoChainBreak(inResidues[i], inResidues[i + stride]) and TestBond(&inResidues[i + stride], &inResidues[i]))
{
inResidues[i + stride].SetHelixFlag(helixType, helix_position_type::End);
for (uint32_t j = i + 1; j < i + stride; ++j)
{
if (inResidues[j].GetHelixFlag(helixType) == helix_position_type::None)
inResidues[j].SetHelixFlag(helixType, helix_position_type::Middle);
}
if (inResidues[i].GetHelixFlag(helixType) == helix_position_type::End)
inResidues[i].SetHelixFlag(helixType, helix_position_type::StartAndEnd);
else
inResidues[i].SetHelixFlag(helixType, helix_position_type::Start);
}
}
}
for (auto &r : inResidues)
{
double kappa = r.mKappa;
r.SetBend(kappa != 360 and kappa > 70);
}
for (uint32_t i = 1; i + 4 < inResidues.size(); ++i)
{
if (inResidues[i].IsHelixStart(helix_type::alpha) and inResidues[i - 1].IsHelixStart(helix_type::alpha))
{
for (uint32_t j = i; j <= i + 3; ++j)
inResidues[j].SetSecondaryStructure(structure_type::Alphahelix);
}
}
for (uint32_t i = 1; i + 3 < inResidues.size(); ++i)
{
if (inResidues[i].IsHelixStart(helix_type::_3_10) and inResidues[i - 1].IsHelixStart(helix_type::_3_10))
{
bool empty = true;
for (uint32_t j = i; empty and j <= i + 2; ++j)
empty = inResidues[j].GetSecondaryStructure() == structure_type::Loop or inResidues[j].GetSecondaryStructure() == structure_type::Helix_3;
if (empty)
{
for (uint32_t j = i; j <= i + 2; ++j)
inResidues[j].SetSecondaryStructure(structure_type::Helix_3);
}
}
}
for (uint32_t i = 1; i + 5 < inResidues.size(); ++i)
{
if (inResidues[i].IsHelixStart(helix_type::pi) and inResidues[i - 1].IsHelixStart(helix_type::pi))
{
bool empty = true;
for (uint32_t j = i; empty and j <= i + 4; ++j)
empty = inResidues[j].GetSecondaryStructure() == structure_type::Loop or inResidues[j].GetSecondaryStructure() == structure_type::Helix_5 or
(inPreferPiHelices and inResidues[j].GetSecondaryStructure() == structure_type::Alphahelix);
if (empty)
{
for (uint32_t j = i; j <= i + 4; ++j)
inResidues[j].SetSecondaryStructure(structure_type::Helix_5);
}
}
}
for (uint32_t i = 1; i + 1 < inResidues.size(); ++i)
{
if (inResidues[i].GetSecondaryStructure() == structure_type::Loop)
{
bool isTurn = false;
for (helix_type helixType : { helix_type::_3_10, helix_type::alpha, helix_type::pi })
{
uint32_t stride = 3 + static_cast<uint32_t>(helixType);
for (uint32_t k = 1; k < stride and not isTurn; ++k)
isTurn = (i >= k) and inResidues[i - k].IsHelixStart(helixType);
}
if (isTurn)
inResidues[i].SetSecondaryStructure(structure_type::Turn);
else if (inResidues[i].IsBend())
inResidues[i].SetSecondaryStructure(structure_type::Bend);
}
}
std::string asym;
size_t helixLength = 0;
for (auto r : inResidues)
{
if (r.mAsymID != asym)
{
helixLength = 0;
asym = r.mAsymID;
}
if (r.GetSecondaryStructure() == structure_type::Alphahelix)
++helixLength;
else if (helixLength > 0)
{
if (helixLength > kHistogramSize)
helixLength = kHistogramSize;
stats.histogram.residues_per_alpha_helix[helixLength - 1] += 1;
helixLength = 0;
}
}
}
// --------------------------------------------------------------------
void CalculatePPHelices(std::vector<residue> &inResidues, statistics &stats, int stretch_length)
{
size_t N = inResidues.size();
const float epsilon = 29;
const float phi_min = -75 - epsilon;
const float phi_max = -75 + epsilon;
const float psi_min = 145 - epsilon;
const float psi_max = 145 + epsilon;
std::vector<float> phi(N), psi(N);
for (uint32_t i = 1; i + 1 < inResidues.size(); ++i)
{
phi[i] = inResidues[i].mPhi;
psi[i] = inResidues[i].mPsi;
}
for (uint32_t i = 1; i + 3 < inResidues.size(); ++i)
{
switch (stretch_length)
{
case 2:
{
if (phi_min > phi[i + 0] or phi[i + 0] > phi_max or
phi_min > phi[i + 1] or phi[i + 1] > phi_max)
continue;
if (psi_min > psi[i + 0] or psi[i + 0] > psi_max or
psi_min > psi[i + 1] or psi[i + 1] > psi_max)
continue;
// auto phi_avg = (phi[i + 0] + phi[i + 1]) / 2;
// auto phi_sq = (phi[i + 0] - phi_avg) * (phi[i + 0] - phi_avg) +
// (phi[i + 1] - phi_avg) * (phi[i + 1] - phi_avg);
// if (phi_sq >= 200)
// continue;
// auto psi_avg = (psi[i + 0] + psi[i + 1]) / 2;
// auto psi_sq = (psi[i + 0] - psi_avg) * (psi[i + 0] - psi_avg) +
// (psi[i + 1] - psi_avg) * (psi[i + 1] - psi_avg);
// if (psi_sq >= 200)
// continue;
switch (inResidues[i].GetHelixFlag(helix_type::pp))
{
case helix_position_type::None:
inResidues[i].SetHelixFlag(helix_type::pp, helix_position_type::Start);
break;
case helix_position_type::End:
inResidues[i].SetHelixFlag(helix_type::pp, helix_position_type::Middle);
break;
default:
break;
}
inResidues[i + 1].SetHelixFlag(helix_type::pp, helix_position_type::End);
if (inResidues[i].GetSecondaryStructure() == structure_type::Loop)
inResidues[i].SetSecondaryStructure(structure_type::Helix_PPII);
if (inResidues[i + 1].GetSecondaryStructure() == structure_type::Loop)
inResidues[i + 1].SetSecondaryStructure(structure_type::Helix_PPII);
}
break;
case 3:
{
if (phi_min > phi[i + 0] or phi[i + 0] > phi_max or
phi_min > phi[i + 1] or phi[i + 1] > phi_max or
phi_min > phi[i + 2] or phi[i + 2] > phi_max)
continue;
if (psi_min > psi[i + 0] or psi[i + 0] > psi_max or
psi_min > psi[i + 1] or psi[i + 1] > psi_max or
psi_min > psi[i + 2] or psi[i + 2] > psi_max)
continue;
// auto phi_avg = (phi[i + 0] + phi[i + 1] + phi[i + 2]) / 3;
// auto phi_sq = (phi[i + 0] - phi_avg) * (phi[i + 0] - phi_avg) +
// (phi[i + 1] - phi_avg) * (phi[i + 1] - phi_avg) +
// (phi[i + 2] - phi_avg) * (phi[i + 2] - phi_avg);
// if (phi_sq >= 300)
// continue;
// auto psi_avg = (psi[i + 0] + psi[i + 1] + psi[i + 2]) / 3;
// auto psi_sq = (psi[i + 0] - psi_avg) * (psi[i + 0] - psi_avg) +
// (psi[i + 1] - psi_avg) * (psi[i + 1] - psi_avg) +
// (psi[i + 2] - psi_avg) * (psi[i + 2] - psi_avg);
// if (psi_sq >= 300)
// continue;
switch (inResidues[i].GetHelixFlag(helix_type::pp))
{
case helix_position_type::None:
inResidues[i].SetHelixFlag(helix_type::pp, helix_position_type::Start);
break;
case helix_position_type::End:
inResidues[i].SetHelixFlag(helix_type::pp, helix_position_type::StartAndEnd);
break;
default:
break;
}
inResidues[i + 1].SetHelixFlag(helix_type::pp, helix_position_type::Middle);
inResidues[i + 2].SetHelixFlag(helix_type::pp, helix_position_type::End);
if (inResidues[i + 0].GetSecondaryStructure() == structure_type::Loop)
inResidues[i + 0].SetSecondaryStructure(structure_type::Helix_PPII);
if (inResidues[i + 1].GetSecondaryStructure() == structure_type::Loop)
inResidues[i + 1].SetSecondaryStructure(structure_type::Helix_PPII);
if (inResidues[i + 2].GetSecondaryStructure() == structure_type::Loop)
inResidues[i + 2].SetSecondaryStructure(structure_type::Helix_PPII);
break;
}
default:
throw std::runtime_error("Unsupported stretch length");
}
}
}
// --------------------------------------------------------------------
struct DSSP_impl
{
DSSP_impl(const cif::datablock &db, int model_nr, int min_poly_proline_stretch_length);
auto findRes(const std::string &asymID, int seqID)
{
return std::find_if(mResidues.begin(), mResidues.end(), [&](auto &r)
{ return r.mAsymID == asymID and r.mSeqID == seqID; });
}
void calculateSurface();
void calculateSecondaryStructure();
std::string GetPDBHEADERLine();
std::string GetPDBCOMPNDLine();
std::string GetPDBSOURCELine();
std::string GetPDBAUTHORLine();
const cif::datablock &mDB;
std::vector<residue> mResidues;
std::vector<std::pair<residue *, residue *>> mSSBonds;
int m_min_poly_proline_stretch_length;
statistics mStats = {};
};
// --------------------------------------------------------------------
DSSP_impl::DSSP_impl(const cif::datablock &db, int model_nr, int min_poly_proline_stretch_length)
: mDB(db)
, m_min_poly_proline_stretch_length(min_poly_proline_stretch_length)
{
int resNumber = 0;
auto &pdbx_poly_seq_scheme = mDB["pdbx_poly_seq_scheme"];
auto &atom_site = mDB["atom_site"];
for (auto r : pdbx_poly_seq_scheme)
{
std::string pdb_strand_id, pdb_ins_code;
int pdb_seq_num;
cif::tie(pdb_strand_id, pdb_seq_num, pdb_ins_code) = r.get("pdb_strand_id", "pdb_seq_num", "pdb_ins_code");
chain_break_type brk = chain_break_type::NewChain;
residue residue(model_nr, pdb_strand_id, pdb_seq_num, pdb_ins_code, pdbx_poly_seq_scheme.get_children(r, atom_site));
if (not residue.mComplete)
continue;
++resNumber;
if (mResidues.empty())
mStats.count.chains = 1;
else
{
if (mResidues.back().mAsymID != residue.mAsymID)
++mStats.count.chains;
if (distance(mResidues.back().mC, residue.mN) > kMaxPeptideBondLength)
{
if (mResidues.back().mAsymID == residue.mAsymID)
{
++mStats.count.chains;
brk = chain_break_type::Gap;
}
++resNumber;
}
}
residue.mChainBreak = brk;
residue.mNumber = resNumber;
mResidues.emplace_back(std::move(residue));
brk = chain_break_type::None;
}
mStats.count.residues = static_cast<uint32_t>(mResidues.size());
for (size_t i = 0; i + 1 < mResidues.size(); ++i)
{
auto &cur = mResidues[i];
auto &next = mResidues[i + 1];
next.mPrev = &cur;
cur.mNext = &next;
}
for (size_t i = 0; i < mResidues.size(); ++i)
{
auto &cur = mResidues[i];
if (i >= 2 and i + 2 < mResidues.size())
{
auto &prevPrev = mResidues[i - 2];
auto &nextNext = mResidues[i + 2];
if (NoChainBreak(prevPrev, nextNext) and prevPrev.mSeqID + 4 == nextNext.mSeqID)
{
double ckap = cosinus_angle(
cur.mCAlpha,
prevPrev.mCAlpha,
nextNext.mCAlpha,
cur.mCAlpha);
double skap = std::sqrt(1 - ckap * ckap);
cur.mKappa = std::atan2(skap, ckap) * 180 / kPI;
}
}
if (i + 1 < mResidues.size())
{
auto &next = mResidues[i + 1];
next.assignHydrogen();
if (NoChainBreak(cur, next))
cur.mPsi = dihedral_angle(cur.mN, cur.mCAlpha, cur.mC, next.mN);
}
if (i > 0)
{
auto &prev = mResidues[i - 1];
if (NoChainBreak(prev, cur))
{
cur.mTCO = cosinus_angle(cur.mC, cur.mO, prev.mC, prev.mO);
cur.mPhi = dihedral_angle(prev.mC, cur.mN, cur.mCAlpha, cur.mC);
}
}
if (i >= 1 and i + 2 < mResidues.size())
{
auto &prev = mResidues[i - 1];
auto &next = mResidues[i + 1];
auto &nextNext = mResidues[i + 2];
if (NoChainBreak(prev, nextNext))
cur.mAlpha = dihedral_angle(prev.mCAlpha, cur.mCAlpha, next.mCAlpha, nextNext.mCAlpha);
}
}
}
void DSSP_impl::calculateSecondaryStructure()
{
using namespace cif::literals;
for (auto [asym1, seq1, asym2, seq2] : mDB["struct_conn"].find<std::string, int, std::string, int>("conn_type_id"_key == "disulf",
"ptnr1_label_asym_id", "ptnr1_label_seq_id", "ptnr2_label_asym_id", "ptnr2_label_seq_id"))
{
auto r1 = findRes(asym1, seq1);
if (r1 == mResidues.end())
{
if (cif::VERBOSE > 0)
std::cerr << "Missing (incomplete?) residue for SS bond when trying to find " << asym1 << '/' << seq1 << std::endl;
continue;
// throw std::runtime_error("Invalid file, missing residue for SS bond");
}
auto r2 = findRes(asym2, seq2);
if (r2 == mResidues.end())
{
if (cif::VERBOSE > 0)
std::cerr << "Missing (incomplete?) residue for SS bond when trying to find " << asym2 << '/' << seq2 << std::endl;
continue;
// throw std::runtime_error("Invalid file, missing residue for SS bond");
}
mSSBonds.emplace_back(&*r1, &*r2);
}
CalculateHBondEnergies(mResidues);
CalculateBetaSheets(mResidues, mStats);
CalculateAlphaHelices(mResidues, mStats);
CalculatePPHelices(mResidues, mStats, m_min_poly_proline_stretch_length);
if (cif::VERBOSE > 1)
{
for (auto &r : mResidues)
{
char helix[5] = {};
for (helix_type helixType : { helix_type::_3_10, helix_type::alpha, helix_type::pi, helix_type::pp })
{
switch (r.GetHelixFlag(helixType))
{
case helix_position_type::Start: helix[static_cast<int>(helixType)] = '>'; break;
case helix_position_type::Middle: helix[static_cast<int>(helixType)] = helixType == helix_type::pp ? 'P' : '3' + static_cast<char>(helixType); break;
case helix_position_type::StartAndEnd: helix[static_cast<int>(helixType)] = 'X'; break;
case helix_position_type::End: helix[static_cast<int>(helixType)] = '<'; break;
case helix_position_type::None: helix[static_cast<int>(helixType)] = ' '; break;
}
}
auto id = r.mAsymID + ':' + std::to_string(r.mSeqID) + '/' + r.mCompoundID;
std::cerr << id << std::string(12 - id.length(), ' ')
<< char(r.mSecondaryStructure) << ' '
<< helix
<< std::endl;
}
}
// finish statistics
mStats.count.SS_bridges = static_cast<uint32_t>(mSSBonds.size());
mStats.count.intra_chain_SS_bridges = 0;
uint8_t ssBondNr = 0;
for (const auto &[a, b] : mSSBonds)
{
if (a == b)
{
if (cif::VERBOSE > 0)
std::cerr << "In the SS bonds list, the residue " << a->mAsymID << ':' << a->mSeqID << " is bonded to itself" << std::endl;
continue;
}
if (a->mAsymID == b->mAsymID and NoChainBreak(a, b))
++mStats.count.intra_chain_SS_bridges;
a->mSSBridgeNr = b->mSSBridgeNr = ++ssBondNr;
}
mStats.count.H_bonds = 0;
for (auto &r : mResidues)
{
auto donor = r.mHBondDonor;
for (int i = 0; i < 2; ++i)
{
if (donor[i].res != nullptr and donor[i].energy < kMaxHBondEnergy)
{
++mStats.count.H_bonds;
auto k = donor[i].res->mNumber - r.mNumber;
if (k >= -5 and k <= 5)
mStats.count.H_Bonds_per_distance[k + 5] += 1;
}
}
}
}
void DSSP_impl::calculateSurface()
{
CalculateAccessibilities(mResidues, mStats);
}
// --------------------------------------------------------------------
// Truncate lines in pseudo PDB format to this length
const int kTruncateAt = 127;
std::string FixStringLength(std::string s, std::string::size_type l = kTruncateAt)
{
if (s.length() > l)
s = s.substr(0, l - 4) + "... ";
else if (s.length() < l)
s.append(l - s.length(), ' ');
return s;
}
std::string cif2pdbDate(const std::string &d)
{
const std::regex rx(R"((\d{4})-(\d{2})(?:-(\d{2}))?)");
const char *kMonths[12] = {
"JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
};
std::smatch m;
std::ostringstream os;
if (std::regex_match(d, m, rx))
{
int year = std::stoi(m[1].str());
int month = std::stoi(m[2].str());
if (m[3].matched)
os << std::setw(2) << std::setfill('0') << stoi(m[3].str()) << '-';
os << kMonths[month - 1] << '-' << std::setw(2) << std::setfill('0') << (year % 100);
}
return os.str();
}
std::string cif2pdbAuth(std::string name)
{
const std::regex rx(R"(([^,]+), (\S+))");
std::smatch m;
if (std::regex_match(name, m, rx))
name = m[2].str() + m[1].str();
return name;
}
std::string DSSP_impl::GetPDBHEADERLine()
{
std::string keywords;
auto &cat1 = mDB["struct_keywords"];
for (auto r : cat1)
{
keywords = FixStringLength(r["pdbx_keywords"].as<std::string>(), 40);
break;
}
std::string date;
for (auto r : mDB["pdbx_database_status"])
{
date = r["recvd_initial_deposition_date"].as<std::string>();
if (date.empty())
continue;
date = cif2pdbDate(date);
break;
}
if (date.empty())
{
for (auto r : mDB["database_PDB_rev"])
{
date = r["date_original"].as<std::string>();
if (date.empty())
continue;
date = cif2pdbDate(date);
break;
}
}
date = FixStringLength(date, 9);
// 0 1 2 3 4 5 6 7 8
// HEADER xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxDDDDDDDDD IIII
char header[] =
"HEADER xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxDDDDDDDDD IIII";
std::copy(keywords.begin(), keywords.end(), header + 10);
std::copy(date.begin(), date.end(), header + 50);
std::string id = mDB.name();
if (id.length() < 4)
id.insert(id.end(), 4 - id.length(), ' ');
else if (id.length() > 4)
id.erase(id.begin() + 4, id.end());
std::copy(id.begin(), id.end(), header + 62);
return FixStringLength(header);
}
std::string DSSP_impl::GetPDBCOMPNDLine()
{
// COMPND
using namespace std::placeholders;
using namespace cif::literals;
int molID = 0;
std::vector<std::string> cmpnd;
for (auto r : mDB["entity"].find("type"_key == "polymer"))
{
std::string entityID = r["id"].as<std::string>();
++molID;
cmpnd.push_back("MOL_ID: " + std::to_string(molID));
std::string molecule = r["pdbx_description"].as<std::string>();
cmpnd.push_back("MOLECULE: " + molecule);
auto poly = mDB["entity_poly"].find("entity_id"_key == entityID);
if (not poly.empty())
{
std::string chains = poly.front()["pdbx_strand_id"].as<std::string>();
cif::replace_all(chains, ",", ", ");
cmpnd.push_back("CHAIN: " + chains);
}
std::string fragment = r["pdbx_fragment"].as<std::string>();
if (not fragment.empty())
cmpnd.push_back("FRAGMENT: " + fragment);
for (auto sr : mDB["entity_name_com"].find("entity_id"_key == entityID))
{
std::string syn = sr["name"].as<std::string>();
if (not syn.empty())
cmpnd.push_back("SYNONYM: " + syn);
}
std::string mutation = r["pdbx_mutation"].as<std::string>();
if (not mutation.empty())
cmpnd.push_back("MUTATION: " + mutation);
std::string ec = r["pdbx_ec"].as<std::string>();
if (not ec.empty())
cmpnd.push_back("EC: " + ec);
if (r["src_method"] == "man" or r["src_method"] == "syn")
cmpnd.push_back("ENGINEERED: YES");
std::string details = r["details"].as<std::string>();
if (not details.empty())
cmpnd.push_back("OTHER_DETAILS: " + details);
}
return FixStringLength("COMPND " + cif::join(cmpnd, "; "), kTruncateAt);
}
std::string DSSP_impl::GetPDBSOURCELine()
{
// SOURCE
using namespace cif::literals;
int molID = 0;
std::vector<std::string> source;
for (auto r : mDB["entity"])
{
if (r["type"] != "polymer")
continue;
std::string entityID = r["id"].as<std::string>();
++molID;
source.push_back("MOL_ID: " + std::to_string(molID));
if (r["src_method"] == "syn")
source.push_back("SYNTHETIC: YES");
auto &gen = mDB["entity_src_gen"];
const std::pair<const char *, const char *> kGenSourceMapping[] = {
{ "gene_src_common_name", "ORGANISM_COMMON" },
{ "pdbx_gene_src_gene", "GENE" },
{ "gene_src_strain", "STRAIN" },
{ "pdbx_gene_src_cell_line", "CELL_LINE" },
{ "pdbx_gene_src_organelle", "ORGANELLE" },
{ "pdbx_gene_src_cellular_location", "CELLULAR_LOCATION" },
{ "pdbx_gene_src_scientific_name", "ORGANISM_SCIENTIFIC" },
{ "pdbx_gene_src_ncbi_taxonomy_id", "ORGANISM_TAXID" },
{ "pdbx_host_org_scientific_name", "EXPRESSION_SYSTEM" },
{ "pdbx_host_org_ncbi_taxonomy_id", "EXPRESSION_SYSTEM_TAXID" },
{ "pdbx_host_org_strain", "EXPRESSION_SYSTEM_STRAIN" },
{ "pdbx_host_org_variant", "EXPRESSION_SYSTEM_VARIANT" },
{ "pdbx_host_org_cellular_location", "EXPRESSION_SYSTEM_CELLULAR_LOCATION" },
{ "pdbx_host_org_vector_type", "EXPRESSION_SYSTEM_VECTOR_TYPE" },
{ "pdbx_host_org_vector", "EXPRESSION_SYSTEM_VECTOR" },
{ "pdbx_host_org_gene", "EXPRESSION_SYSTEM_GENE" },
{ "plasmid_name", "EXPRESSION_SYSTEM_PLASMID" }
};
for (auto gr : gen.find("entity_id"_key == entityID))
{
for (auto m : kGenSourceMapping)
{
std::string cname, sname;
tie(cname, sname) = m;
std::string s = gr[cname].as<std::string>();
if (not s.empty())
source.push_back(sname + ": " + s);
}
}
auto &nat = mDB["entity_src_nat"];
const std::pair<const char *, const char *> kNatSourceMapping[] = {
{ "common_name", "ORGANISM_COMMON" },
{ "strain", "STRAIN" },
{ "pdbx_organism_scientific", "ORGANISM_SCIENTIFIC" },
{ "pdbx_ncbi_taxonomy_id", "ORGANISM_TAXID" },
{ "pdbx_cellular_location", "CELLULAR_LOCATION" },
{ "pdbx_plasmid_name", "PLASMID" },
{ "pdbx_organ", "ORGAN" },
{ "details", "OTHER_DETAILS" }
};
for (auto nr : nat.find("entity_id"_key == entityID))
{
for (auto m : kNatSourceMapping)
{
std::string cname, sname;
tie(cname, sname) = m;
std::string s = nr[cname].as<std::string>();
if (not s.empty())
source.push_back(sname + ": " + s);
}
}
}
return FixStringLength("SOURCE " + cif::join(source, "; "), kTruncateAt);
}
std::string DSSP_impl::GetPDBAUTHORLine()
{
// AUTHOR
std::vector<std::string> author;
for (auto r : mDB["audit_author"])
author.push_back(cif2pdbAuth(r["name"].as<std::string>()));
return FixStringLength("AUTHOR " + cif::join(author, "; "), kTruncateAt);
}
// --------------------------------------------------------------------
std::string DSSP::residue_info::asym_id() const
{
return m_impl->mAsymID;
}
std::string DSSP::residue_info::compound_id() const
{
return m_impl->mCompoundID;
}
int DSSP::residue_info::seq_id() const
{
return m_impl->mSeqID;
}
std::string DSSP::residue_info::alt_id() const
{
return m_impl->mAltID;
}
std::string DSSP::residue_info::auth_asym_id() const
{
return m_impl->mAuthAsymID;
}
int DSSP::residue_info::auth_seq_id() const
{
return m_impl->mAuthSeqID;
}
std::string DSSP::residue_info::pdb_strand_id() const
{
return m_impl->mPDBStrandID;
}
int DSSP::residue_info::pdb_seq_num() const
{
return m_impl->mPDBSeqNum;
}
std::string DSSP::residue_info::pdb_ins_code() const
{
return m_impl->mPDBInsCode;
}
float DSSP::residue_info::alpha() const
{
return m_impl->mAlpha;
}
float DSSP::residue_info::kappa() const
{
return m_impl->mKappa;
}
float DSSP::residue_info::phi() const
{
return m_impl->mPhi;
}
float DSSP::residue_info::psi() const
{
return m_impl->mPsi;
}
float DSSP::residue_info::tco() const
{
return m_impl->mTCO;
}
std::tuple<float, float, float> DSSP::residue_info::ca_location() const
{
return { m_impl->mCAlpha.mX, m_impl->mCAlpha.mY, m_impl->mCAlpha.mZ };
}
chain_break_type DSSP::residue_info::chain_break() const
{
return m_impl->mChainBreak;
}
int DSSP::residue_info::nr() const
{
return m_impl->mNumber;
}
structure_type DSSP::residue_info::type() const
{
return m_impl->mSecondaryStructure;
}
int DSSP::residue_info::ssBridgeNr() const
{
return m_impl->mSSBridgeNr;
}
helix_position_type DSSP::residue_info::helix(helix_type helixType) const
{
return m_impl->GetHelixFlag(helixType);
}
bool DSSP::residue_info::is_alpha_helix_end_before_start() const
{
bool result = false;
if (m_impl->mNext != nullptr)
result = m_impl->GetHelixFlag(helix_type::alpha) == helix_position_type::End and m_impl->mNext->GetHelixFlag(helix_type::alpha) == helix_position_type::Start;
return result;
}
bool DSSP::residue_info::bend() const
{
return m_impl->IsBend();
}
double DSSP::residue_info::accessibility() const
{
return m_impl->mAccessibility;
}
std::tuple<DSSP::residue_info, int, bool> DSSP::residue_info::bridge_partner(int i) const
{
auto bp = m_impl->GetBetaPartner(i);
residue_info ri(bp.m_residue);
return std::make_tuple(std::move(ri), bp.ladder, bp.parallel);
}
int DSSP::residue_info::sheet() const
{
return m_impl->GetSheet();
}
std::tuple<DSSP::residue_info, double> DSSP::residue_info::acceptor(int i) const
{
auto &a = m_impl->mHBondAcceptor[i];
return { residue_info(a.res), a.energy };
}
std::tuple<DSSP::residue_info, double> DSSP::residue_info::donor(int i) const
{
auto &d = m_impl->mHBondDonor[i];
return { residue_info(d.res), d.energy };
}
// --------------------------------------------------------------------
DSSP::iterator::iterator(residue *res)
: m_current(res)
{
}
DSSP::iterator &DSSP::iterator::operator++()
{
++m_current.m_impl;
return *this;
}
DSSP::iterator &DSSP::iterator::operator--()
{
--m_current.m_impl;
return *this;
}
// --------------------------------------------------------------------
DSSP::DSSP(const cif::datablock &db, int model_nr, int min_poly_proline_stretch, bool calculateSurfaceAccessibility)
: m_impl(new DSSP_impl(db, model_nr, min_poly_proline_stretch))
{
if (calculateSurfaceAccessibility)
{
std::thread t(std::bind(&DSSP_impl::calculateSurface, m_impl));
m_impl->calculateSecondaryStructure();
t.join();
}
else
m_impl->calculateSecondaryStructure();
}
DSSP::~DSSP()
{
delete m_impl;
}
DSSP::iterator DSSP::begin() const
{
return iterator(m_impl->mResidues.empty() ? nullptr : m_impl->mResidues.data());
}
DSSP::iterator DSSP::end() const
{
// careful now, MSVC is picky when it comes to dereferencing iterators that are at the end.
residue *res = nullptr;
if (not m_impl->mResidues.empty())
{
res = m_impl->mResidues.data();
res += m_impl->mResidues.size();
}
return iterator(res);
}
statistics DSSP::get_statistics() const
{
return m_impl->mStats;
}
std::string DSSP::get_pdb_header_line(pdb_record_type pdb_record) const
{
switch (pdb_record)
{
case pdb_record_type::HEADER:
return m_impl->GetPDBHEADERLine();
case pdb_record_type::COMPND:
return m_impl->GetPDBCOMPNDLine();
case pdb_record_type::SOURCE:
return m_impl->GetPDBSOURCELine();
case pdb_record_type::AUTHOR:
return m_impl->GetPDBAUTHORLine();
default:
return {};
}
}
} // namespace dssp
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/// \file DSSP.hpp
/// Calculate DSSP-like secondary structure information.
#pragma once
#include <cif++/cif.hpp>
namespace dssp
{
struct residue;
enum class structure_type : char
{
Loop = ' ',
Alphahelix = 'H',
Betabridge = 'B',
Strand = 'E',
Helix_3 = 'G',
Helix_5 = 'I',
Helix_PPII = 'P',
Turn = 'T',
Bend = 'S'
};
enum class helix_type
{
_3_10,
alpha,
pi,
pp
};
enum class helix_position_type
{
None,
Start,
End,
StartAndEnd,
Middle
};
const size_t
kHistogramSize = 30;
struct statistics
{
struct
{
uint32_t residues, chains, SS_bridges, intra_chain_SS_bridges, H_bonds;
uint32_t H_bonds_in_antiparallel_bridges, H_bonds_in_parallel_bridges;
uint32_t H_Bonds_per_distance[11];
} count;
double accessible_surface;
struct
{
uint32_t residues_per_alpha_helix[kHistogramSize];
uint32_t parallel_bridges_per_ladder[kHistogramSize];
uint32_t antiparallel_bridges_per_ladder[kHistogramSize];
uint32_t ladders_per_sheet[kHistogramSize];
} histogram;
};
enum class chain_break_type
{
None,
NewChain,
Gap
};
class DSSP
{
public:
DSSP(const cif::datablock &db, int model_nr, int min_poly_proline_stretch_length, bool calculateSurfaceAccessibility);
~DSSP();
DSSP(const DSSP &) = delete;
DSSP &operator=(const DSSP &) = delete;
statistics get_statistics() const;
class iterator;
using res_iterator = typename std::vector<residue>::iterator;
class residue_info
{
public:
friend class iterator;
residue_info() = default;
residue_info(const residue_info &rhs) = default;
residue_info &operator=(const residue_info &rhs) = default;
explicit operator bool() const { return not empty(); }
bool empty() const { return m_impl == nullptr; }
std::string asym_id() const;
int seq_id() const;
std::string alt_id() const;
std::string compound_id() const;
std::string auth_asym_id() const;
int auth_seq_id() const;
std::string pdb_strand_id() const;
int pdb_seq_num() const;
std::string pdb_ins_code() const;
float alpha() const;
float kappa() const;
float phi() const;
float psi() const;
float tco() const;
std::tuple<float, float, float> ca_location() const;
chain_break_type chain_break() const;
/// \brief the internal number in DSSP
int nr() const;
structure_type type() const;
int ssBridgeNr() const;
helix_position_type helix(helix_type helixType) const;
bool is_alpha_helix_end_before_start() const;
bool bend() const;
double accessibility() const;
/// \brief returns resinfo, ladder and parallel
std::tuple<residue_info, int, bool> bridge_partner(int i) const;
int sheet() const;
/// \brief return resinfo and the energy of the bond
std::tuple<residue_info, double> acceptor(int i) const;
std::tuple<residue_info, double> donor(int i) const;
/// \brief Simple compare equals
bool operator==(const residue_info &rhs) const
{
return m_impl == rhs.m_impl;
}
/// \brief Returns \result true if there is a bond between two residues
friend bool test_bond(residue_info const &a, residue_info const &b);
private:
residue_info(residue *res)
: m_impl(res)
{
}
residue *m_impl = nullptr;
};
class iterator
{
public:
using iterator_category = std::bidirectional_iterator_tag;
using value_type = residue_info;
using difference_type = std::ptrdiff_t;
using pointer = value_type *;
using reference = value_type &;
iterator(const iterator &i) = default;
iterator(residue *res);
iterator &operator=(const iterator &i) = default;
reference operator*() { return m_current; }
pointer operator->() { return &m_current; }
iterator &operator++();
iterator operator++(int)
{
auto tmp(*this);
this->operator++();
return tmp;
}
iterator &operator--();
iterator operator--(int)
{
auto tmp(*this);
this->operator--();
return tmp;
}
bool operator==(const iterator &rhs) const { return m_current.m_impl == rhs.m_current.m_impl; }
bool operator!=(const iterator &rhs) const { return m_current.m_impl != rhs.m_current.m_impl; }
private:
residue_info m_current;
};
using value_type = residue_info;
// To access residue info by key, i.e. LabelAsymID and LabelSeqID
using key_type = std::tuple<std::string,int>;
iterator begin() const;
iterator end() const;
residue_info operator[](const key_type &key) const;
bool empty() const { return begin() == end(); }
// convenience method, when creating old style DSSP files
enum class pdb_record_type { HEADER, COMPND, SOURCE, AUTHOR };
std::string get_pdb_header_line(pdb_record_type pdb_record) const;
private:
struct DSSP_impl *m_impl;
};
} // namespace dssp
......@@ -39,7 +39,7 @@
#include <cif++/structure/Compound.hpp>
#include <cif++/utilities.hpp>
#include "dssp.hpp"
#include "dssp_wrapper.hpp"
#include "revision.hpp"
// --------------------------------------------------------------------
......
......@@ -38,12 +38,11 @@
#include <boost/format.hpp>
#include <boost/date_time/gregorian/formatters.hpp>
#include <cif++/dssp/DSSP.hpp>
// #include <cif++/structure/Compound.hpp>
#include "DSSP.hpp"
#include <boost/program_options.hpp>
#include "dssp.hpp"
#include "dssp_wrapper.hpp"
#include "revision.hpp"
namespace fs = std::filesystem;
......
......@@ -30,9 +30,9 @@
#include <stdexcept>
#include <cif++/dssp/DSSP.hpp>
#include "DSSP.hpp"
#include "dssp.hpp"
#include "dssp_wrapper.hpp"
namespace ba = boost::algorithm;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment