Commit ca881b82 by maarten

reshuffled files

git-svn-id: svn+ssh://gitlab/srv/svn-repos/pdb-redo/trunk@169 a1961a4f-ab94-4bcc-80e8-33b5a54de466
parent 58666318
// Lib for working with structures as contained in mmCIF and PDB files
#pragma once
#include "libcif/config.h"
#include <boost/filesystem/operations.hpp>
#include <boost/math/quaternion.hpp>
namespace libcif
{
enum atom_type : uint8
{
Nn = 0, // Unknown
H = 1, // Hydro­gen
He = 2, // He­lium
Li = 3, // Lith­ium
Be = 4, // Beryl­lium
B = 5, // Boron
C = 6, // Carbon
N = 7, // Nitro­gen
O = 8, // Oxy­gen
F = 9, // Fluor­ine
Ne = 10, // Neon
Na = 11, // So­dium
Mg = 12, // Magne­sium
Al = 13, // Alumin­ium
Si = 14, // Sili­con
P = 15, // Phos­phorus
S = 16, // Sulfur
Cl = 17, // Chlor­ine
Ar = 18, // Argon
K = 19, // Potas­sium
Ca = 20, // Cal­cium
Sc = 21, // Scan­dium
Ti = 22, // Tita­nium
V = 23, // Vana­dium
Cr = 24, // Chrom­ium
Mn = 25, // Manga­nese
Fe = 26, // Iron
Co = 27, // Cobalt
Ni = 28, // Nickel
Cu = 29, // Copper
Zn = 30, // Zinc
Ga = 31, // Gallium
Ge = 32, // Germa­nium
As = 33, // Arsenic
Se = 34, // Sele­nium
Br = 35, // Bromine
Kr = 36, // Kryp­ton
Rb = 37, // Rubid­ium
Sr = 38, // Stront­ium
Y = 39, // Yttrium
Zr = 40, // Zirco­nium
Nb = 41, // Nio­bium
Mo = 42, // Molyb­denum
Tc = 43, // Tech­netium
Ru = 44, // Ruthe­nium
Rh = 45, // Rho­dium
Pd = 46, // Pallad­ium
Ag = 47, // Silver
Cd = 48, // Cad­mium
In = 49, // Indium
Sn = 50, // Tin
Sb = 51, // Anti­mony
Te = 52, // Tellurium
I = 53, // Iodine
Xe = 54, // Xenon
Cs = 55, // Cae­sium
Ba = 56, // Ba­rium
La = 57, // Lan­thanum
Hf = 72, // Haf­nium
Ta = 73, // Tanta­lum
W = 74, // Tung­sten
Re = 75, // Rhe­nium
Os = 76, // Os­mium
Ir = 77, // Iridium
Pt = 78, // Plat­inum
Au = 79, // Gold
Hg = 80, // Mer­cury
Tl = 81, // Thallium
Pb = 82, // Lead
Bi = 83, // Bis­muth
Po = 84, // Polo­nium
At = 85, // Asta­tine
Rn = 86, // Radon
Fr = 87, // Fran­cium
Ra = 88, // Ra­dium
Ac = 89, // Actin­ium
Rf = 104, // Ruther­fordium
Db = 105, // Dub­nium
Sg = 106, // Sea­borgium
Bh = 107, // Bohr­ium
Hs = 108, // Has­sium
Mt = 109, // Meit­nerium
Ds = 110, // Darm­stadtium
Rg = 111, // Roent­genium
Cn = 112, // Coper­nicium
Nh = 113, // Nihon­ium
Fl = 114, // Flerov­ium
Mc = 115, // Moscov­ium
Lv = 116, // Liver­morium
Ts = 117, // Tenness­ine
Og = 118, // Oga­nesson
Ce = 58, // Cerium
Pr = 59, // Praseo­dymium
Nd = 60, // Neo­dymium
Pm = 61, // Prome­thium
Sm = 62, // Sama­rium
Eu = 63, // Europ­ium
Gd = 64, // Gadolin­ium
Tb = 65, // Ter­bium
Dy = 66, // Dyspro­sium
Ho = 67, // Hol­mium
Er = 68, // Erbium
Tm = 69, // Thulium
Yb = 70, // Ytter­bium
Lu = 71, // Lute­tium
Th = 90, // Thor­ium
Pa = 91, // Protac­tinium
U = 92, // Ura­nium
Np = 93, // Neptu­nium
Pu = 94, // Pluto­nium
Am = 95, // Ameri­cium
Cm = 96, // Curium
Bk = 97, // Berkel­ium
Cf = 98, // Califor­nium
Es = 99, // Einstei­nium
Fm = 100, // Fer­mium
Md = 101, // Mende­levium
No = 102, // Nobel­ium
Lr = 103, // Lawren­cium
};
// --------------------------------------------------------------------
// atom_type_info
enum radius_type {
eRadiusCalculated,
eRadiusEmpirical,
eRadiusCovalentEmpirical,
eRadiusSingleBond,
eRadiusDoubleBond,
eRadiusTripleBond,
eRadiusVanderWaals,
eRadiusTypeCount
};
struct atom_type_info
{
atom_type type;
std::string name;
std::string symbol;
float weight;
bool metal;
float radii[eRadiusTypeCount];
};
extern const atom_type_info kKnownAtoms[];
// --------------------------------------------------------------------
// atom_type_traits
class atom_type_traits
{
public:
atom_type_traits(atom_type a);
atom_type_traits(const std::string& symbol);
atom_type type() const { return m_info->type; }
std::string name() const { return m_info->name; }
std::string symbol() const { return m_info->symbol; }
float weight() const { return m_info->weight; }
bool is_metal() const { return m_info->metal; }
static bool is_element(const std::string& symbol);
static bool is_metal(const std::string& symbol);
float radius(radius_type type = eRadiusSingleBond) const
{
if (type >= eRadiusTypeCount)
throw std::invalid_argument("invalid radius requested");
return m_info->radii[type] / 100.f;
}
private:
const struct atom_type_info* m_info;
};
}
// cif parsing library
#pragma once
#include "libcif/config.h"
#include <regex>
#include <iostream>
#include <set>
#include <boost/lexical_cast.hpp>
#include <boost/any.hpp>
#include "cif-utils.h"
extern int VERBOSE;
/*
Simple C++ interface to CIF files.
Assumptions: a file contains one or more datablocks modelled by the class datablock.
Each datablock contains categories. These map to the original tables used to fill
the mmCIF file. Each category can contain multiple items, the columns in the table.
Values are stored as character strings internally.
Synopsis:
// create a cif file
cif::datablock e("1MVE");
e.append(cif::category{"_entry", { "id", "1MVE" } });
cif::category atom_site("atom_site");
size_t nr{};
for (my_atom: atoms)
{
atom_site.push_back({
{ "group_PDB", "ATOM" },
{ "id", ++nr },
{ "type_symbol", my_atom.type.str() },
...
});
}
e.append(move(atom_site));
cif::file f;
f.append(e);
ofstream os("1mve.cif");
f.write(os);
// read
f.read(ifstream{"1mve.cif"});
auto& e = f.first_datablock();
cout << "ID of datablock: " << e.id() << endl;
auto& atoms = e["atom_site"];
for (auto& atom: atoms)
{
cout << atom["group_PDB"] << ", "
<< atom["id"] << ", "
...
float x, y, z;
cif::tie(x, y, z) = atom.get("Cartn_x", "Cartn_y", "Cartn_z");
...
}
Another way of querying a category is by using this construct:
auto cat& = e["atom_site"];
auto rows = cat.find(key("label_asym_id") == "A" and key("label_seq_id") == 1);
*/
namespace cif
{
using std::string;
using std::vector;
// mmCIF mapping
// A CIF data file in this case contains entries (data blocks) which can contain
// one or more category objects. Each category object contains arrays of items.
// Better, you can consider the categories as tables containing columns which
// are the items.
class file;
class datablock;
class category;
class row; // a flyweight class that references data in categories
class item;
class validator;
struct validate_item;
struct validate_category;
struct item_column;
struct item_row;
struct item_value;
// --------------------------------------------------------------------
// class item
//
// This class is only transient, it is used to construct new rows.
// Access to already stored data is through an item_reference object.
class item
{
public:
typedef enum { not_applicable, not_defined, text, number } item_content_type;
item() {}
template<typename T>
item(const string& name, const T& value);
item(const item& rhs) : m_name(rhs.m_name), m_value(rhs.m_value) {}
item(item&& rhs) : m_name(std::move(rhs.m_name)), m_value(std::move(rhs.m_value)) {}
item& operator=(const item& rhs)
{
if (this != &rhs)
{
m_name = rhs.m_name;
m_value = rhs.m_value;
}
return *this;
}
item& operator=(item&& rhs)
{
if (this != &rhs)
{
m_name = std::move(rhs.m_name);
m_value = std::move(rhs.m_value);
}
return *this;
}
const string& name() const { return m_name; }
const string& value() const { return m_value; }
void value(const string& v) { m_value = v; }
bool empty() const { return m_value.empty(); }
size_t length() const { return m_value.length(); }
const char* c_str() const { return m_value.c_str(); }
private:
string m_name;
string m_value;
};
template<typename T>
inline
item::item(const string& name, const T& value)
: m_name(name), m_value(boost::lexical_cast<string>(value))
{
}
template<>
inline
item::item(const string& name, const string& value)
: m_name(name), m_value(value)
{
}
// --------------------------------------------------------------------
// class datablock acts as an STL container for category objects
class datablock
{
public:
friend class file;
typedef std::list<category> category_list;
typedef category_list::iterator iterator;
typedef category_list::const_iterator const_iterator;
datablock(const string& name);
~datablock();
datablock(const datablock&) = delete;
datablock& operator=(const datablock&) = delete;
string name() const { return m_name; }
void set_name(const string& n) { m_name = n; }
string first_item(const string& tag) const;
iterator begin() { return m_categories.begin(); }
iterator end() { return m_categories.end(); }
const_iterator begin() const { return m_categories.begin(); }
const_iterator end() const { return m_categories.end(); }
category& operator[](const string& name);
std::tuple<iterator,bool> emplace(const std::string& name);
void validate();
void set_validator(validator* v);
// this one only looks up a category, returns nullptr if it does not exist
category* get(const string& name);
void get_tag_order(vector<string>& tags) const;
private:
void write(std::ostream& os);
void write(std::ostream& os, const vector<string>& order);
std::list<category> m_categories;
string m_name;
validator* m_validator;
datablock* m_next;
};
// --------------------------------------------------------------------
// class row acts as a container for item objects, It has a more useful
// interface for accessing the contained columns. The get() method
// returns a row_result object that can be used to access only a subset
// of column values by index or by name.
namespace detail
{
// item_reference is a helper class
struct item_reference
{
const char* m_name;
item_row* m_row;
template<typename T>
item_reference& operator=(const T& value)
{
this->operator=(boost::lexical_cast<string>(value));
return *this;
}
// operator string() const { return c_str(); }
template<typename T>
T as() const
{
T result = 0;
if (not empty())
result = boost::lexical_cast<T>(c_str());
return result;
}
template<typename T>
int compare(const T& value) const
{
int result = 0;
try
{
double v = boost::lexical_cast<T>(c_str());
if (v < value)
result = -1;
else if (v > value)
result = 1;
}
catch (...)
{
if (VERBOSE)
std::cerr << "conversion error in compare for '" << c_str() << '\'' << std::endl;
result = 1;
}
return result;
}
bool empty() const;
// bool unapplicable() const;
const char* c_str() const;
bool operator!=(const string& s) const { return s != c_str(); }
bool operator==(const string& s) const { return s == c_str(); }
};
template<>
inline
string item_reference::as<string>() const
{
return string(c_str());
}
template<>
inline
const char* item_reference::as<const char*>() const
{
return c_str();
}
template<>
inline
int item_reference::compare<string>(const string& value) const
{
return icompare(c_str(), value.c_str());
}
template<>
inline
int item_reference::compare(const char* const& value) const
{
return cif::icompare(c_str(), value);
}
inline std::ostream& operator<<(std::ostream& os, const item_reference& rhs)
{
os << rhs.c_str();
return os;
}
template<>
item_reference& item_reference::operator=(const string& value);
// some helper classes to help create tuple result types
template<typename...> struct tuple_catter;
template<typename... Ts>
struct tuple_catter<std::tuple<Ts...>>
{
typedef std::tuple<Ts...> type;
};
template<typename... T1s, typename... T2s, typename... Rem>
struct tuple_catter<std::tuple<T1s...>, std::tuple<T2s...>, Rem...>
{
typedef typename tuple_catter<std::tuple<T1s..., T2s...>, Rem...>::type type;
};
template<typename...> struct col_getter;
template<typename T>
struct col_getter<T>
{
typedef std::tuple<const item_reference> type;
template<typename Res>
static type get(Res& rs)
{
size_t index = Res::N - 1;
return std::tuple<const item_reference>{ rs[index] };
}
};
template<typename T, typename... Ts>
struct col_getter<T, Ts...>
{
typedef col_getter<Ts...> next;
typedef typename tuple_catter<std::tuple<const item_reference>, typename next::type>::type type;
template<typename Res>
static type get(Res& rs)
{
typedef col_getter<Ts...> next;
size_t index = Res::N - 1 - sizeof...(Ts);
return std::tuple_cat(std::tuple<const item_reference>{ rs[index]}, next::get(rs));
}
};
template<typename... C>
struct get_row_result
{
enum { N = sizeof...(C) };
typedef typename col_getter<C...>::type tuple_type;
// const item_reference operator[](const string& col) const
// {
// return m_row[col];
// }
const item_reference operator[](size_t ix) const
{
return m_row[m_columns[ix]];
}
get_row_result(row& r, C... columns)
: m_row(r), m_columns({{columns...}}) {}
row& m_row;
std::array<const char*, N> m_columns;
};
// we want to be able to tie some variables to a row_result, for this we use tiewraps
template<int IX, typename... Ts>
struct tie_wrap;
template<int IX, typename T>
struct tie_wrap<IX,T>
{
tie_wrap(T& t)
: m_val(t) {}
template<typename Res>
void operator=(const Res& rr)
{
typedef typename std::remove_reference<T>::type basic_type;
const item_reference v = rr[IX];
basic_type tv = v.as<basic_type>();
m_val = tv;
}
T& m_val;
};
template<int IX, typename T, typename... Ts>
struct tie_wrap<IX, T, Ts...>
{
typedef tie_wrap<IX + 1, Ts...> next;
tie_wrap(T& t, Ts&... ts)
: m_val(t), m_next(ts...) {}
template<typename Res>
void operator=(const Res& rr)
{
typedef typename std::remove_reference<T>::type basic_type;
const item_reference v = rr[IX];
basic_type tv = v.as<basic_type>();
m_val = tv;
m_next.operator=(rr);
}
T& m_val;
next m_next;
};
}
template<typename... Ts>
auto tie(Ts&... v) -> detail::tie_wrap<0, Ts...>
{
return detail::tie_wrap<0, Ts...>(v...);
}
class row
{
public:
friend class category;
friend class cat_index;
friend class row_comparator;
friend struct detail::item_reference;
row(item_row* data = nullptr) : m_data(data) {}
row(const row& rhs);
row& operator=(const row& rhs);
struct const_iterator : public std::iterator<std::forward_iterator_tag, const item>
{
typedef std::iterator<std::forward_iterator_tag, item> base_type;
typedef typename base_type::pointer pointer;
typedef typename base_type::reference reference;
const_iterator(item_row* data, item_value* ptr);
reference operator*() { return m_current; }
pointer operator->() { return &m_current; }
const_iterator& operator++();
const_iterator operator++(int) { const_iterator result(*this); this->operator++(); return result; }
bool operator==(const const_iterator& rhs) const { return m_ptr == rhs.m_ptr; }
bool operator!=(const const_iterator& rhs) const { return m_ptr != rhs.m_ptr; }
private:
void fetch();
item_row* m_data;
item_value* m_ptr;
item m_current;
};
// checks for an initialized row:
operator bool() const { return m_data != nullptr; }
bool empty() const;
const_iterator begin() const;
const_iterator end() const;
// TODO: implement real const version?
const detail::item_reference operator[](const char* item_tag) const
{
return detail::item_reference{item_tag, m_data};
}
detail::item_reference operator[](const char* item_tag)
{
return detail::item_reference{item_tag, m_data};
}
const detail::item_reference operator[](const string& item_tag) const
{
return detail::item_reference{item_tag.c_str(), m_data};
}
detail::item_reference operator[](const string& item_tag)
{
return detail::item_reference{item_tag.c_str(), m_data};
}
template<typename... C>
auto get(C... columns) -> detail::get_row_result<C...>
{
return detail::get_row_result<C...>(*this, columns...);
}
bool operator==(const row& rhs) const
{
return m_data == rhs.m_data;
}
item_row* data() const { return m_data; }
void swap(row& rhs)
{
std::swap(m_data, rhs.m_data);
}
private:
void assign(const string& name, const string& value, bool emplacing);
void assign(const item& i, bool emplacing);
item_row* m_data;
};
// swap for rows is defined below
// --------------------------------------------------------------------
// some more templates to be able to do querying
namespace detail
{
struct condition_impl
{
virtual ~condition_impl() {}
virtual bool test(const category& c, const row& r) const = 0;
virtual std::string str() const = 0;
};
}
struct condition
{
condition(detail::condition_impl* impl) : m_impl(impl) {}
condition(condition&& rhs)
: m_impl(nullptr)
{
std::swap(m_impl, rhs.m_impl);
}
condition& operator=(condition&& rhs)
{
std::swap(m_impl, rhs.m_impl);
return *this;
}
~condition()
{
delete m_impl;
}
bool operator()(const category& c, const row& r) const
{
assert(m_impl);
return m_impl->test(c, r);
}
std::string str() const
{
return m_impl->str();
}
detail::condition_impl* m_impl;
};
namespace detail
{
template<typename T>
struct key_is_condition_impl : public condition_impl
{
typedef T value_type;
key_is_condition_impl(const string& item_tag, const value_type& value)
: m_item_tag(item_tag), m_value(value) {}
virtual bool test(const category& c, const row& r) const
{
return r[m_item_tag].template compare<value_type>(m_value) == 0;
}
virtual std::string str() const
{
return m_item_tag + " == " + boost::lexical_cast<std::string>(m_value);
}
string m_item_tag;
value_type m_value;
};
template<typename T>
struct key_is_not_condition_impl : public condition_impl
{
typedef T value_type;
key_is_not_condition_impl(const string& item_tag, const value_type& value)
: m_item_tag(item_tag), m_value(value) {}
virtual bool test(const category& c, const row& r) const
{
return r[m_item_tag].template compare<value_type>(m_value) != 0;
}
virtual std::string str() const
{
return m_item_tag + " != " + boost::lexical_cast<std::string>(m_value);
}
string m_item_tag;
value_type m_value;
};
template<typename COMP>
struct key_compare_condition_impl : public condition_impl
{
key_compare_condition_impl(const string& item_tag, COMP&& comp)
: m_item_tag(item_tag), m_comp(std::move(comp)) {}
virtual bool test(const category& c, const row& r) const
{
return m_comp(c, r);
}
virtual std::string str() const
{
return m_item_tag + " compare " /*+ boost::lexical_cast<std::string>(m_value)*/;
}
string m_item_tag;
COMP m_comp;
};
struct key_matches_condition_impl : public condition_impl
{
key_matches_condition_impl(const string& item_tag, const std::regex& rx)
: m_item_tag(item_tag), m_rx(rx) {}
virtual bool test(const category& c, const row& r) const
{
return std::regex_match(r[m_item_tag].as<string>(), m_rx);
}
virtual std::string str() const
{
return m_item_tag + " ~= " + "<rx>";
}
string m_item_tag;
std::regex m_rx;
};
template<typename T>
struct any_is_condition_impl : public condition_impl
{
typedef T value_type;
any_is_condition_impl(const value_type& value)
: m_value(value) {}
virtual bool test(const category& c, const row& r) const;
virtual std::string str() const
{
return "any == " + boost::lexical_cast<std::string>(m_value);
}
value_type m_value;
};
struct any_matches_condition_impl : public condition_impl
{
any_matches_condition_impl(const std::regex& rx)
: m_rx(rx) {}
virtual bool test(const category& c, const row& r) const;
virtual std::string str() const
{
return "any ~= <rx>";
}
std::regex m_rx;
};
struct and_condition_impl : public condition_impl
{
and_condition_impl(condition&& a, condition&& b)
: m_a(nullptr), m_b(nullptr)
{
std::swap(m_a, a.m_impl);
std::swap(m_b, b.m_impl);
}
~and_condition_impl()
{
delete m_a;
delete m_b;
}
virtual bool test(const category& c, const row& r) const
{
return m_a->test(c, r) and m_b->test(c, r);
}
virtual std::string str() const
{
return "(" + m_a->str() + ") and (" + m_b->str() + ")";
}
condition_impl* m_a;
condition_impl* m_b;
};
struct or_condition_impl : public condition_impl
{
or_condition_impl(condition&& a, condition&& b)
: m_a(nullptr), m_b(nullptr)
{
std::swap(m_a, a.m_impl);
std::swap(m_b, b.m_impl);
}
~or_condition_impl()
{
delete m_a;
delete m_b;
}
virtual bool test(const category& c, const row& r) const
{
return m_a->test(c, r) or m_b->test(c, r);
}
virtual std::string str() const
{
return "(" + m_a->str() + ") or (" + m_b->str() + ")";
}
condition_impl* m_a;
condition_impl* m_b;
};
}
inline condition operator&&(condition&& a, condition&& b)
{
return condition(new detail::and_condition_impl(std::move(a), std::move(b)));
}
inline condition operator||(condition&& a, condition&& b)
{
return condition(new detail::or_condition_impl(std::move(a), std::move(b)));
}
struct key
{
key(const string& item_tag) : m_item_tag(item_tag) {}
key(const char* item_tag) : m_item_tag(item_tag) {}
template<typename T>
condition operator==(const T& v) const
{
return condition(new detail::key_is_condition_impl<T>(m_item_tag, v));
}
condition operator==(const char* v) const
{
string value(v ? v : "");
return condition(new detail::key_is_condition_impl<std::string>(m_item_tag, value));
}
template<typename T>
condition operator!=(const T& v) const
{
return condition(new detail::key_is_not_condition_impl<T>(m_item_tag, v));
}
condition operator!=(const char* v) const
{
string value(v ? v : "");
return condition(new detail::key_is_not_condition_impl<std::string>(m_item_tag, value));
}
template<typename T>
condition operator>(const T& v) const
{
auto comp = [this, v](const category& c, const row& r) -> bool { return r[this->m_item_tag].as<T>() > v; };
return condition(new detail::key_compare_condition_impl<decltype(comp)>(m_item_tag, std::move(comp)));
}
template<typename T>
condition operator>=(const T& v) const
{
auto comp = [this, v](const category& c, const row& r) -> bool { return r[this->m_item_tag].as<T>() >= v; };
return condition(new detail::key_compare_condition_impl<decltype(comp)>(m_item_tag, std::move(comp)));
}
template<typename T>
condition operator<(const T& v) const
{
auto comp = [this, v](const category& c, const row& r) -> bool { return r[this->m_item_tag].as<T>() < v; };
return condition(new detail::key_compare_condition_impl<decltype(comp)>(m_item_tag, std::move(comp)));
}
template<typename T>
condition operator<=(const T& v) const
{
auto comp = [this, v](const category& c, const row& r) -> bool { return r[this->m_item_tag].as<T>() <= v; };
return condition(new detail::key_compare_condition_impl<decltype(comp)>(m_item_tag, std::move(comp)));
}
string m_item_tag;
};
template<>
inline
condition key::operator==(const std::regex& rx) const
{
return condition(new detail::key_matches_condition_impl(m_item_tag, rx));
}
struct any
{
template<typename T>
condition operator==(const T& v) const
{
return condition(new detail::any_is_condition_impl<T>(v));
}
};
template<>
inline
condition any::operator==(const std::regex& rx) const
{
return condition(new detail::any_matches_condition_impl(rx));
}
// --------------------------------------------------------------------
// class rowset is used to return find results. Use it to re-order the results
// or to group them
class rowset : public vector<row>
{
public:
rowset(category& cat);
rowset& orderBy(const string& item)
{ return orderBy({ item }); }
rowset& orderBy(std::initializer_list<string> items);
private:
category& m_cat;
};
// --------------------------------------------------------------------
// class category acts as an STL container for row objects
class category
{
public:
friend class datablock;
friend class row;
friend struct detail::item_reference;
category(datablock& db, const string& name, validator* validator);
category(const category&) = delete;
category& operator=(const category&) = delete;
~category();
const string name() const { return m_name; }
const detail::item_reference get_first_item(const char* item_name) const;
struct iterator : public std::iterator<std::forward_iterator_tag, row>
{
friend class category;
typedef std::iterator<std::forward_iterator_tag, row> base_type;
typedef typename base_type::pointer pointer;
typedef typename base_type::reference reference;
iterator(item_row* data) : m_current(data) {}
reference operator*() { return m_current; }
pointer operator->() { return &m_current; }
iterator& operator++();
iterator operator++(int) { iterator result(*this); this->operator++(); return result; }
bool operator==(const iterator& rhs) const { return m_current == rhs.m_current; }
bool operator!=(const iterator& rhs) const { return not (m_current == rhs.m_current); }
private:
row m_current;
};
iterator begin();
iterator end();
bool empty() const;
size_t size() const;
void clear();
row front() { return row(m_head); }
row back() { return row(m_tail); }
row operator[](condition&& cond);
rowset find(condition&& cond);
bool exists(condition&& cond);
rowset orderBy(const string& item)
{ return orderBy({ item }); }
rowset orderBy(std::initializer_list<string> items);
std::tuple<row,bool> emplace(item value) { return emplace({ value }); }
std::tuple<row,bool> emplace(std::initializer_list<item> values)
{ return emplace(values.begin(), values.end()); }
std::tuple<row,bool> emplace(row r);
template<class Iter>
std::tuple<row,bool> emplace(Iter b, Iter e);
void erase(condition&& cond);
void erase(row r);
void erase(iterator ri);
void validate();
const validator& get_validator() const;
const validate_category* get_cat_validator() const { return m_cat_validator; }
void set_validator(validator* v);
iset fields() const;
iset mandatory_fields() const;
iset key_fields() const;
void drop(const string& field);
void get_tag_order(vector<string>& tags) const;
// return index for known column, or the next available column index
size_t get_column_index(const string& name) const;
const string& get_column_name(size_t column_index) const;
void reorderByIndex();
private:
void write(std::ostream& os);
void write(std::ostream& os, const vector<string>& order);
void write(std::ostream& os, const vector<int>& order, bool includeEmptyColumns);
size_t add_column(const string& name);
datablock& m_db;
string m_name;
validator* m_validator;
const validate_category* m_cat_validator = nullptr;
vector<item_column> m_columns;
item_row* m_head;
item_row* m_tail;
class cat_index* m_index;
};
// --------------------------------------------------------------------
class file
{
public:
friend class parser;
friend class validator;
file();
file(std::istream& is, bool validate = false);
file(file&& rhs);
file(const file& rhs) = delete;
file& operator=(const file& rhs) = delete;
~file();
void load(std::istream& is);
void save(std::ostream& os);
void save(std::ostream& os, const vector<string>& order) { write(os, order); }
void write(std::ostream& os, const vector<string>& order);
void load_dictionary(); // load the default dictionary, that is mmcif_ddl in this case
void load_dictionary(const char* dict); // load one of the compiled in dictionaries
void load_dictionary(std::istream& is); // load dictionary from input stream
void validate();
datablock& first_datablock() { return *m_head; }
void append(datablock* e);
datablock& operator[](const string& name);
struct iterator : public std::iterator<std::forward_iterator_tag, datablock>
{
typedef std::iterator<std::forward_iterator_tag, datablock> base_type;
typedef typename base_type::pointer pointer;
typedef typename base_type::reference reference;
iterator(datablock* db) : m_current(db) {}
reference operator*() { return *m_current; }
pointer operator->() { return m_current; }
iterator& operator++();
iterator operator++(int) { iterator result(*this); this->operator++(); return result; }
bool operator==(const iterator& rhs) const { return m_current == rhs.m_current; }
bool operator!=(const iterator& rhs) const { return not (m_current == rhs.m_current); }
private:
datablock* m_current;
};
iterator begin() const;
iterator end() const;
const validator& get_validator() const;
void get_tag_order(vector<string>& tags) const;
private:
void set_validator(validator* v);
datablock* m_head;
validator* m_validator;
};
// --------------------------------------------------------------------
// some postponed inlines
namespace detail
{
template<typename T>
inline
bool any_is_condition_impl<T>::test(const category& c, const row& r) const
{
bool result = false;
for (auto& f: c.fields())
{
try
{
if (r[f].as<value_type>() == m_value)
{
result = true;
break;
}
}
catch (...) {}
}
return result;
}
inline bool any_matches_condition_impl::test(const category& c, const row& r) const
{
bool result = false;
for (auto& f: c.fields())
{
try
{
if (std::regex_match(r[f].as<string>(), m_rx))
{
result = true;
break;
}
}
catch (...) {}
}
return result;
}
}
}
namespace std
{
template<>
inline void swap(cif::row& a, cif::row& b)
{
a.swap(b);
}
}
// CIF parser
#include "libcif/cif++.h"
#include <stack>
namespace cif
{
// --------------------------------------------------------------------
class cif_parser_error : public std::runtime_error
{
public:
cif_parser_error(uint32 line_nr, const std::string& message);
};
// --------------------------------------------------------------------
extern const uint32 kMaxLineLength;
extern const uint8 kCharTraitsTable[128];
enum CharTraitsMask: uint8 {
kOrdinaryMask = 1 << 0,
kNonBlankMask = 1 << 1,
kTextLeadMask = 1 << 2,
kAnyPrintMask = 1 << 3
};
inline bool is_white(int ch)
{
return std::isspace(ch) or ch == '#';
}
inline bool is_ordinary(int ch)
{
return ch >= 0x20 and ch <= 0x7f and (kCharTraitsTable[ch - 0x20] & kOrdinaryMask) != 0;
}
inline bool is_non_blank(int ch)
{
return ch > 0x20 and ch <= 0x7f and (kCharTraitsTable[ch - 0x20] & kNonBlankMask) != 0;
}
inline bool is_text_lead(int ch)
{
return ch >= 0x20 and ch <= 0x7f and (kCharTraitsTable[ch - 0x20] & kTextLeadMask) != 0;
}
inline bool is_any_print(int ch)
{
return ch == '\t' or
(ch >= 0x20 and ch <= 0x7f and (kCharTraitsTable[ch - 0x20] & kAnyPrintMask) != 0);
}
inline bool is_unquoted_string(const char* s)
{
bool result = is_ordinary(*s++);
while (result and *s != 0)
{
result = is_non_blank(*s);
++s;
}
return result;
}
// --------------------------------------------------------------------
std::tuple<std::string,std::string> split_tag_name(const std::string& tag);
// --------------------------------------------------------------------
// sac parser, analogous to SAX parser (simple api for xml)
class sac_parser
{
public:
sac_parser(std::istream& is);
virtual ~sac_parser() {}
enum CIFToken
{
eCIFTokenUnknown,
eCIFTokenEOF,
eCIFTokenDATA,
eCIFTokenLOOP,
eCIFTokenGLOBAL,
eCIFTokenSAVE,
eCIFTokenSTOP,
eCIFTokenTag,
eCIFTokenValue,
};
static const char* kTokenName[];
enum CIFValueType
{
eCIFValueInt,
eCIFValueFloat,
eCIFValueNumeric,
eCIFValueString,
eCIFValueTextField,
eCIFValueInapplicable,
eCIFValueUnknown
};
static const char* kValueName[];
int get_next_char();
void retract();
void restart();
CIFToken get_next_token();
void match(CIFToken token);
void parse_file();
void parse_global();
void parse_data_block();
virtual void parse_save_frame();
void parse_dictionary();
void error(const std::string& msg);
// production methods, these are pure virtual here
virtual void produce_datablock(const std::string& name) = 0;
virtual void produce_category(const std::string& name) = 0;
virtual void produce_row() = 0;
virtual void produce_item(const std::string& category, const std::string& item, const string& value) = 0;
protected:
enum State
{
eStateStart,
eStateWhite,
eStateComment,
eStateQuestionMark,
eStateDot,
eStateQuotedString,
eStateQuotedStringQuote,
eStateUnquotedString,
eStateTag,
eStateTextField,
eStateFloat = 100,
eStateInt = 110,
// eStateNumericSuffix = 200,
eStateValue = 300
};
std::istream& m_data;
// parser state
bool m_validate;
uint32 m_line_nr;
bool m_bol;
int m_state, m_start;
CIFToken m_lookahead;
std::string m_token_value;
CIFValueType m_token_type;
std::stack<int> m_buffer;
};
// --------------------------------------------------------------------
class parser : public sac_parser
{
public:
parser(std::istream& is, file& f);
virtual void produce_datablock(const std::string& name);
virtual void produce_category(const std::string& name);
virtual void produce_row();
virtual void produce_item(const std::string& category, const std::string& item, const std::string& value);
protected:
file& m_file;
datablock* m_data_block;
datablock::iterator m_cat;
row m_row;
};
// --------------------------------------------------------------------
class dict_parser : public parser
{
public:
dict_parser(validator& validator, std::istream& is);
~dict_parser();
void load_dictionary();
private:
virtual void parse_save_frame();
bool collect_item_types();
void link_items();
validator& m_validator;
file m_file;
struct dict_parser_data_impl* m_impl;
bool m_collected_item_types = false;
};
}
// cif parsing library
#pragma once
#include <vector>
#include <set>
#include "libcif/config.h"
namespace cif
{
// some basic utilities: Since we're using ASCII input only, we define for optimisation
// our own case conversion routines.
bool iequals(const std::string& a, const std::string& b);
int icompare(const std::string& a, const std::string& b);
bool iequals(const char* a, const char* b);
int icompare(const char* a, const char* b);
void to_lower(std::string& s);
std::string to_lower_copy(const std::string& s);
// To make life easier, we also define iless and iset using iequals
struct iless
{
bool operator()(const std::string& a, const std::string& b) const
{
return icompare(a, b) < 0;
}
};
typedef std::set<std::string, iless> iset;
// --------------------------------------------------------------------
// This really makes a difference, having our own tolower routines
extern const uint8 kCharToLowerMap[256];
inline char tolower(char ch)
{
return static_cast<char>(kCharToLowerMap[static_cast<uint8>(ch)]);
}
// --------------------------------------------------------------------
std::tuple<std::string,std::string> split_tag_name(const std::string& tag);
// --------------------------------------------------------------------
// custom wordwrapping routine
std::vector<std::string> word_wrap(const std::string& text, unsigned int width);
}
// cif parsing library
#include "libcif/cif++.h"
#include <boost/filesystem/path.hpp>
// the std regex of gcc is crashing....
#include <boost/regex.hpp>
#include <set>
namespace cif
{
struct validate_category;
// --------------------------------------------------------------------
class validation_error : public std::exception
{
public:
validation_error(const std::string& msg) : m_msg(msg) {}
const char* what() const noexcept { return m_msg.c_str(); }
std::string m_msg;
};
// --------------------------------------------------------------------
enum DDL_PrimitiveType
{
ptChar, ptUChar, ptNumb
};
DDL_PrimitiveType map_to_primitive_type(const std::string& s);
struct validate_type
{
std::string m_name;
DDL_PrimitiveType m_primitive_type;
boost::regex m_rx;
bool operator<(const validate_type& rhs) const
{
return icompare(m_name, rhs.m_name) < 0;
}
// compare values based on type
// int compare(const std::string& a, const std::string& b) const
// {
// return compare(a.c_str(), b.c_str());
// }
int compare(const char* a, const char* b) const;
};
struct validate_item
{
std::string m_tag;
bool m_mandatory;
const validate_type* m_type;
cif::iset m_enums;
validate_item* m_parent = nullptr;
std::set<validate_item*>
m_children;
validate_category* m_category = nullptr;
std::set<validate_item*>
m_foreign_keys;
void set_parent(validate_item* parent);
bool operator<(const validate_item& rhs) const
{
return icompare(m_tag, rhs.m_tag) < 0;
}
bool operator==(const validate_item& rhs) const
{
return iequals(m_tag, rhs.m_tag);
}
void operator()(std::string value) const;
};
struct validate_category
{
std::string m_name;
std::vector<string> m_keys;
cif::iset m_groups;
cif::iset m_mandatory_fields;
std::set<validate_item> m_item_validators;
bool operator<(const validate_category& rhs) const
{
return icompare(m_name, rhs.m_name) < 0;
}
void add_item_validator(validate_item&& v);
const validate_item* get_validator_for_item(std::string tag) const;
const std::set<validate_item>& item_validators() const
{
return m_item_validators;
}
};
// --------------------------------------------------------------------
class validator
{
public:
friend class dict_parser;
validator();
~validator();
validator(const validator& rhs) = delete;
validator& operator=(const validator& rhs) = delete;
validator(validator&& rhs);
validator& operator=(validator&& rhs);
void add_type_validator(validate_type&& v);
const validate_type* get_validator_for_type(std::string type_code) const;
void add_category_validator(validate_category&& v);
const validate_category* get_validator_for_category(std::string category) const;
void report_error(const std::string& msg);
std::string dict_name() const { return m_name; }
void dict_name(const std::string& name) { m_name = name; }
std::string dict_version() const { return m_version; }
void dict_version(const std::string& version) { m_version = version; }
private:
// name is fully qualified here:
validate_item* get_validator_for_item(std::string name) const;
std::string m_name;
std::string m_version;
bool m_strict = false;
// std::set<uint32> m_sub_categories;
std::set<validate_type> m_type_validators;
std::set<validate_category> m_category_validators;
};
}
#pragma once
#include "cif++.h"
void WritePDBFile(std::ostream& pdbFile, cif::file& cifFile);
// Lib for working with structures as contained in mmCIF and PDB files
#pragma once
#include <set>
#include <tuple>
#include <vector>
#include <map>
#include "libcif/atom_type.h"
namespace libcif
{
// --------------------------------------------------------------------
// The chemical composition of the structure in an mmCIF file is
// defined in the class composition. A compositon consists of
// entities. Each entity can be either a polymer, a non-polymer
// a macrolide or a water molecule.
// Entities themselves are made up of compounds. And compounds
// contain comp_atom records for each atom.
class composition;
class entity;
class compound;
struct comp_atom;
// --------------------------------------------------------------------
// struct containing information about an atom in a chemical compound
// This information comes from the CCP4 monomer library.
struct comp_atom
{
std::string id;
atom_type type_symbol;
std::string type_energy;
float partial_charge;
};
// --------------------------------------------------------------------
// a class that contains information about a chemical compound.
// This information is derived from the ccp4 monomer library by default.
// To create compounds, you'd best use the factory method.
class compound
{
public:
compound(const std::string& id, const std::string& name,
const std::string& group, std::vector<comp_atom>&& atoms,
std::map<std::tuple<std::string,std::string>,float>&& bonds)
: m_id(id), m_name(name), m_group(group)
, m_atoms(std::move(atoms)), m_bonds(std::move(bonds))
{
}
~compound();
// factory method, create a compound based on the three letter code
// (for amino acids) or the one-letter code (for bases) or the
// code as it is known in the CCP4 monomer library.
static const compound* create(const std::string& id);
// this second factory method can create a compound even if it is not
// recorded in the library. It will take the values from the CCP4 lib
// unless the value passed to this function is not empty.
static const compound* create(const std::string& id, const std::string& name,
const std::string& type, const std::string& formula);
// add an additional path to the monomer library.
static void add_monomer_library_path(const std::string& dir);
// accessors
std::string id() const { return m_id; }
std::string name() const { return m_name; }
std::string type() const;
// std::string group() const { return m_group; }
std::vector<comp_atom> atoms() const { return m_atoms; }
comp_atom get_atom_by_id(const std::string& atom_id) const;
bool atoms_bonded(const std::string& atom_id_1, const std::string& atom_id_2) const;
float atom_bond_value(const std::string& atom_id_1, const std::string& atom_id_2) const;
std::string formula() const;
float formula_weight() const;
int charge() const;
bool is_water() const;
private:
// entity& m_entity;
std::string m_id;
std::string m_name;
std::string m_group;
std::vector<comp_atom> m_atoms;
std::map<std::tuple<std::string,std::string>,float> m_bonds;
};
// --------------------------------------------------------------------
// an entity. This is a base class for polymer_entity and non_poly_entity
// The latter can be either a regular non-polymer (residue), a macrolide or
// water.
class entity
{
public:
entity(const std::string& id, const std::string& type, const std::string& description);
virtual ~entity();
std::string id() const;
std::string type() const;
std::string description() const;
virtual float formula_weight() const = 0;
private:
std::string m_id;
std::string m_type;
std::string m_description;
};
// --------------------------------------------------------------------
// A polymer entity
class polymer_entity : public entity
{
public:
polymer_entity(const std::string& id, const std::string& description);
~polymer_entity();
std::string seq_one_letter_code(bool cannonical) const;
std::string pdbx_strand_id() const;
virtual float formula_weight() const;
class monomer
{
public:
friend class polymer_entity;
size_t num() const; // sequence number
bool hetero() const; // whether this position contains alternate compounds
const compound& comp(size_t alt_nr) const; // the chemical compound of this monomer
private:
monomer* m_next;
monomer* m_alt;
size_t m_num;
compound* m_comp;
};
class iterator : public std::iterator<std::forward_iterator_tag, const monomer>
{
public:
typedef std::iterator<std::forward_iterator_tag, const monomer> base_type;
typedef base_type::reference reference;
typedef base_type::pointer pointer;
iterator(monomer* monomer = nullptr)
: m_cursor(monomer) {}
iterator(const iterator& rhs)
: m_cursor(rhs.m_cursor)
{
}
iterator& operator=(const iterator& rhs)
{
m_cursor = rhs.m_cursor;
return *this;
}
reference operator*() { return *m_cursor; }
pointer operator->() { return m_cursor; }
iterator& operator++() { m_cursor = m_cursor->m_next; return *this; }
iterator operator++(int)
{
iterator tmp(*this);
operator++();
return tmp;
}
bool operator==(const iterator& rhs) const { return m_cursor == rhs.m_cursor; }
bool operator!=(const iterator& rhs) const { return m_cursor != rhs.m_cursor; }
private:
monomer* m_cursor;
};
iterator begin() const { return iterator(m_seq); }
iterator end() const { return iterator(); }
const monomer& operator[](size_t index) const;
private:
entity& m_entity;
monomer* m_seq;
};
// --------------------------------------------------------------------
// non_poly entity
class non_poly_entity : public entity
{
public:
non_poly_entity(const std::string& id, const std::string& type, const std::string& description);
~non_poly_entity();
compound& comp() const;
virtual float formula_weight() const;
private:
compound* m_compound;
};
}
// Lib for working with structures as contained in mmCIF and PDB files
#pragma once
#include <string>
#define HAVE_CPP0X_TEMPLATE_ALIASES 1
#define HAVE_CPP0X_VARIADIC_TEMPLATES 1
#define HAVE_CPP0X_INITIALIZER_LISTS 1
#if defined(_MSC_VER)
// These are Microsoft Visual C++ special settings
// the iso646 file contains the C++ keywords that are
// otherwise not recognized.
#include <ciso646>
#define snprintf _snprintf
// Disable some warnings
#pragma warning (disable : 4996)
#pragma warning (disable : 4355)
#endif
#include <boost/version.hpp>
#include <boost/cstdint.hpp>
typedef int8_t int8;
typedef uint8_t uint8;
typedef int16_t int16;
typedef uint16_t uint16;
typedef int32_t int32;
typedef uint32_t uint32;
typedef int64_t int64;
typedef uint64_t uint64;
#pragma once
#include "pdb2cif.h"
// --------------------------------------------------------------------
struct TemplateLine;
class Remark3Parser
{
public:
virtual ~Remark3Parser() {}
static bool Parse(const std::string& expMethod, PDBRecord* r, cif::datablock& db);
virtual std::string Program();
virtual std::string Version();
protected:
Remark3Parser(const std::string& name, const std::string& expMethod, PDBRecord* r, cif::datablock& db,
const TemplateLine templatelines[], uint32 templateLineCount, std::regex program_version);
virtual float Parse();
std::string NextLine();
bool Match(const char* expr, int nextState);
void StoreCapture(const char* category, std::initializer_list<const char*> items, bool createNew = false);
void StoreRefineLsRestr(const char* type, std::initializer_list<const char*> values);
void UpdateRefineLsRestr(const char* type, std::initializer_list<const char*> values);
virtual void Fixup() {}
std::string m_name;
std::string m_expMethod;
PDBRecord* m_rec;
cif::datablock m_db;
std::string m_line;
std::smatch m_m;
uint32 m_state;
const TemplateLine* m_template;
uint32 m_templateCount;
std::regex m_program_version;
};
#pragma once
#include "cif++.h"
// --------------------------------------------------------------------
struct PDBRecord
{
PDBRecord* m_next;
uint32 m_line_nr;
char m_name[11];
size_t m_vlen;
char m_value[0];
PDBRecord(uint32 line_nr, const std::string& name, const std::string& value);
~PDBRecord();
void* operator new(size_t);
void* operator new(size_t size, size_t v_len);
void operator delete(void* p);
bool is(const char* name) const;
char v_c(size_t column);
std::string v_s(size_t column_first, size_t column_last = std::numeric_limits<size_t>::max());
int v_i(int column_first, int column_last);
std::string v_f(size_t column_first, size_t column_last);
};
// --------------------------------------------------------------------
void ReadPDBFile(std::istream& pdbFile, cif::file& cifFile);
// Lib for working with structures as contained in mmCIF and PDB files
#pragma once
#include <libcif/config.h>
#include <boost/filesystem/operations.hpp>
#include <boost/math/quaternion.hpp>
#include "clipper/core/coords.h"
namespace libcif
{
typedef boost::math::quaternion<float> quaternion;
const long double
kPI = 3.141592653589793238462643383279502884L;
// --------------------------------------------------------------------
// point, a location with x, y and z coordinates as float.
// This one is derived from a tuple<float,float,float> so
// you can do things like:
//
// float x, y, z;
// tie(x, y, z) = atom.loc();
struct point : public std::tuple<float,float,float>
{
typedef std::tuple<float,float,float> base_type;
point() : base_type(0.f, 0.f, 0.f) {}
point(float x, float y, float z) : base_type(x, y, z) {}
point(const clipper::Coord_orth& pt): base_type(pt[0], pt[1], pt[2]) {}
point& operator=(const clipper::Coord_orth& rhs)
{
x(rhs[0]);
y(rhs[1]);
z(rhs[2]);
return *this;
}
float& x() { return std::get<0>(*this); }
float x() const { return std::get<0>(*this); }
void x(float x) { std::get<0>(*this) = x; }
float& y() { return std::get<1>(*this); }
float y() const { return std::get<1>(*this); }
void y(float y) { std::get<1>(*this) = y; }
float& z() { return std::get<2>(*this); }
float z() const { return std::get<2>(*this); }
void z(float z) { std::get<2>(*this) = z; }
point& operator+=(const point& rhs)
{
x() += rhs.x();
y() += rhs.y();
z() += rhs.z();
return *this;
}
point& operator-=(const point& rhs)
{
x() -= rhs.x();
y() -= rhs.y();
z() -= rhs.z();
return *this;
}
point& operator*=(float rhs)
{
x() *= rhs;
y() *= rhs;
z() *= rhs;
return *this;
}
point& operator/=(float rhs)
{
x() *= rhs;
y() *= rhs;
z() *= rhs;
return *this;
}
float normalize()
{
auto length = x() * x() + y() * y() + z() * z();
if (length > 0)
{
length = std::sqrt(length);
operator/=(length);
}
return length;
}
void rotate(const boost::math::quaternion<float>& q)
{
boost::math::quaternion<float> p(0, x(), y(), z());
p = q * p * boost::math::conj(q);
x() = p.R_component_2();
y() = p.R_component_3();
z() = p.R_component_4();
}
operator clipper::Coord_orth() const
{
return clipper::Coord_orth(x(), y(), z());
}
};
inline std::ostream& operator<<(std::ostream& os, const point& pt)
{
os << '(' << pt.x() << ',' << pt.y() << ',' << pt.z() << ')';
return os;
}
inline point operator+(const point& lhs, const point& rhs)
{
return point(lhs.x() + rhs.x(), lhs.y() + rhs.y(), lhs.z() + rhs.z());
}
inline point operator-(const point& lhs, const point& rhs)
{
return point(lhs.x() - rhs.x(), lhs.y() - rhs.y(), lhs.z() - rhs.z());
}
inline point operator-(const point& pt)
{
return point(-pt.x(), -pt.y(), -pt.z());
}
inline point operator*(const point& pt, float f)
{
return point(pt.x() * f, pt.y() * f, pt.z() * f);
}
inline point operator/(const point& pt, float f)
{
return point(pt.x() / f, pt.y() / f, pt.z() / f);
}
// --------------------------------------------------------------------
// several standard 3d operations
inline double DistanceSquared(const point& a, const point& b)
{
return
(a.x() - b.x()) * (a.x() - b.x()) +
(a.y() - b.y()) * (a.y() - b.y()) +
(a.z() - b.z()) * (a.z() - b.z());
}
inline double Distance(const point& a, const point& b)
{
return sqrt(
(a.x() - b.x()) * (a.x() - b.x()) +
(a.y() - b.y()) * (a.y() - b.y()) +
(a.z() - b.z()) * (a.z() - b.z()));
}
inline float DotProduct(const point& a, const point& b)
{
return a.x() * b.x() + a.y() * b.y() + a.z() * b.z();
}
inline point CrossProduct(const point& a, const point& b)
{
return point(a.y() * b.z() - b.y() * a.z(),
a.z() * b.x() - b.z() * a.x(),
a.x() * b.y() - b.x() * a.y());
}
float DihedralAngle(const point& p1, const point& p2, const point& p3, const point& p4);
float CosinusAngle(const point& p1, const point& p2, const point& p3, const point& p4);
// --------------------------------------------------------------------
// We use quaternions to do rotations in 3d space
quaternion Normalize(quaternion q);
//std::tuple<double,point> QuaternionToAngleAxis(quaternion q);
point Centroid(std::vector<point>& points);
point CenterPoints(std::vector<point>& points);
quaternion AlignPoints(const std::vector<point>& a, const std::vector<point>& b);
double RMSd(const std::vector<point>& a, const std::vector<point>& b);
// --------------------------------------------------------------------
// Helper class to generate evenly divided points on a sphere
// we use a fibonacci sphere to calculate even distribution of the dots
template<int N>
class spherical_dots
{
public:
enum { P = 2 * N + 1 };
typedef typename std::array<point,P> array_type;
typedef typename array_type::const_iterator iterator;
static spherical_dots& instance()
{
static spherical_dots s_instance;
return s_instance;
}
size_t size() const { return m_points.size(); }
const point operator[](uint32 inIx) const { return m_points[inIx]; }
iterator begin() const { return m_points.begin(); }
iterator end() const { return m_points.end(); }
double weight() const { return m_weight; }
spherical_dots()
{
using namespace std;
const double
kGoldenRatio = (1 + std::sqrt(5.0)) / 2;
m_weight = (4 * kPI) / P;
auto p = m_points.begin();
for (int32 i = -N; i <= N; ++i)
{
double lat = std::asin((2.0 * i) / P);
double lon = std::fmod(i, kGoldenRatio) * 2 * kPI / kGoldenRatio;
p->x(sin(lon) * cos(lat));
p->y(cos(lon) * cos(lat));
p->z( sin(lat));
++p;
}
}
private:
array_type m_points;
double m_weight;
};
typedef spherical_dots<50> spherical_dots_50;
}
// Lib for working with structures as contained in mmCIF and PDB files
#pragma once
#include <boost/filesystem/operations.hpp>
#include <boost/math/quaternion.hpp>
#include <boost/any.hpp>
#include "libcif/atom_type.h"
#include "libcif/point.h"
#include "libcif/compound.h"
/*
To modify a structure, you will have to use actions.
The currently supported actions are:
// - Move atom to new location
- Remove atom
// - Add new atom that was formerly missing
// - Add alternate residue
-
Other important design principles:
- all objects here are references to the actual data. Not models of
the data itself. That means that if you copy an atom, you copy the
reference to an atom in the structure. You're not creating a new
atom. This may sound obvious, but it is not if you are used to
copy semantics in the C++ world.
*/
// forward declaration
namespace cif
{
class datablock;
};
namespace libcif
{
class atom;
class residue;
class monomer;
class polymer;
class structure;
class file;
// --------------------------------------------------------------------
// We do not want to introduce a dependency on cif++ here, we might want
// to change the backend storage in the future.
// So, in order to access the data we use properties based on boost::any
// Eventually this should be moved to std::variant, but that's only when
// c++17 is acceptable.
struct property
{
property() {}
property(const std::string& name, const boost::any& value)
: name(name), value(value) {}
std::string name;
boost::any value;
};
typedef std::vector<property> property_list;
// --------------------------------------------------------------------
class atom
{
public:
// atom(const structure& s, const std::string& id);
atom(struct atom_impl* impl);
atom(const file& f, const std::string& id);
atom(const atom& rhs);
~atom();
atom& operator=(const atom& rhs);
std::string id() const;
atom_type type() const;
point location() const;
const compound& comp() const;
const entity& ent() const;
bool is_water() const;
int charge() const;
boost::any property(const std::string& name) const;
void property(const std::string& name, const boost::any& value);
// specifications
std::string label_atom_id() const;
std::string label_comp_id() const;
std::string label_asym_id() const;
int label_seq_id() const;
std::string label_alt_id() const;
std::string auth_atom_id() const;
std::string auth_comp_id() const;
std::string auth_asym_id() const;
int auth_seq_id() const;
std::string pdbx_auth_ins_code() const;
std::string auth_alt_id() const;
bool operator==(const atom& rhs) const;
const file& get_file() const;
private:
struct atom_impl* m_impl;
};
typedef std::vector<atom> atom_view;
// --------------------------------------------------------------------
class residue : public std::enable_shared_from_this<residue>
{
public:
residue(const compound& cmp) : m_compound(cmp) {}
const compound& comp() const { return m_compound; }
virtual atom_view atoms();
private:
const compound& m_compound;
};
//// --------------------------------------------------------------------
//// a monomer models a single residue in a protein chain
//
//class monomer : public residue
//{
// public:
// monomer(polymer& polymer, size_t seq_id, const std::string& comp_id,
// const std::string& alt_id);
//
// int num() const { return m_num; }
//// polymer& get_polymer();
//
//// std::vector<monomer_ptr> alternates();
//
// private:
// polymer_ptr m_polymer;
// int m_num;
//};
//
//// --------------------------------------------------------------------
//
//class polymer : public std::enable_shared_from_this<polymer>
//{
// public:
// polymer(const polymer_entity& pe, const std::string& asym_id);
//
// struct iterator : public std::iterator<std::random_access_iterator_tag, monomer>
// {
// typedef std::iterator<std::bidirectional_iterator_tag, monomer> base_type;
// typedef base_type::reference reference;
// typedef base_type::pointer pointer;
//
// iterator(polymer& list, uint32 index);
// iterator(iterator&& rhs);
// iterator(const iterator& rhs);
// iterator& operator=(const iterator& rhs);
// iterator& operator=(iterator&& rhs);
//
// reference operator*();
// pointer operator->();
//
// iterator& operator++();
// iterator operator++(int);
//
// iterator& operator--();
// iterator operator--(int);
//
// bool operator==(const iterator& rhs) const;
// bool operator!=(const iterator& rhs) const;
// };
//
// iterator begin();
// iterator end();
//
// private:
// polymer_entity m_entity;
// std::string m_asym_id;
// std::vector<residue_ptr> m_monomers;
//};
// --------------------------------------------------------------------
// file is a reference to the data stored in e.g. the cif file.
// This object is not copyable.
class file : public std::enable_shared_from_this<file>
{
public:
file();
file(boost::filesystem::path p);
~file();
file(const file&) = delete;
file& operator=(const file&) = delete;
void load(boost::filesystem::path p);
void save(boost::filesystem::path p);
structure* model(size_t nr = 1);
struct file_impl& impl() const { return *m_impl; }
std::vector<const entity*> entities();
cif::datablock& data();
private:
struct file_impl* m_impl;
};
// --------------------------------------------------------------------
class structure
{
public:
structure(file& p, uint32 model_nr = 1);
structure(const structure&);
structure& operator=(const structure&);
~structure();
file& get_file() const;
atom_view atoms() const;
atom_view waters() const;
atom get_atom_by_id(std::string id) const;
atom get_atom_by_location(point pt, float max_distance) const;
atom get_atom_for_label(const std::string& atom_id, const std::string& asym_id,
const std::string& comp_id, int seq_id, const std::string& alt_id = "");
atom get_atom_for_auth(const std::string& atom_id, const std::string& asym_id,
const std::string& comp_id, int seq_id, const std::string& alt_id = "",
const std::string& pdbx_auth_ins_code = "");
// map between auth and label locations
std::tuple<std::string,int,std::string> MapAuthToLabel(const std::string& asym_id,
const std::string& seq_id, const std::string& comp_id, const std::string& ins_code = "");
std::tuple<std::string,std::string,std::string,std::string> MapLabelToAuth(
const std::string& asym_id, int seq_id, const std::string& comp_id);
// returns chain, seqnr
std::tuple<std::string,std::string> MapLabelToAuth(
const std::string& asym_id, int seq_id);
// returns chain,seqnr,comp,iCode
std::tuple<std::string,int,std::string,std::string> MapLabelToPDB(
const std::string& asym_id, int seq_id, const std::string& comp_id);
std::tuple<std::string,int,std::string,std::string> MapPDBToLabel(
const std::string& asym_id, int seq_id, const std::string& comp_id, const std::string& iCode);
// Actions
void remove_atom(atom& a);
private:
friend class action;
struct structure_impl* m_impl;
};
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
// Lib for working with structures as contained in mmCIF and PDB files
#include "libcif/atom_type.h"
#include "libcif/cif++.h"
using namespace std;
namespace libcif
{
const float kNA = nan("1");
const atom_type_info kKnownAtoms[] =
{
{ Nn, "Unknown", "Nn", 0, false, { kNA, kNA, kNA, kNA, kNA, kNA, kNA } },
{ H, "Hydro­gen", "H", 1.008, false, { 53, 25, 37, 32, kNA, kNA, 120 } },
{ He, "He­lium", "He", 4.0026, false, { 31, kNA, 32, 46, kNA, kNA, 140 } },
{ Li, "Lith­ium", "Li", 6.94, true, { 167, 145, 134, 133, 124, kNA, 182 } },
{ Be, "Beryl­lium", "Be", 9.0122, true, { 112, 105, 90, 102, 90, 85, kNA } },
{ B, "Boron", "B", 10.81, true, { 87, 85, 82, 85, 78, 73, kNA } },
{ C, "Carbon", "C", 12.011, false, { 67, 70, 77, 75, 67, 60, 170 } },
{ N, "Nitro­gen", "N", 14.007, false, { 56, 65, 75, 71, 60, 54, 155 } },
{ O, "Oxy­gen", "O", 15.999, false, { 48, 60, 73, 63, 57, 53, 152 } },
{ F, "Fluor­ine", "F", 18.998, false, { 42, 50, 71, 64, 59, 53, 147 } },
{ Ne, "Neon", "Ne", 20.180, false, { 38, kNA, 69, 67, 96, kNA, 154 } },
{ Na, "So­dium", "Na", 22.990, true, { 190, 180, 154, 155, 160, kNA, 227 } },
{ Mg, "Magne­sium", "Mg", 24.305, true, { 145, 150, 130, 139, 132, 127, 173 } },
{ Al, "Alumin­ium", "Al", 26.982, true, { 118, 125, 118, 126, 113, 111, kNA } },
{ Si, "Sili­con", "Si", 28.085, true, { 111, 110, 111, 116, 107, 102, 210 } },
{ P, "Phos­phorus", "P", 30.974, false, { 98, 100, 106, 111, 102, 94, 180 } },
{ S, "Sulfur", "S", 32.06, false, { 88, 100, 102, 103, 94, 95, 180 } },
{ Cl, "Chlor­ine", "Cl", 35.45, false, { 79, 100, 99, 99, 95, 93, 175 } },
{ Ar, "Argon", "Ar", 39.948, false, { 71, kNA, 97, 96, 107, 96, 188 } },
{ K, "Potas­sium", "K", 39.098, true, { 243, 220, 196, 196, 193, kNA, 275 } },
{ Ca, "Cal­cium", "Ca", 40.078, true, { 194, 180, 174, 171, 147, 133, kNA } },
{ Sc, "Scan­dium", "Sc", 44.956, true, { 184, 160, 144, 148, 116, 114, kNA } },
{ Ti, "Tita­nium", "Ti", 47.867, true, { 176, 140, 136, 136, 117, 108, kNA } },
{ V, "Vana­dium", "V", 50.942, true, { 171, 135, 125, 134, 112, 106, kNA } },
{ Cr, "Chrom­ium", "Cr", 51.996, true, { 166, 140, 127, 122, 111, 103, kNA } },
{ Mn, "Manga­nese", "Mn", 54.938, true, { 161, 140, 139, 119, 105, 103, kNA } },
{ Fe, "Iron", "Fe", 55.845, true, { 156, 140, 125, 116, 109, 102, kNA } },
{ Co, "Cobalt", "Co", 58.933, true, { 152, 135, 126, 111, 103, 96, kNA } },
{ Ni, "Nickel", "Ni", 58.693, true, { 149, 135, 121, 110, 101, 101, 163 } },
{ Cu, "Copper", "Cu", 63.546, true, { 145, 135, 138, 112, 115, 120, 140 } },
{ Zn, "Zinc", "Zn", 65.38, true, { 142, 135, 131, 118, 120, kNA, 139 } },
{ Ga, "Gallium", "Ga", 69.723, true, { 136, 130, 126, 124, 117, 121, 187 } },
{ Ge, "Germa­nium", "Ge", 72.630, true, { 125, 125, 122, 121, 111, 114, kNA } },
{ As, "Arsenic", "As", 74.922, true, { 114, 115, 119, 121, 114, 106, 185 } },
{ Se, "Sele­nium", "Se", 78.971, false, { 103, 115, 116, 116, 107, 107, 190 } },
{ Br, "Bromine", "Br", 79.904, false, { 94, 115, 114, 114, 109, 110, 185 } },
{ Kr, "Kryp­ton", "Kr", 83.798, false, { 88, kNA, 110, 117, 121, 108, 202 } },
{ Rb, "Rubid­ium", "Rb", 85.468, true, { 265, 235, 211, 210, 202, kNA, kNA } },
{ Sr, "Stront­ium", "Sr", 87.62, true, { 219, 200, 192, 185, 157, 139, kNA } },
{ Y, "Yttrium", "Y", 88.906, true, { 212, 180, 162, 163, 130, 124, kNA } },
{ Zr, "Zirco­nium", "Zr", 91.224, true, { 206, 155, 148, 154, 127, 121, kNA } },
{ Nb, "Nio­bium", "Nb", 92.906, true, { 198, 145, 137, 147, 125, 116, kNA } },
{ Mo, "Molyb­denum", "Mo", 95.95, true, { 190, 145, 145, 138, 121, 113, kNA } },
{ Tc, "Tech­netium", "Tc", 98, true, { 183, 135, 156, 128, 120, 110, kNA } },
{ Ru, "Ruthe­nium", "Ru", 101.07, true, { 178, 130, 126, 125, 114, 103, kNA } },
{ Rh, "Rho­dium", "Rh", 102.91, true, { 173, 135, 135, 125, 110, 106, kNA } },
{ Pd, "Pallad­ium", "Pd", 106.42, true, { 169, 140, 131, 120, 117, 112, 163 } },
{ Ag, "Silver", "Ag", 107.87, true, { 165, 160, 153, 128, 139, 137, 172 } },
{ Cd, "Cad­mium", "Cd", 112.41, true, { 161, 155, 148, 136, 144, kNA, 158 } },
{ In, "Indium", "In", 114.82, true, { 156, 155, 144, 142, 136, 146, 193 } },
{ Sn, "Tin", "Sn", 118.71, true, { 145, 145, 141, 140, 130, 132, 217 } },
{ Sb, "Anti­mony", "Sb", 121.76, false, { 133, 145, 138, 140, 133, 127, kNA } },
{ Te, "Tellurium", "Te", 127.60, false, { 123, 140, 135, 136, 128, 121, 206 } },
{ I, "Iodine", "I", 126.90, false, { 115, 140, 133, 133, 129, 125, 198 } },
{ Xe, "Xenon", "Xe", 131.29, false, { 108, kNA, 130, 131, 135, 122, 216 } },
{ Cs, "Cae­sium", "Cs", 132.91, true, { 298, 260, 225, 232, 209, kNA, kNA } },
{ Ba, "Ba­rium", "Ba", 137.33, true, { 253, 215, 198, 196, 161, 149, kNA } },
{ La, "Lan­thanum", "La", 138.91, true, { kNA, 195, 169, 180, 139, 139, kNA } },
{ Hf, "Haf­nium", "Hf", 178.49, true, { 208, 155, 150, 152, 128, 122, kNA } },
{ Ta, "Tanta­lum", "Ta", 180.95, true, { 200, 145, 138, 146, 126, 119, kNA } },
{ W, "Tung­sten", "W", 183.84, true, { 193, 135, 146, 137, 120, 115, kNA } },
{ Re, "Rhe­nium", "Re", 186.21, true, { 188, 135, 159, 131, 119, 110, kNA } },
{ Os, "Os­mium", "Os", 190.23, true, { 185, 130, 128, 129, 116, 109, kNA } },
{ Ir, "Iridium", "Ir", 192.22, true, { 180, 135, 137, 122, 115, 107, kNA } },
{ Pt, "Plat­inum", "Pt", 195.08, true, { 177, 135, 128, 123, 112, 110, 175 } },
{ Au, "Gold", "Au", 196.97, true, { 174, 135, 144, 124, 121, 123, 166 } },
{ Hg, "Mer­cury", "Hg", 200.59, true, { 171, 150, 149, 133, 142, kNA, 155 } },
{ Tl, "Thallium", "Tl", 204.38, true, { 156, 190, 148, 144, 142, 150, 196 } },
{ Pb, "Lead", "Pb", 207.2, true, { 154, 180, 147, 144, 135, 137, 202 } },
{ Bi, "Bis­muth", "Bi", 208.98, true, { 143, 160, 146, 151, 141, 135, kNA } },
{ Po, "Polo­nium", "Po", 209, true, { 135, 190, kNA, 145, 135, 129, kNA } },
{ At, "Asta­tine", "At", 210, false, { 127, kNA, kNA, 147, 138, 138, kNA } },
{ Rn, "Radon", "Rn", 222, false, { 120, kNA, 145, 142, 145, 133, kNA } },
{ Fr, "Fran­cium", "Fr", 223, true, { kNA, kNA, kNA, 223, 218, kNA, kNA } },
{ Ra, "Ra­dium", "Ra", 226, true, { kNA, 215, kNA, 201, 173, 159, kNA } },
{ Ac, "Actin­ium", "Ac", 227, true, { kNA, 195, kNA, 186, 153, 140, kNA } },
{ Rf, "Ruther­fordium", "Rf", 267, true, { kNA, kNA, kNA, 157, 140, 131, kNA } },
{ Db, "Dub­nium", "Db", 268, true, { kNA, kNA, kNA, 149, 136, 126, kNA } },
{ Sg, "Sea­borgium", "Sg", 269, true, { kNA, kNA, kNA, 143, 128, 121, kNA } },
{ Bh, "Bohr­ium", "Bh", 270, true, { kNA, kNA, kNA, 141, 128, 119, kNA } },
{ Hs, "Has­sium", "Hs", 277, true, { kNA, kNA, kNA, 134, 125, 118, kNA } },
{ Mt, "Meit­nerium", "Mt", 278, true, { kNA, kNA, kNA, 129, 125, 113, kNA } },
{ Ds, "Darm­stadtium", "Ds", 281, true, { kNA, kNA, kNA, 128, 116, 112, kNA } },
{ Rg, "Roent­genium", "Rg", 282, true, { kNA, kNA, kNA, 121, 116, 118, kNA } },
{ Cn, "Coper­nicium", "Cn", 285, true, { kNA, kNA, kNA, 122, 137, 130, kNA } },
{ Nh, "Nihon­ium", "Nh", 286, true, { kNA, kNA, kNA, 136, kNA, kNA, kNA } },
{ Fl, "Flerov­ium", "Fl", 289, true, { kNA, kNA, kNA, 143, kNA, kNA, kNA } },
{ Mc, "Moscov­ium", "Mc", 290, true, { kNA, kNA, kNA, 162, kNA, kNA, kNA } },
{ Lv, "Liver­morium", "Lv", 293, true, { kNA, kNA, kNA, 175, kNA, kNA, kNA } },
{ Ts, "Tenness­ine", "Ts", 294, true, { kNA, kNA, kNA, 165, kNA, kNA, kNA } },
{ Og, "Oga­nesson", "Og", 294, true, { kNA, kNA, kNA, 157, kNA, kNA, kNA } },
{ Ce, "Cerium", "Ce", 140.12, true, { kNA, 185, kNA, 163, 137, 131, kNA } },
{ Pr, "Praseo­dymium", "Pr", 140.91, true, { 247, 185, kNA, 176, 138, 128, kNA } },
{ Nd, "Neo­dymium", "Nd", 144.24, true, { 206, 185, kNA, 174, 137, kNA, kNA } },
{ Pm, "Prome­thium", "Pm", 145, true, { 205, 185, kNA, 173, 135, kNA, kNA } },
{ Sm, "Sama­rium", "Sm", 150.36, true, { 238, 185, kNA, 172, 134, kNA, kNA } },
{ Eu, "Europ­ium", "Eu", 151.96, true, { 231, 185, kNA, 168, 134, kNA, kNA } },
{ Gd, "Gadolin­ium", "Gd", 157.25, true, { 233, 180, kNA, 169, 135, 132, kNA } },
{ Tb, "Ter­bium", "Tb", 158.93, true, { 225, 175, kNA, 168, 135, kNA, kNA } },
{ Dy, "Dyspro­sium", "Dy", 162.50, true, { 228, 175, kNA, 167, 133, kNA, kNA } },
{ Ho, "Hol­mium", "Ho", 164.93, true, { 226, 175, kNA, 166, 133, kNA, kNA } },
{ Er, "Erbium", "Er", 167.26, true, { 226, 175, kNA, 165, 133, kNA, kNA } },
{ Tm, "Thulium", "Tm", 168.93, true, { 222, 175, kNA, 164, 131, kNA, kNA } },
{ Yb, "Ytter­bium", "Yb", 173.05, true, { 222, 175, kNA, 170, 129, kNA, kNA } },
{ Lu, "Lute­tium", "Lu", 174.97, true, { 217, 175, 160, 162, 131, 131, kNA } },
{ Th, "Thor­ium", "Th", 232.04, true, { kNA, 180, kNA, 175, 143, 136, kNA } },
{ Pa, "Protac­tinium", "Pa", 231.04, true, { kNA, 180, kNA, 169, 138, 129, kNA } },
{ U, "Ura­nium", "U", 238.03, true, { kNA, 175, kNA, 170, 134, 118, 186 } },
{ Np, "Neptu­nium", "Np", 237, true, { kNA, 175, kNA, 171, 136, 116, kNA } },
{ Pu, "Pluto­nium", "Pu", 244, true, { kNA, 175, kNA, 172, 135, kNA, kNA } },
{ Am, "Ameri­cium", "Am", 243, true, { kNA, 175, kNA, 166, 135, kNA, kNA } },
{ Cm, "Curium", "Cm", 247, true, { kNA, kNA, kNA, 166, 136, kNA, kNA } },
{ Bk, "Berkel­ium", "Bk", 247, true, { kNA, kNA, kNA, 168, 139, kNA, kNA } },
{ Cf, "Califor­nium", "Cf", 251, true, { kNA, kNA, kNA, 168, 140, kNA, kNA } },
{ Es, "Einstei­nium", "Es", 252, true, { kNA, kNA, kNA, 165, 140, kNA, kNA } },
{ Fm, "Fer­mium", "Fm", 257, true, { kNA, kNA, kNA, 167, kNA, kNA, kNA } },
{ Md, "Mende­levium", "Md", 258, true, { kNA, kNA, kNA, 173, 139, kNA, kNA } },
{ No, "Nobel­ium", "No", 259, true, { kNA, kNA, kNA, 176, kNA, kNA, kNA } },
{ Lr, "Lawren­cium", "Lr", 266, true, { kNA, kNA, kNA, 161, 141, kNA, kNA } }
};
uint32 kKnownAtomsCount = sizeof(kKnownAtoms) / sizeof(atom_type_info);
// --------------------------------------------------------------------
// atom_type_traits
atom_type_traits::atom_type_traits(const string& symbol)
: m_info(nullptr)
{
for (auto& i: kKnownAtoms)
{
if (cif::iequals(i.symbol, symbol))
{
m_info = &i;
break;
}
}
if (m_info == nullptr)
throw invalid_argument("Not a known element: " + symbol);
}
atom_type_traits::atom_type_traits(atom_type t)
{
if (t < H or t > Lr)
throw invalid_argument("atom_type out of range");
m_info = &kKnownAtoms[t];
}
bool atom_type_traits::is_element(const string& symbol)
{
bool result = false;
for (auto& i: kKnownAtoms)
{
if (cif::iequals(i.symbol, symbol))
{
result = true;
break;
}
}
return result;
}
bool atom_type_traits::is_metal(const std::string& symbol)
{
bool result = false;
for (auto& i: kKnownAtoms)
{
if (cif::iequals(i.symbol, symbol))
{
result = i.metal;
break;
}
}
return result;
}
}
// cif parsing library
#include <cassert>
#include <stack>
#include <tuple>
#include <regex>
#include <set>
#include <unordered_map>
#include <boost/algorithm/string.hpp>
#include <boost/filesystem/operations.hpp>
#include <boost/filesystem/fstream.hpp>
#if defined(USE_RSRC)
#include "mrsrc.h"
#endif
#include "cif++.h"
#include "cif-parser.h"
#include "cif-validator.h"
#include "cif-utils.h"
using namespace std;
namespace ba = boost::algorithm;
namespace fs = boost::filesystem;
extern int VERBOSE;
namespace cif
{
static const char* kEmptyResult = "";
// --------------------------------------------------------------------
// most internal data structures are stored as linked lists
// item values are stored in a simple struct. They should be const anyway
struct item_value
{
item_value* m_next;
uint32 m_column_index;
char m_text[0];
item_value(const char* v, uint32 column_index);
~item_value();
void* operator new(size_t size, size_t data_size);
void operator delete(void* p);
};
// --------------------------------------------------------------------
item_value::item_value(const char* value, uint32 column_index)
: m_next(nullptr), m_column_index(column_index)
{
strcpy(m_text, value);
}
item_value::~item_value()
{
// remove recursion (and be paranoid)
while (m_next != nullptr and m_next != this)
{
auto n = m_next;
m_next = n->m_next;
n->m_next = nullptr;
delete n;
}
}
void* item_value::operator new(size_t size, size_t data_size)
{
return malloc(size + data_size + 1);
}
void item_value::operator delete(void* p)
{
free(p);
}
// --------------------------------------------------------------------
// item_column contains info about a column or field in a category
struct item_column
{
string m_name; // store lower-case, for optimization
const validate_item* m_validator;
};
// item_row contains the actual values for a row in a category
struct item_row
{
~item_row();
void drop(uint32 column_ix);
const char* c_str(uint32 column_ix) const;
string str() const
{
stringstream s;
s << '{';
for (auto v = m_values; v != nullptr; v = v->m_next)
{
s << m_category->get_column_name(v->m_column_index)
<< ':'
<< v->m_text;
if (v->m_next != nullptr)
s << ", ";
}
s << '}';
return s.str();
}
item_row* m_next;
category* m_category;
item_value* m_values;
};
ostream& operator<<(ostream& os, const item_row& r)
{
os << r.m_category->name() << '[';
for (auto iv = r.m_values; iv != nullptr; iv = iv->m_next)
{
os << iv->m_text;
if (iv->m_next)
os << ',';
}
os << ']';
return os;
}
// --------------------------------------------------------------------
item_row::~item_row()
{
// remove recursive
while (m_next != nullptr and m_next != this)
{
auto n = m_next;
m_next = n->m_next;
n->m_next = nullptr;
delete n;
}
delete m_values;
}
void item_row::drop(uint32 column_ix)
{
if (m_values != nullptr and m_values->m_column_index == column_ix)
{
auto v = m_values;
m_values = m_values->m_next;
v->m_next = nullptr;
delete v;
}
else
{
for (auto v = m_values; v->m_next != nullptr; v = v->m_next)
{
if (v->m_next->m_column_index == column_ix)
{
auto vn = v->m_next;
v->m_next = vn->m_next;
vn->m_next = nullptr;
delete vn;
break;
}
}
}
#if DEBUG
for (auto iv = m_values; iv != nullptr; iv = iv->m_next)
assert(iv != iv->m_next and (iv->m_next == nullptr or iv != iv->m_next->m_next));
#endif
}
const char* item_row::c_str(uint32 column_ix) const
{
const char* result = kEmptyResult;
for (auto v = m_values; v != nullptr; v = v->m_next)
{
if (v->m_column_index == column_ix)
{
result = v->m_text;
break;
}
}
return result;
}
// --------------------------------------------------------------------
namespace detail
{
template<>
item_reference& item_reference::operator=(const string& value)
{
row(m_row).assign(m_name, value, false);
return *this;
}
const char*
item_reference::c_str() const
{
const char* result = kEmptyResult;
if (m_row != nullptr /* and m_row->m_category != nullptr*/)
{
// assert(m_row->m_category);
auto cix = m_row->m_category->get_column_index(m_name);
for (auto iv = m_row->m_values; iv != nullptr; iv = iv->m_next)
{
if (iv->m_column_index == cix)
{
if (iv->m_text[0] != '.' or iv->m_text[1] != 0)
result = iv->m_text;
break;
}
}
}
return result;
}
bool item_reference::empty() const
{
return c_str() == kEmptyResult;
}
}
// --------------------------------------------------------------------
// datablock implementation
datablock::datablock(const string& name)
: m_name(name), m_validator(nullptr), m_next(nullptr)
{
}
datablock::~datablock()
{
delete m_next;
}
string datablock::first_item(const string& tag) const
{
string result;
string cat_name, item_name;
std::tie(cat_name, item_name) = split_tag_name(tag);
for (auto& cat: m_categories)
{
if (iequals(cat.name(), cat_name))
{
result = cat.get_first_item(item_name.c_str()).as<string>();
break;
}
}
return result;
}
auto datablock::emplace(const string& name) -> tuple<iterator,bool>
{
bool isNew = false;
iterator i = find_if(begin(), end(), [name](const category& cat) -> bool
{ return iequals(cat.name(), name); });
if (i == end())
{
isNew = true;
i = m_categories.emplace(end(), *this, name, m_validator);
}
return make_tuple(i, isNew);
}
category& datablock::operator[](const string& name)
{
iterator i;
std::tie(i, ignore) = emplace(name);
return *i;
}
category* datablock::get(const string& name)
{
auto i = find_if(begin(), end(), [name](const category& cat) -> bool
{ return iequals(cat.name(), name); });
return i == end() ? nullptr : &*i;
}
void datablock::validate()
{
if (m_validator == nullptr)
throw runtime_error("validator not specified");
for (auto& cat: *this)
cat.validate();
}
void datablock::set_validator(validator* v)
{
m_validator = v;
for (auto& cat: *this)
cat.set_validator(v);
}
void datablock::get_tag_order(vector<string>& tags) const
{
for (auto& cat: *this)
cat.get_tag_order(tags);
}
void datablock::write(ostream& os)
{
os << "data_" << m_name << endl
<< "# " << endl;
// mmcif support, sort of. First write the 'entry' category
// and if it exists, _AND_ we have a validator, write out the
// audit_conform record.
for (auto& cat: m_categories)
{
if (cat.name() == "entry")
{
cat.write(os);
if (m_validator != nullptr)
{
category audit_conform(*this, "audit_conform", nullptr);
audit_conform.emplace({
{ "dict_name", m_validator->dict_name() },
{ "dict_version", m_validator->dict_version() }
});
audit_conform.write(os);
}
break;
}
}
for (auto& cat: m_categories)
{
if (cat.name() != "entry" and cat.name() != "audit_conform")
cat.write(os);
}
}
void datablock::write(ostream& os, const vector<string>& order)
{
os << "data_" << m_name << endl
<< "# " << endl;
vector<string> catOrder;
for (auto& o: order)
{
string cat, item;
std::tie(cat, item) = split_tag_name(o);
if (find_if(catOrder.rbegin(), catOrder.rend(), [cat](const string& s) -> bool { return iequals(cat, s); }) == catOrder.rend())
catOrder.push_back(cat);
}
for (auto& c: catOrder)
{
auto cat = get(c);
if (cat == nullptr)
continue;
vector<string> items;
for (auto& o: order)
{
string cat_name, item;
std::tie(cat_name, item) = split_tag_name(o);
if (cat_name == c)
items.push_back(item);
}
cat->write(os, items);
}
// for any category we missed in the catOrder
for (auto& cat: m_categories)
{
if (find_if(catOrder.begin(), catOrder.end(), [&](const string& s) -> bool { return iequals(cat.name(), s); }) != catOrder.end())
continue;
cat.write(os);
}
// // mmcif support, sort of. First write the 'entry' category
// // and if it exists, _AND_ we have a validator, write out the
// // audit_conform record.
//
// for (auto& cat: m_categories)
// {
// if (cat.name() == "entry")
// {
// cat.write(os);
//
// if (m_validator != nullptr)
// {
// category audit_conform(*this, "audit_conform", nullptr);
// audit_conform.emplace({
// { "dict_name", m_validator->dict_name() },
// { "dict_version", m_validator->dict_version() }
// });
// audit_conform.write(os);
// }
//
// break;
// }
// }
//
// for (auto& cat: m_categories)
// {
// if (cat.name() != "entry" and cat.name() != "audit_conform")
// cat.write(os);
// }
}
// --------------------------------------------------------------------
//
// class to compare two rows based on their keys.
class row_comparator
{
public:
row_comparator(category* cat)
: row_comparator(cat, cat->get_cat_validator()->m_keys.begin(), cat->get_cat_validator()->m_keys.end())
{
}
template<typename KeyIter>
row_comparator(category* cat, KeyIter b, KeyIter e);
int operator()(const item_row* a, const item_row* b) const;
int operator()(const row& a, const row& b) const
{
return operator()(a.m_data, b.m_data);
}
private:
typedef function<int(const char*,const char*)> compare_func;
typedef tuple<size_t,compare_func> key_comp;
vector<key_comp> m_comp;
};
template<typename KeyIter>
row_comparator::row_comparator(category* cat, KeyIter b, KeyIter e)
{
auto cv = cat->get_cat_validator();
for (auto ki = b; ki != e; ++ki)
{
string k = *ki;
size_t ix = cat->get_column_index(k);
auto iv = cv->get_validator_for_item(k);
if (iv == nullptr)
throw runtime_error("Incomplete dictionary, no item validator for key " + k);
auto tv = iv->m_type;
if (tv == nullptr)
throw runtime_error("Incomplete dictionary, no type validator for item " + k);
using namespace placeholders;
m_comp.emplace_back(ix, bind(&validate_type::compare, tv, _1, _2));
}
}
int row_comparator::operator()(const item_row* a, const item_row* b) const
{
assert(a);
assert(b);
int d = 0;
for (auto& c: m_comp)
{
size_t k;
compare_func f;
std::tie(k, f) = c;
const char* ka = a->c_str(k);
const char* kb = b->c_str(k);
d = f(ka, kb);
if (d != 0)
break;
}
return d;
}
// --------------------------------------------------------------------
//
// class to keep an index on the keys of a category. This is a red/black
// tree implementation.
class cat_index
{
public:
cat_index(category* cat);
~cat_index();
item_row* find(item_row* k) const;
void insert(item_row* r);
void erase(item_row* r);
// batch create
void reconstruct();
// reorder the item_row's and returns new head and tail
tuple<item_row*,item_row*> reorder()
{
tuple<item_row*,item_row*> result = make_tuple(nullptr, nullptr);
if (m_root != nullptr)
{
entry* head = findMin(m_root);
entry* tail = reorder(m_root);
tail->m_row->m_next = nullptr;
result = make_tuple(head->m_row, tail->m_row);
}
return result;
}
size_t size() const;
void validate() const;
private:
struct entry
{
entry(item_row* r)
: m_row(r), m_left(nullptr), m_right(nullptr), m_red(true) {}
~entry()
{
delete m_left;
delete m_right;
}
item_row* m_row;
entry* m_left;
entry* m_right;
bool m_red;
};
entry* insert(entry* h, item_row* v);
entry* erase(entry* h, item_row* k);
void validate(entry* h, bool isParentRed, uint32 blackDepth, uint32& minBlack, uint32& maxBlack) const;
entry* rotateLeft(entry* h)
{
entry* x = h->m_right;
h->m_right = x->m_left;
x->m_left = h;
x->m_red = h->m_red;
h->m_red = true;
return x;
}
entry* rotateRight(entry* h)
{
entry* x = h->m_left;
h->m_left = x->m_right;
x->m_right = h;
x->m_red = h->m_red;
h->m_red = true;
return x;
}
void flipColour(entry* h)
{
h->m_red = not h->m_red;
if (h->m_left != nullptr)
h->m_left->m_red = not h->m_left->m_red;
if (h->m_right != nullptr)
h->m_right->m_red = not h->m_right->m_red;
}
bool isRed(entry* h) const
{
return h != nullptr and h->m_red;
}
entry* moveRedLeft(entry* h)
{
flipColour(h);
if (h->m_right != nullptr and isRed(h->m_right->m_left))
{
h->m_right = rotateRight(h->m_right);
h = rotateLeft(h);
flipColour(h);
}
return h;
}
entry* moveRedRight(entry* h)
{
flipColour(h);
if (h->m_left != nullptr and isRed(h->m_left->m_left))
{
h = rotateRight(h);
flipColour(h);
}
return h;
}
entry* fixUp(entry* h)
{
if (isRed(h->m_right))
h = rotateLeft(h);
if (isRed(h->m_left) and isRed(h->m_left->m_left))
h = rotateRight(h);
if (isRed(h->m_left) and isRed(h->m_right))
flipColour(h);
return h;
}
entry* findMin(entry* h)
{
while (h->m_left != nullptr)
h = h->m_left;
return h;
}
entry* eraseMin(entry* h)
{
if (h->m_left == nullptr)
{
delete h;
h = nullptr;
}
else
{
if (not isRed(h->m_left) and not isRed(h->m_left->m_left))
h = moveRedLeft(h);
h->m_left = eraseMin(h->m_left);
h = fixUp(h);
}
return h;
}
// Fix m_next fields for rows in order of this index
entry* reorder(entry* e)
{
auto result = e;
if (e->m_left != nullptr)
{
auto l = reorder(e->m_left);
l->m_row->m_next = e->m_row;
}
if (e->m_right != nullptr)
{
auto mr = findMin(e->m_right);
e->m_row->m_next = mr->m_row;
result = reorder(e->m_right);
}
return result;
}
category& m_cat;
row_comparator m_comp;
entry* m_root;
};
cat_index::cat_index(category* cat)
: m_cat(*cat), m_comp(cat), m_root(nullptr)
{
}
cat_index::~cat_index()
{
delete m_root;
}
item_row* cat_index::find(item_row* k) const
{
const entry* r = m_root;
while (r != nullptr)
{
int d = m_comp(k, r->m_row);
if (d < 0)
r = r->m_left;
else if (d > 0)
r = r->m_right;
else
break;
}
return r ? r->m_row : nullptr;
}
void cat_index::insert(item_row* k)
{
m_root = insert(m_root, k);
m_root->m_red = false;
}
cat_index::entry* cat_index::insert(entry* h, item_row* v)
{
if (h == nullptr)
return new entry(v);
int d = m_comp(v, h->m_row);
if (d < 0) h->m_left = insert(h->m_left, v);
else if (d > 0) h->m_right = insert(h->m_right, v);
else
throw runtime_error("Duplicate key violation, cat: " + m_cat.name() + " values: " + v->str());
if (isRed(h->m_right) and not isRed(h->m_left))
h = rotateLeft(h);
if (isRed(h->m_left) and isRed(h->m_left->m_left))
h = rotateRight(h);
if (isRed(h->m_left) and isRed(h->m_right))
flipColour(h);
return h;
}
void cat_index::erase(item_row* k)
{
m_root = erase(m_root, k);
if (m_root != nullptr)
m_root->m_red = false;
}
cat_index::entry* cat_index::erase(entry* h, item_row* k)
{
if (m_comp(k, h->m_row) < 0)
{
if (h->m_left != nullptr)
{
if (not isRed(h->m_left) and not isRed(h->m_left->m_left))
h = moveRedLeft(h);
h->m_left = erase(h->m_left, k);
}
}
else
{
if (isRed(h->m_left))
h = rotateRight(h);
if (m_comp(k, h->m_row) == 0 and h->m_right == nullptr)
{
delete h;
return nullptr;
}
if (h->m_right != nullptr)
{
if (not isRed(h->m_right) and not isRed(h->m_right->m_left))
h = moveRedRight(h);
if (m_comp(k, h->m_row) == 0)
{
h->m_row = findMin(h->m_right)->m_row;
h->m_right = eraseMin(h->m_right);
}
else
h->m_right = erase(h->m_right, k);
}
}
return fixUp(h);
}
void cat_index::reconstruct()
{
delete m_root;
m_root = nullptr;
for (auto r: m_cat)
insert(r.m_data);
// maybe reconstruction can be done quicker by using the following commented code.
// however, I've not had the time to think of a way to set the red/black flag correctly in that case.
// vector<item_row*> rows;
// transform(m_cat.begin(), m_cat.end(), back_inserter(rows),
// [](row r) -> item_row* { assert(r.m_data); return r.m_data; });
//
// assert(std::find(rows.begin(), rows.end(), nullptr) == rows.end());
//
// // don't use sort here, it will run out of the stack of something.
// // quicksort is notorious for using excessive recursion.
// // Besides, most of the time, the data is ordered already anyway.
//
// stable_sort(rows.begin(), rows.end(), [this](item_row* a, item_row* b) -> bool { return this->m_comp(a, b) < 0; });
//
// for (size_t i = 0; i < rows.size() - 1; ++i)
// assert(m_comp(rows[i], rows[i + 1]) < 0);
//
// deque<entry*> e;
// transform(rows.begin(), rows.end(), back_inserter(e),
// [](item_row* r) -> entry* { return new entry(r); });
//
// while (e.size() > 1)
// {
// deque<entry*> ne;
//
// while (not e.empty())
// {
// entry* a = e.front();
// e.pop_front();
//
// if (e.empty())
// ne.push_back(a);
// else
// {
// entry* b = e.front();
// b->m_left = a;
//
// assert(m_comp(a->m_row, b->m_row) < 0);
//
// e.pop_front();
//
// if (not e.empty())
// {
// entry* c = e.front();
// e.pop_front();
//
// assert(m_comp(b->m_row, c->m_row) < 0);
//
// b->m_right = c;
// }
//
// ne.push_back(b);
//
// if (not e.empty())
// {
// ne.push_back(e.front());
// e.pop_front();
// }
// }
// }
//
// swap (e, ne);
// }
//
// assert(e.size() == 1);
// m_root = e.front();
}
size_t cat_index::size() const
{
stack<entry*> s;
s.push(m_root);
size_t result = 0;
while (not s.empty())
{
entry* e = s.top();
s.pop();
if (e == nullptr)
continue;
++result;
s.push(e->m_left);
s.push(e->m_right);
}
return result;
}
void cat_index::validate() const
{
if (m_root != nullptr)
{
uint32 minBlack = numeric_limits<uint32>::max();
uint32 maxBlack = 0;
assert(not m_root->m_red);
validate(m_root, false, 0, minBlack, maxBlack);
assert(minBlack == maxBlack);
}
}
void cat_index::validate(entry* h, bool isParentRed, uint32 blackDepth, uint32& minBlack, uint32& maxBlack) const
{
if (h->m_red)
assert(not isParentRed);
else
++blackDepth;
if (isParentRed)
assert(not h->m_red);
if (h->m_left != nullptr and h->m_right != nullptr)
{
if (isRed(h->m_left))
assert(not isRed(h->m_right));
if (isRed(h->m_right))
assert(not isRed(h->m_left));
}
if (h->m_left != nullptr)
{
assert(m_comp(h->m_left->m_row, h->m_row) < 0);
validate(h->m_left, h->m_red, blackDepth, minBlack, maxBlack);
}
else
{
if (minBlack > blackDepth)
minBlack = blackDepth;
if (maxBlack < blackDepth)
maxBlack = blackDepth;
}
if (h->m_right != nullptr)
{
assert(m_comp(h->m_right->m_row, h->m_row) > 0);
validate(h->m_right, h->m_right, blackDepth, minBlack, maxBlack);
}
else
{
if (minBlack > blackDepth)
minBlack = blackDepth;
if (maxBlack < blackDepth)
maxBlack = blackDepth;
}
}
// --------------------------------------------------------------------
rowset::rowset(category& cat)
: m_cat(cat)
{
}
rowset& rowset::orderBy(initializer_list<string> items)
{
row_comparator c(&m_cat, items.begin(), items.end());
stable_sort(begin(), end(), c);
return *this;
}
// --------------------------------------------------------------------
category::category(datablock& db, const string& name, validator* validator)
: m_db(db), m_name(name), m_validator(validator)
, m_head(nullptr), m_tail(nullptr), m_index(nullptr)
{
if (m_name.empty())
throw validation_error("invalid empty name for category");
if (m_validator != nullptr)
{
m_cat_validator = m_validator->get_validator_for_category(m_name);
if (m_cat_validator != nullptr)
{
// make sure all required columns are added
for (auto& k: m_cat_validator->m_keys)
add_column(k);
for (auto& k: m_cat_validator->m_mandatory_fields)
add_column(k);
m_index = new cat_index(this);
}
}
}
category::~category()
{
delete m_head;
delete m_index;
}
void category::set_validator(validator* v)
{
m_validator = v;
if (m_index != nullptr)
{
delete m_index;
m_index = nullptr;
}
if (m_validator != nullptr)
{
m_cat_validator = m_validator->get_validator_for_category(m_name);
if (m_cat_validator != nullptr)
{
m_index = new cat_index(this);
m_index->reconstruct();
#if DEBUG
assert(m_index->size() == size());
m_index->validate();
#endif
}
}
else
m_cat_validator = nullptr;
}
size_t category::get_column_index(const string& name) const
{
size_t result;
for (result = 0; result < m_columns.size(); ++result)
{
if (iequals(name, m_columns[result].m_name))
break;
}
return result;
}
const string& category::get_column_name(size_t column_ix) const
{
return m_columns.at(column_ix).m_name;
}
size_t category::add_column(const string& name)
{
size_t result = get_column_index(name);
if (result == m_columns.size())
{
const validate_item* item_validator = nullptr;
if (m_cat_validator != nullptr)
{
item_validator = m_cat_validator->get_validator_for_item(name);
if (item_validator == nullptr)
m_validator->report_error("tag " + name + " not allowed in category " + m_name);
}
m_columns.push_back({name, item_validator});
}
return result;
}
void category::reorderByIndex()
{
if (m_index != nullptr)
std::tie(m_head, m_tail) = m_index->reorder();
}
size_t category::size() const
{
size_t result = 0;
for (auto pi = m_head; pi != nullptr; pi = pi->m_next)
++result;
return result;
}
bool category::empty() const
{
return m_head == nullptr or m_head->m_values == nullptr;
}
void category::drop(const string& field)
{
using namespace placeholders;
auto ci = find_if(m_columns.begin(), m_columns.end(),
[field](item_column& c) -> bool { return iequals(c.m_name, field); });
if (ci != m_columns.end())
{
uint32 column_ix = ci - m_columns.begin();
for (auto pi = m_head; pi != nullptr; pi = pi->m_next)
pi->drop(column_ix);
m_columns.erase(ci);
}
}
row category::operator[](condition&& cond)
{
row result;
for (auto r: *this)
{
if (cond(*this, r))
{
result = r;
break;
}
}
return result;
}
rowset category::find(condition&& cond)
{
rowset result(*this);
for (auto r: *this)
{
if (cond(*this, r))
result.push_back(r);
}
return result;
}
bool category::exists(condition&& cond)
{
bool result = false;
for (auto r: *this)
{
if (cond(*this, r))
{
result = true;
break;
}
}
return result;
}
rowset category::orderBy(std::initializer_list<string> items)
{
rowset result(*this);
result.insert(result.begin(), begin(), end());
return result.orderBy(items);
}
void category::clear()
{
delete m_head;
m_head = m_tail = nullptr;
if (m_index != nullptr)
{
delete m_index;
m_index = new cat_index(this);
}
}
template<class Iter>
tuple<row,bool> category::emplace(Iter b, Iter e)
{
// First, make sure all mandatory fields are supplied
tuple<row,bool> result = make_tuple(row(), true);
if (m_cat_validator != nullptr and b != e)
{
for (auto& col: m_columns)
{
auto iv = m_cat_validator->get_validator_for_item(col.m_name);
if (iv == nullptr)
continue;
bool seen = false;
for (auto v = b; v != e; ++v)
{
if (iequals(v->name(), col.m_name))
{
seen = true;
break;
}
}
if (not seen and iv->m_mandatory)
throw runtime_error("missing mandatory field " + col.m_name + " for category " + m_name);
}
if (m_index != nullptr)
{
unique_ptr<item_row> nr(new item_row{nullptr, this, nullptr});
row r(nr.get());
auto keys = key_fields();
for (auto v = b; v != e; ++v)
{
if (keys.count(v->name()))
r.assign(v->name(), v->value(), true);
}
auto test = m_index->find(nr.get());
if (test != nullptr)
{
if (VERBOSE > 1)
cerr << "Not inserting new record in " << m_name << " (duplicate key)" << endl;
result = make_tuple(row(test), false);
}
}
}
if (get<1>(result))
{
auto nr = new item_row{nullptr, this, nullptr};
if (m_head == nullptr)
{
assert(m_tail == nullptr);
m_head = m_tail = nr;
}
else
{
assert(m_tail != nullptr);
assert(m_head != nullptr);
m_tail->m_next = nr;
m_tail = nr;
}
row r(nr);
for (auto v = b; v != e; ++v)
r.assign(*v, true);
get<0>(result) = r;
if (m_index != nullptr)
m_index->insert(nr);
}
return result;
}
tuple<row,bool> category::emplace(row r)
{
return emplace(r.begin(), r.end());
}
void category::erase(condition&& cond)
{
rowset remove(*this);
for (auto r: *this)
{
if (cond(*this, r))
remove.push_back(r);
}
for (auto r: remove)
erase(r);
}
void category::erase(iterator p)
{
erase(*p);
}
void category::erase(row r)
{
iset keys;
if (m_cat_validator)
keys = iset(m_cat_validator->m_keys.begin(), m_cat_validator->m_keys.end());
for (auto& col: m_columns)
{
auto iv = col.m_validator;
if (iv == nullptr or iv->m_children.empty())
continue;
if (not keys.count(col.m_name))
continue;
const char* value = r[col.m_name].c_str();
for (auto child: iv->m_children)
{
if (child->m_category == nullptr)
continue;
auto child_cat = m_db.get(child->m_category->m_name);
if (child_cat == nullptr)
continue;
auto rows = child_cat->find(key(child->m_tag) == value);
for (auto& cr: rows)
child_cat->erase(cr);
}
}
if (m_head == nullptr)
throw runtime_error("erase");
if (m_index != nullptr)
m_index->erase(r.m_data);
if (r == m_head)
{
m_head = m_head->m_next;
r.m_data->m_next = nullptr;
delete r.m_data;
}
else
{
for (auto pi = m_head; pi != nullptr; pi = pi->m_next)
{
if (pi->m_next == r.m_data)
{
pi->m_next = r.m_data->m_next;
r.m_data->m_next = nullptr;
delete r.m_data;
break;
}
}
}
}
void category::get_tag_order(vector<string>& tags) const
{
for (auto& c: m_columns)
tags.push_back("_" + m_name + "." + c.m_name);
}
const detail::item_reference category::get_first_item(const char* item_name) const
{
return detail::item_reference{item_name, m_head};
}
category::iterator category::begin()
{
return iterator(m_head);
}
category::iterator category::end()
{
return iterator(nullptr);
}
void category::validate()
{
if (m_validator == nullptr)
throw runtime_error("no validator specified");
if (empty())
{
if (VERBOSE > 2)
cerr << "Skipping validation of empty category " << m_name << endl;
return;
}
if (m_cat_validator == nullptr)
{
m_validator->report_error("undefined category " + m_name);
return;
}
auto mandatory = m_cat_validator->m_mandatory_fields;
for (auto& col: m_columns)
{
auto iv = m_cat_validator->get_validator_for_item(col.m_name);
if (iv == nullptr)
m_validator->report_error("Field " + col.m_name + " is not valid in category " + m_name);
col.m_validator = iv;
mandatory.erase(col.m_name);
}
if (not mandatory.empty())
m_validator->report_error("In category " + m_name + " the following mandatory fields are missing: " + ba::join(mandatory, ", "));
// check index?
if (m_index)
{
#if not defined(NDEBUG)
m_index->validate();
for (auto r: *this)
{
if (m_index->find(r.m_data) != r.m_data)
m_validator->report_error("Key not found in index for category " + m_name);
}
#endif
}
// validate all values
mandatory = m_cat_validator->m_mandatory_fields;
for (auto ri = m_head; ri != nullptr; ri = ri->m_next)
{
for (size_t cix = 0; cix < m_columns.size(); ++cix)
{
bool seen = false;
auto iv = m_columns[cix].m_validator;
if (iv == nullptr)
{
m_validator->report_error("invalid field " + m_columns[cix].m_name + " for category " + m_name);
continue;
}
for (auto vi = ri->m_values; vi != nullptr; vi = vi->m_next)
{
if (vi->m_column_index == cix)
{
seen = true;
(*iv)(vi->m_text);
}
}
if (seen)
continue;
if (iv != nullptr and iv->m_mandatory)
m_validator->report_error("missing mandatory field " + m_columns[cix].m_name + " for category " + m_name);
}
}
}
const validator& category::get_validator() const
{
if (m_validator == nullptr)
throw runtime_error("no validator defined yet");
return *m_validator;
}
iset category::fields() const
{
if (m_validator == nullptr)
throw runtime_error("No validator specified");
if (m_cat_validator == nullptr)
m_validator->report_error("undefined category");
iset result;
for (auto& iv: m_cat_validator->m_item_validators)
result.insert(iv.m_tag);
return result;
}
iset category::mandatory_fields() const
{
if (m_validator == nullptr)
throw runtime_error("No validator specified");
if (m_cat_validator == nullptr)
m_validator->report_error("undefined category");
return m_cat_validator->m_mandatory_fields;
}
iset category::key_fields() const
{
if (m_validator == nullptr)
throw runtime_error("No validator specified");
if (m_cat_validator == nullptr)
m_validator->report_error("undefined category");
return iset{ m_cat_validator->m_keys.begin(), m_cat_validator->m_keys.end() };
}
auto category::iterator::operator++() -> iterator&
{
m_current = row(m_current.data()->m_next);
return *this;
}
namespace detail
{
size_t write_value(ostream& os, string value, size_t offset, size_t width)
{
if (value.find('\n') != string::npos or width == 0 or value.length() >= 132) // write as text field
{
ba::replace_all(value, "\n;", "\n\\;");
if (offset > 0)
os << endl;
os << ';' << value;
if (not ba::ends_with(value, "\n"))
os << endl;
os << ';' << endl;
offset = 0;
}
else if (is_unquoted_string(value.c_str()))
{
os << value;
if (value.length() < width)
{
os << string(width - value.length(), ' ');
offset += width;
}
else
{
os << ' ';
offset += value.length() + 1;
}
}
else
{
bool done = false;
for (char q: { '\'', '"'})
{
auto p = value.find(q); // see if we can use the quote character
while (p != string::npos and is_non_blank(value[p + 1]) and value[p + 1] != q)
p = value.find(q, p + 1);
if (p != string::npos)
continue;
os << q << value << q;
if (value.length() + 2 < width)
{
os << string(width - value.length() - 2, ' ');
offset += width;
}
else
{
os << ' ';
offset += value.length() + 1;
}
done = true;
break;
}
if (not done)
{
if (offset > 0)
os << endl;
os << ';' << value << endl
<< ';' << endl;
offset = 0;
}
}
return offset;
}
}
void category::write(ostream& os, const vector<int>& order, bool includeEmptyColumns)
{
if (empty())
return;
// If the first row has a next, we need a loop_
bool need_loop = (m_head->m_next != nullptr);
if (need_loop)
{
os << "loop_" << endl;
vector<size_t> column_widths;
for (auto cix: order)
{
auto& col = m_columns[cix];
os << '_' << m_name << '.' << col.m_name << ' ' << endl;
column_widths.push_back(2);
}
for (auto row = m_head; row != nullptr; row = row->m_next)
{
for (auto v = row->m_values; v != nullptr; v = v->m_next)
{
if (strchr(v->m_text, '\n') == nullptr)
{
size_t l = strlen(v->m_text);
if (not is_unquoted_string(v->m_text))
l += 2;
if (l >= 132)
continue;
if (column_widths[v->m_column_index] < l + 1)
column_widths[v->m_column_index] = l + 1;
}
}
}
for (auto row = m_head; row != nullptr; row = row->m_next) // loop over rows
{
size_t offset = 0;
for (size_t cix: order)
{
size_t w = column_widths[cix];
string s;
for (auto iv = row->m_values; iv != nullptr; iv = iv->m_next)
{
if (iv->m_column_index == cix)
{
s = iv->m_text;
break;
}
}
if (s.empty())
s = "?";
size_t l = s.length();
if (not is_unquoted_string(s.c_str()))
l += 2;
if (l < w)
l = w;
if (offset + l >= 132 and offset > 0)
{
os << endl;
offset = 0;
}
offset = detail::write_value(os, s, offset, w);
if (offset >= 132)
{
os << endl;
offset = 0;
}
}
if (offset > 0)
os << endl;
}
}
else
{
// first find the indent level
size_t l = 0;
for (auto& col: m_columns)
{
string tag = '_' + m_name + '.' + col.m_name;
if (l < tag.length())
l = tag.length();
}
l += 3;
for (size_t cix: order)
{
auto& col = m_columns[cix];
os << '_' << m_name << '.' << col.m_name << string(l - col.m_name.length() - m_name.length() - 2, ' ');
string s;
for (auto iv = m_head->m_values; iv != nullptr; iv = iv->m_next)
{
if (iv->m_column_index == cix)
{
s = iv->m_text;
break;
}
}
if (s.empty())
s = "?";
size_t offset = l;
if (s.length() + l >= kMaxLineLength)
{
os << endl;
offset = 0;
}
if (detail::write_value(os, s, offset, 1) != 0)
os << endl;
}
}
os << "# " << endl;
}
void category::write(ostream& os)
{
vector<int> order(m_columns.size());
iota(order.begin(), order.end(), 0);
write(os, order, false);
}
void category::write(ostream& os, const vector<string>& columns)
{
// make sure all columns are present
for (auto& c: columns)
add_column(c);
vector<int> order;
order.reserve(m_columns.size());
for (auto& c: columns)
order.push_back(get_column_index(c));
for (size_t i = 0; i < m_columns.size(); ++i)
{
if (std::find(order.begin(), order.end(), i) == order.end())
order.push_back(i);
}
write(os, order, true);
}
// --------------------------------------------------------------------
row::row(const row& rhs)
: m_data(rhs.m_data)
{
}
row& row::operator=(const row& rhs)
{
m_data = rhs.m_data;
return *this;
}
void row::assign(const string& name, const string& value, bool emplacing)
{
if (m_data == nullptr)
throw logic_error("invalid row, no data");
auto cat = m_data->m_category;
auto cix = cat->add_column(name);
auto& col = cat->m_columns[cix];
// auto& db = cat->m_db;
const char* oldValue = nullptr;
for (auto iv = m_data->m_values; iv != nullptr; iv = iv->m_next)
{
assert(iv != iv->m_next and (iv->m_next == nullptr or iv != iv->m_next->m_next));
if (iv->m_column_index == cix)
{
oldValue = iv->m_text;
break;
}
}
if (oldValue != nullptr and value == oldValue) // no need to update
return;
// check the value
if (col.m_validator)
(*col.m_validator)(value);
// If the field is part of the key for this category, remove it from the index
// before updating
bool reinsert = false;
if (not emplacing) // an update of an item's value
{
////#if DEBUG
//// if (VERBOSE)
//// cerr << "reassigning the value of key field _" << cat->m_name << '.' << name << endl;
////#endif
// // see if we need to update any child categories that depend on this value
// auto iv = col.m_validator;
// if (iv != nullptr and not iv->m_children.empty())
// {
// for (auto child: iv->m_children)
// {
// if (child->m_category == nullptr)
// continue;
//
// auto child_cat = db.get(child->m_category->m_name);
// if (child_cat == nullptr)
// continue;
//
// auto rows = child_cat->find(key(child->m_tag) == oldValue);
// for (auto& cr: rows)
// cr.assign(child->m_tag, value, false);
// }
// }
if (cat->m_index != nullptr and cat->key_fields().count(name))
{
reinsert = cat->m_index->find(m_data);
if (reinsert)
cat->m_index->erase(m_data);
}
}
// first remove old value with cix
if (m_data->m_values == nullptr)
; // nothing to do
else if (m_data->m_values->m_column_index == cix)
{
auto iv = m_data->m_values;
m_data->m_values = iv->m_next;
iv->m_next = nullptr;
delete iv;
}
else
{
for (auto iv = m_data->m_values; iv->m_next != nullptr; iv = iv->m_next)
{
if (iv->m_next->m_column_index == cix)
{
auto nv = iv->m_next;
iv->m_next = nv->m_next;
nv->m_next = nullptr;
delete nv;
break;
}
}
}
#if DEBUG
for (auto iv = m_data->m_values; iv != nullptr; iv = iv->m_next)
assert(iv != iv->m_next and (iv->m_next == nullptr or iv != iv->m_next->m_next));
#endif
if (not value.empty())
{
auto nv = new(value.length()) item_value(value.c_str(), cix);
if (m_data->m_values == nullptr)
m_data->m_values = nv;
else
{
auto iv = m_data->m_values;
while (iv->m_next != nullptr)
iv = iv->m_next;
iv->m_next = nv;
}
}
#if DEBUG
for (auto iv = m_data->m_values; iv != nullptr; iv = iv->m_next)
assert(iv != iv->m_next and (iv->m_next == nullptr or iv != iv->m_next->m_next));
#endif
if (reinsert)
cat->m_index->insert(m_data);
}
void row::assign(const item& value, bool emplacing)
{
assign(value.name(), value.value(), emplacing);
}
bool row::empty() const
{
return m_data == nullptr or m_data->m_values == nullptr;
}
auto row::begin() const -> const_iterator
{
return const_iterator(m_data, m_data->m_values);
}
auto row::end() const -> const_iterator
{
return const_iterator(m_data, nullptr);
}
row::const_iterator::const_iterator(item_row* data, item_value* ptr)
: m_data(data), m_ptr(ptr)
{
if (m_ptr != nullptr)
fetch();
}
row::const_iterator& row::const_iterator::operator++()
{
if (m_ptr != nullptr)
m_ptr = m_ptr->m_next;
if (m_ptr != nullptr)
fetch();
return *this;
}
void row::const_iterator::fetch()
{
m_current = item(
m_data->m_category->get_column_name(m_ptr->m_column_index),
m_ptr->m_text);
}
// --------------------------------------------------------------------
file::file()
: m_head(nullptr)
, m_validator(nullptr)
{
}
file::file(istream& is, bool validate)
: file()
{
// parser p(is, *this);
// p.parse_file();
load(is);
}
file::file(file&& rhs)
: m_head(nullptr), m_validator(nullptr)
{
swap(m_head, rhs.m_head);
swap(m_validator, rhs.m_validator);
}
file::~file()
{
delete m_head;
delete m_validator;
}
void file::append(datablock* e)
{
e->set_validator(m_validator);
if (m_head == nullptr)
m_head = e;
else
{
auto ie = m_head;
for (;;)
{
if (iequals(ie->name(), e->name()))
throw validation_error("datablock " + e->name() + " already defined in file");
if (ie->m_next == nullptr)
{
ie->m_next = e;
break;
}
ie = ie->m_next;
}
}
}
void file::load(istream& is)
{
validator* saved = m_validator;
set_validator(nullptr);
parser p(is, *this);
p.parse_file();
if (saved != nullptr)
{
set_validator(saved);
validate();
}
}
void file::save(ostream& os)
{
datablock* e = m_head;
while (e != nullptr)
{
e->write(os);
e = e->m_next;
}
}
void file::write(ostream& os, const vector<string>& order)
{
datablock* e = m_head;
while (e != nullptr)
{
e->write(os, order);
e = e->m_next;
}
}
datablock& file::operator[](const string& name)
{
datablock* result = m_head;
while (result != nullptr and not iequals(result->m_name, name))
result = result->m_next;
if (result == nullptr)
throw runtime_error("datablock " + name + " does not exist");
return *result;
}
void file::validate()
{
if (m_validator == nullptr)
{
if (VERBOSE)
cerr << "No dictionary loaded explicitly, loading default" << endl;
load_dictionary();
}
for (auto d = m_head; d != nullptr; d = d->m_next)
d->validate();
}
const validator& file::get_validator() const
{
if (m_validator == nullptr)
throw runtime_error("no validator defined yet");
return *m_validator;
}
void file::load_dictionary()
{
load_dictionary("mmcif_ddl");
}
void file::load_dictionary(const char* dict)
{
fs::path dict_file = string("dictionaries/") + dict + ".dic";
#if defined(USE_RSRC)
mrsrc::rsrc dict_data(dict_file.string());
if (not dict_data)
throw invalid_argument("no such dictionary");
struct membuf : public streambuf
{
membuf(char* dict, size_t length)
{
this->setg(dict, dict, dict + length);
}
} buffer(const_cast<char*>(dict_data.data()), dict_data.size());
istream is(&buffer);
#else
if (not fs::exists(dict_file))
throw runtime_error("Dictionary not found (" + dict_file.string() + ")");
fs::ifstream is(dict_file);
#endif
load_dictionary(is);
}
void file::load_dictionary(istream& is)
{
unique_ptr<validator> v(new validator());
dict_parser p(*v, is);
p.load_dictionary();
set_validator(v.release());
}
void file::set_validator(validator* v)
{
m_validator = v;
for (auto d = m_head; d != nullptr; d = d->m_next)
d->set_validator(m_validator);
}
void file::get_tag_order(vector<string>& tags) const
{
for (auto d = m_head; d != nullptr; d = d->m_next)
d->get_tag_order(tags);
}
auto file::iterator::operator++() -> iterator&
{
m_current = m_current->m_next;
return *this;
}
auto file::begin() const -> iterator
{
return iterator(m_head);
}
auto file::end() const -> iterator
{
return iterator(nullptr);
}
}
// cif parsing library
#include <set>
#include <boost/algorithm/string.hpp>
#include "libcif/cif++.h"
#include "libcif/cif-parser.h"
#include "libcif/cif-validator.h"
using namespace std;
namespace ba = boost::algorithm;
extern int VERBOSE;
namespace cif
{
const uint32 kMaxLineLength = 132;
const uint8 kCharTraitsTable[128] = {
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
14, 15, 14, 14, 14, 15, 15, 14, 15, 15, 15, 15, 15, 15, 15, 15, // 2
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 10, 15, 15, 15, 15, // 3
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, // 4
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 15, 14, 15, 14, // 5
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, // 6
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 0, // 7
};
// --------------------------------------------------------------------
cif_parser_error::cif_parser_error(uint32 line_nr, const string& message)
: runtime_error("parse error at line " + to_string(line_nr) + ": " + message)
{
}
// --------------------------------------------------------------------
const char* sac_parser::kTokenName[] = {
"unknown",
"EOF",
"DATA",
"LOOP",
"GLOBAL",
"SAVE",
"STOP",
"Tag",
"Value"
};
const char* sac_parser::kValueName[] = {
"Int",
"Float",
"Numeric",
"String",
"TextField",
"Inapplicable",
"Unknown"
};
// --------------------------------------------------------------------
sac_parser::sac_parser(std::istream& is)
: m_data(is)
{
m_validate = true;
m_line_nr = 1;
m_bol = true;
m_lookahead = get_next_token();
}
void sac_parser::error(const string& msg)
{
throw cif_parser_error(m_line_nr, msg);
}
// get_next_char takes a char from the buffer, or if it is empty
// from the istream. This function also does carriage/linefeed
// translation.
int sac_parser::get_next_char()
{
int result;
if (m_buffer.empty())
result = m_data.get();
else
{
result = m_buffer.top();
m_buffer.pop();
}
// very simple CR/LF translation into LF
if (result == '\r')
{
int lookahead = m_data.get();
if (lookahead != '\n')
m_buffer.push(lookahead);
result = '\n';
}
m_token_value += static_cast<char>(result);
if (result == '\n')
++m_line_nr;
if (VERBOSE >= 6)
{
cerr << "get_next_char => ";
if (iscntrl(result) or not isprint(result))
cerr << int(result) << endl;
else
cerr << char(result) << endl;
}
return result;
}
void sac_parser::retract()
{
assert(not m_token_value.empty());
char ch = m_token_value.back();
if (ch == '\n')
--m_line_nr;
m_buffer.push(ch);
m_token_value.pop_back();
}
void sac_parser::restart()
{
while (not m_token_value.empty())
retract();
switch (m_start)
{
case eStateStart:
m_state = m_start = eStateFloat;
break;
case eStateFloat:
m_state = m_start = eStateInt;
break;
case eStateInt:
m_state = m_start = eStateValue;
break;
default:
error("Invalid state in sac_parser");
}
m_bol = false;
}
void sac_parser::match(sac_parser::CIFToken t)
{
if (m_lookahead != t)
error(string("Unexpected token, expected ") + kTokenName[t] + " but found " + kTokenName[m_lookahead]);
m_lookahead = get_next_token();
}
sac_parser::CIFToken sac_parser::get_next_token()
{
const auto kEOF = char_traits<char>::eof();
CIFToken result = eCIFTokenUnknown;
int quoteChar = 0;
m_state = m_start = eStateStart;
m_bol = false;
m_token_value.clear();
m_token_type = eCIFValueUnknown;
while (result == eCIFTokenUnknown)
{
auto ch = get_next_char();
switch (m_state)
{
case eStateStart:
if (ch == kEOF)
result = eCIFTokenEOF;
else if (ch == '\n')
{
m_bol = true;
m_state = eStateWhite;
}
else if (ch == ' ' or ch == '\t')
m_state = eStateWhite;
else if (ch == '#')
m_state = eStateComment;
else if (ch == '.')
m_state = eStateDot;
else if (ch == '_')
m_state = eStateTag;
else if (ch == ';' and m_bol)
m_state = eStateTextField;
else if (ch == '\'' or ch == '"')
{
quoteChar = ch;
m_state = eStateQuotedString;
}
else if (ch == '?')
m_state = eStateQuestionMark;
else
restart();
break;
case eStateWhite:
if (ch == kEOF)
result = eCIFTokenEOF;
else if (not isspace(ch))
{
m_state = eStateStart;
retract();
m_token_value.clear();
}
else
m_bol = (ch == '\n');
break;
case eStateComment:
if (ch == '\n')
{
m_state = eStateStart;
m_bol = true;
m_token_value.clear();
}
else if (ch == kEOF)
result = eCIFTokenEOF;
else if (not is_any_print(ch))
error("invalid character in comment");
break;
case eStateQuestionMark:
if (is_non_blank(ch))
m_state = eStateValue;
else
{
retract();
result = eCIFTokenValue;
m_token_value.clear();
m_token_type = eCIFValueUnknown;
}
break;
case eStateDot:
if (isdigit(ch))
m_state = eStateFloat + 2;
else if (isspace(ch))
{
retract();
result = eCIFTokenValue;
m_token_type = eCIFValueInapplicable;
}
else
m_state = eStateValue;
break;
case eStateTextField:
if (ch == '\n')
m_state = eStateTextField + 1;
else if (ch == kEOF)
error("unterminated textfield");
else if (not is_any_print(ch))
// error("invalid character in text field '" + string({ static_cast<char>(ch) }) + "' (" + to_string((int)ch) + ")");
cerr << "invalid character in text field '" << string({ static_cast<char>(ch) }) << "' (" << ch << ") line: " << m_line_nr << endl;
break;
case eStateTextField + 1:
if (is_text_lead(ch) or ch == ' ' or ch == '\t')
m_state = eStateTextField;
else if (ch == ';')
{
assert(m_token_value.length() >= 2);
m_token_value = m_token_value.substr(1, m_token_value.length() - 3);
m_token_type = eCIFValueTextField;
result = eCIFTokenValue;
}
else if (ch == kEOF)
error("unterminated textfield");
else if (ch != '\n')
error("invalid character in text field");
break;
case eStateQuotedString:
if (ch == kEOF)
error("unterminated quoted string");
else if (ch == quoteChar)
m_state = eStateQuotedStringQuote;
else if (not is_any_print(ch))
error("invalid character in quoted string");
break;
case eStateQuotedStringQuote:
if (is_white(ch))
{
retract();
result = eCIFTokenValue;
m_token_type = eCIFValueString;
assert(m_token_value.length() >= 3);
m_token_value = m_token_value.substr(1, m_token_value.length() - 2);
}
else if (ch == quoteChar)
;
else if (is_any_print(ch))
m_state = eStateQuotedString;
else if (ch == kEOF)
error("unterminated quoted string");
else
error("invalid character in quoted string");
break;
case eStateTag:
if (not is_non_blank(ch))
{
retract();
result = eCIFTokenTag;
}
break;
case eStateFloat:
if (ch == '+' or ch == '-')
{
m_state = eStateFloat + 1;
}
else if (isdigit(ch))
m_state = eStateFloat + 1;
else
restart();
break;
case eStateFloat + 1:
// if (ch == '(') // numeric???
// m_state = eStateNumericSuffix;
// else
if (ch == '.')
m_state = eStateFloat + 2;
else if (tolower(ch) == 'e')
m_state = eStateFloat + 3;
else if (is_white(ch) or ch == kEOF)
{
retract();
result = eCIFTokenValue;
m_token_type = eCIFValueInt;
}
else
restart();
break;
// parsed '.'
case eStateFloat + 2:
// if (ch == '(') // numeric???
// m_state = eStateNumericSuffix;
// else
if (tolower(ch) == 'e')
m_state = eStateFloat + 3;
else if (is_white(ch) or ch == kEOF)
{
retract();
result = eCIFTokenValue;
m_token_type = eCIFValueFloat;
}
else
restart();
break;
// parsed 'e'
case eStateFloat + 3:
if (ch == '-' or ch == '+')
m_state = eStateFloat + 4;
else if (isdigit(ch))
m_state = eStateFloat + 5;
else
restart();
break;
case eStateFloat + 4:
if (isdigit(ch))
m_state = eStateFloat + 5;
else
restart();
break;
case eStateFloat + 5:
// if (ch == '(')
// m_state = eStateNumericSuffix;
// else
if (is_white(ch) or ch == kEOF)
{
retract();
result = eCIFTokenValue;
m_token_type = eCIFValueFloat;
}
else
restart();
break;
case eStateInt:
if (isdigit(ch) or ch == '+' or ch == '-')
m_state = eStateInt + 1;
else
restart();
break;
case eStateInt + 1:
if (is_white(ch) or ch == kEOF)
{
retract();
result = eCIFTokenValue;
m_token_type = eCIFValueInt;
}
else
restart();
break;
// case eStateNumericSuffix:
// if (isdigit(ch))
// m_state = eStateNumericSuffix + 1;
// else
// restart();
// break;
//
// case eStateNumericSuffix + 1:
// if (ch == ')')
// {
// result = eCIFTokenValue;
// m_token_type = eCIFValueNumeric;
// }
// else if (not isdigit(ch))
// restart();
// break;
case eStateValue:
if (is_non_blank(ch))
m_state = eStateValue + 1;
else
error("invalid character at this position");
break;
case eStateValue + 1:
if (ch == '_') // first _, check for keywords
{
string s = to_lower_copy(m_token_value);
if (s == "global_")
result = eCIFTokenGLOBAL;
else if (s == "stop_")
result = eCIFTokenSTOP;
else if (s == "loop_")
result = eCIFTokenLOOP;
else if (s == "data_" or s == "save_")
m_state = eStateValue + 2;
}
else if (not is_non_blank(ch))
{
retract();
result = eCIFTokenValue;
m_token_type = eCIFValueString;
}
break;
case eStateValue + 2:
if (not is_non_blank(ch))
{
retract();
if (tolower(m_token_value[0]) == 'd')
result = eCIFTokenDATA;
else
result = eCIFTokenSAVE;
m_token_value.erase(m_token_value.begin(), m_token_value.begin() + 5);
}
break;
default:
assert(false);
error("Invalid state in get_next_token");
break;
}
}
if (VERBOSE >= 5)
{
cerr << kTokenName[result];
if (m_token_type != eCIFValueUnknown)
cerr << ' ' << kValueName[m_token_type];
if (result != eCIFTokenEOF)
cerr << " '" << m_token_value << '\'';
cerr << endl;
}
return result;
}
void sac_parser::parse_file()
{
try
{
while (m_lookahead != eCIFTokenEOF)
{
switch (m_lookahead)
{
case eCIFTokenGLOBAL:
parse_global();
break;
case eCIFTokenDATA:
produce_datablock(m_token_value);
match(eCIFTokenDATA);
parse_data_block();
break;
default:
error("This file does not seem to be an mmCIF file");
break;
}
}
}
catch (const exception& ex)
{
error(string("Error parsing file: '") + ex.what() + "'");
}
}
void sac_parser::parse_global()
{
match(eCIFTokenGLOBAL);
while (m_lookahead == eCIFTokenTag)
{
match(eCIFTokenTag);
match(eCIFTokenValue);
}
}
void sac_parser::parse_data_block()
{
string cat;
while (m_lookahead == eCIFTokenLOOP or m_lookahead == eCIFTokenTag or m_lookahead == eCIFTokenSAVE)
{
switch (m_lookahead)
{
case eCIFTokenLOOP:
{
cat.clear(); // should start a new category
match(eCIFTokenLOOP);
vector<string> tags;
while (m_lookahead == eCIFTokenTag)
{
string cat_name, item_name;
std::tie(cat_name, item_name) = split_tag_name(m_token_value);
if (cat.empty())
{
produce_category(cat_name);
cat = cat_name;
}
else if (not iequals(cat, cat_name))
error("inconsistent categories in loop_");
tags.push_back(item_name);
match(eCIFTokenTag);
}
while (m_lookahead == eCIFTokenValue)
{
produce_row();
for (auto tag: tags)
{
produce_item(cat, tag, m_token_value);
match(eCIFTokenValue);
}
}
cat.clear();
break;
}
case eCIFTokenTag:
{
string cat_name, item_name;
std::tie(cat_name, item_name) = split_tag_name(m_token_value);
if (not iequals(cat, cat_name))
{
produce_category(cat_name);
cat = cat_name;
produce_row();
}
match(eCIFTokenTag);
produce_item(cat, item_name, m_token_value);
match(eCIFTokenValue);
break;
}
case eCIFTokenSAVE:
parse_save_frame();
break;
default:
assert(false);
break;
}
}
}
void sac_parser::parse_save_frame()
{
error("A regular CIF file should not contain a save frame");
}
// --------------------------------------------------------------------
parser::parser(std::istream& is, file& f)
: sac_parser(is), m_file(f), m_data_block(nullptr)
{
}
void parser::produce_datablock(const string& name)
{
m_data_block = new datablock(name);
m_file.append(m_data_block);
}
void parser::produce_category(const string& name)
{
if (VERBOSE >= 4)
cerr << "producing category " << name << endl;
std::tie(m_cat, ignore) = m_data_block->emplace(name);
}
void parser::produce_row()
{
if (VERBOSE >= 4)
cerr << "producing row for category " << m_cat->name() << endl;
m_cat->emplace({});
m_row = m_cat->back();
}
void parser::produce_item(const string& category, const string& item, const string& value)
{
if (VERBOSE >= 4)
cerr << "producing _" << category << '.' << item << " -> " << value << endl;
if (not iequals(category, m_cat->name()))
error("inconsistent categories in loop_");
m_row[item] = m_token_value;
}
// --------------------------------------------------------------------
struct dict_parser_data_impl
{
// temporary values for constructing dictionaries
vector<validate_category> m_category_validators;
map<string,vector<validate_item>> m_item_validators;
};
dict_parser::dict_parser(validator& validator, std::istream& is)
: parser(is, m_file), m_validator(validator), m_impl(new dict_parser_data_impl)
{
}
dict_parser::~dict_parser()
{
delete m_impl;
}
void dict_parser::parse_save_frame()
{
if (not m_collected_item_types)
m_collected_item_types = collect_item_types();
string saveFrameName = m_token_value;
if (saveFrameName.empty())
error("Invalid save frame, should contain more than just 'save_' here");
bool isCategorySaveFrame = m_token_value[0] != '_';
datablock dict(m_token_value);
datablock::iterator cat = dict.end();
match(eCIFTokenSAVE);
while (m_lookahead == eCIFTokenLOOP or m_lookahead == eCIFTokenTag)
{
if (m_lookahead == eCIFTokenLOOP)
{
cat = dict.end(); // should start a new category
match(eCIFTokenLOOP);
vector<string> tags;
while (m_lookahead == eCIFTokenTag)
{
string cat_name, item_name;
std::tie(cat_name, item_name) = split_tag_name(m_token_value);
if (cat == dict.end())
std::tie(cat, ignore) = dict.emplace(cat_name);
else if (not iequals(cat->name(), cat_name))
error("inconsistent categories in loop_");
tags.push_back(item_name);
match(eCIFTokenTag);
}
while (m_lookahead == eCIFTokenValue)
{
cat->emplace({});
auto row = cat->back();
for (auto tag: tags)
{
row[tag] = m_token_value;
match(eCIFTokenValue);
}
}
cat = dict.end();
}
else
{
string cat_name, item_name;
std::tie(cat_name, item_name) = split_tag_name(m_token_value);
if (cat == dict.end() or not iequals(cat->name(), cat_name))
std::tie(cat, ignore) = dict.emplace(cat_name);
match(eCIFTokenTag);
if (cat->empty())
cat->emplace({});
cat->back()[item_name] = m_token_value;
match(eCIFTokenValue);
}
}
match(eCIFTokenSAVE);
if (isCategorySaveFrame)
{
string category = dict.first_item("_category.id");
vector<string> keys;
for (auto k: dict["category_key"])
keys.push_back(get<1>(split_tag_name(k["name"].as<string>())));
iset groups;
for (auto g: dict["category_group"])
groups.insert(g["id"].as<string>());
m_impl->m_category_validators.push_back(validate_category{category, keys, groups});
}
else
{
// if the type code is missing, this must be a pointer, just skip it
string type_code = dict.first_item("_item_type.code");
const validate_type* tv = nullptr;
if (not (type_code.empty() or type_code == "?"))
tv = m_validator.get_validator_for_type(type_code);
iset ess;
for (auto e: dict["item_enumeration"])
ess.insert(e["value"].as<string>());
// collect the dict from our data_block and construct validators
for (auto i: dict["item"])
{
string tag_name, category, mandatory;
cif::tie(tag_name, category, mandatory) = i.get("name", "category_id", "mandatory_code");
string cat_name, item_name;
std::tie(cat_name, item_name) = split_tag_name(tag_name);
if (cat_name.empty() or item_name.empty())
error("Invalid tag name in _item.name " + tag_name);
if (not iequals(category, cat_name) and not (category.empty() or category == "?"))
error("specified category id does match the implicit category name for tag '" + tag_name + '\'');
else
category = cat_name;
auto& ivs = m_impl->m_item_validators[category];
auto vi = find(ivs.begin(), ivs.end(), validate_item{item_name});
if (vi == ivs.end())
ivs.push_back(validate_item{item_name, iequals(mandatory, "yes"), tv, ess});
else
{
// need to update the item_validator?
if (vi->m_mandatory != (iequals(mandatory, "yes")))
{
if (VERBOSE > 2)
{
cerr << "inconsistent mandatory value for " << tag_name << " in dictionary" << endl;
if (iequals(tag_name, saveFrameName))
cerr << "choosing " << mandatory << endl;
else
cerr << "choosing " << (vi->m_mandatory ? "Y" : "N") << endl;
}
if (iequals(tag_name, saveFrameName))
vi->m_mandatory = (iequals(mandatory, "yes"));
}
if (vi->m_type != nullptr and tv != nullptr and vi->m_type != tv)
{
if (VERBOSE > 1)
cerr << "inconsistent type for " << tag_name << " in dictionary" << endl;
}
// vi->m_mandatory = (iequals(mandatory, "yes"));
if (vi->m_type == nullptr)
vi->m_type = tv;
vi->m_enums.insert(ess.begin(), ess.end());
// anything else yet?
// ...
}
}
}
}
void dict_parser::link_items()
{
if (not m_data_block)
error("no datablock");
auto& dict = *m_data_block;
for (auto gl: dict["pdbx_item_linked_group_list"])
{
string child, parent;
cif::tie(child, parent) = gl.get("child_name", "parent_name");
auto civ = m_validator.get_validator_for_item(child);
if (civ == nullptr)
error("in pdbx_item_linked_group_list, item '" + child + "' is not specified");
auto piv = m_validator.get_validator_for_item(parent);
if (piv == nullptr)
error("in pdbx_item_linked_group_list, item '" + parent + "' is not specified");
civ->set_parent(piv);
}
// now make sure the item_type is specified for all item_validators
for (auto& cv: m_validator.m_category_validators)
{
for (auto& iv: cv.m_item_validators)
{
if (iv.m_type == nullptr)
cerr << "Missing item_type for " << iv.m_tag << endl;
}
}
}
void dict_parser::load_dictionary()
{
unique_ptr<datablock> dict;
datablock* saved_datablock = m_data_block;
try
{
while (m_lookahead != eCIFTokenEOF)
{
switch (m_lookahead)
{
case eCIFTokenGLOBAL:
parse_global();
break;
default:
{
dict.reset(new datablock(m_token_value)); // dummy datablock, for constructing the validator only
m_data_block = dict.get();
match(eCIFTokenDATA);
parse_data_block();
break;
}
}
}
}
catch (const exception& ex)
{
if (VERBOSE)
cerr << "Error parsing dictionary: '" << ex.what() << "'" << endl;
}
// store all validators
for (auto& ic: m_impl->m_category_validators)
m_validator.add_category_validator(move(ic));
m_impl->m_category_validators.clear();
for (auto& iv: m_impl->m_item_validators)
{
auto cv = m_validator.get_validator_for_category(iv.first);
if (cv == nullptr)
error("Undefined category '" + iv.first);
for (auto& v: iv.second)
const_cast<validate_category*>(cv)->add_item_validator(move(v));
}
// check all item validators for having a type_validator
if (dict)
link_items();
// store meta information
datablock::iterator info;
bool n;
std::tie(info, n) = m_data_block->emplace("dictionary");
if (n)
{
auto r = info->front();
m_validator.dict_name(r["title"].as<string>());
m_validator.dict_version(r["version"].as<string>());
}
m_data_block = saved_datablock;
m_impl->m_item_validators.clear();
}
bool dict_parser::collect_item_types()
{
bool result = false;
if (not m_data_block)
error("no datablock");
auto& dict = *m_data_block;
for (auto& t: dict["item_type_list"])
{
auto ts = t.get("code", "primitive_code", "construct");
string code, primitive_code, construct;
cif::tie(code, primitive_code, construct) = ts;
ba::replace_all(construct, "\\n", "\n");
ba::replace_all(construct, "\\t", "\t");
ba::replace_all(construct, "\\\n", "");
validate_type v = {
code, map_to_primitive_type(primitive_code), boost::regex(construct, boost::regex::egrep)
};
// Do not replace an already defined type validator, this won't work with pdbx_v40
// as it has a name that is too strict for its own names :-)
// if (m_file_impl.m_type_validators.count(v))
// m_file_impl.m_type_validators.erase(v);
m_validator.add_type_validator(move(v));
if (VERBOSE >= 5)
cerr << "Added type " << code << " (" << primitive_code << ") => " << construct << endl;
result = true;
}
return result;
}
}
// CIF parser
#include "libcif/config.h"
#include <tuple>
#include <iostream>
#include <boost/algorithm/string.hpp>
#include "libcif/cif-utils.h"
using namespace std;
namespace ba = boost::algorithm;
namespace cif
{
// --------------------------------------------------------------------
// This really makes a difference, having our own tolower routines
const uint8 kCharToLowerMap[256] =
{
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
};
// --------------------------------------------------------------------
bool iequals(const string& a, const string& b)
{
bool result = a.length() == b.length();
for (auto ai = a.begin(), bi = b.begin(); result and ai != a.end() and bi != b.end(); ++ai, ++bi)
result = tolower(*ai) == tolower(*bi);
return result;
}
bool iequals(const char* a, const char* b)
{
bool result = true;
for (; result and *a and *b; ++a, ++b)
result = tolower(*a) == tolower(*b);
return result and *a == *b;
}
int icompare(const string& a, const string& b)
{
int d = 0;
auto ai = a.begin(), bi = b.begin();
for (; d == 0 and ai != a.end() and bi != b.end(); ++ai, ++bi)
d = tolower(*ai) - tolower(*bi);
if (d == 0)
{
if (ai != a.end())
d = 1;
else if (bi != b.end())
d = -1;
}
return d;
}
int icompare(const char* a, const char* b)
{
int d = 0;
for (; d == 0 and *a != 0 and *b != 0; ++a, ++b)
d = tolower(*a) - tolower(*b);
if (d == 0)
{
if (*a != 0)
d = 1;
else if (*b != 0)
d = -1;
}
return d;
}
void to_lower(string& s)
{
for (auto& c: s)
c = tolower(c);
}
string to_lower_copy(const string& s)
{
string result(s);
for (auto& c: result)
c = tolower(c);
return result;
}
// --------------------------------------------------------------------
tuple<string,string> split_tag_name(const string& tag)
{
if (tag.empty())
throw runtime_error("empty tag");
if (tag[0] != '_')
throw runtime_error("tag does not start with underscore");
auto s = tag.find('.');
if (s == string::npos)
throw runtime_error("tag does not contain dot");
return tuple<string,string>{
tag.substr(1, s - 1), tag.substr(s + 1)
};
}
// --------------------------------------------------------------------
// Simplified line breaking code taken from a decent text editor.
// In this case, simplified means it only supports ASCII.
enum LineBreakClass
{
kLBC_OpenPunctuation,
kLBC_ClosePunctuation,
kLBC_CloseParenthesis,
kLBC_Quotation,
kLBC_NonBreaking,
kLBC_Nonstarter,
kLBC_Exlamation,
kLBC_SymbolAllowingBreakAfter,
kLBC_InfixNumericSeparator,
kLBC_PrefixNumeric,
kLBC_PostfixNumeric,
kLBC_Numeric,
kLBC_Alphabetic,
kLBC_Ideographic,
kLBC_Inseperable,
kLBC_Hyphen,
kLBC_BreakAfter,
kLBC_BreakBefor,
kLBC_BreakOpportunityBeforeAndAfter,
kLBC_ZeroWidthSpace,
kLBC_CombiningMark,
kLBC_WordJoiner,
kLBC_HangulLVSyllable,
kLBC_HangulLVTSyllable,
kLBC_HangulLJamo,
kLBC_HangulVJamo,
kLBC_HangulTJamo,
kLBC_MandatoryBreak,
kLBC_CarriageReturn,
kLBC_LineFeed,
kLBC_NextLine,
kLBC_Surrogate,
kLBC_Space,
kLBC_ContigentBreakOpportunity,
kLBC_Ambiguous,
kLBC_ComplexContext,
kLBC_Unknown
};
const LineBreakClass kASCII_LBTable[128] =
{
kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark,
kLBC_CombiningMark, kLBC_BreakAfter, kLBC_LineFeed, kLBC_MandatoryBreak, kLBC_MandatoryBreak, kLBC_CarriageReturn, kLBC_CombiningMark, kLBC_CombiningMark,
kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark,
kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark, kLBC_CombiningMark,
kLBC_Space, kLBC_Exlamation, kLBC_Quotation, kLBC_Alphabetic, kLBC_PrefixNumeric, kLBC_PostfixNumeric, kLBC_Alphabetic, kLBC_Quotation,
kLBC_OpenPunctuation, kLBC_CloseParenthesis, kLBC_Alphabetic, kLBC_PrefixNumeric,
// comma treated differently here, it is not a numeric separator in PDB
kLBC_SymbolAllowingBreakAfter/* kLBC_InfixNumericSeparator */,
kLBC_Hyphen, kLBC_InfixNumericSeparator, kLBC_SymbolAllowingBreakAfter,
kLBC_Numeric, kLBC_Numeric, kLBC_Numeric, kLBC_Numeric, kLBC_Numeric, kLBC_Numeric, kLBC_Numeric, kLBC_Numeric,
kLBC_Numeric, kLBC_Numeric, kLBC_InfixNumericSeparator, kLBC_InfixNumericSeparator, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Exlamation,
kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic,
kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic,
kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic,
kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_OpenPunctuation, kLBC_PrefixNumeric, kLBC_CloseParenthesis, kLBC_Alphabetic, kLBC_Alphabetic,
kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic,
kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic,
kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic,
kLBC_Alphabetic, kLBC_Alphabetic, kLBC_Alphabetic, kLBC_OpenPunctuation, kLBC_BreakAfter, kLBC_ClosePunctuation, kLBC_Alphabetic, kLBC_CombiningMark
};
string::const_iterator next_line_break(string::const_iterator text, string::const_iterator end)
{
if (text == end)
return text;
enum break_action
{
DBK = 0, // direct break (blank in table)
IBK, // indirect break (% in table)
PBK, // prohibited break (^ in table)
CIB, // combining indirect break
CPB // combining prohibited break
};
const break_action brkTable[27][27] = {
// OP CL CP QU GL NS EX SY IS PR PO NU AL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT
/* OP */ { PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, CPB, PBK, PBK, PBK, PBK, PBK, PBK },
/* CL */ { DBK, PBK, PBK, IBK, IBK, PBK, PBK, PBK, PBK, IBK, IBK, DBK, DBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* CP */ { DBK, PBK, PBK, IBK, IBK, PBK, PBK, PBK, PBK, IBK, IBK, IBK, IBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* QU */ { PBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, PBK, CIB, PBK, IBK, IBK, IBK, IBK, IBK },
/* GL */ { IBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, PBK, CIB, PBK, IBK, IBK, IBK, IBK, IBK },
/* NS */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, DBK, DBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* EX */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, DBK, DBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* SY */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* IS */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, IBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* PR */ { IBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, IBK, IBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, IBK, IBK, IBK, IBK, IBK },
/* PO */ { IBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, IBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* NU */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, IBK, IBK, IBK, IBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* AL */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, IBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* ID */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* IN */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* HY */ { DBK, PBK, PBK, IBK, DBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* BA */ { DBK, PBK, PBK, IBK, DBK, IBK, PBK, PBK, PBK, DBK, DBK, DBK, DBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* BB */ { IBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, PBK, CIB, PBK, IBK, IBK, IBK, IBK, IBK },
/* B2 */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, DBK, DBK, DBK, DBK, IBK, IBK, DBK, PBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* ZW */ { DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, PBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK },
/* CM */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, IBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK },
/* WJ */ { IBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, PBK, CIB, PBK, IBK, IBK, IBK, IBK, IBK },
/* H2 */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, IBK, IBK },
/* H3 */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, IBK },
/* JL */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, IBK, IBK, IBK, IBK, DBK },
/* JV */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, IBK, IBK },
/* JT */ { DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, IBK },
};
uint8 ch = static_cast<uint8>(*text);
LineBreakClass cls;
if (ch == '\n')
cls = kLBC_MandatoryBreak;
else if (ch < 128)
{
cls = kASCII_LBTable[ch];
if (cls > kLBC_MandatoryBreak and cls != kLBC_Space) // duh...
cls = kLBC_Alphabetic;
}
else
cls = kLBC_Unknown;
if (cls == kLBC_Space)
cls = kLBC_WordJoiner;
LineBreakClass ncls = cls;
while (++text != end and cls != kLBC_MandatoryBreak)
{
ch = *text;
LineBreakClass lcls = ncls;
if (ch == '\n')
{
++text;
break;
}
ncls = kASCII_LBTable[ch];
if (ncls == kLBC_Space)
continue;
break_action brk = brkTable[cls][ncls];
if (brk == DBK or (brk == IBK and lcls == kLBC_Space))
break;
cls = ncls;
}
return text;
}
vector<string> wrap_line(const string& text, unsigned int width)
{
vector<string> result;
vector<size_t> offsets = { 0 };
auto b = text.begin();
while (b != text.end())
{
auto e = next_line_break(b, text.end());
offsets.push_back(e - text.begin());
b = e;
}
size_t count = offsets.size() - 1;
vector<size_t> minima(count + 1, 1000000);
minima[0] = 0;
vector<size_t> breaks(count + 1, 0);
for (size_t i = 0; i < count; ++i)
{
size_t j = i + 1;
while (j <= count)
{
size_t w = offsets[j] - offsets[i];
if (w > width)
break;
while (w > 0 and isspace(text[offsets[i] + w - 1]))
--w;
size_t cost = minima[i];
if (j < count) // last line may be shorter
cost += (width - w) * (width - w);
if (cost < minima[j])
{
minima[j] = cost;
breaks[j] = i;
}
++j;
}
}
size_t j = count;
while (j > 0)
{
size_t i = breaks[j];
result.push_back(text.substr(offsets[i], offsets[j] - offsets[i]));
j = i;
}
reverse(result.begin(), result.end());
return result;
}
vector<string> word_wrap(const string& text, unsigned int width)
{
vector<string> paragraphs;
ba::split(paragraphs, text, ba::is_any_of("\n"));
vector<string> result;
for (auto& p: paragraphs)
{
if (p.empty())
{
result.push_back("");
continue;
}
auto lines = wrap_line(p, width);
result.insert(result.end(), lines.begin(), lines.end());
}
return result;
}
}
// cif parsing library
#include <boost/algorithm/string.hpp>
// since gcc's regex is crashing....
#include <boost/regex.hpp>
#include "libcif/cif++.h"
#include "libcif/cif-parser.h"
#include "libcif/cif-validator.h"
using namespace std;
namespace ba = boost::algorithm;
extern int VERBOSE;
namespace cif
{
DDL_PrimitiveType map_to_primitive_type(const string& s)
{
DDL_PrimitiveType result;
if (iequals(s, "char"))
result = ptChar;
else if (iequals(s, "uchar"))
result = ptUChar;
else if (iequals(s, "numb"))
result = ptNumb;
else
throw validation_error("Not a known primitive type");
return result;
}
// --------------------------------------------------------------------
int validate_type::compare(const char* a, const char* b) const
{
int result = 0;
if (*a == 0)
result = *b == 0 ? 0 : -1;
else if (*b == 0)
result = *a == 0 ? 0 : +1;
else
{
try
{
switch (m_primitive_type)
{
case ptNumb:
{
double da = strtod(a, nullptr);
double db = strtod(b, nullptr);
auto d = da - db;
if (abs(d) > numeric_limits<double>::epsilon())
{
if (d > 0)
result = 1;
else if (d < 0)
result = -1;
}
break;
}
case ptUChar:
case ptChar:
{
// CIF is guaranteed to have ascii only, therefore this primitive code will do
// also, we're collapsing spaces
auto ai = a, bi = b;
for (;;)
{
if (*ai == 0)
{
if (*bi != 0)
result = -1;
break;
}
else if (*bi == 0)
{
result = 1;
break;
}
char ca = toupper(*ai);
char cb = toupper(*bi);
result = ca - cb;
if (result != 0)
break;
if (ca == ' ')
{
while (ai[1] == ' ')
++ai;
while (bi[1] == ' ')
++bi;
}
++ai;
++bi;
}
break;
}
}
}
catch (const std::invalid_argument& ex)
{
result = 1;
}
}
return result;
}
// --------------------------------------------------------------------
void validate_item::set_parent(validate_item* parent)
{
m_parent = parent;
if (m_type == nullptr and m_parent != nullptr)
m_type = m_parent->m_type;
if (m_parent != nullptr)
{
m_parent->m_children.insert(this);
if (m_category->m_keys == vector<string>{m_tag})
m_parent->m_foreign_keys.insert(this);
}
}
void validate_item::operator()(string value) const
{
if (VERBOSE >= 4)
cout << "validating '" << value << "' for '" << m_tag << "'" << endl;
if (not value.empty() and value != "?" and value != ".")
{
if (m_type != nullptr and not boost::regex_match(value, m_type->m_rx))
throw validation_error("Value '" + value + "' does not match type expression for type " + m_type->m_name + " in item " + m_tag);
if (not m_enums.empty())
{
if (m_enums.count(value) == 0)
throw validation_error("Value '" + value + "' is not in the list of allowed values for item " + m_tag);
}
}
}
// --------------------------------------------------------------------
void validate_category::add_item_validator(validate_item&& v)
{
if (v.m_mandatory)
m_mandatory_fields.insert(v.m_tag);
v.m_category = this;
auto r = m_item_validators.insert(move(v));
if (not r.second and VERBOSE >= 4)
cout << "Could not add validator for item " << v.m_tag << " to category " << m_name << endl;
}
const validate_item* validate_category::get_validator_for_item(string tag) const
{
const validate_item* result = nullptr;
auto i = m_item_validators.find(validate_item{tag});
if (i != m_item_validators.end())
result = &*i;
else if (VERBOSE > 4)
cout << "No validator for tag " << tag << endl;
return result;
}
// --------------------------------------------------------------------
validator::validator()
{
}
validator::~validator()
{
}
void validator::add_type_validator(validate_type&& v)
{
auto r = m_type_validators.insert(move(v));
if (not r.second and VERBOSE > 4)
cout << "Could not add validator for type " << v.m_name << endl;
}
const validate_type* validator::get_validator_for_type(string type_code) const
{
const validate_type* result = nullptr;
auto i = m_type_validators.find(validate_type{ type_code, ptChar, boost::regex() });
if (i != m_type_validators.end())
result = &*i;
else if (VERBOSE > 4)
cout << "No validator for type " << type_code << endl;
return result;
}
void validator::add_category_validator(validate_category&& v)
{
auto r = m_category_validators.insert(move(v));
if (not r.second and VERBOSE > 4)
cout << "Could not add validator for category " << v.m_name << endl;
}
const validate_category* validator::get_validator_for_category(string category) const
{
const validate_category* result = nullptr;
auto i = m_category_validators.find(validate_category{category});
if (i != m_category_validators.end())
result = &*i;
else if (VERBOSE > 4)
cout << "No validator for category " << category << endl;
return result;
}
validate_item* validator::get_validator_for_item(string tag) const
{
validate_item* result = nullptr;
string cat, item;
std::tie(cat, item) = split_tag_name(tag);
auto* cv = get_validator_for_category(cat);
if (cv != nullptr)
result = const_cast<validate_item*>(cv->get_validator_for_item(item));
if (result == nullptr and VERBOSE > 4)
cout << "No validator for item " << tag << endl;
return result;
}
void validator::report_error(const string& msg)
{
if (m_strict)
throw validation_error(msg);
else if (VERBOSE)
cerr << msg << endl;
}
}
This source diff could not be displayed because it is too large. You can view the blob instead.
// Lib for working with structures as contained in mmCIF and PDB files
#include "libcif/config.h"
#include <map>
#include <boost/algorithm/string.hpp>
#include <boost/filesystem/operations.hpp>
#include <boost/filesystem/fstream.hpp>
#include "libcif/compound.h"
#include "libcif/cif++.h"
using namespace std;
namespace ba = boost::algorithm;
namespace fs = boost::filesystem;
namespace libcif
{
class compound_factory
{
public:
static compound_factory& instance();
const compound* create(string id);
private:
compound_factory();
~compound_factory();
static compound_factory* sInstance;
fs::path m_clibd_mon;
vector<compound*> m_compounds;
};
// --------------------------------------------------------------------
// compound
string compound::formula() const
{
string result;
map<string,uint32> atoms;
float charge_sum = 0;
for (auto r: m_atoms)
{
atoms[atom_type_traits(r.type_symbol).symbol()] += 1;
charge_sum += r.partial_charge;
}
auto c = atoms.find("C");
if (c != atoms.end())
{
result = "C";
if (c->second > 1)
result += to_string(c->second);
atoms.erase(c);
auto h = atoms.find("H");
if (h != atoms.end())
{
result += " H";
if (h->second > 1)
result += to_string(h->second);
atoms.erase(h);
}
}
for (auto a: atoms)
{
if (not result.empty())
result += ' ';
result += a.first;
if (a.second > 1)
result += to_string(a.second);
}
int charge = lrint(charge_sum);
if (charge != 0)
result += ' ' + to_string(charge);
return result;
}
int compound::charge() const
{
float result = 0;
for (auto r: m_atoms)
result += r.partial_charge;
return lrint(result);
}
string compound::type() const
{
string result;
// known groups are (counted from ccp4 monomer dictionary)
// D-pyranose
// DNA
// L-PEPTIDE LINKING
// L-SACCHARIDE
// L-peptide
// L-pyranose
// M-peptide
// NON-POLYMER
// P-peptide
// RNA
// furanose
// non-polymer
// non_polymer
// peptide
// pyranose
// saccharide
if (cif::iequals(m_id, "gly"))
result = "peptide linking";
else if (cif::iequals(m_group, "l-peptide") or cif::iequals(m_group, "L-peptide linking") or cif::iequals(m_group, "peptide"))
result = "L-peptide linking";
else if (cif::iequals(m_group, "DNA"))
result = "DNA linking";
else if (cif::iequals(m_group, "RNA"))
result = "RNA linking";
return result;
}
bool compound::is_water() const
{
return m_id == "HOH" or m_id == "H2O";
}
comp_atom compound::get_atom_by_id(const string& atom_id) const
{
comp_atom result;
for (auto& a: m_atoms)
{
if (a.id == atom_id)
{
result = a;
break;
}
}
if (result.id != atom_id)
throw out_of_range("No atom " + atom_id + " in compound " + m_id);
return result;
}
const compound* compound::create(const string& id)
{
return compound_factory::instance().create(id);
}
// --------------------------------------------------------------------
// a factory class to generate compounds
compound_factory* compound_factory::sInstance = nullptr;
compound_factory::compound_factory()
{
const char* clibd_mon = getenv("CLIBD_MON");
if (clibd_mon == nullptr)
throw runtime_error("Cannot locate peptide list, please souce the CCP4 environment");
m_clibd_mon = clibd_mon;
}
compound_factory::~compound_factory()
{
}
compound_factory& compound_factory::instance()
{
if (sInstance == nullptr)
sInstance = new compound_factory();
return *sInstance;
}
// id is the three letter code
const compound* compound_factory::create(std::string id)
{
ba::to_upper(id);
compound* result = nullptr;
for (auto cmp: m_compounds)
{
if (cmp->id() == id)
{
result = cmp;
break;
}
}
if (result == nullptr)
{
fs::path resFile = m_clibd_mon / ba::to_lower_copy(id.substr(0, 1)) / (id + ".cif");
fs::ifstream file(resFile);
if (file.is_open())
{
cif::file cf;
try
{
cf.load(file);
}
catch (const exception& ex)
{
cerr << "Error while loading " << resFile << endl;
throw ex;
}
auto& list = cf["comp_list"];
auto row = list["chem_comp"][cif::key("id") == id];
string name, group;
uint32 number_atoms_all, number_atoms_nh;
cif::tie(name, group, number_atoms_all, number_atoms_nh) =
row.get("name", "group", "number_atoms_all", "number_atoms_nh");
ba::trim(name);
ba::trim(group);
auto& comp_atoms = cf["comp_" + id]["chem_comp_atom"];
vector<comp_atom> atoms;
for (auto row: comp_atoms)
{
string id, symbol, energy;
float charge;
cif::tie(id, symbol, energy, charge) = row.get("atom_id", "type_symbol", "type_energy", "partial_charge");
atoms.push_back({
id, atom_type_traits(symbol).type(), energy, charge
});
}
auto& comp_bonds = cf["comp_" + id]["chem_comp_bond"];
map<tuple<string,string>,float> bonds;
for (auto row: comp_bonds)
{
string atom_id_1, atom_id_2, type;
cif::tie(atom_id_1, atom_id_2, type) = row.get("atom_id_1", "atom_id_2", "type");
float value = 0;
if (type == "single") value = 1;
else if (type == "double") value = 2;
else if (type == "triple") value = 3;
else if (type == "deloc" or type == "aromat")
value = 1.5;
else
{
cerr << "Unimplemented chem_comp_bond.type " << type << " in file " << resFile << endl;
value = 1.0;
}
bonds[make_tuple(atom_id_1, atom_id_2)] = value;
}
result = new compound(id, name, group, move(atoms), move(bonds));
m_compounds.push_back(result);
}
}
return result;
}
bool compound::atoms_bonded(const string& atom_id_1, const string& atom_id_2) const
{
return m_bonds.count(make_tuple(atom_id_1, atom_id_2)) or m_bonds.count(make_tuple(atom_id_2, atom_id_1));
}
float compound::atom_bond_value(const string& atom_id_1, const string& atom_id_2) const
{
auto i = m_bonds.find(make_tuple(atom_id_1, atom_id_2));
if (i == m_bonds.end())
i = m_bonds.find(make_tuple(atom_id_2, atom_id_1));
return i == m_bonds.end() ? 0 : i->second;
}
}
#include "libpr.h"
#include <map>
#include <set>
#include <boost/date_time/gregorian/gregorian.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/format.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include "peptidedb.h"
#include "pdb2cif.h"
#include "libcif/atom_type.h"
#include "libcif/compound.h"
#include "libcif/pdb2cif-remark3.h"
using namespace std;
namespace ba = boost::algorithm;
using cif::datablock;
using cif::category;
using cif::row;
using cif::key;
using cif::iequals;
static const char* kRedOn = "\033[37;1;41m";
static const char* kRedOff = "\033[0m";
// --------------------------------------------------------------------
struct TemplateLine
{
const char* rx;
int next_state_offset;
const char* category;
initializer_list<const char*> items;
const char* ls_restr_type = nullptr;
bool create_new;
};
// --------------------------------------------------------------------
const TemplateLine kBusterTNT_Template[] = {
/* 0 */ { R"(DATA USED IN REFINEMENT\.)", 1 },
/* 1 */ { R"(RESOLUTION RANGE HIGH \(ANGSTROMS\) :\s+(.+?))", 1, "refine", { "ls_d_res_high" } },
/* 2 */ { R"(RESOLUTION RANGE LOW \(ANGSTROMS\) :\s+(.+?))", 1, "refine", { "ls_d_res_low" } },
/* 3 */ { R"(DATA CUTOFF \(SIGMA\(F\)\) :\s+(.+?))", 1, "refine", { "pdbx_ls_sigma_F" } },
/* 4 */ { R"(COMPLETENESS FOR RANGE \(%\) :\s+(.+?))", 1, "refine", { "ls_percent_reflns_obs" } },
/* 5 */ { R"(NUMBER OF REFLECTIONS :\s+(.+?))", 1, "refine", { "ls_number_reflns_obs" } },
/* 6 */ { R"(FIT TO DATA USED IN REFINEMENT\.)", 1 },
/* 7 */ { R"(CROSS-VALIDATION METHOD :\s+(.+?))", 1, "refine", { "pdbx_ls_cross_valid_method" } },
/* 8 */ { R"(FREE R VALUE TEST SET SELECTION :\s+(.+?))", 1, "refine", { "pdbx_R_Free_selection_details" } },
/* 9 */ { R"(R VALUE \(WORKING \+ TEST SET\) :\s+(.+?))", 1, "refine", { "ls_R_factor_obs" } },
/* 10 */ { R"(R VALUE \(WORKING SET\) :\s+(.+?))", 1, "refine", { "ls_R_factor_R_work" } },
/* 11 */ { R"(FREE R VALUE :\s+(.+?))", 1, "refine", { "ls_R_factor_R_free" } },
/* 12 */ { R"(FREE R VALUE TEST SET SIZE \(%\) :\s+(.+?))", 1, "refine", { "ls_percent_reflns_R_free" } },
/* 13 */ { R"(FREE R VALUE TEST SET COUNT :\s+(.+?))", 1, "refine", { "ls_number_reflns_R_free" } },
/* 14 */ { R"(ESTIMATED ERROR OF FREE R VALUE :\s+(.+?))", 1, "refine", { "ls_R_factor_R_free_error" } },
/* 15 */ { R"(FIT IN THE HIGHEST RESOLUTION BIN\.)", 1 },
/* 16 */ { R"(TOTAL NUMBER OF BINS USED :\s+(.+?))", 1, "refine_ls_shell", { "pdbx_total_number_of_bins_used" } },
/* 17 */ { R"(BIN RESOLUTION RANGE HIGH \(ANGSTROMS\) :\s+(.+?))", 1, "refine_ls_shell", { "d_res_high" } },
/* 18 */ { R"(BIN RESOLUTION RANGE LOW \(ANGSTROMS\) :\s+(.+?))", 1, "refine_ls_shell", { "d_res_low" } },
/* 19 */ { R"(BIN COMPLETENESS \(WORKING\+TEST\) \(%\) :\s+(.+?))", 1, "refine_ls_shell", { "percent_reflns_obs" } },
/* 20 */ { R"(REFLECTIONS IN BIN \(WORKING \+ TEST SET\) :\s+(.+?))", 1, "refine_ls_shell", { "number_reflns_all" } },
/* 21 */ { R"(BIN R VALUE \(WORKING \+ TEST SET\) :\s+(.+?))", 1, "refine_ls_shell", { "R_factor_all" } },
/* 22 */ { R"(REFLECTIONS IN BIN \(WORKING SET\) :\s+(.+?))", 1, "refine_ls_shell", { "number_reflns_R_work" } },
/* 23 */ { R"(BIN R VALUE \(WORKING SET\) :\s+(.+?))", 1, "refine_ls_shell", { "R_factor_R_work" } },
/* 24 */ { R"(BIN FREE R VALUE :\s+(.+?))", 1, "refine_ls_shell", { "R_factor_R_free" } },
/* 25 */ { R"(BIN FREE R VALUE TEST SET SIZE \(%\) :\s+(.+?))", 1, "refine_ls_shell", { "percent_reflns_R_free" } },
/* 26 */ { R"(BIN FREE R VALUE TEST SET COUNT :\s+(.+?))", 1, "refine_ls_shell", { "number_reflns_R_free" } },
/* 27 */ { R"(ESTIMATED ERROR OF BIN FREE R VALUE :\s+(.+?))", 1, "refine_ls_shell", { "R_factor_R_free_error" } },
/* 28 */ { R"(NUMBER OF NON-HYDROGEN ATOMS USED IN REFINEMENT\.)", 1 },
/* 29 */ { R"(PROTEIN ATOMS :\s+(.+?))", 1, "refine_hist", { "pdbx_number_atoms_protein" } },
/* 30 */ { R"(NUCLEIC ACID ATOMS :\s+(.+?))", 1, "refine_hist", { "pdbx_number_atoms_nucleic_acid" } },
/* 31 */ { R"(HETEROGEN ATOMS :\s+(.+?))", 1, "refine_hist", { "pdbx_number_atoms_ligand" } },
/* 32 */ { R"(SOLVENT ATOMS :\s+(.+?))", 1, "refine_hist", { "number_atoms_solvent" } },
/* 33 */ { R"(B VALUES\.)", 1 },
/* 34 */ { R"(B VALUE TYPE :\s+(.+?))", 1, "refine", { "pdbx_TLS_residual_ADP_flag" } },
/* 35 */ { R"(FROM WILSON PLOT \(A\*\*2\) :\s+(.+?))", 1, "reflns", { "B_iso_Wilson_estimate" } },
/* 36 */ { R"(MEAN B VALUE \(OVERALL, A\*\*2\) :\s+(.+?))", 1, "refine", { "B_iso_mean" } },
/* 37 */ { R"(OVERALL ANISOTROPIC B VALUE\.)", 1 },
/* 38 */ { R"(B11 \(A\*\*2\) :\s+(.+?))", 1, "refine", { "aniso_B[1][1]" } },
/* 39 */ { R"(B22 \(A\*\*2\) :\s+(.+?))", 1, "refine", { "aniso_B[2][2]" } },
/* 40 */ { R"(B33 \(A\*\*2\) :\s+(.+?))", 1, "refine", { "aniso_B[3][3]" } },
/* 41 */ { R"(B12 \(A\*\*2\) :\s+(.+?))", 1, "refine", { "aniso_B[1][2]" } },
/* 42 */ { R"(B13 \(A\*\*2\) :\s+(.+?))", 1, "refine", { "aniso_B[1][3]" } },
/* 43 */ { R"(B23 \(A\*\*2\) :\s+(.+?))", 1, "refine", { "aniso_B[2][3]" } },
/* 44 */ { R"(ESTIMATED COORDINATE ERROR\.)", 1 },
/* 45 */ { R"(ESD FROM LUZZATI PLOT \(A\) :\s+(.+?))", 1, "refine_analyze", { "Luzzati_coordinate_error_obs" } },
/* 46 */ { R"(DPI \(BLOW EQ-10\) BASED ON R VALUE \(A\) :\s+(.+?))", 1, "refine", { "pdbx_overall_SU_R_Blow_DPI" } },
/* 47 */ { R"(DPI \(BLOW EQ-9\) BASED ON FREE R VALUE \(A\) :\s+(.+?))", 1, "refine", { "pdbx_overall_SU_R_free_Blow_DPI" } },
/* 48 */ { R"(DPI \(CRUICKSHANK\) BASED ON R VALUE \(A\) :\s+(.+?))", 1, "refine", { "overall_SU_R_Cruickshank_DPI" } },
/* 49 */ { R"(DPI \(CRUICKSHANK\) BASED ON FREE R VALUE \(A\) :\s+(.+?))", 1, "refine", { "pdbx_overall_SU_R_free_Cruickshank_DPI" } },
/* 50 */ { R"(REFERENCES: BLOW, D\. \(2002\) ACTA CRYST D58, 792-797 CRUICKSHANK, D\.W\.J\. \(1999\) ACTA CRYST D55, 583-601)", 1 },
/* 51 */ { R"(CORRELATION COEFFICIENTS\.)", 1 },
/* 52 */ { R"(CORRELATION COEFFICIENT FO-FC :\s+(.+?))", 1, "refine", { "correlation_coeff_Fo_to_Fc" } },
/* 53 */ { R"(CORRELATION COEFFICIENT FO-FC FREE :\s+(.+?))", 1, "refine", { "correlation_coeff_Fo_to_Fc_free" } },
/* 54 */ { R"(NUMBER OF GEOMETRIC FUNCTION TERMS DEFINED : 15)", 1 },
/* 55 */ { R"(TERM COUNT WEIGHT FUNCTION\.)", 1 },
/* 56 */ { R"(BOND LENGTHS :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_bond_d", true },
/* 57 */ { R"(BOND ANGLES :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_angle_deg", true },
/* 58 */ { R"(TORSION ANGLES :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_dihedral_angle_d", true },
/* 59 */ { R"(TRIGONAL CARBON PLANES :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_trig_c_planes", true },
/* 60 */ { R"(GENERAL PLANES :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_gen_planes", true },
/* 61 */ { R"(ISOTROPIC THERMAL FACTORS :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_it", true },
/* 62 */ { R"(BAD NON-BONDED CONTACTS :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_nbd", true },
/* 63 */ { R"(IMPROPER TORSIONS :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_improper_torsion", true },
/* 64 */ { R"(PSEUDOROTATION ANGLES :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_pseud_angle", true },
/* 65 */ { R"(CHIRAL IMPROPER TORSION :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_chiral_improper_torsion", true },
/* 66 */ { R"(SUM OF OCCUPANCIES :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_sum_occupancies", true },
/* 67 */ { R"(UTILITY DISTANCES :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_utility_distance", true },
/* 68 */ { R"(UTILITY ANGLES :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_utility_angle", true },
/* 69 */ { R"(UTILITY TORSION :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_utility_torsion", true },
/* 70 */ { R"(IDEAL-DIST CONTACT TERM :\s+(.+?);\s+(.+?);\s+(.+?))", 1, "refine_ls_restr", { "number", "weight", "pdbx_restraint_function" }, "t_ideal_dist_contact", true },
/* 71 */ { R"(RMS DEVIATIONS FROM IDEAL VALUES\.)", 1 },
/* 72 */ { R"(BOND LENGTHS \(A\) :\s+(.+?))", 1, "refine_ls_restr", { "dev_ideal" }, "t_bond_d", false },
/* 73 */ { R"(BOND ANGLES \(DEGREES\) :\s+(.+?))", 1, "refine_ls_restr", { "dev_ideal" }, "t_angle_deg", false },
/* 74 */ { R"(PEPTIDE OMEGA TORSION ANGLES \(DEGREES\) :\s+(.+?))", 1, "refine_ls_restr", { "dev_ideal" }, "t_omega_torsion", false },
/* 75 */ { R"(OTHER TORSION ANGLES \(DEGREES\) :\s+(.+?))", 1, "refine_ls_restr", { "dev_ideal" }, "t_other_torsion", false },
/* 76 */ { R"(TLS DETAILS\.?)", 1 },
/* 77 */ { R"(NUMBER OF TLS GROUPS :.+)", 1 },
/* 78 */ { R"(TLS GROUP :\s*(\d+))", 1, "pdbx_refine_tls", { "id" }, nullptr, true },
/* 79 */ { R"(SELECTION:\s+(.+?))", 1, "pdbx_refine_tls_group", { "selection_details" }, nullptr, true },
/* 80 */ { R"(ORIGIN FOR THE GROUP \(A\):\s+(.+?)\s+(.+?)\s+(.+?))", 1, "pdbx_refine_tls", { "origin_x", "origin_y", "origin_z" } },
/* 81 */ { R"(T TENSOR)", 1 },
/* 82 */ { R"(T11:\s+(.+?) T22:\s+(.+?))", 1, "pdbx_refine_tls", { "T[1][1]", "T[2][2]" } },
/* 83 */ { R"(T33:\s+(.+?) T12:\s+(.+?))", 1, "pdbx_refine_tls", { "T[3][3]", "T[1][2]" } },
/* 84 */ { R"(T13:\s+(.+?) T23:\s+(.+?))", 1, "pdbx_refine_tls", { "T[1][3]", "T[2][3]" } },
/* 85 */ { R"(L TENSOR)", 1 },
/* 86 */ { R"(L11:\s+(.+?) L22:\s+(.+?))", 1, "pdbx_refine_tls", { "L[1][1]", "L[2][2]" } },
/* 87 */ { R"(L33:\s+(.+?) L12:\s+(.+?))", 1, "pdbx_refine_tls", { "L[3][3]", "L[1][2]" } },
/* 88 */ { R"(L13:\s+(.+?) L23:\s+(.+?))", 1, "pdbx_refine_tls", { "L[1][3]", "L[2][3]" } },
/* 89 */ { R"(S TENSOR)", 1 },
/* 90 */ { R"(S11:\s+(.+?) S12:\s+(.+?) S13:\s+(.+?))", 1, "pdbx_refine_tls", { "S[1][1]", "S[1][2]", "S[1][3]" } },
/* 91 */ { R"(S21:\s+(.+?) S22:\s+(.+?) S23:\s+(.+?))", 1, "pdbx_refine_tls", { "S[2][1]", "S[2][2]", "S[2][3]" } },
/* 92 */ { R"(S31:\s+(.+?) S32:\s+(.+?) S33:\s+(.+?))", 78 - 92, "pdbx_refine_tls", { "S[3][1]", "S[3][2]", "S[3][3]" } },
};
class BUSTER_TNT_Remark3Parser : public Remark3Parser
{
public:
BUSTER_TNT_Remark3Parser(const string& name, const string& expMethod, PDBRecord* r, cif::datablock& db)
: Remark3Parser(name, expMethod, r, db,
kBusterTNT_Template, sizeof(kBusterTNT_Template) / sizeof(TemplateLine),
regex(R"((BUSTER(?:-TNT)?)(?: (\d+(?:\..+)?))?)")) {}
};
const TemplateLine kCNS_Template[] = {
/* 0 */ { R"(REFINEMENT TARGET\s*:\s*(.+))", 1, "refine", { "pdbx_stereochemistry_target_values" } },
/* 1 */ { R"(DATA USED IN REFINEMENT\.)", 1 },
/* 2 */ { R"(RESOLUTION RANGE HIGH \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_high" } },
/* 3 */ { R"(RESOLUTION RANGE LOW \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_low" } },
/* 4 */ { R"(DATA CUTOFF \(SIGMA\(F\)\)\s*:\s*(.+))", 1, "refine", { "pdbx_ls_sigma_F" } },
/* 5 */ { R"(DATA CUTOFF HIGH \(ABS\(F\)\)\s*:\s*(.+))", 1, "refine", { "pdbx_data_cutoff_high_absF" } },
/* 6 */ { R"(DATA CUTOFF LOW \(ABS\(F\)\)\s*:\s*(.+))", 1, "refine", { "pdbx_data_cutoff_low_absF" } },
/* 7 */ { R"(COMPLETENESS \(WORKING\+TEST\) \(%\)\s*:\s*(.+))", 1, "refine", { "ls_percent_reflns_obs" } },
/* 8 */ { R"(NUMBER OF REFLECTIONS\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_obs" } },
/* 9 */ { R"(FIT TO DATA USED IN REFINEMENT\.)", 1 },
/* 10 */ { R"(CROSS-VALIDATION METHOD\s*:\s*(.+))", 1, "refine", { "pdbx_ls_cross_valid_method" } },
/* 11 */ { R"(FREE R VALUE TEST SET SELECTION\s*:\s*(.+))", 1, "refine", { "pdbx_R_Free_selection_details" } },
/* 12 */ { R"(R VALUE \(WORKING \+ TEST SET\)\s*:\s*(.+))", 1, "refine", { "ls_R_factor_obs" } },
/* 13 */ { R"(R VALUE \(WORKING SET\)\s*:\s*(.+))", 1, "refine", { "ls_R_factor_R_work" } },
/* 14 */ { R"(FREE R VALUE\s*:\s*(.+))", 1, "refine", { "ls_R_factor_R_free" } },
/* 15 */ { R"(FREE R VALUE TEST SET SIZE \(%\)\s*:\s*(.+))", 1, "refine", { "ls_percent_reflns_R_free" } },
/* 16 */ { R"(FREE R VALUE TEST SET COUNT\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_R_free" } },
/* 17 */ { R"(ESTIMATED ERROR OF FREE R VALUE\s*:\s*(.+))", 1, "refine", { "ls_R_factor_R_free_error" } },
/* 18 */ { R"(FIT/AGREEMENT OF MODEL WITH ALL DATA\.)", 1 },
/* 19 */ { R"(R VALUE \(WORKING \+ TEST SET, NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "R_factor_all_no_cutoff" } },
/* 20 */ { R"(R VALUE \(WORKING SET, NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "R_factor_obs_no_cutoff" } },
/* 21 */ { R"(FREE R VALUE \(NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_factor_no_cutoff" } },
/* 22 */ { R"(FREE R VALUE TEST SET SIZE \(%, NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_val_test_set_size_perc_no_cutoff" } },
/* 23 */ { R"(FREE R VALUE TEST SET COUNT \(NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_val_test_set_ct_no_cutoff" } },
/* 24 */ { R"(ESTIMATED ERROR OF FREE R VALUE \(NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_error_no_cutoff" } },
/* 25 */ { R"(TOTAL NUMBER OF REFLECTIONS \(NO CUTOFF\)\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_all" } },
/* 26 */ { R"(FIT IN THE HIGHEST RESOLUTION BIN\.)", 1 },
/* 27 */ { R"(TOTAL NUMBER OF BINS USED\s*:\s*(.+))", 1, "refine_ls_shell", { "pdbx_total_number_of_bins_used" } },
/* 28 */ { R"(BIN RESOLUTION RANGE HIGH \(A\)\s*:\s*(.+))", 1, "refine_ls_shell", { "d_res_high" } },
/* 29 */ { R"(BIN RESOLUTION RANGE LOW \(A\)\s*:\s*(.+))", 1, "refine_ls_shell", { "d_res_low" } },
/* 30 */ { R"(BIN COMPLETENESS \(WORKING\+TEST\) \(%\)\s*:\s*(.+))", 1, "refine_ls_shell", { "percent_reflns_obs" } },
/* 31 */ { R"(REFLECTIONS IN BIN \(WORKING SET\)\s*:\s*(.+))", 1, "refine_ls_shell", { "number_reflns_R_work" } },
/* 32 */ { R"(BIN R VALUE \(WORKING SET\)\s*:\s*(.+))", 1, "refine_ls_shell", { "R_factor_R_work" } },
/* 33 */ { R"(BIN FREE R VALUE\s*:\s*(.+))", 1, "refine_ls_shell", { "R_factor_R_free" } },
/* 34 */ { R"(BIN FREE R VALUE TEST SET SIZE \(%\)\s*:\s*(.+))", 1, "refine_ls_shell", { "percent_reflns_R_free" } },
/* 35 */ { R"(BIN FREE R VALUE TEST SET COUNT\s*:\s*(.+))", 1, "refine_ls_shell", { "number_reflns_R_free" } },
/* 36 */ { R"(ESTIMATED ERROR OF BIN FREE R VALUE\s*:\s*(.+))", 1, "refine_ls_shell", { "R_factor_R_free_error" } },
/* 37 */ { R"(NUMBER OF NON-HYDROGEN ATOMS USED IN REFINEMENT\.)", 1 },
/* 38 */ { R"(PROTEIN ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_protein" } },
/* 39 */ { R"(NUCLEIC ACID ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_nucleic_acid" } },
/* 40 */ { R"(HETEROGEN ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_ligand" } },
/* 41 */ { R"(SOLVENT ATOMS\s*:\s*(.+))", 1, "refine_hist", { "number_atoms_solvent" } },
/* 42 */ { R"(B VALUES\.)", 1 },
/* 43 */ { R"(B VALUE TYPE\s*:\s*(.+))", 1, "refine", { "pdbx_TLS_residual_ADP_flag" } },
/* 44 */ { R"(FROM WILSON PLOT \(A\*\*2\)\s*:\s*(.+))", 1, "reflns", { "B_iso_Wilson_estimate" } },
/* 45 */ { R"(MEAN B VALUE \(OVERALL, A\*\*2\)\s*:\s*(.+))", 1, "refine", { "B_iso_mean" } },
/* 46 */ { R"(OVERALL ANISOTROPIC B VALUE\.)", 1 },
/* 47 */ { R"(B11 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][1]" } },
/* 48 */ { R"(B22 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[2][2]" } },
/* 49 */ { R"(B33 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[3][3]" } },
/* 50 */ { R"(B12 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][2]" } },
/* 51 */ { R"(B13 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][3]" } },
/* 52 */ { R"(B23 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[2][3]" } },
/* 53 */ { R"(ESTIMATED COORDINATE ERROR\.)", 1 },
/* 54 */ { R"(ESD FROM LUZZATI PLOT \(A\)\s*:\s*(.+))", 1, "refine_analyze", { "Luzzati_coordinate_error_obs" } },
/* 55 */ { R"(ESD FROM SIGMAA \(A\)\s*:\s*(.+))", 1, "refine_analyze", { "Luzzati_sigma_a_obs" } },
/* 56 */ { R"(LOW RESOLUTION CUTOFF \(A\)\s*:\s*(.+))", 1, "refine_analyze", { "Luzzati_d_res_low_obs" } },
/* 57 */ { R"(CROSS-VALIDATED ESTIMATED COORDINATE ERROR\.)", 1 },
/* 58 */ { R"(ESD FROM C-V LUZZATI PLOT \(A\)\s*:\s*(.+))", 1, "refine_analyze", { "Luzzati_coordinate_error_free" } },
/* 59 */ { R"(ESD FROM C-V SIGMAA \(A\)\s*:\s*(.+))", 1, "refine_analyze", { "Luzzati_sigma_a_free" } },
/* 60 */ { R"(RMS DEVIATIONS FROM IDEAL VALUES\.)", 1 },
/* 61 */ { R"(BOND LENGTHS \(A\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "c_bond_d", false },
/* 62 */ { R"(BOND ANGLES \(DEGREES\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "c_angle_deg", false },
/* 63 */ { R"(DIHEDRAL ANGLES \(DEGREES\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "c_dihedral_angle_d", false },
/* 64 */ { R"(IMPROPER ANGLES \(DEGREES\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "c_improper_angle_d", false },
/* 65 */ { R"(ISOTROPIC THERMAL MODEL\s*:\s*(.+))", 1, "refine", { "pdbx_isotropic_thermal_model" } },
/* 66 */ { R"(ISOTROPIC THERMAL FACTOR RESTRAINTS\. RMS SIGMA)", 1 },
/* 67 */ { R"(MAIN-CHAIN BOND \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "c_mcbond_it", false },
/* 68 */ { R"(MAIN-CHAIN ANGLE \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "c_mcangle_it", false },
/* 69 */ { R"(SIDE-CHAIN BOND \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "c_scbond_it", false },
/* 70 */ { R"(SIDE-CHAIN ANGLE \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "c_scangle_it", false },
/* 71 */ { R"(BULK SOLVENT MODELING\.)", 1 },
/* 72 */ { R"(METHOD USED\s*:\s*(.+))", 1, "refine", { "solvent_model_details" } },
/* 73 */ { R"(KSOL\s*:\s*(.+))", 1, "refine", { "solvent_model_param_ksol" } },
/* 74 */ { R"(BSOL\s*:\s*(.+))", 1, "refine", { "solvent_model_param_bsol" } },
/* 75 */ { R"(NCS MODEL\s*:\s*(.+))", 1, /* "refine_ls_restr_ncs", { "ncs_model_details" } */ },
/* 76 */ { R"(NCS RESTRAINTS\. RMS SIGMA/WEIGHT)", 1 },
/* 77 */ { R"(GROUP (\d+) POSITIONAL \(A\)\s*:\s*(.+))", 1, /* "refine_ls_restr_ncs", { "dom_id", "rms_dev_position", "weight_position" } */ },
/* 78 */ { R"(GROUP (\d+) B-FACTOR \(A\*\*2\)\s*:\s*(.+))", 1, /* "refine_ls_restr_ncs", { "dom_id", "rms_dev_B_iso", "weight_B_iso" } */ },
/* 79 */ { R"(PARAMETER FILE (\d+) :\s+(.+))", 1, /* "pdbx_xplor_file", { "serial_no", "param_file" } */ },
/* 80 */ { R"(TOPOLOGY FILE (\d+) :\s+(.+))", 1, /* "pdbx_xplor_file", { "serial_no", "topol_file" } */ },
};
class CNS_Remark3Parser : public Remark3Parser
{
public:
CNS_Remark3Parser(const string& name, const string& expMethod, PDBRecord* r, cif::datablock& db)
: Remark3Parser(name, expMethod, r, db, kCNS_Template,
sizeof(kCNS_Template) / sizeof(TemplateLine), regex(R"((CN[SX])(?: (\d+(?:\.\d+)?))?)")) {}
};
const TemplateLine kPHENIX_Template[] = {
/* 0 */ { R"(REFINEMENT TARGET\s*:\s*(.+))", 1, "refine", { "pdbx_stereochemistry_target_values" } },
/* 1 */ { R"(DATA USED IN REFINEMENT\.)", 1 },
/* 2 */ { R"(RESOLUTION RANGE HIGH \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_high" } },
/* 3 */ { R"(RESOLUTION RANGE LOW \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_low" } },
/* 4 */ { R"(MIN\(FOBS/SIGMA_FOBS\)\s*:\s*(.+))", 1, "refine", { "pdbx_ls_sigma_F" } },
/* 5 */ { R"(COMPLETENESS FOR RANGE \(%\)\s*:\s*(.+))", 1, "refine", { "ls_percent_reflns_obs" } },
/* 6 */ { R"(NUMBER OF REFLECTIONS\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_obs" } },
/* 7 */ { R"(FIT TO DATA USED IN REFINEMENT\.)", 1 },
/* 8 */ { R"(R VALUE \(WORKING \+ TEST SET\)\s*:\s*(.+))", 1, "refine", { "ls_R_factor_obs" } },
/* 9 */ { R"(R VALUE \(WORKING SET\)\s*:\s*(.+))", 1, "refine", { "ls_R_factor_R_work" } },
/* 10 */ { R"(FREE R VALUE\s*:\s*(.+))", 1, "refine", { "ls_R_factor_R_free" } },
/* 11 */ { R"(FREE R VALUE TEST SET SIZE \(%\)\s*:\s*(.+))", 1, "refine", { "ls_percent_reflns_R_free" } },
/* 12 */ { R"(FREE R VALUE TEST SET COUNT\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_R_free" } },
/* 13 */ { R"(FIT TO DATA USED IN REFINEMENT \(IN BINS\)\.)", 1 },
/* 14 */ { R"(BIN RESOLUTION RANGE COMPL\. NWORK NFREE RWORK RFREE)", 1 },
/* 15 */ { R"(\d+ (\d+(?:\.\d+)?) - (\d+(?:\.\d+)?) (\d+(?:\.\d+)?) (\d+) (\d+) (\d+(?:\.\d+)?) (\d+(?:\.\d+)?))", 0,
"refine_ls_shell", { "d_res_low", "d_res_high", "percent_reflns_obs", "number_reflns_R_work", "number_reflns_R_free", "R_factor_R_work", "R_factor_R_free" },
nullptr, true },
/* 16 */ { R"(BULK SOLVENT MODELLING\.)", 1 },
/* 17 */ { R"(METHOD USED\s*:\s*(.+))", 1, "refine", { "solvent_model_details" } },
/* 18 */ { R"(SOLVENT RADIUS\s*:\s*(.+))", 1, "refine", { "pdbx_solvent_vdw_probe_radii" } },
/* 19 */ { R"(SHRINKAGE RADIUS\s*:\s*(.+))", 1, "refine", { "pdbx_solvent_shrinkage_radii" } },
/* 20 */ { R"(K_SOL\s*:\s*(.+))", 1, "refine", { "solvent_model_param_ksol" } },
/* 21 */ { R"(B_SOL\s*:\s*(.+))", 1, "refine", { "solvent_model_param_bsol" } },
/* 22 */ { R"(ERROR ESTIMATES\.)", 1 },
/* 23 */ { R"(COORDINATE ERROR \(MAXIMUM-LIKELIHOOD BASED\)\s*:\s*(.+))", 1, "refine", { "overall_SU_ML" } },
/* 24 */ { R"(PHASE ERROR \(DEGREES, MAXIMUM-LIKELIHOOD BASED\)\s*:\s*(.+))", 1, "refine", { "pdbx_overall_phase_error" } },
/* 25 */ { R"(B VALUES\.)", 1 },
/* 26 */ { R"(B VALUE TYPE\s*:\s*(.+))", 1, "refine", { "pdbx_TLS_residual_ADP_flag" } },
/* 27 */ { R"(FROM WILSON PLOT \(A\*\*2\)\s*:\s*(.+))", 1, "reflns", { "B_iso_Wilson_estimate" } },
/* 28 */ { R"(MEAN B VALUE \(OVERALL, A\*\*2\)\s*:\s*(.+))", 1, "refine", { "B_iso_mean" } },
/* 29 */ { R"(OVERALL ANISOTROPIC B VALUE\.)", 1 },
/* 30 */ { R"(B11 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][1]" } },
/* 31 */ { R"(B22 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[2][2]" } },
/* 32 */ { R"(B33 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[3][3]" } },
/* 33 */ { R"(B12 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][2]" } },
/* 34 */ { R"(B13 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][3]" } },
/* 35 */ { R"(B23 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[2][3]" } },
/* 36 */ { R"(TWINNING INFORMATION\.)", 1 },
/* 37 */ { R"(FRACTION:\s*(.+))", 1, "pdbx_reflns_twin", { "fraction" } },
/* 38 */ { R"(OPERATOR:\s*(.+))", 1, "pdbx_reflns_twin", { "operator" } },
/* 39 */ { R"(DEVIATIONS FROM IDEAL VALUES\.)", 1 },
/* 40 */ { R"(RMSD COUNT)", 1 },
/* 41 */ { R"(BOND\s*:\s*(\d+(?:\.\d+))\s+(\d+))", 1, "refine_ls_restr", { "dev_ideal", "number" }, "f_bond_d", false },
/* 42 */ { R"(ANGLE\s*:\s*(\d+(?:\.\d+))\s+(\d+))", 1, "refine_ls_restr", { "dev_ideal", "number" }, "f_angle_d", false },
/* 43 */ { R"(CHIRALITY\s*:\s*(\d+(?:\.\d+))\s+(\d+))", 1, "refine_ls_restr", { "dev_ideal", "number" }, "f_chiral_restr", false },
/* 44 */ { R"(PLANARITY\s*:\s*(\d+(?:\.\d+))\s+(\d+))", 1, "refine_ls_restr", { "dev_ideal", "number" }, "f_plane_restr", false },
/* 45 */ { R"(DIHEDRAL\s*:\s*(\d+(?:\.\d+))\s+(\d+))", 1, "refine_ls_restr", { "dev_ideal", "number" }, "f_dihedral_angle_d", false },
/* 46 */ { R"(TLS DETAILS)", 1 },
/* 47 */ { R"(NUMBER OF TLS GROUPS\s*:\s*(.+))", 1 },
/* 48 */ { R"(TLS GROUP\s*:\s*(.+))", 1, "pdbx_refine_tls", { "id" }, nullptr, true },
/* 49 */ { R"(SELECTION:\s*(.+))", 1, "pdbx_refine_tls_group", { "selection_details" }, nullptr, true },
/* 50 */ { R"(ORIGIN FOR THE GROUP(?:\s*\(A\))?\s*:\s*(\S+)\s+(\S+)\s+(\S+))", 1, "pdbx_refine_tls", { "origin_x", "origin_y", "origin_z" } },
/* 51 */ { R"(T TENSOR)", 1 },
/* 52 */ { R"(T11\s*:\s*(.+) T22\s*:\s*(.+))", 1, "pdbx_refine_tls", { "T[1][1]", "T[2][2]" } },
/* 53 */ { R"(T33\s*:\s*(.+) T12\s*:\s*(.+))", 1, "pdbx_refine_tls", { "T[3][3]", "T[1][2]" } },
/* 54 */ { R"(T13\s*:\s*(.+) T23\s*:\s*(.+))", 1, "pdbx_refine_tls", { "T[1][3]", "T[2][3]" } },
/* 55 */ { R"(L TENSOR)", 1 },
/* 56 */ { R"(L11\s*:\s*(.+) L22\s*:\s*(.+))", 1, "pdbx_refine_tls", { "L[1][1]", "L[2][2]" } },
/* 57 */ { R"(L33\s*:\s*(.+) L12\s*:\s*(.+))", 1, "pdbx_refine_tls", { "L[3][3]", "L[1][2]" } },
/* 58 */ { R"(L13\s*:\s*(.+) L23\s*:\s*(.+))", 1, "pdbx_refine_tls", { "L[1][3]", "L[2][3]" } },
/* 59 */ { R"(S TENSOR)", 1 },
/* 60 */ { R"(S11\s*:\s*(.+) S12\s*:\s*(.+) S13\s*:\s*(.+))", 1, "pdbx_refine_tls", { "S[1][1]", "S[1][2]", "S[1][3]" } },
/* 61 */ { R"(S21\s*:\s*(.+) S22\s*:\s*(.+) S23\s*:\s*(.+))", 1, "pdbx_refine_tls", { "S[2][1]", "S[2][2]", "S[2][3]" } },
/* 62 */ { R"(S31\s*:\s*(.+) S32\s*:\s*(.+) S33\s*:\s*(.+))", 48 - 62, "pdbx_refine_tls", { "S[3][1]", "S[3][2]", "S[3][3]" } },
/* 63 */ { R"(NCS DETAILS)", 1 },
/* 64 */ { R"(NUMBER OF NCS GROUPS\s*:\s*(.+))", 1 },
};
class PHENIX_Remark3Parser : public Remark3Parser
{
public:
PHENIX_Remark3Parser(const string& name, const string& expMethod, PDBRecord* r, cif::datablock& db)
: Remark3Parser(name, expMethod, r, db, kPHENIX_Template, sizeof(kPHENIX_Template) / sizeof(TemplateLine),
regex(R"((PHENIX)(?: \(PHENIX\.REFINE:) (\d+(?:\.[^)]+)?)\)?)")) {}
virtual void Fixup();
};
void PHENIX_Remark3Parser::Fixup()
{
for (auto r: m_db["refine_ls_shell"])
{
try
{
float val = r["percent_reflns_obs"].as<float>();
int perc = static_cast<int>(val * 100);
r["percent_reflns_obs"] = perc;
}
catch (...) {}
}
}
const TemplateLine kPROLSQ_Template[] = {
/* 0 */ { R"(DATA USED IN REFINEMENT\.)", 1 },
/* 1 */ { R"(RESOLUTION RANGE HIGH \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_high" } },
/* 2 */ { R"(RESOLUTION RANGE LOW \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_low" } },
/* 3 */ { R"(DATA CUTOFF \(SIGMA\(F\)\)\s*:\s*(.+))", 1, "refine", { "pdbx_ls_sigma_F" } },
/* 4 */ { R"(COMPLETENESS FOR RANGE \(%\)\s*:\s*(.+))", 1, "refine", { "ls_percent_reflns_obs" } },
/* 5 */ { R"(NUMBER OF REFLECTIONS\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_obs" } },
/* 6 */ { R"(FIT TO DATA USED IN REFINEMENT\.)", 1 },
/* 7 */ { R"(CROSS-VALIDATION METHOD\s*:\s*(.+))", 1, "refine", { "pdbx_ls_cross_valid_method" } },
/* 8 */ { R"(FREE R VALUE TEST SET SELECTION\s*:\s*(.+))", 1, "refine", { "pdbx_R_Free_selection_details" } },
/* 9 */ { R"(R VALUE \(WORKING \+ TEST SET\)\s*:\s*(.+))", 1, "refine", { "ls_R_factor_obs" } },
/* 10 */ { R"(R VALUE \(WORKING SET\)\s*:\s*(.+))", 1, "refine", { "ls_R_factor_R_work" } },
/* 11 */ { R"(FREE R VALUE\s*:\s*(.+))", 1, "refine", { "ls_R_factor_R_free" } },
/* 12 */ { R"(FREE R VALUE TEST SET SIZE \(%\)\s*:\s*(.+))", 1, "refine", { "ls_percent_reflns_R_free" } },
/* 13 */ { R"(FREE R VALUE TEST SET COUNT\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_R_free" } },
/* 14 */ { R"(FIT/AGREEMENT OF MODEL WITH ALL DATA\.)", 1 },
/* 15 */ { R"(R VALUE \(WORKING \+ TEST SET, NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "R_factor_all_no_cutoff" } },
/* 16 */ { R"(R VALUE \(WORKING SET, NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "R_factor_obs_no_cutoff" } },
/* 17 */ { R"(FREE R VALUE \(NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_factor_no_cutoff" } },
/* 18 */ { R"(FREE R VALUE TEST SET SIZE \(%, NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_val_test_set_size_perc_no_cutoff" } },
/* 19 */ { R"(FREE R VALUE TEST SET COUNT \(NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_val_test_set_ct_no_cutoff" } },
/* 20 */ { R"(TOTAL NUMBER OF REFLECTIONS \(NO CUTOFF\)\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_all" } },
/* 21 */ { R"(NUMBER OF NON-HYDROGEN ATOMS USED IN REFINEMENT\.)", 1 },
/* 22 */ { R"(PROTEIN ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_protein" } },
/* 23 */ { R"(NUCLEIC ACID ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_nucleic_acid" } },
/* 24 */ { R"(HETEROGEN ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_ligand" } },
/* 25 */ { R"(SOLVENT ATOMS\s*:\s*(.+))", 1, "refine_hist", { "number_atoms_solvent" } },
/* 26 */ { R"(B VALUES\.)", 1 },
/* 27 */ { R"(B VALUE TYPE\s*:\s*(.+))", 1, "refine", { "pdbx_TLS_residual_ADP_flag" } },
/* 28 */ { R"(FROM WILSON PLOT \(A\*\*2\)\s*:\s*(.+))", 1, "reflns", { "B_iso_Wilson_estimate" } },
/* 29 */ { R"(MEAN B VALUE \(OVERALL, A\*\*2\)\s*:\s*(.+))", 1, "refine", { "B_iso_mean" } },
/* 30 */ { R"(OVERALL ANISOTROPIC B VALUE\.)", 1 },
/* 31 */ { R"(B11 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][1]" } },
/* 32 */ { R"(B22 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[2][2]" } },
/* 33 */ { R"(B33 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[3][3]" } },
/* 34 */ { R"(B12 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][2]" } },
/* 35 */ { R"(B13 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][3]" } },
/* 36 */ { R"(B23 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[2][3]" } },
/* 37 */ { R"(ESTIMATED COORDINATE ERROR\.)", 1 },
/* 38 */ { R"(ESD FROM LUZZATI PLOT \(A\)\s*:\s*(.+))", 1, "refine_analyze", { "Luzzati_coordinate_error_obs" } },
/* 39 */ { R"(ESD FROM SIGMAA \(A\)\s*:\s*(.+))", 1, "refine_analyze", { "Luzzati_sigma_a_obs" } },
/* 40 */ { R"(LOW RESOLUTION CUTOFF \(A\)\s*:\s*(.+))", 1, "refine_analyze", { "Luzzati_d_res_low_obs" } },
/* 41 */ { R"(RMS DEVIATIONS FROM IDEAL VALUES\.)", 1 },
/* 42 */ { R"(DISTANCE RESTRAINTS\. RMS SIGMA)", 1 },
/* 43 */ { R"(BOND LENGTH \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_bond_d", false },
/* 44 */ { R"(ANGLE DISTANCE \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_angle_d", false },
/* 45 */ { R"(INTRAPLANAR 1-4 DISTANCE \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_planar_d", false },
/* 46 */ { R"(H-BOND OR METAL COORDINATION \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_hb_or_metal_coord", false },
/* 47 */ { R"(PLANE RESTRAINT \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_plane_restr", false },
/* 48 */ { R"(CHIRAL-CENTER RESTRAINT \(A\*\*3\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_chiral_restr", false },
/* 49 */ { R"(NON-BONDED CONTACT RESTRAINTS\.)", 1 },
/* 50 */ { R"(SINGLE TORSION \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_singtor_nbd", false },
/* 51 */ { R"(MULTIPLE TORSION \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_multtor_nbd", false },
/* 52 */ { R"(H-BOND \(X\.\.\.Y\) \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_xyhbond_nbd", false },
/* 53 */ { R"(H-BOND \(X-H\.\.\.Y\) \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_xhyhbond_nbd", false },
/* 54 */ { R"(CONFORMATIONAL TORSION ANGLE RESTRAINTS\.)", 1 },
/* 55 */ { R"(SPECIFIED \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_special_tor", false },
/* 56 */ { R"(PLANAR \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_planar_tor", false },
/* 57 */ { R"(STAGGERED \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_staggered_tor", false },
/* 58 */ { R"(TRANSVERSE \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_transverse_tor", false },
/* 59 */ { R"(ISOTROPIC THERMAL FACTOR RESTRAINTS\. RMS SIGMA)", 1 },
/* 60 */ { R"(MAIN-CHAIN BOND \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_mcbond_it", false },
/* 61 */ { R"(MAIN-CHAIN ANGLE \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_mcangle_it", false },
/* 62 */ { R"(SIDE-CHAIN BOND \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_scbond_it", false },
/* 63 */ { R"(SIDE-CHAIN ANGLE \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_scangle_it", false },
};
class PROLSQ_Remark3Parser : public Remark3Parser
{
public:
PROLSQ_Remark3Parser(const string& name, const string& expMethod, PDBRecord* r, cif::datablock& db)
: Remark3Parser(name, expMethod, r, db, kPROLSQ_Template, sizeof(kPROLSQ_Template) / sizeof(TemplateLine),
regex(R"((PROLSQ|NUCLSQ)(?: (\d+(?:\.\d+)?))?)")) {}
};
const TemplateLine kREFMAC_Template[] = {
/* 0 */ { "DATA USED IN REFINEMENT.", 1 },
/* 1 */ { R"(RESOLUTION RANGE HIGH \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_high" } },
/* 3 */ { R"(RESOLUTION RANGE LOW \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_low" } },
/* 4 */ { R"(DATA CUTOFF \(SIGMA\(F\)\)\s*:\s*(.+))", 1, "refine", { "pdbx_ls_sigma_F" } },
/* 5 */ { R"(COMPLETENESS FOR RANGE \(%\)\s*:\s*(.+))", 1, "refine", { "ls_percent_reflns_obs" } },
/* 6 */ { R"(NUMBER OF REFLECTIONS\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_obs" } },
/* 7 */ { R"(FIT TO DATA USED IN REFINEMENT.)", 1 },
/* 8 */ { R"(CROSS-VALIDATION METHOD\s*:\s*(.+))", 1, "refine", { "pdbx_ls_cross_valid_method" } },
/* 9 */ { R"(FREE R VALUE TEST SET SELECTION\s*:\s*(.+))", 1, "refine", { "pdbx_R_Free_selection_details" } },
/* 10 */ { R"(R VALUE \(WORKING \+ TEST SET\)\s*:\s*(.+))", 1, "refine", { "ls_R_factor_obs" } },
/* 11 */ { R"(R VALUE \(WORKING SET\)\s*:\s*(.+))", 1, "refine", { "ls_R_factor_R_work" } },
/* 12 */ { R"(FREE R VALUE\s*:\s*(.+))", 1, "refine", { "ls_R_factor_R_free" } },
/* 13 */ { R"(FREE R VALUE TEST SET SIZE \(%\)\s*:\s*(.+))", 1, "refine", { "ls_percent_reflns_R_free" } },
/* 14 */ { R"(FREE R VALUE TEST SET COUNT\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_R_free" } },
/* 15 */ { R"(NUMBER OF NON-HYDROGEN ATOMS USED IN REFINEMENT.)", 1 },
/* 16 */ { R"(PROTEIN ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_protein" } },
/* 17 */ { R"(NUCLEIC ACID ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_nucleic_acid" } },
/* 18 */ { R"(HETEROGEN ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_ligand" } },
/* 19 */ { R"(SOLVENT ATOMS\s*:\s*(.+))", 1, "refine_hist", { "number_atoms_solvent" } },
/* 20 */ { R"(ALL ATOMS\s*:\s*(.+))", 1, /* "refine_hist", "pdbx_number_atoms_protein" */ },
/* 21 */ { R"(B VALUES\..*)", 1 },
/* 22 */ { R"(B VALUE TYPE\s*:\s*(.+))", 1, "refine", { "pdbx_TLS_residual_ADP_flag" } },
/* 23 */ { R"(FROM WILSON PLOT \(A\*\*2\)\s*:\s*(.+))", 1, "reflns", { "B_iso_Wilson_estimate" } },
/* 24 */ { R"(MEAN B VALUE \(OVERALL, A\*\*2\)\s*:\s*(.+))", 1, "refine", { "B_iso_mean" } },
/* 25 */ { R"(OVERALL ANISOTROPIC B VALUE.)", 1 },
/* 26 */ { R"(B11 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][1]" } },
/* 27 */ { R"(B22 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[2][2]" } },
/* 28 */ { R"(B33 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[3][3]" } },
/* 29 */ { R"(B12 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][2]" } },
/* 30 */ { R"(B13 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][3]" } },
/* 31 */ { R"(B23 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[2][3]" } },
/* 32 */ { R"(ESTIMATED OVERALL COORDINATE ERROR.)", 1 },
/* 33 */ { R"(ESU BASED ON R VALUE(?:\s*\(A\))?\s*:\s*(.+))", 1, "refine", { "pdbx_overall_ESU_R" } },
/* 34 */ { R"(ESU BASED ON FREE R VALUE(?:\s*\(A\))?\s*:\s*(.+))", 1, "refine", { "pdbx_overall_ESU_R_Free" } },
/* 35 */ { R"(ESU BASED ON MAXIMUM LIKELIHOOD(?:\s*\(A\))?\s*:\s*(.+))", 1, "refine", { "overall_SU_ML" } },
/* 36 */ { R"(ESU FOR B VALUES BASED ON MAXIMUM LIKELIHOOD \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "overall_SU_B" } },
/* 37 */ { R"(RMS DEVIATIONS FROM IDEAL VALUES.)", 1 },
/* 38 */ { R"(DISTANCE RESTRAINTS. RMS SIGMA)", 1 },
/* 39 */ { R"(BOND LENGTH \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_bond_d", false },
/* 40 */ { R"(ANGLE DISTANCE \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_angle_d", false },
/* 41 */ { R"(INTRAPLANAR 1-4 DISTANCE \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_planar_d", false },
/* 42 */ { R"(H-BOND OR METAL COORDINATION \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_hb_or_metal_coord", false },
/* 43 */ { R"(PLANE RESTRAINT \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_plane_restr", false },
/* 44 */ { R"(CHIRAL-CENTER RESTRAINT \(A\*\*3\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_chiral_restr", false },
/* 45 */ { R"(NON-BONDED CONTACT RESTRAINTS.)", 1 },
/* 46 */ { R"(SINGLE TORSION \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_singtor_nbd", false },
/* 47 */ { R"(MULTIPLE TORSION \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_multtor_nbd", false },
/* 48 */ { R"(H-BOND \(X\.\..Y\) \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_xyhbond_nbd", false },
/* 49 */ { R"(H-BOND \(X-H\.\.\.Y\) \(A\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_xhyhbond_nbd", false },
/* 50 */ { R"(CONFORMATIONAL TORSION ANGLE RESTRAINTS.)", 1 },
/* 51 */ { R"(SPECIFIED \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_special_tor", false },
/* 52 */ { R"(PLANAR \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_planar_tor", false },
/* 53 */ { R"(STAGGERED \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_staggered_tor", false },
/* 54 */ { R"(TRANSVERSE \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_transverse_tor", false },
/* 55 */ { R"(ISOTROPIC THERMAL FACTOR RESTRAINTS. RMS SIGMA)", 1 },
/* 56 */ { R"(MAIN-CHAIN BOND \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_mcbond_it", false },
/* 57 */ { R"(MAIN-CHAIN ANGLE \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_mcangle_it", false },
/* 58 */ { R"(SIDE-CHAIN BOND \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_scbond_it", false },
/* 59 */ { R"(SIDE-CHAIN ANGLE \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "p_scangle_it", false },
};
class REFMAC_Remark3Parser : public Remark3Parser
{
public:
REFMAC_Remark3Parser(const string& name, const string& expMethod, PDBRecord* r, cif::datablock& db)
: Remark3Parser(name, expMethod, r, db, kREFMAC_Template, sizeof(kREFMAC_Template) / sizeof(TemplateLine),
regex(".+")) {}
virtual string Program() { return "REFMAC"; }
virtual string Version() { return ""; }
};
const TemplateLine kREFMAC5_Template[] = {
/* 0 */ { R"(REFINEMENT TARGET\s*:\s*(.+))", 1, "refine", { "pdbx_stereochemistry_target_values" } },
/* 1 */ { R"(DATA USED IN REFINEMENT\.)", 1 },
/* 2 */ { R"(RESOLUTION RANGE HIGH \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_high" } },
/* 3 */ { R"(RESOLUTION RANGE LOW \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_low" } },
/* 4 */ { R"(DATA CUTOFF \(SIGMA\(F\)\)\s*:\s*(.+))", 1, "refine", { "pdbx_ls_sigma_F" } },
/* 5 */ { R"(COMPLETENESS FOR RANGE \(%\)\s*:\s*(.+))", 1, "refine", { "ls_percent_reflns_obs" } },
/* 6 */ { R"(NUMBER OF REFLECTIONS\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_obs" } },
/* 7 */ { R"(FIT TO DATA USED IN REFINEMENT.)", 1 },
/* 8 */ { R"(CROSS-VALIDATION METHOD\s*:\s*(.+))", 1, "refine", { "pdbx_ls_cross_valid_method" } },
/* 9 */ { R"(FREE R VALUE TEST SET SELECTION\s*:\s*(.+))", 1, "refine", { "pdbx_R_Free_selection_details" } },
/* 10 */ { R"(R VALUE \(WORKING \+ TEST SET\)\s*:\s*(.+))", 1, "refine", { "ls_R_factor_obs" } },
/* 11 */ { R"(R VALUE \(WORKING SET\)\s*:\s*(.+))", 1, "refine", { "ls_R_factor_R_work" } },
/* 12 */ { R"(FREE R VALUE\s*:\s*(.+))", 1, "refine", { "ls_R_factor_R_free" } },
/* 13 */ { R"(FREE R VALUE TEST SET SIZE \(%\)\s*:\s*(.+))", 1, "refine", { "ls_percent_reflns_R_free" } },
/* 14 */ { R"(FREE R VALUE TEST SET COUNT\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_R_free" } },
/* 15 */ { R"(FIT IN THE HIGHEST RESOLUTION BIN.)", 1 },
/* 16 */ { R"(TOTAL NUMBER OF BINS USED\s*:\s*(.+))", 1, "refine_ls_shell", { "pdbx_total_number_of_bins_used" } },
/* 17 */ { R"(BIN RESOLUTION RANGE HIGH(?:\s*\(A\))?\s*:\s*(.+))", 1, "refine_ls_shell", { "d_res_high" } },
/* 18 */ { R"(BIN RESOLUTION RANGE LOW(?:\s*\(A\))?\s*:\s*(.+))", 1, "refine_ls_shell", { "d_res_low" } },
/* 19 */ { R"(REFLECTION IN BIN \(WORKING SET\)\s*:\s*(.+))", 1, "refine_ls_shell", { "number_reflns_R_work" } },
/* 20 */ { R"(BIN COMPLETENESS \(WORKING\+TEST\) \(%\)\s*:\s*(.+))", 1, "refine_ls_shell", { "percent_reflns_obs" } },
/* 21 */ { R"(BIN R VALUE \(WORKING SET\)\s*:\s*(.+))", 1, "refine_ls_shell", { "R_factor_R_work" } },
/* 22 */ { R"(BIN FREE R VALUE SET COUNT\s*:\s*(.+))", 1, "refine_ls_shell", { "number_reflns_R_free" } },
/* 23 */ { R"(BIN FREE R VALUE\s*:\s*(.+))", 1, "refine_ls_shell", { "R_factor_R_free" } },
/* 24 */ { R"(NUMBER OF NON-HYDROGEN ATOMS USED IN REFINEMENT.)", 1 },
/* 25 */ { R"(PROTEIN ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_protein" } },
/* 26 */ { R"(NUCLEIC ACID ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_nucleic_acid" } },
/* 27 */ { R"(HETEROGEN ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_ligand" } },
/* 28 */ { R"(SOLVENT ATOMS\s*:\s*(.+))", 1, "refine_hist", { "number_atoms_solvent" } },
/* 29 */ { R"(ALL ATOMS\s*:\s*(.+))", 1, /* "refine_hist", { "pdbx_number_atoms_protein" } */ },
/* 30 */ { R"(B VALUES\..*)", 1 },
/* 31 */ { R"(B VALUE TYPE\s*:\s*(.+))", 1, "refine", { "pdbx_TLS_residual_ADP_flag" } },
/* 32 */ { R"(FROM WILSON PLOT \(A\*\*2\)\s*:\s*(.+))", 1, "reflns", { "B_iso_Wilson_estimate" } },
/* 33 */ { R"(MEAN B VALUE \(OVERALL, A\*\*2\)\s*:\s*(.+))", 1, "refine", { "B_iso_mean" } },
/* 34 */ { R"(OVERALL ANISOTROPIC B VALUE.)", 1 },
/* 35 */ { R"(B11 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][1]" } },
/* 36 */ { R"(B22 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[2][2]" } },
/* 37 */ { R"(B33 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[3][3]" } },
/* 38 */ { R"(B12 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][2]" } },
/* 39 */ { R"(B13 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[1][3]" } },
/* 40 */ { R"(B23 \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "aniso_B[2][3]" } },
/* 41 */ { R"(ESTIMATED OVERALL COORDINATE ERROR.)", 1 },
/* 42 */ { R"(ESU BASED ON R VALUE(?:\s*\(A\))?\s*:\s*(.+))", 1, "refine", { "pdbx_overall_ESU_R" } },
/* 43 */ { R"(ESU BASED ON FREE R VALUE(?:\s*\(A\))?\s*:\s*(.+))", 1, "refine", { "pdbx_overall_ESU_R_Free" } },
/* 44 */ { R"(ESU BASED ON MAXIMUM LIKELIHOOD(?:\s*\(A\))?\s*:\s*(.+))", 1, "refine", { "overall_SU_ML" } },
/* 45 */ { R"(ESU FOR B VALUES BASED ON MAXIMUM LIKELIHOOD \(A\*\*2\)\s*:\s*(.+))", 1, "refine", { "overall_SU_B" } },
/* 46 */ { R"(CORRELATION COEFFICIENTS.)", 1 },
/* 47 */ { R"(CORRELATION COEFFICIENT FO-FC\s*:\s*(.+))", 1, "refine", { "correlation_coeff_Fo_to_Fc" } },
/* 48 */ { R"(CORRELATION COEFFICIENT FO-FC FREE\s*:\s*(.+))", 1, "refine", { "correlation_coeff_Fo_to_Fc_free" } },
/* 49 */ { R"(RMS DEVIATIONS FROM IDEAL VALUES COUNT RMS WEIGHT)", 1 },
/* 50 */ { R"(BOND LENGTHS REFINED ATOMS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_bond_refined_d", false },
/* 51 */ { R"(BOND LENGTHS OTHERS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_bond_other_d", false },
/* 52 */ { R"(BOND ANGLES REFINED ATOMS \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_angle_refined_deg", false },
/* 53 */ { R"(BOND ANGLES OTHERS \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_angle_other_deg", false },
/* 54 */ { R"(TORSION ANGLES, PERIOD 1 \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_dihedral_angle_1_deg", false },
/* 55 */ { R"(TORSION ANGLES, PERIOD 2 \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_dihedral_angle_2_deg", false },
/* 56 */ { R"(TORSION ANGLES, PERIOD 3 \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_dihedral_angle_3_deg", false },
/* 57 */ { R"(TORSION ANGLES, PERIOD 4 \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_dihedral_angle_4_deg", false },
/* 58 */ { R"(CHIRAL-CENTER RESTRAINTS \(A\*\*3\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_chiral_restr", false },
/* 59 */ { R"(GENERAL PLANES REFINED ATOMS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_gen_planes_refined", false },
/* 60 */ { R"(GENERAL PLANES OTHERS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_gen_planes_other", false },
/* 61 */ { R"(NON-BONDED CONTACTS REFINED ATOMS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_nbd_refined", false },
/* 62 */ { R"(NON-BONDED CONTACTS OTHERS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_nbd_other", false },
/* 63 */ { R"(NON-BONDED TORSION REFINED ATOMS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_nbtor_refined", false },
/* 64 */ { R"(NON-BONDED TORSION OTHERS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_nbtor_other", false },
/* 65 */ { R"(H-BOND \(X...Y\) REFINED ATOMS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_xyhbond_nbd_refined", false },
/* 66 */ { R"(H-BOND \(X...Y\) OTHERS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_xyhbond_nbd_other", false },
/* 67 */ { R"(POTENTIAL METAL-ION REFINED ATOMS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_metal_ion_refined", false },
/* 68 */ { R"(POTENTIAL METAL-ION OTHERS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_metal_ion_other", false },
/* 69 */ { R"(SYMMETRY VDW REFINED ATOMS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_symmetry_vdw_refined", false },
/* 70 */ { R"(SYMMETRY VDW OTHERS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_symmetry_vdw_other", false },
/* 71 */ { R"(SYMMETRY H-BOND REFINED ATOMS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_symmetry_hbond_refined", false },
/* 72 */ { R"(SYMMETRY H-BOND OTHERS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_symmetry_hbond_other", false },
/* 73 */ { R"(SYMMETRY METAL-ION REFINED ATOMS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_symmetry_metal_ion_refined", false },
/* 74 */ { R"(SYMMETRY METAL-ION OTHERS(?:\s*\(A\))?\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_symmetry_metal_ion_other", false },
/* 75 */ { R"(ISOTROPIC THERMAL FACTOR RESTRAINTS. COUNT RMS WEIGHT)", 1 },
/* 76 */ { R"(MAIN-CHAIN BOND REFINED ATOMS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_mcbond_it", false },
/* 77 */ { R"(MAIN-CHAIN BOND OTHER ATOMS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_mcbond_other", false },
/* 78 */ { R"(MAIN-CHAIN ANGLE REFINED ATOMS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_mcangle_it", false },
/* 79 */ { R"(MAIN-CHAIN ANGLE OTHER ATOMS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_mcangle_other", false },
/* 80 */ { R"(SIDE-CHAIN BOND REFINED ATOMS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_scbond_it", false },
/* 81 */ { R"(SIDE-CHAIN BOND OTHER ATOMS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_scbond_other", false },
/* 82 */ { R"(SIDE-CHAIN ANGLE REFINED ATOMS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_scangle_it", false },
/* 83 */ { R"(SIDE-CHAIN ANGLE OTHER ATOMS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_scangle_other", false },
/* 84 */ { R"(LONG RANGE B REFINED ATOMS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_long_range_B_refined", false },
/* 85 */ { R"(LONG RANGE B OTHER ATOMS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_long_range_B_other", false },
/* 86 */ { R"(ANISOTROPIC THERMAL FACTOR RESTRAINTS. COUNT RMS WEIGHT)", 1 },
/* 87 */ { R"(RIGID-BOND RESTRAINTS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_rigid_bond_restr", false },
/* 88 */ { R"(SPHERICITY; FREE ATOMS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_sphericity_free", false },
/* 89 */ { R"(SPHERICITY; BONDED ATOMS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "number", "dev_ideal", "dev_ideal_target" }, "r_sphericity_bonded", false },
// Simply ignore NCS, you can ask Robbie why
/* 90 */ { R"(NCS RESTRAINTS STATISTICS)", 1 },
/* 91 */ { R"(NUMBER OF DIFFERENT NCS GROUPS\s*:\s*(.+))", 1 },
/* 92 */ { R"(NCS GROUP NUMBER\s*:\s*(\d+))", 1, /*"struct_ncs_dom", { "pdbx_ens_id" }*/ },
/* 93 */ { R"(CHAIN NAMES\s*:\s*(.+))", 1, /*"struct_ncs_dom", { "details" }*/ },
/* 94 */ { R"(NUMBER OF COMPONENTS NCS GROUP\s*:\s*(\d+))", 1 },
/* 95 */ { R"(COMPONENT C SSSEQI TO C SSSEQI CODE)", 1 },
//// This sucks.... The following line is fixed format
/* 97 */ { R"((\d+)\s+(.)\s+(\d+)(.)\s+(.)\s+(\d+)(.)\s+(.+))", 0 },//, "struct_ncs_dom_lim", { "pdbx_component_id", "beg_auth_asym_id", "beg_auth_seq_id", "beg_auth_icode", "end_auth_asym_id", "end_auth_seq_id", "end_auth_icode", "pdbx_refine_code" }, {}, 1 },
/* 98 */ { R"((\d+)\s+(.)\s+(\d+)\s+(.)\s+(\d+)\s+(.+))", 0 },//, "struct_ncs_dom_lim", { "pdbx_component_id", "beg_auth_asym_id", "beg_auth_seq_id", "end_auth_asym_id", "end_auth_seq_id", "pdbx_refine_code" }, {}, 1 },
/* 96 */ { R"(GROUP CHAIN COUNT RMS WEIGHT)", 1 }, /*, "refine_ls_restr_ncs", { "pdbx_type", "dom_id", "pdbx_auth_asym_id", "pdbx_number", "rms_dev_position", "weight_position", }*/
/* 99 */ { R"(TIGHT POSITIONAL\s+\d+\s+(.)\s+\(A\):\s+(\d+)\s*;\s*(\d+(?:\.\d*)?)\s*;\s*(\d+(?:\.\d*)?))", 0 },// , "refine_ls_restr_ncs", {"pdbx_auth_asym_id", "pdbx_number", "rms_dev_position", "weight_position"}, { "pdbx_type", "tight positional"}, 1 },
/* 100 */ { R"(MEDIUM POSITIONAL\s+\d+\s+(.)\s+\(A\):\s+(\d+)\s*;\s*(\d+(?:\.\d*)?)\s*;\s*(\d+(?:\.\d*)?))", 0 },// , "refine_ls_restr_ncs", {"pdbx_auth_asym_id", "pdbx_number", "rms_dev_position", "weight_position"}, { "pdbx_type", "medium positional"}, 1 },
/* 101 */ { R"(LOOSE POSITIONAL\s+\d+\s+(.)\s+\(A\):\s+(\d+)\s*;\s*(\d+(?:\.\d*)?)\s*;\s*(\d+(?:\.\d*)?))", 0 },// , "refine_ls_restr_ncs", {"pdbx_auth_asym_id", "pdbx_number", "rms_dev_position", "weight_position"}, { "pdbx_type", "loose positional"}, 1 },
/* 102 */ { R"(TIGHT THERMAL\s+\d+\s+(.)\s+\(A\*\*2\):\s+(\d+)\s*;\s*(\d+(?:\.\d*)?)\s*;\s*(\d+(?:\.\d*)?))", 0 },// , "refine_ls_restr_ncs", {"pdbx_auth_asym_id", "pdbx_number", "rms_dev_position", "weight_position"}, { "pdbx_type", "tight thermal", }, 1 },
/* 103 */ { R"(MEDIUM THERMAL\s+\d+\s+(.)\s+\(A\*\*2\):\s+(\d+)\s*;\s*(\d+(?:\.\d*)?)\s*;\s*(\d+(?:\.\d*)?))", 0 },// , "refine_ls_restr_ncs", {"pdbx_auth_asym_id", "pdbx_number", "rms_dev_position", "weight_position"}, { "pdbx_type", "medium thermal", }, 1 },
/* 104 */ { R"(LOOSE THERMAL\s+\d+\s+(.)\s+\(A\*\*2\):\s+(\d+)\s*;\s*(\d+(?:\.\d*)?)\s*;\s*(\d+(?:\.\d*)?))", 0 },// , "refine_ls_restr_ncs", {"pdbx_auth_asym_id", "pdbx_number", "rms_dev_position", "weight_position"}, { "pdbx_type", "loose thermal", }, 10 },
/* 105 */ { R"(NCS GROUP NUMBER\s*:\s*(\d+))", 93 - 105, /*"struct_ncs_dom", { "pdbx_ens_id" }*/ },
/* 106 */ { R"(TWIN DETAILS)", 1 },
/* 107 */ { R"(NUMBER OF TWIN DOMAINS\s*:\s*(\d*))", 1 },
/* 108 */ { R"(TWIN DOMAIN\s*:\s*(.+))", 1, "pdbx_reflns_twin", { "domain_id" }, nullptr, true },
/* 109 */ { R"(TWIN OPERATOR\s*:\s*(.+))", 1, "pdbx_reflns_twin", { "operator" } },
/* 110 */ { R"(TWIN FRACTION\s*:\s*(.+))", 108 - 110, "pdbx_reflns_twin", { "fraction" } },
/* 111 */ { R"(TLS DETAILS)", 1 },
/* 112 */ { R"(NUMBER OF TLS GROUPS\s*:\s*(.+))", 1 },
/* 113 */ { R"(TLS GROUP\s*:\s*(.+))", 1, "pdbx_refine_tls", { "id" }, nullptr, true },
/* 114 */ { R"(NUMBER OF COMPONENTS GROUP\s*:\s*(.+))", 1 },
/* 115 */ { R"(COMPONENTS C SSSEQI TO C SSSEQI)", 1 },
/* 116 */ { R"(RESIDUE RANGE\s*:\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+))", 0, "pdbx_refine_tls_group", { "beg_auth_asym_id", "beg_auth_seq_id", "end_auth_asym_id", "end_auth_seq_id" }, nullptr, true },
/* 117 */ { R"(ORIGIN FOR THE GROUP(?:\s*\(A\))?\s*:\s*(\S+)\s+(\S+)\s+(\S+))", 1, "pdbx_refine_tls", { "origin_x", "origin_y", "origin_z" } },
/* 118 */ { R"(T TENSOR)", 1 },
/* 119 */ { R"(T11\s*:\s*(.+) T22\s*:\s*(.+))", 1, "pdbx_refine_tls", { "T[1][1]", "T[2][2]" } },
/* 120 */ { R"(T33\s*:\s*(.+) T12\s*:\s*(.+))", 1, "pdbx_refine_tls", { "T[3][3]", "T[1][2]" } },
/* 121 */ { R"(T13\s*:\s*(.+) T23\s*:\s*(.+))", 1, "pdbx_refine_tls", { "T[1][3]", "T[2][3]" } },
/* 122 */ { R"(L TENSOR)", 1 },
/* 123 */ { R"(L11\s*:\s*(.+) L22\s*:\s*(.+))", 1, "pdbx_refine_tls", { "L[1][1]", "L[2][2]" } },
/* 124 */ { R"(L33\s*:\s*(.+) L12\s*:\s*(.+))", 1, "pdbx_refine_tls", { "L[3][3]", "L[1][2]" } },
/* 125 */ { R"(L13\s*:\s*(.+) L23\s*:\s*(.+))", 1, "pdbx_refine_tls", { "L[1][3]", "L[2][3]" } },
/* 126 */ { R"(S TENSOR)", 1 },
/* 127 */ { R"(S11\s*:\s*(.+) S12\s*:\s*(.+) S13\s*:\s*(.+))", 1, "pdbx_refine_tls", { "S[1][1]", "S[1][2]", "S[1][3]" } },
/* 128 */ { R"(S21\s*:\s*(.+) S22\s*:\s*(.+) S23\s*:\s*(.+))", 1, "pdbx_refine_tls", { "S[2][1]", "S[2][2]", "S[2][3]" } },
/* 129 */ { R"(S31\s*:\s*(.+) S32\s*:\s*(.+) S33\s*:\s*(.+))", 113 - 129, "pdbx_refine_tls", { "S[3][1]", "S[3][2]", "S[3][3]" } },
/* 130 */ { R"(BULK SOLVENT MODELLING.)", 1 },
/* 131 */ { R"(METHOD USED\s*:\s*(.+))", 1, "refine", { "solvent_model_details" } },
/* 132 */ { R"(PARAMETERS FOR MASK CALCULATION)", 1 },
/* 133 */ { R"(VDW PROBE RADIUS\s*:\s*(.+))", 1, "refine", { "pdbx_solvent_vdw_probe_radii" } },
/* 134 */ { R"(ION PROBE RADIUS\s*:\s*(.+))", 1, "refine", { "pdbx_solvent_ion_probe_radii" } },
/* 135 */ { R"(SHRINKAGE RADIUS\s*:\s*(.+))", 1, "refine", { "pdbx_solvent_shrinkage_radii" } },
};
class REFMAC5_Remark3Parser : public Remark3Parser
{
public:
REFMAC5_Remark3Parser(const string& name, const string& expMethod, PDBRecord* r, cif::datablock& db)
: Remark3Parser(name, expMethod, r, db, kREFMAC5_Template, sizeof(kREFMAC5_Template) / sizeof(TemplateLine),
regex(R"((REFMAC)(?: (\d+(?:\..+)?))?)")) {}
};
const TemplateLine kSHELXL_Template[] = {
/* 0 */ { R"(DATA USED IN REFINEMENT\.)", 1 },
/* 1 */ { R"(RESOLUTION RANGE HIGH \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_high" } },
/* 2 */ { R"(RESOLUTION RANGE LOW \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_low" } },
/* 3 */ { R"(DATA CUTOFF \(SIGMA\(F\)\)\s*:\s*(.+))", 1, "refine", { "pdbx_ls_sigma_F" } },
/* 4 */ { R"(COMPLETENESS FOR RANGE \(%\)\s*:\s*(.+))", 1, "refine", { "ls_percent_reflns_obs" } },
/* 5 */ { R"(CROSS-VALIDATION METHOD\s*:\s*(.+))", 1, "refine", { "pdbx_ls_cross_valid_method" } },
/* 6 */ { R"(FREE R VALUE TEST SET SELECTION\s*:\s*(.+))", 1, "refine", { "pdbx_R_Free_selection_details" } },
/* 7 */ { R"(FIT TO DATA USED IN REFINEMENT \(NO CUTOFF\)\.)", 1 },
/* 8 */ { R"(R VALUE \(WORKING \+ TEST SET, NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "R_factor_all_no_cutoff" } },
/* 9 */ { R"(R VALUE \(WORKING SET, NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "R_factor_obs_no_cutoff" } },
/* 10 */ { R"(FREE R VALUE \(NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_factor_no_cutoff" } },
/* 11 */ { R"(FREE R VALUE TEST SET SIZE \(%, NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_val_test_set_size_perc_no_cutoff" } },
/* 12 */ { R"(FREE R VALUE TEST SET COUNT \(NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_val_test_set_ct_no_cutoff" } },
/* 13 */ { R"(TOTAL NUMBER OF REFLECTIONS \(NO CUTOFF\)\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_all" } },
/* 14 */ { R"(FIT/AGREEMENT OF MODEL FOR DATA WITH F>4SIG\(F\)\.)", 1 },
/* 15 */ { R"(R VALUE \(WORKING \+ TEST SET, F>4SIG\(F\)\)\s*:\s*(.+))", 1, "pdbx_refine", { "R_factor_all_4sig_cutoff" } },
/* 16 */ { R"(R VALUE \(WORKING SET, F>4SIG\(F\)\)\s*:\s*(.+))", 1, "pdbx_refine", { "R_factor_obs_4sig_cutoff" } },
/* 17 */ { R"(FREE R VALUE \(F>4SIG\(F\)\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_factor_4sig_cutoff" } },
/* 18 */ { R"(FREE R VALUE TEST SET SIZE \(%, F>4SIG\(F\)\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_val_test_set_size_perc_4sig_cutoff" } },
/* 19 */ { R"(FREE R VALUE TEST SET COUNT \(F>4SIG\(F\)\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_val_test_set_ct_4sig_cutoff" } },
/* 20 */ { R"(TOTAL NUMBER OF REFLECTIONS \(F>4SIG\(F\)\)\s*:\s*(.+))", 1, "pdbx_refine", { "number_reflns_obs_4sig_cutoff" } },
/* 21 */ { R"(NUMBER OF NON-HYDROGEN ATOMS USED IN REFINEMENT\.)", 1 },
/* 22 */ { R"(PROTEIN ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_protein" } },
/* 23 */ { R"(NUCLEIC ACID ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_nucleic_acid" } },
/* 24 */ { R"(HETEROGEN ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_ligand" } },
/* 25 */ { R"(SOLVENT ATOMS\s*:\s*(.+))", 1, "refine_hist", { "number_atoms_solvent" } },
/* 26 */ { R"(MODEL REFINEMENT\.)", 1 },
/* 27 */ { R"(OCCUPANCY SUM OF NON-HYDROGEN ATOMS\s*:\s*(.+))", 1, "refine_analyze", { "occupancy_sum_non_hydrogen" } },
/* 28 */ { R"(OCCUPANCY SUM OF HYDROGEN ATOMS\s*:\s*(.+))", 1, "refine_analyze", { "occupancy_sum_hydrogen" } },
/* 29 */ { R"(NUMBER OF DISCRETELY DISORDERED RESIDUES\s*:\s*(.+))", 1, "refine_analyze", { "number_disordered_residues" } },
/* 30 */ { R"(NUMBER OF LEAST-SQUARES PARAMETERS\s*:\s*(.+))", 1, "refine", { "ls_number_parameters" } },
/* 31 */ { R"(NUMBER OF RESTRAINTS\s*:\s*(.+))", 1, "refine", { "ls_number_restraints" } },
/* 32 */ { R"(RMS DEVIATIONS FROM RESTRAINT TARGET VALUES\.)", 1 },
/* 33 */ { R"(BOND LENGTHS \(A\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "s_bond_d", false },
/* 34 */ { R"(ANGLE DISTANCES \(A\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "s_angle_d", false },
/* 35 */ { R"(SIMILAR DISTANCES \(NO TARGET VALUES\) \(A\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "s_similar_dist", false },
/* 36 */ { R"(DISTANCES FROM RESTRAINT PLANES \(A\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "s_from_restr_planes", false },
/* 37 */ { R"(ZERO CHIRAL VOLUMES \(A\*\*3\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "s_zero_chiral_vol", false },
/* 38 */ { R"(NON-ZERO CHIRAL VOLUMES \(A\*\*3\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "s_non_zero_chiral_vol", false },
/* 39 */ { R"(ANTI-BUMPING DISTANCE RESTRAINTS \(A\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "s_anti_bump_dis_restr", false },
/* 40 */ { R"(RIGID-BOND ADP COMPONENTS \(A\*\*2\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "s_rigid_bond_adp_cmpnt", false },
/* 41 */ { R"(SIMILAR ADP COMPONENTS \(A\*\*2\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "s_similar_adp_cmpnt", false },
/* 42 */ { R"(APPROXIMATELY ISOTROPIC ADPS \(A\*\*2\)\s*:\s*(.+))", 1, "refine_ls_restr", { "dev_ideal" }, "s_approx_iso_adps", false },
/* 43 */ { R"(BULK SOLVENT MODELING\.)", 1 },
/* 44 */ { R"(METHOD USED\s*:\s*(.+))", 1, "refine", { "solvent_model_details" } },
/* 45 */ { R"(STEREOCHEMISTRY TARGET VALUES\s*:\s*(.+))", 1, "refine", { "pdbx_stereochemistry_target_values" } },
/* 46 */ { R"(SPECIAL CASE\s*:\s*(.+))", 1, "refine", { "pdbx_stereochem_target_val_spec_case" } },
};
class SHELXL_Remark3Parser : public Remark3Parser
{
public:
SHELXL_Remark3Parser(const string& name, const string& expMethod, PDBRecord* r, cif::datablock& db)
: Remark3Parser(name, expMethod, r, db, kSHELXL_Template, sizeof(kSHELXL_Template) / sizeof(TemplateLine),
regex(R"((SHELXL)(?:-(\d+(?:\..+)?)))")) {}
};
const TemplateLine kTNT_Template[] = {
/* 0 */ { R"(DATA USED IN REFINEMENT\.)", 1 },
/* 1 */ { R"(RESOLUTION RANGE HIGH \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_high" } },
/* 2 */ { R"(RESOLUTION RANGE LOW \(ANGSTROMS\)\s*:\s*(.+))", 1, "refine", { "ls_d_res_low" } },
/* 3 */ { R"(DATA CUTOFF \(SIGMA\(F\)\)\s*:\s*(.+))", 1, "refine", { "pdbx_ls_sigma_F" } },
/* 4 */ { R"(COMPLETENESS FOR RANGE \(%\)\s*:\s*(.+))", 1, "refine", { "ls_percent_reflns_obs" } },
/* 5 */ { R"(NUMBER OF REFLECTIONS\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_obs" } },
/* 6 */ { R"(USING DATA ABOVE SIGMA CUTOFF\.)", 1 },
/* 7 */ { R"(CROSS-VALIDATION METHOD\s*:\s*(.+))", 1, "refine", { "pdbx_ls_cross_valid_method" } },
/* 8 */ { R"(FREE R VALUE TEST SET SELECTION\s*:\s*(.+))", 1, "refine", { "pdbx_R_Free_selection_details" } },
/* 9 */ { R"(R VALUE \(WORKING \+ TEST SET\)\s*:\s*(.+))", 1, "refine", { "ls_R_factor_obs" } },
/* 10 */ { R"(R VALUE \(WORKING SET\)\s*:\s*(.+))", 1, "refine", { "ls_R_factor_R_work" } },
/* 11 */ { R"(FREE R VALUE\s*:\s*(.+))", 1, "refine", { "ls_R_factor_R_free" } },
/* 12 */ { R"(FREE R VALUE TEST SET SIZE \(%\)\s*:\s*(.+))", 1, "refine", { "ls_percent_reflns_R_free" } },
/* 13 */ { R"(FREE R VALUE TEST SET COUNT\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_R_free" } },
/* 14 */ { R"(USING ALL DATA, NO SIGMA CUTOFF\.)", 1 },
/* 15 */ { R"(R VALUE \(WORKING \+ TEST SET, NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "R_factor_all_no_cutoff" } },
/* 16 */ { R"(R VALUE \(WORKING SET, NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "R_factor_obs_no_cutoff" } },
/* 17 */ { R"(FREE R VALUE \(NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_factor_no_cutoff" } },
/* 18 */ { R"(FREE R VALUE TEST SET SIZE \(%, NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_val_test_set_size_perc_no_cutoff" } },
/* 19 */ { R"(FREE R VALUE TEST SET COUNT \(NO CUTOFF\)\s*:\s*(.+))", 1, "pdbx_refine", { "free_R_val_test_set_ct_no_cutoff" } },
/* 20 */ { R"(TOTAL NUMBER OF REFLECTIONS \(NO CUTOFF\)\s*:\s*(.+))", 1, "refine", { "ls_number_reflns_all" } },
/* 21 */ { R"(NUMBER OF NON-HYDROGEN ATOMS USED IN REFINEMENT\.)", 1 },
/* 22 */ { R"(PROTEIN ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_protein" } },
/* 23 */ { R"(NUCLEIC ACID ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_nucleic_acid" } },
/* 24 */ { R"(HETEROGEN ATOMS\s*:\s*(.+))", 1, "refine_hist", { "pdbx_number_atoms_ligand" } },
/* 25 */ { R"(SOLVENT ATOMS\s*:\s*(.+))", 1, "refine_hist", { "number_atoms_solvent" } },
/* 26 */ { R"(WILSON B VALUE \(FROM FCALC, A\*\*2\)\s*:\s*(.+))", 1, "reflns", { "B_iso_Wilson_estimate" } },
/* 27 */ { R"(RMS DEVIATIONS FROM IDEAL VALUES\. RMS WEIGHT COUNT)", 1 },
/* 28 */ { R"(BOND LENGTHS \(A\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "weight", "number" }, "t_bond_d", false },
/* 29 */ { R"(BOND ANGLES \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "weight", "number" }, "t_angle_deg", false },
/* 30 */ { R"(TORSION ANGLES \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "weight", "number" }, "t_dihedral_angle_d", false },
/* 31 */ { R"(PSEUDOROTATION ANGLES \(DEGREES\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "weight", "number" }, "t_pseud_angle", false },
/* 32 */ { R"(TRIGONAL CARBON PLANES \(A\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "weight", "number" }, "t_trig_c_planes", false },
/* 33 */ { R"(GENERAL PLANES \(A\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "weight", "number" }, "t_gen_planes", false },
/* 34 */ { R"(ISOTROPIC THERMAL FACTORS \(A\*\*2\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "weight", "number" }, "t_it", false },
/* 35 */ { R"(NON-BONDED CONTACTS \(A\)\s*:\s*(.+)\s*;\s*(.+)\s*;\s*(.+))", 1, "refine_ls_restr", { "dev_ideal", "weight", "number" }, "t_nbd", false },
/* 36 */ { R"(INCORRECT CHIRAL-CENTERS \(COUNT\)\s*:\s*(.+)\s*)", 1, "refine_ls_restr", { "number" }, "t_incorr_chiral_ct", false },
/* 37 */ { R"(BULK SOLVENT MODELING\.)", 1 },
/* 38 */ { R"(METHOD USED\s*:\s*(.+))", 1, "refine", { "solvent_model_details" } },
/* 39 */ { R"(KSOL\s*:\s*(.+))", 1, "refine", { "solvent_model_param_ksol" } },
/* 40 */ { R"(BSOL\s*:\s*(.+))", 1, "refine", { "solvent_model_param_bsol" } },
/* 41 */ { R"(RESTRAINT LIBRARIES\.)", 1 },
/* 42 */ { R"(STEREOCHEMISTRY\s*:\s*(.+))", 1, "refine", { "pdbx_stereochemistry_target_values" } },
/* 43 */ { R"(ISOTROPIC THERMAL FACTOR RESTRAINTS\s*:\s*(.+))", 1, "refine", { "pdbx_isotropic_thermal_model" } },
};
class TNT_Remark3Parser : public Remark3Parser
{
public:
TNT_Remark3Parser(const string& name, const string& expMethod, PDBRecord* r, cif::datablock& db)
: Remark3Parser(name, expMethod, r, db, kTNT_Template, sizeof(kTNT_Template) / sizeof(TemplateLine),
regex(R"((TNT)(?: V. (\d+.+)?)?)")) {}
};
const TemplateLine kXPLOR_Template[] = {
/* 0 */ { R"(DATA USED IN REFINEMENT\.)", 1 },
/* 1 */ { R"(RESOLUTION RANGE HIGH \(ANGSTROMS\) :\s+(.+))", 1, "refine", { "ls_d_res_high" } },
/* 2 */ { R"(RESOLUTION RANGE LOW \(ANGSTROMS\) :\s+(.+))", 1, "refine", { "ls_d_res_low" } },
/* 3 */ { R"(DATA CUTOFF \(SIGMA\(F\)\) :\s+(.+))", 1, "refine", { "pdbx_ls_sigma_F" } },
/* 4 */ { R"(DATA CUTOFF HIGH \(ABS\(F\)\) :\s+(.+))", 1, "refine", { "pdbx_data_cutoff_high_absF" } },
/* 5 */ { R"(DATA CUTOFF LOW \(ABS\(F\)\) :\s+(.+))", 1, "refine", { "pdbx_data_cutoff_low_absF" } },
/* 6 */ { R"(COMPLETENESS \(WORKING\+TEST\) \(%\) :\s+(.+))", 1, "refine", { "ls_percent_reflns_obs" } },
/* 7 */ { R"(NUMBER OF REFLECTIONS :\s+(.+))", 1, "refine", { "ls_number_reflns_obs" } },
/* 8 */ { R"(FIT TO DATA USED IN REFINEMENT\.)", 1 },
/* 9 */ { R"(CROSS-VALIDATION METHOD :\s+(.+))", 1, "refine", { "pdbx_ls_cross_valid_method" } },
/* 10 */ { R"(FREE R VALUE TEST SET SELECTION :\s+(.+))", 1, "refine", { "pdbx_R_Free_selection_details" } },
/* 11 */ { R"(R VALUE \(WORKING SET\) :\s+(.+))", 1, "refine", { "ls_R_factor_R_work" } },
/* 12 */ { R"(FREE R VALUE :\s+(.+))", 1, "refine", { "ls_R_factor_R_free" } },
/* 13 */ { R"(FREE R VALUE TEST SET SIZE \(%\) :\s+(.+))", 1, "refine", { "ls_percent_reflns_R_free" } },
/* 14 */ { R"(FREE R VALUE TEST SET COUNT :\s+(.+))", 1, "refine", { "ls_number_reflns_R_free" } },
/* 15 */ { R"(ESTIMATED ERROR OF FREE R VALUE :\s+(.+))", 1, "refine", { "ls_R_factor_R_free_error" } },
/* 16 */ { R"(FIT IN THE HIGHEST RESOLUTION BIN\.)", 1 },
/* 17 */ { R"(TOTAL NUMBER OF BINS USED :\s+(.+))", 1, "refine_ls_shell", { "pdbx_total_number_of_bins_used" } },
/* 18 */ { R"(BIN RESOLUTION RANGE HIGH \(A\) :\s+(.+))", 1, "refine_ls_shell", { "d_res_high" } },
/* 19 */ { R"(BIN RESOLUTION RANGE LOW \(A\) :\s+(.+))", 1, "refine_ls_shell", { "d_res_low" } },
/* 20 */ { R"(BIN COMPLETENESS \(WORKING\+TEST\) \(%\) :\s+(.+))", 1, "refine_ls_shell", { "percent_reflns_obs" } },
/* 21 */ { R"(REFLECTIONS IN BIN \(WORKING SET\) :\s+(.+))", 1, "refine_ls_shell", { "number_reflns_R_work" } },
/* 22 */ { R"(BIN R VALUE \(WORKING SET\) :\s+(.+))", 1, "refine_ls_shell", { "R_factor_R_work" } },
/* 23 */ { R"(BIN FREE R VALUE :\s+(.+))", 1, "refine_ls_shell", { "R_factor_R_free" } },
/* 24 */ { R"(BIN FREE R VALUE TEST SET SIZE \(%\) :\s+(.+))", 1, "refine_ls_shell", { "percent_reflns_R_free" } },
/* 25 */ { R"(BIN FREE R VALUE TEST SET COUNT :\s+(.+))", 1, "refine_ls_shell", { "number_reflns_R_free" } },
/* 26 */ { R"(ESTIMATED ERROR OF BIN FREE R VALUE :\s+(.+))", 1, "refine_ls_shell", { "R_factor_R_free_error" } },
/* 27 */ { R"(NUMBER OF NON-HYDROGEN ATOMS USED IN REFINEMENT\.)", 1 },
/* 28 */ { R"(PROTEIN ATOMS :\s+(.+))", 1, "refine_hist", { "pdbx_number_atoms_protein" } },
/* 29 */ { R"(NUCLEIC ACID ATOMS :\s+(.+))", 1, "refine_hist", { "pdbx_number_atoms_nucleic_acid" } },
/* 30 */ { R"(HETEROGEN ATOMS :\s+(.+))", 1, "refine_hist", { "pdbx_number_atoms_ligand" } },
/* 31 */ { R"(SOLVENT ATOMS :\s+(.+))", 1, "refine_hist", { "number_atoms_solvent" } },
/* 32 */ { R"(B VALUES\.)", 1 },
/* 33 */ { R"(B VALUE TYPE :\s+(.+))", 1, "refine", { "pdbx_TLS_residual_ADP_flag" } },
/* 34 */ { R"(FROM WILSON PLOT \(A\*\*2\) :\s+(.+))", 1, "reflns", { "B_iso_Wilson_estimate" } },
/* 35 */ { R"(MEAN B VALUE \(OVERALL, A\*\*2\) :\s+(.+))", 1, "refine", { "B_iso_mean" } },
/* 36 */ { R"(OVERALL ANISOTROPIC B VALUE\.)", 1 },
/* 37 */ { R"(B11 \(A\*\*2\) :\s+(.+))", 1, "refine", { "aniso_B[1][1]" } },
/* 38 */ { R"(B22 \(A\*\*2\) :\s+(.+))", 1, "refine", { "aniso_B[2][2]" } },
/* 39 */ { R"(B33 \(A\*\*2\) :\s+(.+))", 1, "refine", { "aniso_B[3][3]" } },
/* 40 */ { R"(B12 \(A\*\*2\) :\s+(.+))", 1, "refine", { "aniso_B[1][2]" } },
/* 41 */ { R"(B13 \(A\*\*2\) :\s+(.+))", 1, "refine", { "aniso_B[1][3]" } },
/* 42 */ { R"(B23 \(A\*\*2\) :\s+(.+))", 1, "refine", { "aniso_B[2][3]" } },
/* 43 */ { R"(ESTIMATED COORDINATE ERROR\.)", 1 },
/* 44 */ { R"(ESD FROM LUZZATI PLOT \(A\) :\s+(.+))", 1, "refine_analyze", { "Luzzati_coordinate_error_obs" } },
/* 45 */ { R"(ESD FROM SIGMAA \(A\) :\s+(.+))", 1, "refine_analyze", { "Luzzati_sigma_a_obs" } },
/* 46 */ { R"(LOW RESOLUTION CUTOFF \(A\) :\s+(.+))", 1, "refine_analyze", { "Luzzati_d_res_low_obs" } },
/* 47 */ { R"(CROSS-VALIDATED ESTIMATED COORDINATE ERROR\.)", 1 },
/* 48 */ { R"(ESD FROM C-V LUZZATI PLOT \(A\) :\s+(.+))", 1, "refine_analyze", { "Luzzati_coordinate_error_free" } },
/* 49 */ { R"(ESD FROM C-V SIGMAA \(A\) :\s+(.+))", 1, "refine_analyze", { "Luzzati_sigma_a_free" } },
/* 50 */ { R"(RMS DEVIATIONS FROM IDEAL VALUES\..*)", 1 },
/* 51 */ { R"(BOND LENGTHS \(A\) :\s+(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "x_bond_d", false },
/* 52 */ { R"(BOND ANGLES \(DEGREES\) :\s+(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "x_angle_deg", false },
/* 53 */ { R"(DIHEDRAL ANGLES \(DEGREES\) :\s+(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "x_dihedral_angle_d", false },
/* 54 */ { R"(IMPROPER ANGLES \(DEGREES\) :\s+(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "x_improper_angle_d", false },
/* 55 */ { R"(ISOTROPIC THERMAL MODEL :\s+(.+))", 1, "refine", { "pdbx_isotropic_thermal_model" } },
/* 56 */ { R"(ISOTROPIC THERMAL FACTOR RESTRAINTS\. RMS SIGMA)", 1 },
/* 57 */ { R"(MAIN-CHAIN BOND \(A\*\*2\) :\s+(.+?);\s+(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "x_mcbond_it", false },
/* 58 */ { R"(MAIN-CHAIN ANGLE \(A\*\*2\) :\s+(.+?);\s+(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "x_mcangle_it", false },
/* 59 */ { R"(SIDE-CHAIN BOND \(A\*\*2\) :\s+(.+?);\s+(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "x_scbond_it", false },
/* 60 */ { R"(SIDE-CHAIN ANGLE \(A\*\*2\) :\s+(.+?);\s+(.+))", 1, "refine_ls_restr", { "dev_ideal", "dev_ideal_target" }, "x_scangle_it", false },
/* 61 */ { R"(NCS MODEL :\s+(.+))", 1, /* "refine_ls_restr_ncs", { "ncs_model_details" } */ },
/* 62 */ { R"(NCS RESTRAINTS\. RMS SIGMA/WEIGHT)", 1 },
/* 63 */ { R"(GROUP (\d+) POSITIONAL \(A\) :\s+(.+?);\s+(.+))", 1, /* "refine_ls_restr_ncs", { ":dom_id", "rms_dev_position", "weight_position" } */ },
/* 64 */ { R"(GROUP (\d+) B-FACTOR \(A\*\*2\) :\s+(.+?);\s+(.+))", 63 - 64, /* "refine_ls_restr_ncs", { ":dom_id", "rms_dev_B_iso", "weight_B_iso" } */ },
/* 65 */ { R"(PARAMETER FILE (\d+) :\s+(.+))", 0, /* "pdbx_xplor_file", { "serial_no", "param_file" } */ },
/* 66 */ { R"(TOPOLOGY FILE (\d+) :\s+(.+))", 0, /* "pdbx_xplor_file", { "serial_no", "topol_file" } */ },
};
class XPLOR_Remark3Parser : public Remark3Parser
{
public:
XPLOR_Remark3Parser(const string& name, const string& expMethod, PDBRecord* r, cif::datablock& db)
: Remark3Parser(name, expMethod, r, db, kXPLOR_Template, sizeof(kXPLOR_Template) / sizeof(TemplateLine),
regex(R"((X-PLOR)(?: (\d+(?:\.\d+)?))?)")) {}
};
// --------------------------------------------------------------------
Remark3Parser::Remark3Parser(const std::string& name, const std::string& expMethod, PDBRecord* r, cif::datablock& db,
const TemplateLine templatelines[], uint32 templateLineCount, std::regex program_version)
: m_name(name), m_expMethod(expMethod), m_rec(r), m_db(db.name())
, m_template(templatelines), m_templateCount(templateLineCount), m_program_version(program_version)
{
}
string Remark3Parser::NextLine()
{
m_line.clear();
while (m_rec != nullptr and m_rec->is("REMARK 3"))
{
size_t valueIndent = 0;
for (size_t i = 4; i < m_rec->m_vlen; ++i)
{
if (m_rec->m_value[i] == ' ')
continue;
if (m_rec->m_value[i] == ':')
{
valueIndent = i;
while (valueIndent < m_rec->m_vlen and m_rec->m_value[i] == ' ')
++valueIndent;
break;
}
}
m_line = m_rec->v_s(12);
m_rec = m_rec->m_next;
if (m_line.empty())
continue;
// concatenate value that is wrapped over multiple lines (tricky code...)
if (valueIndent > 4)
{
string indent(valueIndent - 4, ' ');
while (m_rec->is("REMARK 3") and m_rec->m_vlen > valueIndent)
{
string v(m_rec->m_value + 4, m_rec->m_value + m_rec->m_vlen);
if (not ba::starts_with(v, indent))
break;
m_line += ' ';
m_line.append(m_rec->m_value + valueIndent, m_rec->m_value + m_rec->m_vlen);
m_rec = m_rec->m_next;
}
}
// collapse multiple spaces
bool space = false;
auto i = m_line.begin(), j = i;
while (i != m_line.end())
{
bool nspace = isspace(*i);
if (nspace == false)
{
if (space)
*j++ = ' ';
*j++ = *i;
}
space = nspace;
++i;
}
m_line.erase(j, m_line.end());
break;
}
if (VERBOSE >= 2)
cerr << "RM3: " << m_line << endl;
return m_line;
}
bool Remark3Parser::Match(const char* expr, int nextState)
{
regex rx(expr);
bool result = regex_match(m_line, m_m, rx);
if (result)
m_state = nextState;
else if (VERBOSE >= 3)
cerr << kRedOn << "No Match:" << kRedOff << " '" << expr << '\'' << endl;
return result;
}
float Remark3Parser::Parse()
{
int lineCount = 0, dropped = 0;
string remarks;
m_state = 0;
while (m_rec != nullptr)
{
NextLine();
if (m_line.empty())
break;
++lineCount;
// Skip over AUTHORS lines
if (m_state == 0 and Match(R"(AUTHORS\s*:.+)", 0))
continue;
auto state = m_state;
for (state = m_state; state < m_templateCount; ++state)
{
const TemplateLine& tmpl = m_template[state];
if (Match(tmpl.rx, state + tmpl.next_state_offset))
{
if (not (tmpl.category == nullptr or tmpl.items.size() == 0))
{
if (tmpl.ls_restr_type == nullptr)
StoreCapture(tmpl.category, tmpl.items, tmpl.create_new);
else if (tmpl.create_new)
StoreRefineLsRestr(tmpl.ls_restr_type, tmpl.items);
else
UpdateRefineLsRestr(tmpl.ls_restr_type, tmpl.items);
}
break;
}
}
if (state < m_templateCount)
continue;
if (state == m_templateCount and Match(R"(OTHER REFINEMENT REMARKS\s*:\s*(.*))", m_templateCount + 1))
{
remarks = m_m[1].str();
continue;
}
if (state == m_templateCount + 1)
{
remarks = remarks + '\n' + m_line;
continue;
}
if (VERBOSE >= 2)
cerr << kRedOn << "Dropping line:" << kRedOff << " '" << m_line << '\'' << endl;
++dropped;
}
if (not remarks.empty() and not iequals(remarks, "NULL"))
m_db["refine"].front()["details"] = remarks;
float score = float(lineCount - dropped) / lineCount;
return score;
}
string Remark3Parser::Program()
{
string result = m_name;
smatch m;
if (regex_match(m_name, m, m_program_version))
result = m[1].str();
return result;
}
string Remark3Parser::Version()
{
string result;
smatch m;
if (regex_match(m_name, m, m_program_version))
result = m[2].str();
return result;
}
void Remark3Parser::StoreCapture(const char* category, initializer_list<const char*> items, bool createNew)
{
int capture = 0;
for (auto item: items)
{
++capture;
string value = m_m[capture].str();
ba::trim(value);
if (iequals(value, "NULL") or iequals(value, "NONE"))
continue;
if (VERBOSE >= 3)
cerr << "storing: '" << value << "' in _" << category << '.' << item << endl;
auto& cat = m_db[category];
if (cat.empty() or createNew)
{
if (iequals(category, "refine"))
cat.emplace({
{ "pdbx_refine_id", m_expMethod },
{ "entry_id", m_db.name() },
#warning("???")
{ "pdbx_diffrn_id", 1 }
});
else if (iequals(category, "refine_analyze") or iequals(category, "pdbx_refine"))
cat.emplace({
{ "pdbx_refine_id", m_expMethod },
{ "entry_id", m_db.name() },
// { "pdbx_diffrn_id", 1 }
});
else if (iequals(category, "refine_hist"))
{
string d_res_high, d_res_low;
for (auto r: m_db["refine"])
{
cif::tie(d_res_high, d_res_low) = r.get("ls_d_res_high", "ls_d_res_low");
break;
}
cat.emplace({
{ "pdbx_refine_id", m_expMethod },
{ "cycle_id", "LAST" },
{ "d_res_high", d_res_high.empty() ? "." : d_res_high },
{ "d_res_low", d_res_low.empty() ? "." : d_res_low }
});
}
else if (iequals(category, "refine_ls_shell"))
{
cat.emplace({
{ "pdbx_refine_id", m_expMethod },
});
}
else if (iequals(category, "pdbx_refine_tls_group"))
{
string tls_group_id;
if (not m_db["pdbx_refine_tls"].empty())
tls_group_id = m_db["pdbx_refine_tls"].back()["id"].as<string>();
cat.emplace({
{ "pdbx_refine_id", m_expMethod },
{ "id", tls_group_id },
{ "refine_tls_id", tls_group_id }
});
}
else if (iequals(category, "pdbx_refine_tls"))
{
cat.emplace({
{ "pdbx_refine_id", m_expMethod },
{ "method", "refined" }
});
}
// else if (iequals(category, "struct_ncs_dom"))
// {
// size_t id = cat.size() + 1;
//
// cat.emplace({
// { "id", id }
// });
// }
else if (iequals(category, "pdbx_reflns_twin"))
{
cat.emplace({
#warning("???")
{ "crystal_id", 1 },
{ "diffrn_id", 1 }
});
}
else
cat.emplace({});
createNew = false;
}
cat.back()[item] = value;
}
}
void Remark3Parser::StoreRefineLsRestr(const char* type, initializer_list<const char*> items)
{
row r;
int capture = 0;
for (auto item: items)
{
++capture;
string value = m_m[capture].str();
ba::trim(value);
if (value.empty() or iequals(value, "NULL"))
continue;
if (not r)
{
std::tie(r, std::ignore) = m_db["refine_ls_restr"].emplace({});
r["pdbx_refine_id"] = m_expMethod;
r["type"] = type;
}
r[item] = value;
}
}
void Remark3Parser::UpdateRefineLsRestr(const char* type, initializer_list<const char*> items)
{
auto rows = m_db["refine_ls_restr"].find(cif::key("type") == type and cif::key("pdbx_refine_id") == m_expMethod);
if (rows.empty())
StoreRefineLsRestr(type, items);
else
{
for (row r: rows)
{
int capture = 0;
for (auto item: items)
{
++capture;
string value = m_m[capture].str();
ba::trim(value);
if (iequals(value, "NULL"))
value.clear();
r[item] = value;
}
break;
}
}
}
// --------------------------------------------------------------------
bool Remark3Parser::Parse(const string& expMethod, PDBRecord* r, cif::datablock& db)
{
// simple version, only for the first few lines
auto GetNextLine = [&]()
{
string result;
while (result.empty() and r != nullptr and r->is("REMARK 3"))
{
result = r->v_s(12);
r = r->m_next;
}
return result;
};
// All remark 3 records should start with the same data.
string line = GetNextLine();
if (line != "REFINEMENT.")
throw runtime_error("Unexpected data in REMARK 3");
line = GetNextLine();
regex rxp(R"(^PROGRAM\s*:\s*(.+))");
smatch m;
if (not regex_match(line, m, rxp))
throw runtime_error("Expected valid PROGRAM line in REMARK 3");
line = m[1].str();
struct program_score
{
program_score(const string& program, Remark3Parser* parser, float score)
: program(program), parser(parser), score(score) {}
string program;
unique_ptr<Remark3Parser> parser;
float score;
bool operator<(const program_score& rhs) const
{
return score > rhs.score;
}
};
vector<program_score> scores;
auto tryParser = [&](Remark3Parser* p)
{
unique_ptr<Remark3Parser> parser(p);
float score = parser->Parse();
if (VERBOSE >= 2)
cerr << "Score for " << parser->Program() << ": " << score << endl;
if (score > 0)
{
auto& software = db["software"];
string program = parser->Program();
string version = parser->Version();
software.emplace({
{ "name", program },
{ "classification", "refinement" },
{ "version", version },
{ "pdbx_ordinal", software.size() + 1 }
});
scores.emplace_back(program, parser.release(), score);
}
};
for (auto p = make_split_iterator(line, ba::first_finder(", "));
not p.eof(); ++p)
{
string program(p->begin(), p->end());
unique_ptr<Remark3Parser> parser;
if (ba::starts_with(program, "BUSTER"))
tryParser(new BUSTER_TNT_Remark3Parser(program, expMethod, r, db));
else if (ba::starts_with(program, "CNS") or ba::starts_with(program, "CNX"))
tryParser(new CNS_Remark3Parser(program, expMethod, r, db));
else if (ba::starts_with(program, "PHENIX"))
tryParser(new PHENIX_Remark3Parser(program, expMethod, r, db));
else if (ba::starts_with(program, "PROLSQ") or ba::starts_with(program, "NUCLSQ"))
tryParser(new PROLSQ_Remark3Parser(program, expMethod, r, db));
else if (ba::starts_with(program, "REFMAC"))
{
// simply try both and take the best
tryParser(new REFMAC_Remark3Parser(program, expMethod, r, db));
tryParser(new REFMAC5_Remark3Parser(program, expMethod, r, db));
}
else if (ba::starts_with(program, "SHELXL"))
tryParser(new SHELXL_Remark3Parser(program, expMethod, r, db));
else if (ba::starts_with(program, "TNT"))
tryParser(new TNT_Remark3Parser(program, expMethod, r, db));
else if (ba::starts_with(program, "X-PLOR"))
tryParser(new XPLOR_Remark3Parser(program, expMethod, r, db));
else if (VERBOSE)
cerr << "Skipping unknown program (" << program << ") in REMARK 3" << endl;
}
bool result = false;
if (not scores.empty())
{
result = true;
sort(scores.begin(), scores.end());
auto& best = scores.front();
if (VERBOSE >= 2)
cerr << "Choosing " << best.parser->Program() << " version '" << best.parser->Version() << "' as refinement program. Score = " << best.score << endl;
best.parser->Fixup();
for (auto& cat1: best.parser->m_db)
{
auto& cat2 = db[cat1.name()];
// copy only the values in the first row for the following categories
if (cat1.name() == "reflns" or cat1.name() == "refine")
{
if (cat2.empty()) // duh... this will generate a validation error anyway...
cat2.emplace({});
row r1 = cat1.front();
row r2 = cat2.front();
for (auto& i: r1)
r2[i.name()] = i.value();
}
else
{
for (auto r: cat1)
cat2.emplace(r);
}
}
}
return result;
}
This source diff could not be displayed because it is too large. You can view the blob instead.
// Lib for working with structures as contained in mmCIF and PDB files
#include "libcif/point.h"
using namespace std;
namespace libcif
{
// --------------------------------------------------------------------
quaternion Normalize(quaternion q)
{
valarray<double> t(4);
t[0] = q.R_component_1();
t[1] = q.R_component_2();
t[2] = q.R_component_3();
t[3] = q.R_component_4();
t *= t;
double length = sqrt(t.sum());
if (length > 0.001)
q /= length;
else
q = quaternion(1, 0, 0, 0);
return q;
}
// --------------------------------------------------------------------
float DihedralAngle(const point& p1, const point& p2, const point& p3, const point& p4)
{
point v12 = p1 - p2; // vector from p2 to p1
point v43 = p4 - p3; // vector from p3 to p4
point z = p2 - p3; // vector from p3 to p2
point p = CrossProduct(z, v12);
point x = CrossProduct(z, v43);
point y = CrossProduct(z, x);
double u = DotProduct(x, x);
double v = DotProduct(y, y);
double result = 360;
if (u > 0 and v > 0)
{
u = DotProduct(p, x) / sqrt(u);
v = DotProduct(p, y) / sqrt(v);
if (u != 0 or v != 0)
result = atan2(v, u) * 180 / kPI;
}
return result;
}
float CosinusAngle(const point& p1, const point& p2, const point& p3, const point& p4)
{
point v12 = p1 - p2;
point v34 = p3 - p4;
double result = 0;
double x = DotProduct(v12, v12) * DotProduct(v34, v34);
if (x > 0)
result = DotProduct(v12, v34) / sqrt(x);
return result;
}
// --------------------------------------------------------------------
tuple<double,point> QuaternionToAngleAxis(quaternion q)
{
if (q.R_component_1() > 1)
q = Normalize(q);
// angle:
double angle = 2 * acos(q.R_component_1());
angle = angle * 180 / kPI;
// axis:
double s = sqrt(1 - q.R_component_1() * q.R_component_1());
if (s < 0.001)
s = 1;
point axis(q.R_component_2() / s, q.R_component_3() / s, q.R_component_4() / s);
return make_tuple(angle, axis);
}
point CenterPoints(vector<point>& points)
{
point t;
for (point& pt : points)
{
t.x() += pt.x();
t.y() += pt.y();
t.z() += pt.z();
}
t.x() /= points.size();
t.y() /= points.size();
t.z() /= points.size();
for (point& pt : points)
{
pt.x() -= t.x();
pt.y() -= t.y();
pt.z() -= t.z();
}
return t;
}
point Centroid(vector<point>& points)
{
point result;
for (point& pt : points)
result += pt;
result /= points.size();
return result;
}
double RMSd(const vector<point>& a, const vector<point>& b)
{
double sum = 0;
for (uint32 i = 0; i < a.size(); ++i)
{
valarray<double> d(3);
d[0] = b[i].x() - a[i].x();
d[1] = b[i].y() - a[i].y();
d[2] = b[i].z() - a[i].z();
d *= d;
sum += d.sum();
}
return sqrt(sum / a.size());
}
// The next function returns the largest solution for a quartic equation
// based on Ferrari's algorithm.
// A depressed quartic is of the form:
//
// x^4 + ax^2 + bx + c = 0
//
// (since I'm too lazy to find out a better way, I've implemented the
// routine using complex values to avoid nan's as a result of taking
// sqrt of a negative number)
double LargestDepressedQuarticSolution(double a, double b, double c)
{
complex<double> P = - (a * a) / 12 - c;
complex<double> Q = - (a * a * a) / 108 + (a * c) / 3 - (b * b) / 8;
complex<double> R = - Q / 2.0 + sqrt((Q * Q) / 4.0 + (P * P * P) / 27.0);
complex<double> U = pow(R, 1 / 3.0);
complex<double> y;
if (U == 0.0)
y = -5.0 * a / 6.0 + U - pow(Q, 1.0 / 3.0);
else
y = -5.0 * a / 6.0 + U - P / (3.0 * U);
complex<double> W = sqrt(a + 2.0 * y);
// And to get the final result:
// result = (±W + sqrt(-(3 * alpha + 2 * y ± 2 * beta / W))) / 2;
// We want the largest result, so:
valarray<double> t(4);
t[0] = (( W + sqrt(-(3.0 * a + 2.0 * y + 2.0 * b / W))) / 2.0).real();
t[1] = (( W + sqrt(-(3.0 * a + 2.0 * y - 2.0 * b / W))) / 2.0).real();
t[2] = ((-W + sqrt(-(3.0 * a + 2.0 * y + 2.0 * b / W))) / 2.0).real();
t[3] = ((-W + sqrt(-(3.0 * a + 2.0 * y - 2.0 * b / W))) / 2.0).real();
return t.max();
}
//quaternion AlignPoints(const vector<point>& pa, const vector<point>& pb)
//{
// // First calculate M, a 3x3 matrix containing the sums of products of the coordinates of A and B
// matrix<double> M(3, 3, 0);
//
// for (uint32 i = 0; i < pa.size(); ++i)
// {
// const point& a = pa[i];
// const point& b = pb[i];
//
// M(0, 0) += a.x() * b.x(); M(0, 1) += a.x() * b.y(); M(0, 2) += a.x() * b.z();
// M(1, 0) += a.y() * b.x(); M(1, 1) += a.y() * b.y(); M(1, 2) += a.y() * b.z();
// M(2, 0) += a.z() * b.x(); M(2, 1) += a.z() * b.y(); M(2, 2) += a.z() * b.z();
// }
//
// // Now calculate N, a symmetric 4x4 matrix
// symmetric_matrix<double> N(4);
//
// N(0, 0) = M(0, 0) + M(1, 1) + M(2, 2);
// N(0, 1) = M(1, 2) - M(2, 1);
// N(0, 2) = M(2, 0) - M(0, 2);
// N(0, 3) = M(0, 1) - M(1, 0);
//
// N(1, 1) = M(0, 0) - M(1, 1) - M(2, 2);
// N(1, 2) = M(0, 1) + M(1, 0);
// N(1, 3) = M(0, 2) + M(2, 0);
//
// N(2, 2) = -M(0, 0) + M(1, 1) - M(2, 2);
// N(2, 3) = M(1, 2) + M(2, 1);
//
// N(3, 3) = -M(0, 0) - M(1, 1) + M(2, 2);
//
// // det(N - λI) = 0
// // find the largest λ (λm)
// //
// // Aλ4 + Bλ3 + Cλ2 + Dλ + E = 0
// // A = 1
// // B = 0
// // and so this is a so-called depressed quartic
// // solve it using Ferrari's algorithm
//
// double C = -2 * (
// M(0, 0) * M(0, 0) + M(0, 1) * M(0, 1) + M(0, 2) * M(0, 2) +
// M(1, 0) * M(1, 0) + M(1, 1) * M(1, 1) + M(1, 2) * M(1, 2) +
// M(2, 0) * M(2, 0) + M(2, 1) * M(2, 1) + M(2, 2) * M(2, 2));
//
// double D = 8 * (M(0, 0) * M(1, 2) * M(2, 1) +
// M(1, 1) * M(2, 0) * M(0, 2) +
// M(2, 2) * M(0, 1) * M(1, 0)) -
// 8 * (M(0, 0) * M(1, 1) * M(2, 2) +
// M(1, 2) * M(2, 0) * M(0, 1) +
// M(2, 1) * M(1, 0) * M(0, 2));
//
// double E =
// (N(0,0) * N(1,1) - N(0,1) * N(0,1)) * (N(2,2) * N(3,3) - N(2,3) * N(2,3)) +
// (N(0,1) * N(0,2) - N(0,0) * N(2,1)) * (N(2,1) * N(3,3) - N(2,3) * N(1,3)) +
// (N(0,0) * N(1,3) - N(0,1) * N(0,3)) * (N(2,1) * N(2,3) - N(2,2) * N(1,3)) +
// (N(0,1) * N(2,1) - N(1,1) * N(0,2)) * (N(0,2) * N(3,3) - N(2,3) * N(0,3)) +
// (N(1,1) * N(0,3) - N(0,1) * N(1,3)) * (N(0,2) * N(2,3) - N(2,2) * N(0,3)) +
// (N(0,2) * N(1,3) - N(2,1) * N(0,3)) * (N(0,2) * N(1,3) - N(2,1) * N(0,3));
//
// // solve quartic
// double lm = LargestDepressedQuarticSolution(C, D, E);
//
// // calculate t = (N - λI)
// matrix<double> li = identity_matrix<double>(4) * lm;
// matrix<double> t = N - li;
//
// // calculate a matrix of cofactors for t
// matrix<double> cf(4, 4);
//
// const uint32 ixs[4][3] =
// {
// { 1, 2, 3 },
// { 0, 2, 3 },
// { 0, 1, 3 },
// { 0, 1, 2 }
// };
//
// uint32 maxR = 0;
// for (uint32 r = 0; r < 4; ++r)
// {
// const uint32* ir = ixs[r];
//
// for (uint32 c = 0; c < 4; ++c)
// {
// const uint32* ic = ixs[c];
//
// cf(r, c) =
// t(ir[0], ic[0]) * t(ir[1], ic[1]) * t(ir[2], ic[2]) +
// t(ir[0], ic[1]) * t(ir[1], ic[2]) * t(ir[2], ic[0]) +
// t(ir[0], ic[2]) * t(ir[1], ic[0]) * t(ir[2], ic[1]) -
// t(ir[0], ic[2]) * t(ir[1], ic[1]) * t(ir[2], ic[0]) -
// t(ir[0], ic[1]) * t(ir[1], ic[0]) * t(ir[2], ic[2]) -
// t(ir[0], ic[0]) * t(ir[1], ic[2]) * t(ir[2], ic[1]);
// }
//
// if (r > maxR and cf(r, 0) > cf(maxR, 0))
// maxR = r;
// }
//
// // NOTE the negation of the y here, why? Maybe I swapped r/c above?
// quaternion q(cf(maxR, 0), cf(maxR, 1), -cf(maxR, 2), cf(maxR, 3));
// q = Normalize(q);
//
// return q;
//}
}
// Lib for working with structures as contained in file and PDB files
#include "libcif/structure.h"
#include <boost/algorithm/string.hpp>
#include <boost/filesystem/fstream.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/bzip2.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include "pdb2cif.h"
#include "libcif/cif-parser.h"
#include "cif2pdb.h"
using namespace std;
namespace ba = boost::algorithm;
namespace fs = boost::filesystem;
namespace io = boost::iostreams;
extern int VERBOSE;
namespace libcif
{
// --------------------------------------------------------------------
// file_impl
struct file_impl
{
cif::file m_data;
cif::datablock* m_db = nullptr;
void load(fs::path p);
void save(fs::path p);
};
void file_impl::load(fs::path p)
{
fs::ifstream infile(p, ios_base::in | ios_base::binary);
if (not infile.is_open())
throw runtime_error("No such file: " + p.string());
io::filtering_stream<io::input> in;
string ext;
if (p.extension() == ".bz2")
{
in.push(io::bzip2_decompressor());
ext = p.stem().extension().string();
}
else if (p.extension() == ".gz")
{
in.push(io::gzip_decompressor());
ext = p.stem().extension().string();
}
in.push(infile);
// OK, we've got the file, now create a protein
if (ext == ".cif")
m_data.load(in);
else if (ext == ".pdb" or ext == ".ent")
ReadPDBFile(in, m_data);
else
{
try
{
if (VERBOSE)
cerr << "unrecognized file extension, trying cif" << endl;
m_data.load(in);
}
catch (const cif::cif_parser_error& e)
{
if (VERBOSE)
cerr << "Not cif, trying plain old PDB" << endl;
// pffft...
in.reset();
if (infile.is_open())
infile.seekg(0);
else
infile.open(p, ios_base::in | ios::binary);
if (p.extension() == ".bz2")
in.push(io::bzip2_decompressor());
else if (p.extension() == ".gz")
in.push(io::gzip_decompressor());
in.push(infile);
ReadPDBFile(in, m_data);
}
}
// Yes, we've parsed the data. Now locate the datablock.
m_db = &m_data.first_datablock();
// And validate, otherwise lots of functionality won't work
// if (m_data.get_validator() == nullptr)
m_data.load_dictionary("mmcif_pdbx");
m_data.validate();
}
void file_impl::save(fs::path p)
{
fs::ofstream outfile(p, ios_base::out | ios_base::binary);
io::filtering_stream<io::output> out;
if (p.extension() == ".gz")
{
out.push(io::gzip_compressor());
p = p.stem();
}
else if (p.extension() == ".bz2")
{
out.push(io::bzip2_compressor());
p = p.stem();
}
out.push(outfile);
if (p.extension() == ".pdb")
WritePDBFile(out, m_data);
else
m_data.save(out);
}
// --------------------------------------------------------------------
// atom
struct atom_impl
{
atom_impl(const file& f, const string& id)
: m_file(f), m_id(id), m_refcount(1), m_compound(nullptr)
{
auto& db = *m_file.impl().m_db;
auto& cat = db["atom_site"];
m_row = cat[cif::key("id") == m_id];
prefetch();
}
atom_impl(const file& f, const string& id, cif::row row)
: m_file(f), m_id(id), m_refcount(1), m_row(row), m_compound(nullptr)
{
prefetch();
}
void prefetch()
{
// Prefetch some data
string symbol;
cif::tie(symbol) = m_row.get("type_symbol");
m_type = atom_type_traits(symbol).type();
float x, y, z;
cif::tie(x, y, z) = m_row.get("Cartn_x", "Cartn_y", "Cartn_z");
m_location = point(x, y, z);
try
{
comp();
}
catch (...) {}
}
void reference()
{
++m_refcount;
}
void release()
{
if (--m_refcount < 0)
delete this;
}
const compound& comp()
{
if (m_compound == nullptr)
{
string comp_id;
cif::tie(comp_id) = m_row.get("label_comp_id");
m_compound = compound::create(comp_id);
}
if (m_compound == nullptr)
throw runtime_error("no compound");
return *m_compound;
}
bool is_water() const
{
return m_compound != nullptr and m_compound->is_water();
}
const file& m_file;
string m_id;
int m_refcount;
cif::row m_row;
const compound* m_compound;
point m_location;
atom_type m_type;
// const entity& m_entity;
// std::string m_asym_id;
// std::string m_atom_id;
// point m_loc;
// property_list m_properties;
};
atom::atom(const file& f, const string& id)
: m_impl(new atom_impl(f, id))
{
}
atom::atom(atom_impl* impl)
: m_impl(impl)
{
}
atom::atom(const atom& rhs)
: m_impl(rhs.m_impl)
{
m_impl->reference();
}
atom::~atom()
{
if (m_impl)
m_impl->release();
}
atom& atom::operator=(const atom& rhs)
{
if (this != &rhs)
{
m_impl->release();
m_impl = rhs.m_impl;
m_impl->reference();
}
return *this;
}
string atom::id() const
{
return m_impl->m_id;
}
atom_type atom::type() const
{
return m_impl->m_type;
}
int atom::charge() const
{
int charge;
cif::tie(charge) = m_impl->m_row.get("pdbx_formal_charge");
return charge;
}
string atom::label_atom_id() const
{
string atom_id;
cif::tie(atom_id) = m_impl->m_row.get("label_atom_id");
return atom_id;
}
string atom::label_comp_id() const
{
string comp_id;
cif::tie(comp_id) = m_impl->m_row.get("label_comp_id");
return comp_id;
}
string atom::label_asym_id() const
{
string asym_id;
cif::tie(asym_id) = m_impl->m_row.get("label_asym_id");
return asym_id;
}
int atom::label_seq_id() const
{
int seq_id;
cif::tie(seq_id) = m_impl->m_row.get("label_seq_id");
return seq_id;
}
string atom::auth_asym_id() const
{
string asym_id;
cif::tie(asym_id) = m_impl->m_row.get("auth_asym_id");
return asym_id;
}
int atom::auth_seq_id() const
{
int seq_id;
cif::tie(seq_id) = m_impl->m_row.get("auth_seq_id");
return seq_id;
}
point atom::location() const
{
return m_impl->m_location;
}
const compound& atom::comp() const
{
return m_impl->comp();
}
bool atom::is_water() const
{
return m_impl->is_water();
}
boost::any atom::property(const std::string& name) const
{
string s = m_impl->m_row[name].as<string>();
return boost::any(s);
}
bool atom::operator==(const atom& rhs) const
{
return m_impl == rhs.m_impl or
(&m_impl->m_file == &rhs.m_impl->m_file and m_impl->m_id == rhs.m_impl->m_id);
}
const file& atom::get_file() const
{
assert(m_impl);
return m_impl->m_file;
}
// --------------------------------------------------------------------
// residue
//atom_view residue::atoms()
//{
// assert(false);
//}
// --------------------------------------------------------------------
// monomer
// --------------------------------------------------------------------
// polymer
// --------------------------------------------------------------------
// file
file::file()
: m_impl(new file_impl)
{
}
file::file(fs::path file)
: m_impl(new file_impl)
{
load(file);
}
file::~file()
{
delete m_impl;
}
void file::load(fs::path p)
{
m_impl->load(p);
// // all data is now in m_file, construct atoms and others
//
// auto& db = m_file.first_datablock();
//
// // the entities
//
// struct entity
// {
// string id;
// string type;
// };
// vector<entity> entities;
//
// for (auto& _e: db["entity"])
// {
// string type = _e["type"];
// ba::to_lower(type);
// entities.push_back({ _e["id"], type });
// }
//
// auto& atom_sites = db["atom_site"];
// for (auto& atom_site: atom_sites)
// {
// atom_ptr ap(new atom(this, atom_site));
//
// string entity_id = atom_site["entity_id"];
//
// auto e = find_if(entities.begin(), entities.end(), [=](entity& e) -> bool { return e.id == entity_id; });
// if (e == entities.end())
// throw runtime_error("Entity " + entity_id + " is not defined");
//
// string comp_id, asym_id, seq_id;
// cif::tie(comp_id, seq_id) = atom_site.get("label_comp_id", "label_asym_id", "label_seq_id");
//
// auto r = find_if(m_residues.begin(), m_residues.end(), [=](residue_ptr& res) -> bool
// {
//// return res.entities
// return false;
// });
//
// if (e->type == "water")
// ;
// else if (e->type == "polymer")
// ;
// else
// ;
//
// m_atoms.push_back(ap);
// }
}
void file::save(boost::filesystem::path file)
{
m_impl->save(file);
}
cif::datablock& file::data()
{
assert(m_impl);
assert(m_impl->m_db);
if (m_impl == nullptr or m_impl->m_db == nullptr)
throw runtime_error("No data loaded");
return *m_impl->m_db;
}
// --------------------------------------------------------------------
// structure
struct structure_impl
{
structure_impl(structure& s, file& f, uint32 model_nr)
: m_file(&f), m_model_nr(model_nr)
{
auto& db = *m_file->impl().m_db;
auto& atom_cat = db["atom_site"];
for (auto& a: atom_cat)
{
auto model_nr = a["pdbx_PDB_model_num"];
if (model_nr.empty() or model_nr.as<uint32>() == m_model_nr)
m_atoms.emplace_back(new atom_impl(f, a["id"].as<string>(), a));
}
}
void remove_atom(atom& a);
file* m_file;
uint32 m_model_nr;
atom_view m_atoms;
};
void structure_impl::remove_atom(atom& a)
{
cif::datablock& db = *m_file->impl().m_db;
auto& atom_sites = db["atom_site"];
for (auto i = atom_sites.begin(); i != atom_sites.end(); ++i)
{
string id;
cif::tie(id) = i->get("id");
if (id == a.id())
{
atom_sites.erase(i);
break;
}
}
m_atoms.erase(remove(m_atoms.begin(), m_atoms.end(), a), m_atoms.end());
}
structure::structure(file& f, uint32 model_nr)
: m_impl(new structure_impl(*this, f, model_nr))
{
}
structure::~structure()
{
delete m_impl;
}
atom_view structure::atoms() const
{
return m_impl->m_atoms;
}
atom_view structure::waters() const
{
atom_view result;
auto& db = *get_file().impl().m_db;
// Get the entity id for water
auto& entity_cat = db["entity"];
string water_entity_id;
for (auto& e: entity_cat)
{
string id, type;
cif::tie(id, type) = e.get("id", "type");
if (ba::iequals(type, "water"))
{
water_entity_id = id;
break;
}
}
for (auto& a: m_impl->m_atoms)
{
if (boost::any_cast<string>(a.property("label_entity_id")) == water_entity_id)
result.push_back(a);
}
return result;
}
atom structure::get_atom_by_id(string id) const
{
for (auto& a: m_impl->m_atoms)
{
if (a.id() == id)
return a;
}
throw out_of_range("Could not find atom with id " + id);
}
file& structure::get_file() const
{
return *m_impl->m_file;
}
//tuple<string,string> structure::MapLabelToAuth(
// const string& asym_id, int seq_id)
//{
// auto& db = *get_file().impl().m_db;
//
// tuple<string,int,string,string> result;
// bool found = false;
//
// for (auto r: db["pdbx_poly_seq_scheme"].find(
// cif::key("asym_id") == asym_id and
// cif::key("seq_id") == seq_id))
// {
// string auth_asym_id, pdb_mon_id, pdb_ins_code;
// int pdb_seq_num;
//
// cif::tie(pdb_strand_id, pdb_seq_num, pdb_mon_id, pdb_ins_code) =
// r.get("pdb_strand_id", "pdb_seq_num", "pdb_mon_id", "pdb_ins_code");
//
// result = make_tuple(pdb_strand_id, pdb_seq_num, pdb_mon_id, pdb_ins_code);
//
// found = true;
// break;
// }
//
// for (auto r: db["pdbx_nonpoly_scheme"].find(
// cif::key("asym_id") == asym_id and
// cif::key("seq_id") == seq_id and
// cif::key("mon_id") == mon_id))
// {
// string pdb_strand_id, pdb_mon_id, pdb_ins_code;
// int pdb_seq_num;
//
// cif::tie(pdb_strand_id, pdb_seq_num, pdb_mon_id, pdb_ins_code) =
// r.get("pdb_strand_id", "pdb_seq_num", "pdb_mon_id", "pdb_ins_code");
//
// result = make_tuple(pdb_strand_id, pdb_seq_num, pdb_mon_id, pdb_ins_code);
//
// found = true;
// break;
// }
//
// return result;
//}
tuple<string,int,string,string> structure::MapLabelToPDB(
const string& asym_id, int seq_id, const string& mon_id)
{
auto& db = *get_file().impl().m_db;
tuple<string,int,string,string> result;
for (auto r: db["pdbx_poly_seq_scheme"].find(
cif::key("asym_id") == asym_id and
cif::key("seq_id") == seq_id and
cif::key("mon_id") == mon_id))
{
string pdb_strand_id, pdb_mon_id, pdb_ins_code;
int pdb_seq_num;
cif::tie(pdb_strand_id, pdb_seq_num, pdb_mon_id, pdb_ins_code) =
r.get("pdb_strand_id", "pdb_seq_num", "pdb_mon_id", "pdb_ins_code");
result = make_tuple(pdb_strand_id, pdb_seq_num, pdb_mon_id, pdb_ins_code);
break;
}
for (auto r: db["pdbx_nonpoly_scheme"].find(
cif::key("asym_id") == asym_id and
cif::key("seq_id") == seq_id and
cif::key("mon_id") == mon_id))
{
string pdb_strand_id, pdb_mon_id, pdb_ins_code;
int pdb_seq_num;
cif::tie(pdb_strand_id, pdb_seq_num, pdb_mon_id, pdb_ins_code) =
r.get("pdb_strand_id", "pdb_seq_num", "pdb_mon_id", "pdb_ins_code");
result = make_tuple(pdb_strand_id, pdb_seq_num, pdb_mon_id, pdb_ins_code);
break;
}
return result;
}
// --------------------------------------------------------------------
// actions
void structure::remove_atom(atom& a)
{
m_impl->remove_atom(a);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment