Commit 82ce81d0 by Peter Eastman

Added support for loading PDBx/mmCIF files

parent bbb8cfa8
The SEQRES records in this PDB file include residues that are missing from the atom data section. Do you want to add the missing residues? The sequence records in this PDB file include residues that are missing from the atom data section. Do you want to add the missing residues?
<p> <p>
<form id="mainform" method="post" action="/"> <form id="mainform" method="post" action="/">
<table border="1" id="table"> <table border="1" id="table">
......
...@@ -36,6 +36,7 @@ import simtk.openmm as mm ...@@ -36,6 +36,7 @@ import simtk.openmm as mm
import simtk.openmm.app as app import simtk.openmm.app as app
import simtk.unit as unit import simtk.unit as unit
from simtk.openmm.app.internal.pdbstructure import PdbStructure from simtk.openmm.app.internal.pdbstructure import PdbStructure
from simtk.openmm.app.internal.pdbx.reader.PdbxReader import PdbxReader
from simtk.openmm.app.element import hydrogen, oxygen from simtk.openmm.app.element import hydrogen, oxygen
from simtk.openmm.app.forcefield import NonbondedGenerator from simtk.openmm.app.forcefield import NonbondedGenerator
import numpy as np import numpy as np
...@@ -47,11 +48,12 @@ import math ...@@ -47,11 +48,12 @@ import math
from pkg_resources import resource_filename from pkg_resources import resource_filename
# Imports for urlopen try:
if sys.version_info >= (3,0):
from urllib.request import urlopen from urllib.request import urlopen
else: from io import StringIO
except:
from urllib2 import urlopen from urllib2 import urlopen
from cStringIO import StringIO
substitutions = { substitutions = {
'2AS':'ASP', '3AH':'HIS', '5HP':'GLU', 'ACL':'ARG', 'AGM':'ARG', 'AIB':'ALA', 'ALM':'ALA', 'ALO':'THR', 'ALY':'LYS', 'ARM':'ARG', '2AS':'ASP', '3AH':'HIS', '5HP':'GLU', 'ACL':'ARG', 'AGM':'ARG', 'AIB':'ALA', 'ALM':'ALA', 'ALO':'THR', 'ALY':'LYS', 'ARM':'ARG',
...@@ -147,27 +149,34 @@ def _findUnoccupiedDirection(point, positions): ...@@ -147,27 +149,34 @@ def _findUnoccupiedDirection(point, positions):
return direction return direction
class PDBFixer(object): class PDBFixer(object):
"""PDBFixer implements many tools for fixing problems in PDB files. """PDBFixer implements many tools for fixing problems in PDB and PDBx/mmCIF files.
""" """
def __init__(self, filename=None, pdbfile=None, url=None, pdbid=None): def __init__(self, filename=None, pdbfile=None, pdbxfile=None, url=None, pdbid=None):
"""Create a new PDBFixer instance to fix problems in a PDB file. """Create a new PDBFixer instance to fix problems in a PDB or PDBx/mmCIF file.
Parameters Parameters
---------- ----------
filename : str, optional, default=None filename : str, optional, default=None
A filename specifying the file from which the PDB file is to be read. The name of the file to read. The format is determined automatically based on the filename extension. If
it ends in either ".pdbx" or ".cif", it is assumed to be a PDBx/mmCIF file. Otherwise, it is assumed to be
a PDB file.
pdbfile : file, optional, default=None pdbfile : file, optional, default=None
A file-like object from which the PDB file is to be read. A file-like object from which the PDB file is to be read.
The file is not closed after reading. The file is not closed after reading.
pdbxfile : file, optional, default=None
A file-like object from which the PDBx/mmCIF file is to be read.
The file is not closed after reading.
url : str, optional, default=None url : str, optional, default=None
A URL specifying the internet location from which the PDB file contents should be retrieved. A URL specifying the internet location from which the file contents should be retrieved. The format is
determined automatically by looking for a filename extension. If the URL contains either ".pdbx" or ".cif",
it is assumed to be a PDBx/mmCIF file. Otherwise, it is assumed to be a PDB file.
pdbid : str, optional, default=None pdbid : str, optional, default=None
A four-letter PDB code specifying the structure to be retrieved from the RCSB. A four-letter PDB code specifying the structure to be retrieved from the RCSB.
Notes Notes
----- -----
Only one of structure, filename, pdbfile, url, or pdbid may be specified or an exception will be thrown. Only one of structure, filename, pdbfile, pdbxfile, url, or pdbid may be specified or an exception will be thrown.
Examples Examples
-------- --------
...@@ -193,47 +202,44 @@ class PDBFixer(object): ...@@ -193,47 +202,44 @@ class PDBFixer(object):
""" """
# Check to make sure only one option has been specified. # Check to make sure only one option has been specified.
if bool(filename) + bool(pdbfile) + bool(url) + bool(pdbid) != 1: if bool(filename) + bool(pdbfile) + bool(pdbxfile) + bool(url) + bool(pdbid) != 1:
raise Exception("Exactly one option [filename, pdbfile, url, pdbid] must be specified.") raise Exception("Exactly one option [filename, pdbfile, pdbxfile, url, pdbid] must be specified.")
self.source = None self.source = None
if pdbid:
# A PDB id has been specified.
url = 'http://www.rcsb.org/pdb/files/%s.pdb' % pdbid
if filename: if filename:
self.source = filename
# A local file has been specified. # A local file has been specified.
self.source = filename
file = open(filename, 'r') file = open(filename, 'r')
structure = PdbStructure(file) if filename.lower().endswith('.pdbx') or filename.lower().endswith('.cif'):
self._initializeFromPDBx(file.read())
else:
self._initializeFromPDB(file)
file.close() file.close()
elif pdbfile: elif pdbfile:
# A file-like object has been specified. # A file-like object has been specified.
structure = PdbStructure(pdbfile) self._initializeFromPDB(pdbfile)
elif pdbxfile:
# A file-like object has been specified.
self._initializeFromPDBx(pdbxfile.read())
elif url: elif url:
self.source = url
# A URL has been specified. # A URL has been specified.
file = urlopen(url)
structure = PdbStructure(file)
file.close()
elif pdbid:
# A PDB id has been specified.
url = 'http://www.rcsb.org/pdb/files/%s.pdb' % pdbid
self.source = url self.source = url
file = urlopen(url) file = urlopen(url)
# Read contents all at once and split into lines, since urlopen doesn't like it when we read one line at a time over the network.
contents = file.read().decode('utf-8') contents = file.read().decode('utf-8')
lines = contents.split('\n')
file.close() file.close()
structure = PdbStructure(lines) if '.pdbx' in url.lower() or '.cif' in url.lower():
self._initializeFromPDBx(contents)
else:
self._initializeFromPDB(StringIO(contents))
# Check the structure has some atoms in it. # Check the structure has some atoms in it.
atoms = list(structure.iter_atoms()) atoms = list(self.topology.atoms())
if len(atoms)==0: if len(atoms) == 0:
raise Exception("Structure contains no atoms.") raise Exception("Structure contains no atoms.")
pdb = app.PDBFile(structure)
self.topology = pdb.topology
self.positions = pdb.positions
self.sequences = [Sequence(s.chain_id, s.residues) for s in structure.sequences]
self.modifiedResidues = [ModifiedResidue(r.chain_id, r.number, r.residue_name, r.standard_name) for r in structure.modified_residues]
# Load the templates. # Load the templates.
self.templates = {} self.templates = {}
...@@ -243,7 +249,67 @@ class PDBFixer(object): ...@@ -243,7 +249,67 @@ class PDBFixer(object):
name = next(templatePdb.topology.residues()).name name = next(templatePdb.topology.residues()).name
self.templates[name] = templatePdb self.templates[name] = templatePdb
return def _initializeFromPDB(self, file):
"""Initialize this object by reading a PDB file."""
structure = PdbStructure(file)
pdb = app.PDBFile(structure)
self.topology = pdb.topology
self.positions = pdb.positions
self.sequences = [Sequence(s.chain_id, s.residues) for s in structure.sequences]
self.modifiedResidues = [ModifiedResidue(r.chain_id, r.number, r.residue_name, r.standard_name) for r in structure.modified_residues]
def _initializeFromPDBx(self, filecontent):
"""Initialize this object by reading a PDBx/mmCIF file."""
pdbx = app.PDBxFile(StringIO(filecontent))
self.topology = pdbx.topology
self.positions = pdbx.positions
# PDBxFile doesn't record the information about sequence or modified residues, so we need to read them separately.
reader = PdbxReader(StringIO(filecontent))
data = []
reader.read(data)
block = data[0]
# Load the sequence data.
sequenceData = block.getObj('entity_poly_seq')
entityIdCol = sequenceData.getAttributeIndex('entity_id')
residueCol = sequenceData.getAttributeIndex('mon_id')
sequences = {}
for row in sequenceData.getRowList():
entityId = row[entityIdCol]
residue = row[residueCol]
if entityId not in sequences:
sequences[entityId] = []
sequences[entityId].append(residue)
# Sequences are stored by "entity". There could be multiple chains that are all the same entity, so we need to
# convert from entities to chains.
asymData = block.getObj('struct_asym')
asymIdCol = asymData.getAttributeIndex('id')
entityIdCol = asymData.getAttributeIndex('entity_id')
self.sequences = []
for row in asymData.getRowList():
asymId = row[asymIdCol]
entityId = row[entityIdCol]
if entityId in sequences:
self.sequences.append(Sequence(asymId, sequences[entityId]))
# Load the modified residues.
modData = block.getObj('pdbx_struct_mod_residue')
asymIdCol = modData.getAttributeIndex('label_asym_id')
resNameCol = modData.getAttributeIndex('label_comp_id')
resNumCol = modData.getAttributeIndex('auth_seq_id')
standardResCol = modData.getAttributeIndex('parent_comp_id')
self.modifiedResidues = []
if -1 not in (asymIdCol, resNameCol, resNumCol, standardResCol):
for row in modData.getRowList():
self.modifiedResidues.append(ModifiedResidue(row[asymIdCol], int(row[resNumCol]), row[resNameCol], row[standardResCol]))
def _addAtomsToTopology(self, heavyAtomsOnly, omitUnknownMolecules): def _addAtomsToTopology(self, heavyAtomsOnly, omitUnknownMolecules):
"""Create a new Topology in which missing atoms have been added. """Create a new Topology in which missing atoms have been added.
......
...@@ -55,8 +55,12 @@ def startPageCallback(parameters, handler): ...@@ -55,8 +55,12 @@ def startPageCallback(parameters, handler):
global fixer global fixer
if 'type' in parameters: if 'type' in parameters:
if parameters.getfirst('type') == 'local': if parameters.getfirst('type') == 'local':
filename = parameters['pdbfile'].filename
if filename.lower().endswith('.pdbx') or filename.lower().endswith('.cif'):
fixer = PDBFixer(pdbxfile=StringIO(parameters['pdbfile'].value.decode()))
else:
fixer = PDBFixer(pdbfile=parameters['pdbfile'].value.decode().splitlines()) fixer = PDBFixer(pdbfile=parameters['pdbfile'].value.decode().splitlines())
fixer.source = parameters['pdbfile'].filename fixer.source = filename
else: else:
id = parameters.getfirst('pdbid') id = parameters.getfirst('pdbid')
try: try:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment