Added support for loading PDBx/mmCIF files

82ce81d0 · Peter Eastman · bbb8cfa8 · 82ce81d0 · 82ce81d0 · 82ce81d0
Commit 82ce81d0 authored Aug 06, 2015 by Peter Eastman
Show whitespace changes
Inline Side-by-side

Showing with 105 additions and 35 deletions

pdbfixer/html/addResidues.html
+1 -1

pdbfixer/pdbfixer.py
+99 -33

pdbfixer/ui.py
+5 -1

No files found.
--- a/pdbfixer/html/addResidues.html
+++ b/pdbfixer/html/addResidues.html
-The SEQRES records in this PDB file include residues that are missing from the atom data section.  Do you want to add the missing residues?
+The sequence records in this PDB file include residues that are missing from the atom data section.  Do you want to add the missing residues?
 <p>
 <form id="mainform" method="post" action="/">
 <table border="1" id="table">

--- a/pdbfixer/pdbfixer.py
+++ b/pdbfixer/pdbfixer.py
@@ -36,6 +36,7 @@ import simtk.openmm as mm
 import simtk.openmm.app as app
 import simtk.unit as unit
 from simtk.openmm.app.internal.pdbstructure import PdbStructure
+from simtk.openmm.app.internal.pdbx.reader.PdbxReader import PdbxReader
 from simtk.openmm.app.element import hydrogen, oxygen
 from simtk.openmm.app.forcefield import NonbondedGenerator
 import numpy as np
@@ -47,11 +48,12 @@ import math
 from pkg_resources import resource_filename
-# Imports for urlopen
+try:
-if sys.version_info >= (3,0):
    from urllib.request import urlopen
-else:
+    from io import StringIO
+except:
    from urllib2 import urlopen
+    from cStringIO import StringIO
 substitutions = {
    '2AS':'ASP', '3AH':'HIS', '5HP':'GLU', 'ACL':'ARG', 'AGM':'ARG', 'AIB':'ALA', 'ALM':'ALA', 'ALO':'THR', 'ALY':'LYS', 'ARM':'ARG',
@@ -147,27 +149,34 @@ def _findUnoccupiedDirection(point, positions):
    return direction
 class PDBFixer(object):
-    """PDBFixer implements many tools for fixing problems in PDB files.
+    """PDBFixer implements many tools for fixing problems in PDB and PDBx/mmCIF files.
    """
-    def __init__(self, filename=None, pdbfile=None, url=None, pdbid=None):
+    def __init__(self, filename=None, pdbfile=None, pdbxfile=None, url=None, pdbid=None):
-        """Create a new PDBFixer instance to fix problems in a PDB file.
+        """Create a new PDBFixer instance to fix problems in a PDB or PDBx/mmCIF file.
        Parameters
        ----------
        filename : str, optional, default=None
-            A filename specifying the file from which the PDB file is to be read.
+            The name of the file to read.  The format is determined automatically based on the filename extension.  If
+            it ends in either ".pdbx" or ".cif", it is assumed to be a PDBx/mmCIF file.  Otherwise, it is assumed to be
+            a PDB file.
        pdbfile : file, optional, default=None
            A file-like object from which the PDB file is to be read.
            The file is not closed after reading.
+        pdbxfile : file, optional, default=None
+            A file-like object from which the PDBx/mmCIF file is to be read.
+            The file is not closed after reading.
        url : str, optional, default=None
-            A URL specifying the internet location from which the PDB file contents should be retrieved.
+            A URL specifying the internet location from which the file contents should be retrieved.  The format is
+            determined automatically by looking for a filename extension.  If the URL contains either ".pdbx" or ".cif",
+            it is assumed to be a PDBx/mmCIF file.  Otherwise, it is assumed to be a PDB file.
        pdbid : str, optional, default=None
            A four-letter PDB code specifying the structure to be retrieved from the RCSB.
        Notes
        -----
-        Only one of structure, filename, pdbfile, url, or pdbid may be specified or an exception will be thrown.
+        Only one of structure, filename, pdbfile, pdbxfile, url, or pdbid may be specified or an exception will be thrown.
        Examples
        --------
@@ -193,47 +202,44 @@ class PDBFixer(object):
        """
        # Check to make sure only one option has been specified.
-        if bool(filename) + bool(pdbfile) + bool(url) + bool(pdbid) != 1:
+        if bool(filename) + bool(pdbfile) + bool(pdbxfile) + bool(url) + bool(pdbid) != 1:
-            raise Exception("Exactly one option [filename, pdbfile, url, pdbid] must be specified.")
+            raise Exception("Exactly one option [filename, pdbfile, pdbxfile, url, pdbid] must be specified.")
        self.source = None
+        if pdbid:
+            # A PDB id has been specified.
+            url = 'http://www.rcsb.org/pdb/files/%s.pdb' % pdbid
        if filename:
-            self.source = filename
            # A local file has been specified.
+            self.source = filename
            file = open(filename, 'r')
-            structure = PdbStructure(file)
+            if filename.lower().endswith('.pdbx') or filename.lower().endswith('.cif'):
+                self._initializeFromPDBx(file.read())
+            else:
+                self._initializeFromPDB(file)
            file.close()
        elif pdbfile:
            # A file-like object has been specified.
-            structure = PdbStructure(pdbfile)  
+            self._initializeFromPDB(pdbfile)
+        elif pdbxfile:
+            # A file-like object has been specified.
+            self._initializeFromPDBx(pdbxfile.read())
        elif url:
-            self.source = url
            # A URL has been specified.
-            file = urlopen(url)
-            structure = PdbStructure(file)
-            file.close()
-        elif pdbid:
-            # A PDB id has been specified.
-            url = 'http://www.rcsb.org/pdb/files/%s.pdb' % pdbid
            self.source = url
            file = urlopen(url)
-            # Read contents all at once and split into lines, since urlopen doesn't like it when we read one line at a time over the network.
            contents = file.read().decode('utf-8')
-            lines = contents.split('\n')
            file.close()
-            structure = PdbStructure(lines)
+            if '.pdbx' in url.lower() or '.cif' in url.lower():
+                self._initializeFromPDBx(contents)
+            else:
+                self._initializeFromPDB(StringIO(contents))
        # Check the structure has some atoms in it.
-        atoms = list(structure.iter_atoms())
+        atoms = list(self.topology.atoms())
-        if len(atoms)==0:
+        if len(atoms) == 0:
            raise Exception("Structure contains no atoms.")
-        pdb = app.PDBFile(structure)
-        self.topology = pdb.topology
-        self.positions = pdb.positions
-        self.sequences = [Sequence(s.chain_id, s.residues) for s in structure.sequences]
-        self.modifiedResidues = [ModifiedResidue(r.chain_id, r.number, r.residue_name, r.standard_name) for r in structure.modified_residues]
        # Load the templates.
        self.templates = {}
@@ -243,7 +249,67 @@ class PDBFixer(object):
            name = next(templatePdb.topology.residues()).name
            self.templates[name] = templatePdb
-        return
+    def _initializeFromPDB(self, file):
+        """Initialize this object by reading a PDB file."""
+        structure = PdbStructure(file)
+        pdb = app.PDBFile(structure)
+        self.topology = pdb.topology
+        self.positions = pdb.positions
+        self.sequences = [Sequence(s.chain_id, s.residues) for s in structure.sequences]
+        self.modifiedResidues = [ModifiedResidue(r.chain_id, r.number, r.residue_name, r.standard_name) for r in structure.modified_residues]
+    def _initializeFromPDBx(self, filecontent):
+        """Initialize this object by reading a PDBx/mmCIF file."""
+        pdbx = app.PDBxFile(StringIO(filecontent))
+        self.topology = pdbx.topology
+        self.positions = pdbx.positions
+        # PDBxFile doesn't record the information about sequence or modified residues, so we need to read them separately.
+        reader = PdbxReader(StringIO(filecontent))
+        data = []
+        reader.read(data)
+        block = data[0]
+        # Load the sequence data.
+        sequenceData = block.getObj('entity_poly_seq')
+        entityIdCol = sequenceData.getAttributeIndex('entity_id')
+        residueCol = sequenceData.getAttributeIndex('mon_id')
+        sequences = {}
+        for row in sequenceData.getRowList():
+            entityId = row[entityIdCol]
+            residue = row[residueCol]
+            if entityId not in sequences:
+                sequences[entityId] = []
+            sequences[entityId].append(residue)
+        # Sequences are stored by "entity".  There could be multiple chains that are all the same entity, so we need to
+        # convert from entities to chains.
+        asymData = block.getObj('struct_asym')
+        asymIdCol = asymData.getAttributeIndex('id')
+        entityIdCol = asymData.getAttributeIndex('entity_id')
+        self.sequences = []
+        for row in asymData.getRowList():
+            asymId = row[asymIdCol]
+            entityId = row[entityIdCol]
+            if entityId in sequences:
+                self.sequences.append(Sequence(asymId, sequences[entityId]))
+        # Load the modified residues.
+        modData = block.getObj('pdbx_struct_mod_residue')
+        asymIdCol = modData.getAttributeIndex('label_asym_id')
+        resNameCol = modData.getAttributeIndex('label_comp_id')
+        resNumCol = modData.getAttributeIndex('auth_seq_id')
+        standardResCol = modData.getAttributeIndex('parent_comp_id')
+        self.modifiedResidues = []
+        if -1 not in (asymIdCol, resNameCol, resNumCol, standardResCol):
+            for row in modData.getRowList():
+                self.modifiedResidues.append(ModifiedResidue(row[asymIdCol], int(row[resNumCol]), row[resNameCol], row[standardResCol]))
    def _addAtomsToTopology(self, heavyAtomsOnly, omitUnknownMolecules):
        """Create a new Topology in which missing atoms have been added.

--- a/pdbfixer/ui.py
+++ b/pdbfixer/ui.py
@@ -55,8 +55,12 @@ def startPageCallback(parameters, handler):
    global fixer
    if 'type' in parameters:
        if parameters.getfirst('type') == 'local':
+            filename = parameters['pdbfile'].filename
+            if filename.lower().endswith('.pdbx') or filename.lower().endswith('.cif'):
+                fixer = PDBFixer(pdbxfile=StringIO(parameters['pdbfile'].value.decode()))
+            else:
                fixer = PDBFixer(pdbfile=parameters['pdbfile'].value.decode().splitlines())
-            fixer.source = parameters['pdbfile'].filename
+            fixer.source = filename
        else:
            id = parameters.getfirst('pdbid')
            try: