Merge pull request #107 from peastman/pdbx

[WIP] Added support for loading PDBx/mmCIF files

Merge pull request #107 from peastman/pdbx
[WIP] Added support for loading PDBx/mmCIF files
ccc80f18 · peastman · bbb8cfa8 · eb82172f · ccc80f18 · ccc80f18
Commit ccc80f18 authored Aug 07, 2015 by peastman
Show whitespace changes
Inline Side-by-side

Showing with 135 additions and 41 deletions

Manual.html
+6 -4

pdbfixer/html/addResidues.html
+1 -1

pdbfixer/pdbfixer.py
+120 -33

pdbfixer/ui.py
+8 -3

No files found.
--- a/Manual.html
+++ b/Manual.html
@@ -4,11 +4,11 @@
    </head>
 <body>
 <h1 style="text-align:center">PDBFixer</h1>
-<div style="text-align:center">Copyright 2013-2014 by Peter Eastman and Stanford University</div>
+<div style="text-align:center">Copyright 2013-2015 by Peter Eastman and Stanford University</div>
 <h1>1. Introduction</h1>
-Protein Data Bank (PDB) files often have a number of problems that must be fixed before they can be used in a molecular dynamics simulation.  The details vary depending on how the file was generated.  Here are some of the most common ones:
+Protein Data Bank (PDB or PDBx/mmCIF) files often have a number of problems that must be fixed before they can be used in a molecular dynamics simulation.  The details vary depending on how the file was generated.  Here are some of the most common ones:
 <ol>
    <li>If the structure was generated by X-ray crystallography, most or all of the hydrogen atoms will usually be missing.</li>
@@ -34,7 +34,9 @@ To install PDBFixer, navigate to the root directory of the source distribution y
 This will install the PDBFixer python package as well as the command line program <tt>pdbfixer</tt>.
 <p>
-Before running PDBFixer, you must first install <a href="https://simtk.org/home/openmm">OpenMM</a> 6.0 or later.  Follow the installation instructions in the OpenMM manual.  It is also recommended that you install CUDA or OpenCL, since the performance will usually be faster than when running on the CPU platform.  PDBFixer requires that <a href="http://www.numpy.org">NumPy</a> be installed.
+Before running PDBFixer, you must first install <a href="https://simtk.org/home/openmm">OpenMM</a> 6.3 or later.  Follow the installation instructions in the OpenMM manual.  It is also recommended that you install CUDA or OpenCL, since the performance will usually be faster than when running on the CPU platform.  PDBFixer requires that <a href="http://www.numpy.org">NumPy</a> be installed.
+<p>
+Alternatively, PDBFixer is included as part of the <a href="https://omnia.md">Omnia</a> suite for molecular simulation.  If you install the suite, PDBFixer and its dependencies will be included.
 <h1>3. PDBFixer as a Desktop Application</h1>
@@ -45,7 +47,7 @@ To run PDBFixer as a desktop application, type
 <p>
 on the command line.  PDBFixer displays its user interface through a web browser, but it is still a single user desktop application.  It should automatically launch a web browser and open a new window displaying the user interface.  If for any reason this does not happen, you can launch a web browser yourself and point it to <a href="http://localhost:8000">http://localhost:8000</a>.
 <p>
-The user interface consists of a series of pages for selecting a PDB file and choosing what changes to make to it.  Depending on the details of a particular file, some of these pages may be skipped.
+The user interface consists of a series of pages for selecting a PDB or PDBx/mmCIF file and choosing what changes to make to it.  Depending on the details of a particular file, some of these pages may be skipped.
 <h3>Load File</h3>

--- a/pdbfixer/html/addResidues.html
+++ b/pdbfixer/html/addResidues.html
-The SEQRES records in this PDB file include residues that are missing from the atom data section.  Do you want to add the missing residues?
+The sequence records in this PDB file include residues that are missing from the atom data section.  Do you want to add the missing residues?
 <p>
 <form id="mainform" method="post" action="/">
 <table border="1" id="table">

--- a/pdbfixer/pdbfixer.py
+++ b/pdbfixer/pdbfixer.py
@@ -36,6 +36,7 @@ import simtk.openmm as mm
 import simtk.openmm.app as app
 import simtk.unit as unit
 from simtk.openmm.app.internal.pdbstructure import PdbStructure
+from simtk.openmm.app.internal.pdbx.reader.PdbxReader import PdbxReader
 from simtk.openmm.app.element import hydrogen, oxygen
 from simtk.openmm.app.forcefield import NonbondedGenerator
 import numpy as np
@@ -47,11 +48,12 @@ import math
 from pkg_resources import resource_filename
-# Imports for urlopen
+try:
-if sys.version_info >= (3,0):
    from urllib.request import urlopen
-else:
+    from io import StringIO
+except:
    from urllib2 import urlopen
+    from cStringIO import StringIO
 substitutions = {
    '2AS':'ASP', '3AH':'HIS', '5HP':'GLU', 'ACL':'ARG', 'AGM':'ARG', 'AIB':'ALA', 'ALM':'ALA', 'ALO':'THR', 'ALY':'LYS', 'ARM':'ARG',
@@ -87,6 +89,26 @@ class ModifiedResidue(object):
        self.residueName = residueName
        self.standardName = standardName
+def _guessFileFormat(file, filename):
+    """Guess whether a file is PDB or PDBx/mmCIF based on its filename and contents."""
+    filename = filename.lower()
+    if '.pdbx' in filename or '.cif' in filename:
+        return 'pdbx'
+    if '.pdb' in filename:
+        return 'pdb'
+    for line in file:
+        if line.startswith('data_') or line.startswith('loop_'):
+            file.seek(0)
+            return 'pdbx'
+        if line.startswith('HEADER') or line.startswith('REMARK') or line.startswith('TITLE '):
+            file.seek(0)
+            return 'pdb'
+    # It's certainly not a valid PDBx/mmCIF.  Guess that it's a PDB.
+    file.seek(0)
+    return 'pdb'
 def _overlayPoints(points1, points2):
    """Given two sets of points, determine the translation and rotation that matches them as closely as possible.
@@ -147,27 +169,33 @@ def _findUnoccupiedDirection(point, positions):
    return direction
 class PDBFixer(object):
-    """PDBFixer implements many tools for fixing problems in PDB files.
+    """PDBFixer implements many tools for fixing problems in PDB and PDBx/mmCIF files.
    """
-    def __init__(self, filename=None, pdbfile=None, url=None, pdbid=None):
+    def __init__(self, filename=None, pdbfile=None, pdbxfile=None, url=None, pdbid=None):
-        """Create a new PDBFixer instance to fix problems in a PDB file.
+        """Create a new PDBFixer instance to fix problems in a PDB or PDBx/mmCIF file.
        Parameters
        ----------
        filename : str, optional, default=None
-            A filename specifying the file from which the PDB file is to be read.
+            The name of the file to read.  The format is determined automatically based on the filename extension, or if
+            that is ambiguous, by looking at the file content.
        pdbfile : file, optional, default=None
            A file-like object from which the PDB file is to be read.
            The file is not closed after reading.
+        pdbxfile : file, optional, default=None
+            A file-like object from which the PDBx/mmCIF file is to be read.
+            The file is not closed after reading.
        url : str, optional, default=None
-            A URL specifying the internet location from which the PDB file contents should be retrieved.
+            A URL specifying the internet location from which the file contents should be retrieved.  The format is
+            determined automatically by looking for a filename extension in the URL, or if that is ambiguous, by looking
+            at the file content.
        pdbid : str, optional, default=None
            A four-letter PDB code specifying the structure to be retrieved from the RCSB.
        Notes
        -----
-        Only one of structure, filename, pdbfile, url, or pdbid may be specified or an exception will be thrown.
+        Only one of structure, filename, pdbfile, pdbxfile, url, or pdbid may be specified or an exception will be thrown.
        Examples
        --------
@@ -193,47 +221,45 @@ class PDBFixer(object):
        """
        # Check to make sure only one option has been specified.
-        if bool(filename) + bool(pdbfile) + bool(url) + bool(pdbid) != 1:
+        if bool(filename) + bool(pdbfile) + bool(pdbxfile) + bool(url) + bool(pdbid) != 1:
-            raise Exception("Exactly one option [filename, pdbfile, url, pdbid] must be specified.")
+            raise Exception("Exactly one option [filename, pdbfile, pdbxfile, url, pdbid] must be specified.")
        self.source = None
+        if pdbid:
+            # A PDB id has been specified.
+            url = 'http://www.rcsb.org/pdb/files/%s.pdb' % pdbid
        if filename:
-            self.source = filename
            # A local file has been specified.
+            self.source = filename
            file = open(filename, 'r')
-            structure = PdbStructure(file)
+            if _guessFileFormat(file, filename) == 'pdbx':
+                self._initializeFromPDBx(file.read())
+            else:
+                self._initializeFromPDB(file)
            file.close()
        elif pdbfile:
            # A file-like object has been specified.
-            structure = PdbStructure(pdbfile)  
+            self._initializeFromPDB(pdbfile)
+        elif pdbxfile:
+            # A file-like object has been specified.
+            self._initializeFromPDBx(pdbxfile)
        elif url:
-            self.source = url
            # A URL has been specified.
-            file = urlopen(url)
-            structure = PdbStructure(file)
-            file.close()
-        elif pdbid:
-            # A PDB id has been specified.
-            url = 'http://www.rcsb.org/pdb/files/%s.pdb' % pdbid
            self.source = url
            file = urlopen(url)
-            # Read contents all at once and split into lines, since urlopen doesn't like it when we read one line at a time over the network.
            contents = file.read().decode('utf-8')
-            lines = contents.split('\n')
            file.close()
-            structure = PdbStructure(lines)
+            file = StringIO(contents)
+            if _guessFileFormat(file, url) == 'pdbx':
+                self._initializeFromPDBx(contents)
+            else:
+                self._initializeFromPDB(StringIO(contents))
        # Check the structure has some atoms in it.
-        atoms = list(structure.iter_atoms())
+        atoms = list(self.topology.atoms())
-        if len(atoms)==0:
+        if len(atoms) == 0:
            raise Exception("Structure contains no atoms.")
-        pdb = app.PDBFile(structure)
-        self.topology = pdb.topology
-        self.positions = pdb.positions
-        self.sequences = [Sequence(s.chain_id, s.residues) for s in structure.sequences]
-        self.modifiedResidues = [ModifiedResidue(r.chain_id, r.number, r.residue_name, r.standard_name) for r in structure.modified_residues]
        # Load the templates.
        self.templates = {}
@@ -243,7 +269,68 @@ class PDBFixer(object):
            name = next(templatePdb.topology.residues()).name
            self.templates[name] = templatePdb
-        return
+    def _initializeFromPDB(self, file):
+        """Initialize this object by reading a PDB file."""
+        structure = PdbStructure(file)
+        pdb = app.PDBFile(structure)
+        self.topology = pdb.topology
+        self.positions = pdb.positions
+        self.sequences = [Sequence(s.chain_id, s.residues) for s in structure.sequences]
+        self.modifiedResidues = [ModifiedResidue(r.chain_id, r.number, r.residue_name, r.standard_name) for r in structure.modified_residues]
+    def _initializeFromPDBx(self, file):
+        """Initialize this object by reading a PDBx/mmCIF file."""
+        pdbx = app.PDBxFile(file)
+        self.topology = pdbx.topology
+        self.positions = pdbx.positions
+        # PDBxFile doesn't record the information about sequence or modified residues, so we need to read them separately.
+        file.seek(0)
+        reader = PdbxReader(file)
+        data = []
+        reader.read(data)
+        block = data[0]
+        # Load the sequence data.
+        sequenceData = block.getObj('entity_poly_seq')
+        entityIdCol = sequenceData.getAttributeIndex('entity_id')
+        residueCol = sequenceData.getAttributeIndex('mon_id')
+        sequences = {}
+        for row in sequenceData.getRowList():
+            entityId = row[entityIdCol]
+            residue = row[residueCol]
+            if entityId not in sequences:
+                sequences[entityId] = []
+            sequences[entityId].append(residue)
+        # Sequences are stored by "entity".  There could be multiple chains that are all the same entity, so we need to
+        # convert from entities to chains.
+        asymData = block.getObj('struct_asym')
+        asymIdCol = asymData.getAttributeIndex('id')
+        entityIdCol = asymData.getAttributeIndex('entity_id')
+        self.sequences = []
+        for row in asymData.getRowList():
+            asymId = row[asymIdCol]
+            entityId = row[entityIdCol]
+            if entityId in sequences:
+                self.sequences.append(Sequence(asymId, sequences[entityId]))
+        # Load the modified residues.
+        modData = block.getObj('pdbx_struct_mod_residue')
+        asymIdCol = modData.getAttributeIndex('label_asym_id')
+        resNameCol = modData.getAttributeIndex('label_comp_id')
+        resNumCol = modData.getAttributeIndex('auth_seq_id')
+        standardResCol = modData.getAttributeIndex('parent_comp_id')
+        self.modifiedResidues = []
+        if -1 not in (asymIdCol, resNameCol, resNumCol, standardResCol):
+            for row in modData.getRowList():
+                self.modifiedResidues.append(ModifiedResidue(row[asymIdCol], int(row[resNumCol]), row[resNameCol], row[standardResCol]))
    def _addAtomsToTopology(self, heavyAtomsOnly, omitUnknownMolecules):
        """Create a new Topology in which missing atoms have been added.

--- a/pdbfixer/ui.py
+++ b/pdbfixer/ui.py
@@ -7,7 +7,7 @@ import time
 import simtk.openmm.app as app
 import simtk.unit as unit
-from .pdbfixer import PDBFixer, proteinResidues, dnaResidues, rnaResidues
+from .pdbfixer import PDBFixer, proteinResidues, dnaResidues, rnaResidues, _guessFileFormat
 from . import uiserver
 try:
@@ -55,8 +55,13 @@ def startPageCallback(parameters, handler):
    global fixer
    if 'type' in parameters:
        if parameters.getfirst('type') == 'local':
-            fixer = PDBFixer(pdbfile=parameters['pdbfile'].value.decode().splitlines())
+            filename = parameters['pdbfile'].filename
-            fixer.source = parameters['pdbfile'].filename
+            file = StringIO(parameters['pdbfile'].value.decode())
+            if _guessFileFormat(file, filename) == 'pdbx':
+                fixer = PDBFixer(pdbxfile=file)
+            else:
+                fixer = PDBFixer(pdbfile=file)
+            fixer.source = filename
        else:
            id = parameters.getfirst('pdbid')
            try: