#!/bin/python
""" PDBnet is a collection of Python objects intended to model and contain
PDB protein data.
PDBnet Copyright (C) 2012 Christian Blouin
Contributions by Alex Safatli and Jose Sergio Hleap
Major Refactoring (2014) done by Alex Safatli and Jose Sergio Hleap
E-mail: cblouin@cs.dal.ca, safatli@cs.dal.ca, jshleap@dal.ca
Dependencies: Scipy, BioPython, FASTAnet (contained in LabBlouinTools) """
from mmap import mmap as memoryMappedFile
import numpy as np
import sys, FASTAnet, math
from math import sqrt
from os.path import getsize as sizeOfFile
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from scipy.spatial.distance import cdist
# Metadata
__author__ = ['Christian Blouin','Jose Sergio Hleap','Alexander Safatli']
__version__ = '1.1.0'
# Constants
PDB_LARGE_FILE_SIZE = 100000000 #bytes
# Classes
[docs]class PDBatom(object):
""" ATOM in a PDB protein structure. """
__slots__ = ('name','serial','x','y','z','occupancy','tempFactor',
'charge','symbol','parent')
def __init__(self,serial,name,x,y,z,oc,b,symbol,charge):
""" Construct an atom in a PDB protein structure. """
self.name = name
self.serial = serial
self.x = x
self.y = y
self.z = z
self.occupancy = oc
self.tempFactor = b
self.charge = charge
self.symbol = symbol
self.parent = None
[docs] def fixname(self):
""" Ensures the name of this atom fits within 4 characters. """
if len(self.name) == 4: return self.name
else: return ' '+self.name.ljust(3)
def __str__(self):
""" Express this atom as a single line in a PDB file (see PDB format). """
return "ATOM %5s %4s %3s %1s%4s %8.3f%8.3f%8.3f%6.2f%6.2f %2s%2s" \
% (self.serial,self.fixname(),self.parent.name,self.parent.chain,
self.parent.index,self.x,self.y,self.z,self.occupancy,
self.tempFactor,self.symbol,self.charge)
[docs] def GetPosition(self):
""" Get the 3-dimensional coordinate of this atom in space. """
return (self.x,self.y,self.z)
[docs] def DistanceTo(self, atom):
""" Acquire the distance from this atom to another atom. """
return ((self.x-atom.x)**2+(self.y-atom.y)**2+(self.z-atom.z)**2)**0.5
[docs]class PDBterminator(PDBatom):
""" A placeholder class that represents a terminating ATOM-like line in the PDB file. """
__slots__ = ('name','serial','x','y','z','occupancy','tempFactor',
'charge','symbol','parent','lastatom','lastresname',
'lastreschain','lastresind')
def __init__(self,chaininst):
self.lastatom = chaininst.GetAtoms()[-1]
self.lastresname = self.lastatom.parent.name
self.lastreschain = self.lastatom.parent.chain
self.lastresind = self.lastatom.parent.index
newserial = str(int(self.lastatom.serial) + 1)
super(PDBterminator,self).__init__(newserial,'',0,0,0,0,0,'','')
def __str__(self):
return 'TER %5s %4s %3s %1s%4s' % (
self.serial,'',self.lastresname,self.lastreschain,self.lastresind)
[docs]class PDBresidue:
""" A residue (collection of ATOM fields) in a PDB protein structure. """
__slots__ = ('index','name','atoms','atomsOrdered','contacts','centroid',
'chain','model')
def __init__(self,index=None,name=''):
""" Construct a residue (collection of atoms) in a PDB protein structure. """
self.index = index
self.name = name
self.atoms = {}
self.atomsOrdered = []
self.contacts = []
self.centroid = None
self.chain = ''
self.model = None
[docs] def GetAtoms(self): return [x for x in self.atomsOrdered]
[docs] def AddAtom(self, atom):
""" Add a PDBatom structure to this residue. """
self.atoms[atom.name.strip()] = atom
self.atomsOrdered.append(atom)
atom.parent = self
[docs] def GetCA(self):
""" Get the alpha-carbon found in this residue as a PDBatom. """
return self.atoms['CA']
[docs] def Centroid(self):
""" Calculate the centroid of this residue. Return this as a PDBatom. """
x = 0.0
y = 0.0
z = 0.0
for atom in self.atoms:
if atom in ['C','N','O']: continue
x += self.atoms[atom].x
y += self.atoms[atom].y
z += self.atoms[atom].z
div = float(len(self.atoms)-3)
x /= div
y /= div
z /= div
self.centroid = PDBatom(None,'centroid',x,y,z,0,0,'','')
self.centroid.parent = self
return self.centroid
def __int__(self): return int(self.index)
def __str__(self): return '\n'.join([str(x) for x in self.atomsOrdered])
[docs]class PDBchain(object):
""" A PDB chain (collection of protein residues). """
__slots__ = ('name','structure','residues','indices','parent')
def __init__(self,name):
self.name = name
self.residues = list()
self.indices = list()
self.structure = None
self.parent = None
[docs] def GetResidues(self):
""" Acquire all of the residues in this chain. Note that if this
is a large file, this will force the object to load all of these
residues into memory from the file. """
return [self.GetResidueByIndex(i) for i in self.GetIndices()]
[docs] def IterResidues(self):
""" Iteratively yield all of the residues in this chain. Note that if this
is a large file, this will force the object to load all of these
residues into memory from the file. """
for i in self.GetIndices(): yield self.GetResidueByIndex(i)
[docs] def GetResidueByIndex(self,index):
""" Alias for acquiring a residue from this instance using [] operator. """
return self.__getitem__(index)
[docs] def GetAtoms(self):
""" Get all comprising atoms in this chain in order of residues. """
atoms = []
for x in self.IterResidues():
atoms.extend(x.GetAtoms())
return atoms
[docs] def GetIndices(self):
""" Acquire all of the indices for residues present in this chain. """
return [int(x) for x in self.indices]
[docs] def AddIndexOfResidue(self,index):
""" Add an index for a residue to this chain. Will generate own residue
information if called upon (forces lazy evaluation). """
self.indices.append(str(index))
[docs] def AddResidue(self,resid):
""" Add a residue to this chain. """
resid.chain = self.name
self.residues.append(resid)
self.indices.append(str(resid.index))
[docs] def AddResidueByIndex(self,index):
""" Add a residue to this chain by its index in the PDB. The residue
object will automatically be constructed from the file. """
self.indices.append(str(index))
resid = self.GetResidueByIndex(index)
self.residues.append(resid)
[docs] def RemoveResidue(self,resid):
""" Remove a residue from this chain. """
return self.pop(resid)
[docs] def GetPrimaryPropertiesFromBioPython(self):
""" Use BioPython to populate this class with attributes. """
props=PA(self.AsFASTA())
self.amino_acid_composition= props.get_amino_acids_percent()
self.molecular_weight= props.molecular_weight()
self.aromaticity= props.aromaticity()
self.instability_index= props.instability_index()
self.flexibility= props.flexibility()
self.isoelectric_point=props.isoelectric_point()
self.secondary_structure_fraction=props.secondary_structure_fraction()
return props
[docs] def AsFASTA(self):
""" Return the string representing this chain's FASTA sequence. """
return ''.join([aa[item.name] for item in self.IterResidues()])
[docs] def WriteAsPDB(self,filename):
""" Write this single chain to a file as a PDB. """
fh = open(filename,'w')
fh.write(str(self))
fh.close()
[docs] def SortByNumericalIndices(self):
""" Sort all internal items by a numerical index. """
if not self.structure.handle.isLargeFile():
self.residues = sorted(self.residues,key=lambda d: int(d.index))
self.indices = sorted(self.indices, key=lambda d: int(d))
[docs] def pop(self,s):
""" Pop a residue. """
res = self.GetResidueByIndex(s)
if not self.structure.handle.isLargeFile():
self.residues.remove(res)
self.indices.remove(str(s))
return res
[docs] def update(self,o):
""" Add residues from another chain. """
for res in o: self.AddResidue(res)
def __str__(self):
get = lambda d: self.GetResidueByIndex(d)
out = '\n'.join([str(get(x)) for x in self.indices])
out += '\n%s\n' % (str(PDBterminator(self)))
return out
def __getitem__(self,i):
""" Force the object to load the residue if not present
in this structure. """
if self.parent == None: parent = -1
else: parent = self.parent.name
handle = self.structure.handle
if str(i) in self.indices:
if handle and ((handle.isLargeFile()) or (len(self.indices) > len(self.residues))):
if handle.hasResidue(self.name,str(i),parent):
resid = handle.readResidue(self.name,str(i),parent)
resid.chain = self.name
return resid
elif self.indices.index(str(i)) < len(self.residues):
return self.residues[self.indices.index(str(i))]
else:
raise KeyError('Could not find residue %s in %s:%s in file or structure.' % (
str(i),str(self.name),str(parent)))
return self.residues[self.indices.index(str(i))]
else: return None
def __len__(self): return len(self.indices)
def __iter__(self):
for ind in self.indices: yield int(ind)
[docs]class PDBmodel(PDBchain):
""" A PDB model (a special kind of chain). """
__slots__ = ('name','structure','residues','indices','chains','chainNames')
def __init__(self,name):
if type(name) != int: raise TypeError('Model names should be integers.')
elif name < 0: raise ValueError('Model name should be zero or a positive integer.')
super(PDBmodel,self).__init__(name)
self.chains = list()
self.chainNames = list()
[docs] def GetResidues(self):
""" Acquire all of the residues in this model. Note that if this
is a large file, this will force the object to load all of these
residues into memory from the file. """
this = [self.GetResidueByIndex(i) for i in self.GetIndices()]
for it in self.chains: this += it.GetResidues()
return this
[docs] def IterResidues(self):
""" Iteratively yield all of the residues in this model. Note that if this
is a large file, this will force the object to load all of these
residues into memory from the file. """
for i in self.GetIndices(): yield self.GetResidueByIndex(i)
for i in self.chains:
for j in i.IterResidues(): yield j
[docs] def GetChains(self): return self.chains
[docs] def GetChain(self,name):
return self.GetChainByName(name)
[docs] def GetChainByName(self,name):
for x in xrange(len(self.chainNames)):
if self.chainNames[x] == name: return self.chains[x]
[docs] def GetChainNames(self): return self.chainNames
[docs] def NewChain(self,ch):
""" Create a new PDBchain, add it to this model, and return it. """
c = PDBchain(ch)
self.AddChain(c)
return c
[docs] def AddChain(self,chain):
""" Add a PDBchain (chain) to this model. """
chain.structure = self.structure
chain.parent = self
self.chains.append(chain)
self.chainNames.append(chain.name)
[docs] def AddResidue(self,resid):
""" Add a residue to this model. """
resid.model = self.name
self.residues.append(resid)
self.indices.append(str(resid.index))
def __getitem__(self, i):
handle = self.structure.handle
if str(i) in self.indices:
if (handle and handle.isLargeFile()) or len(self.indices) > len(self.residues):
resid = handle.readResidue('',str(i),self.name)
resid.model = self.name
return resid
return self.residues[self.indices.index(str(i))]
else: return None
def __len__(self):
l = len(self.indices)
for c in self.GetChains():
l += len(c)
return l
def __str__(self):
model = 'MODEL%9s\n' % (str(self.name)) + super(
PDBmodel,self).__str__()
for ch in self.chains: model += str(ch)
return model + 'ENDMDL\n'
[docs]class PDBstructure(object):
""" A PDB protein structure (a collection of chains/models). """
__slots__ = ('chains','orderofchains','models','orderofmodels','remarks',
'filepath','organism','taxid','mutation','contactmap','handle',
'ismodel')
def __init__(self, filein=''):
# Attributes.
self.filepath = filein
self.handle = None
self.organism = None
self.taxid = None
self.mutation = False
self.contactmap = list()
self.orderofchains = list()
self.orderofmodels = list()
self.remarks = list()
self.chains = dict()
self.models = dict()
# Read a file?
if filein != '':
self.ReadFile(filein)
# Accesssors/Mutators.
[docs] def GetChainNames(self): return self.orderofchains
[docs] def GetModelNames(self): return self.orderofmodels
[docs] def GetChain(self,ch):
""" Get a chain by name. """
if ch in self.chains: return self.chains[ch]
return None
[docs] def GetModel(self,mod):
""" Get a model by name. """
if mod in self.models: return self.models[mod]
return None
[docs] def NewChain(self,name):
""" Construct and add a new chain by name to the PDB. Returns the chain. """
p = PDBchain(name)
self.AddChain(name,p)
return p
[docs] def NewModel(self,name):
""" Construct and add a new model by name to the PDB. Returns the model. """
p = PDBmodel(name)
self.AddModel(name,p)
return p
[docs] def RemoveChain(self,name):
""" Remove a chain from the structure (by name). Returns the chain. """
c = self.GetChain(name)
if not name in self.chains:
raise KeyError('Chain does not exist in structure by that name!')
del self.chains[name]
self.orderofchains.remove(name)
return c
[docs] def RemoveModel(self,name):
""" Remove a model from the structure (by name). Returns the chain. """
m = self.GetModel(name)
if not name in self.models:
raise KeyError('Model does not exist in structure by that name!')
del self.models[name]
self.orderofmodels.remove(name)
return m
[docs] def AddChain(self,chainname,chain):
""" Add a chain as a list of residues to the PDB. """
# Do chain addition operations.
if chainname in self.chains:
raise KeyError('Chain already exists in structure by that name!')
if type(chain) != PDBchain and type(chain) == PDBmodel:
# Cast into a PDBchain.
cast = PDBchain(chainname)
for i in chain: cast.AddResidue(chain[i])
chain = cast
self.chains[chainname] = chain
self.orderofchains.append(chainname)
chain.structure = self
[docs] def AddModel(self,modelname,model):
""" Add a model as a list of residues to the PDB. """
# Do chain addition operations.
if model in self.models:
raise KeyError('Model already exists in structure by that name!')
if type(model) != PDBmodel and type(model) == PDBchain:
# Cast into a PDBmodel.
cast = PDBmodel(modelname)
for i in model: cast.AddResidue(model[i])
model = cast
model.name = modelname
self.models[modelname] = model
self.orderofmodels.append(modelname)
model.structure = self
return model
[docs] def AddResidueToChain(self, chain, res):
""" Add a residue to a chain. Deprecated; use chain class function. """
ch = self.GetChain(chain)
if ch != None: ch.AddResidue(res)
[docs] def AddResidueToModel(self, model, res):
""" Add a residue to a model. Deprecated; use model class function. """
mo = self.GetModel(model)
if mo != None: mo.AddResidue(res)
[docs] def CheckComplete(self):
""" For every chain, check to see if every residue is complete (see
aa_list dictionary). """
for ch in self.orderofchains:
for re in self.chains[ch]:
res = self.chains[ch][re]
nam = res.name
if not nam in aa_lists: continue
check = set(set(aa_lists[nam])).difference(res.atoms)
if len(check) > 0:
return res
for ch in self.orderofmodels:
for res in self.models[ch]:
nam = res.name
if not nam in aa_lists: continue
check = set(set(aa_lists[nam])).difference(res.atoms)
if len(check) > 0:
return res
return True
# I/O Functionality.
def _pymol(self):
# Start pymol if not started.
import pymol
pymol.finish_launching()
return pymol
[docs] def view(self,istrajectory=False):
""" View the structure in a Pymol window. Requires an installation of Pymol. """
pym = self._pymol()
pym.cmd.read_pdbstr(str(self),'PDBnet')
if len(self.models) > 0 and not istrajectory:
pym.cmd.set('all_states','on')
elif istrajectory: pass # SERGIO
[docs] def ViewStructure(self): self.view()
[docs] def WriteFile(self, filename):
""" Write this PDB structure as a single PDB file. """
fh = open(filename,'w')
fh.write(str(self))
fh.close()
[docs] def read(self,filename):
""" Alias for ReadFile(). """
self.ReadFile(filename)
[docs] def ReadFile(self, filename):
""" Read a PDB file. Populate this PDBstructure. """
# Map to memory and read the file.
p = PDBfile(filename)
self.handle = p
self.remarks, self.organism, self.taxid, self.mutation = p.read()
largeFile = p.isLargeFile()
# Start creating high-level structures for models/chains.
if p.hasModels():
self.ismodel = True
map(self.NewModel,p.getModelNames())
for mo, ch, re in p.iterResidueData():
model = self.GetModel(mo)
if ch != '':
if not ch in model.GetChainNames(): model.NewChain(ch)
chain = model.GetChain(ch)
if largeFile: chain.AddIndexOfResidue(re)
else: chain.AddResidueByIndex(re)
else:
if largeFile: model.AddIndexOfResidue(re)
else: model.AddResidueByIndex(re)
for mod in self.GetModelNames():
model = self.GetModel(mod)
model.structure = self
model.SortByNumericalIndices()
for chain in model.GetChains():
chain.SortByNumericalIndices()
else:
self.ismodel = False
map(self.NewChain,p.getChainNames())
for _, ch, re in p.iterResidueData():
chain = self.GetChain(ch)
if largeFile: chain.AddIndexOfResidue(re)
else: chain.AddResidueByIndex(re)
for cha in self.GetChainNames():
chain = self.GetChain(cha)
chain.structure = self
chain.SortByNumericalIndices()
# Remove this component from memory if no longer necessary.
if not largeFile:
p.close()
self.handle = None
del p
[docs] def ChainAsFASTA(self, chain):
""" Return the chain as a FASTA. Deprecated; use chain class function. """
ch = self.GetChain(chain)
if ch != None: return ch.AsFASTA()
[docs] def ModelAsFASTA(self, model):
""" Return the model as a FASTA. Deprecated; use chain class function. """
mo = self.GetModel(model)
if mo != None: return mo.AsFASTA()
# Scoring Functionality
def _iterResidueAssociations(self,fasta,chains=None,fast=None):
""" PRIVATE: Use an input FASTA file to determine associations
between residues as they exist between 2+ chains. Constructs a
mapping between chain and sequence and yields all strictly
homologous positions as residues, chain-by-chain."""
ismodel=self.ismodel
if ismodel:
orderofthings = self.orderofmodels
things = self.models
else:
orderofthings = self.orderofchains
things = self.chains
if not chains: chains = orderofthings
chaininds = [orderofthings.index(x) for x in chains]
if len(chaininds) < 2:
raise ValueError('Need more than two chains (gave %d).' % (len(chaininds)))
if fast is None:
fast = FASTAnet.FASTAstructure(fasta,uniqueOnly=False)
seqs = fast.orderedSequences
if len(things) != len(seqs):
raise IOError('FASTA set length not equal to PDB length.')
posv = fast.getStrictlyUngappedPositions(chaininds)
for n in chaininds: # Go chain by chain and yield information.
seq = seqs[n].sequence
ch = orderofthings[n]
matches = self.GetFASTAIndices(things[ch],seq)
residueList = [match for match in matches]
if residueList[0] is None:
nm = seqs[n].name
chn = things[ch].name
raise IOError('No relation found between sequence %s and chain %s.' % (nm,chn))
residueList = sorted(residueList,key=lambda d:d.fstindex)
for pos in posv: # Go position by position.
yield pos, ch, residueList.pop(0) # Yield position, chain name, and residue.
[docs] def tmscore(self,fasta,chains=None,native=None,CA=True):
""" Get the TMscore between two chains. Requires a
FASTA alignment and a value for the length of the
native structure (e.g., for a pairwise alignment,
the length of the structure used as a reference
before alignment was done). The latter is computed
by assuming the first of both provided chains is the
native structure; otherwise, uses a provided chain
name (native input). """
ismodel=self.ismodel
if ismodel:
orderofthings = self.orderofmodels
things = self.models
else:
orderofthings = self.orderofchains
things = self.chains
if not chains: chains = orderofthings
if len(chains) != 2:
raise ValueError('Need exactly two chains to score.')
fas = FASTAnet.FASTAstructure(fasta,uniqueOnly=False)
posvect = fas.getStrictlyUngappedPositions()
items = self._iterResidueAssociations(fasta,chains,fas)
# Get the lengths of the respective chains.
if native == None: leN = len(things[chains[0]])
else: leN = len(things[native]) # Length of the reference structure.
leT = len(posvect) # Number of aligned positions.
# Calculate d_0 for this alignment.
cuberoot = lambda d: d**(1./3)
d_0 = 1.24 * cuberoot(leN-15) - 1.8
# Get the summation portion of the TMscore.
sumportion = 0
posmask = {}
for pos,_,residue in items:
if not pos in posmask: posmask[pos] = []
if CA: atom = residue.GetCA()
else: atom = residue.Centroid()
posmask[pos].append(atom)
for pos in posvect: # Assume chA is Native Structure.
cavect = posmask[pos]
ca1, ca2 = cavect[0], cavect[1]
d_i = sqrt((ca1.x-ca2.x)**2+
(ca1.y-ca2.y)**2+
(ca1.z-ca2.z)**2)
sumdenom = 1 + (d_i/d_0)**2
suminside = 1./(sumdenom)
sumportion += suminside
# Return the TMscore.
return (1./leN) * sumportion
[docs] def gdt(self,fasta,chains=None,distcutoffs=[1,2,4,8],CA=True):
""" Get the GDT score between two chains. Requires a
FASTA alignment. """
ismodel=self.ismodel
if ismodel:
orderofthings = self.orderofmodels
things = self.models
else:
orderofthings = self.orderofchains
things = self.chains
if len(distcutoffs) != 4:
raise ValueError('Need exactly 4 distance cutoff values.')
if not chains: chains = orderofthings
if len(chains) != 2:
raise ValueError('Need exactly two chains to score.')
items = self._iterResidueAssociations(fasta,chains)
# Get all relevant atoms.
posmask = {}
for pos,_,residue in items:
if not pos in posmask: posmask[pos] = []
if CA: atom = residue.GetCA()
else: atom = residue.Centroid()
posmask[pos].append(atom)
# Get the Euclidean distances between
# all pairs of aligned positions.
distances = {}
for pos in posmask:
cavect = posmask[pos]
ca1, ca2 = cavect[0], cavect[1]
dist = sqrt((ca1.x-ca2.x)**2+
(ca1.y-ca2.y)**2+
(ca1.z-ca2.z)**2)
distances[pos] = dist
# Calculate the counts for different cutoffs.
counts = [0,0,0,0]
poslen = float(len(distances))
A,B,C,D = distcutoffs
for pos in distances:
dist = distances[pos]
if dist < A: counts[0] += 1
if dist < B: counts[1] += 1
if dist < C: counts[2] += 1
if dist < D: counts[3] += 1
GDT_P1 = counts[0]/poslen
GDT_P2 = counts[1]/poslen
GDT_P3 = counts[2]/poslen
GDT_P4 = counts[3]/poslen
return (GDT_P1+GDT_P2+GDT_P3+GDT_P4)/4.0
[docs] def rmsd(self,fasta,chains=None,CA=True):
"""Get the RMSD between chains. Requires a FASTA alignment."""
items = self._iterResidueAssociations(fasta,chains)
# Get all relevant atoms.
posmask = {}
for pos,_,residue in items:
if not pos in posmask: posmask[pos] = []
if CA: atom = residue.GetCA()
else: atom = residue.Centroid()
posmask[pos].append(atom)
# Get the Euclidean distance squared between
# all pairs of aligned positions.
distsqs = {}
for pos in posmask:
cavect = posmask[pos]
for a in xrange(len(cavect)):
for b in xrange(a+1,len(cavect)):
if (a,b) not in distsqs: distsqs[(a,b)] = 0
ca1, ca2 = cavect[a], cavect[b]
distsq = ((ca1.x-ca2.x)**2
+(ca1.y-ca2.y)**2
+(ca1.z-ca2.z)**2)
distsqs[(a,b)] += distsq
# Calculate the average RMSD.
rmsd = 0
for d in distsqs:
d = distsqs[d]
r = sqrt((float(d)/len(posmask)))
rmsd += r
rmsd /= len(distsqs)
return rmsd
[docs] def rrmsd(self,fasta,chains=None,CA=True):
""" Get the RRMSD between chains. Requires a FASTA alignment.
See Betancourt & Skolnick, "Universal Similarity Measure for
Comparison Protein Structures". """
if self.ismodel:
orderofthings = self.orderofmodels
things = self.models
else:
orderofthings = self.orderofchains
things = self.chains
if not chains: chains = orderofthings
if len(chains) != 2:
raise ValueError('Need exactly two chains to score.')
R_A = self.RadiusOfGyration([chains[0]])
R_B = self.RadiusOfGyration([chains[1]])
avglen = (len(things[chains[0]])+
len(things[chains[1]]))/2
L_N, e = avglen-1, math.e
c = 0.42-0.05*L_N*e**(-L_N/4.7)+0.63*e**(-L_N/37)
denom = R_A**2+R_B**2-2*c*R_A*R_B
alignRMSD = self.rmsd(fasta,chains,CA)
return float(alignRMSD)/sqrt(denom)
# Other Functionality
[docs] def GetAverage(self,chains=None,newname=None):
""" Acquire a new chain or model corresponding to an average of all present
chains or models specified. """
if self.ismodel:
orderofthings = self.orderofmodels
things = self.models
if newname == None: newname = 1
output = PDBmodel(newname)
else:
orderofthings = self.orderofchains
things = self.chains
if newname == None: newname = '*'
output = PDBchain(newname)
if chains == None: chains = orderofthings
# Make sure all sequences have equal FASTAs.
fas = ''
for ch in chains:
if fas == '': fas = things[ch].AsFASTA()
if things[ch].AsFASTA() != fas:
raise AssertionError('Not all provided chains/models have equal sequences.')
def avgResidue(res_no):
# Create the dummy residue structure.
firstch = things[chains[0]]
firstres = firstch.GetResidues()[res_no]
fake = PDBresidue(firstres.index,firstres.name)
# Populate it with atoms.
atomlist = []
for a in firstres.GetAtoms():
atomlist.append(PDBatom(a.serial,a.name,0,0,0,
0,0,a.symbol,''))
# Average all properties.
count = len(chains)
for ch in chains:
res = things[ch].GetResidueByIndex(things[ch].GetIndices()[res_no])
atoms = res.GetAtoms()
for i in xrange(len(atoms)):
atomlist[i].x += atoms[i].x
atomlist[i].y += atoms[i].y
atomlist[i].z += atoms[i].z
atomlist[i].occupancy += atoms[i].occupancy
atomlist[i].tempFactor += atoms[i].tempFactor
for a in atomlist:
a.x /= float(count)
a.y /= float(count)
a.z /= float(count)
a.occupancy /= float(count)
a.tempFactor /= float(count)
for a in atomlist: fake.AddAtom(a)
return fake
res_nums = range(len(things[chains[0]]))
for i in res_nums: output.AddResidue(avgResidue(i))
return output
[docs] def RadiusOfGyration(self,chains=None):
""" Acquire the radius of the gyration of the entire, or a portion of, the
PDB protein molecule. """
if self.ismodel:
orderofthings = self.orderofmodels
things = self.models
else:
orderofthings = self.orderofchains
things = self.chains
if not chains: chains = orderofthings
# Get centre of mass of entire molecule.
molCentroid = [0.,0.,0.]
numAtoms = 0
for c in chains:
for re in things[c]:
r = things[c][re]
for at in r.atoms:
a = r.atoms[at]
molCentroid[0] += a.x
molCentroid[1] += a.y
molCentroid[2] += a.z
numAtoms += 1
m_x,m_y,m_z = molCentroid
molCentroid = [m_x/numAtoms,m_y/numAtoms,m_z/numAtoms]
# Sum all of the distances squared vs. centre of mass.
distsqr = []
m_x,m_y,m_z = molCentroid
for c in chains:
for re in things[c]:
r = things[c][re]
for at in r.atoms:
a = r.atoms[at]
diff = (a.x-m_x+a.y-m_y+a.z-m_z)
distsqr.append(diff**2)
# Return the sum of all of these, divided by number of atoms,
# square rooted.
sumdistratio = sum(distsqr)/float(numAtoms)
return sqrt(sumdistratio)
[docs] def GetAllCentroid(self, chain):
""" Populates the centroids of all residues. """
out = []
if chain:
if not chain in self.chains:
raise ValueError('Chain %s does not exist!' % (chain))
ch = self.chains[chain]
for res in ch:
res = ch[res]
res.Centroid()
out.append(res.centroid)
else:
for res in self.IterAllResidues():
res.Centroid()
out.append(res.centroid)
return out
[docs] def IndexSeq(self, chain, fst):
""" Store in residues the correct index to the fasta.
Requires a 1-to-1 correspondance at least a portion
of the way through. Deprecated; use GetFASTAIndices(). """
ismodel=self.ismodel
if ismodel: thing = self.GetModel(chain)
else: thing = self.GetChain(chain)
return self.GetFASTAIndices(thing, fst)
[docs] def GetFASTAIndices(self, thing, fst):
""" Given a PDBchain, find 1-to-1 correspondances between
it and a FASTA sequence. """
chainseq = thing.AsFASTA()
ungapped = fst.replace('-','')
success = True
if len(ungapped) != len(chainseq):
success = False
yield None
for i in xrange(0,len(chainseq)):
# See if there is a failed correspondance.
if chainseq[i] != ungapped[i]:
yield None
success = False
break
if success:
index = -1
for i in thing:
i = thing[i]
index = fst.find(aa[i.name],index+1)
i.fstindex = index
yield i
[docs] def IterResiduesFor(self,chains=None):
""" Produce an iterator to allow one to iterate over all residues for a subset of the structure. """
if self.ismodel:
orderofthings = self.orderofmodels
things = self.models
else:
orderofthings = self.orderofchains
things = self.chains
if chains == None: chains = orderofthings
for ch in chains:
chain = things[ch]
for res in chain: yield chain[res]
[docs] def IterAllResidues(self):
""" Produce an iterator to allow one to iterate over all possible residues. """
for ch in self.chains.keys():
chain = self.GetChain(ch)
for res in chain: yield chain[res]
for mo in self.models.keys():
model = self.GetModel(mo)
for res in model: yield model[res]
[docs] def gm(self,fasta,chains=None,CA=False,typeof='str'):
""" Acquire Geometric Morphometric data corresponding to the
(x,y,z) coordinates between all homologous residue positions.
Requires a FASTA alignment. Options include using alpha-carbon
positions. By default, uses centroids of residues. Returns a list
of labels and a list of coordinates as raw GM data. The typeof option
provides an option for coordinate output; they are returned
as a semicolon-delimited string (str) or as a numpy 2d array (matrix). """
if self.ismodel:
orderofthings = self.orderofmodels
things = self.models
else:
orderofthings = self.orderofchains
things = self.chains
if chains == None: chains = orderofthings
labels,coords = [],[] # Output Variables
# Set up name grabbing from FASTA file for GM labelling.
f = FASTAnet.FASTAstructure(fasta,uniqueOnly=False)
name = lambda d: f.orderedSequences[d].name
# Acquire all homologous positions as defined in FASTA.
items = self._iterResidueAssociations(fasta,chains,f)
chainsSaw = []
if typeof == 'matrix': row = []
else: row = ''
for pos,chain,res in items:
if chain not in chainsSaw:
ind = orderofthings.index(chain)
nm = name(ind).strip().split(':')[0]
labels.append('>%s:%s' % (nm,nm[:4]))
if len(chainsSaw) != 0:
coords.append(row)
chainsSaw.append(chain)
if typeof == 'matrix': row = []
else: row = ''
if CA: atom = res.GetCA()
else: atom = res.Centroid()
if isinstance(row, str): row += '%f;%f;%f;'%(atom.x,atom.y,atom.z)
elif isinstance(row,list): row.extend([atom.x,atom.y,atom.z])
coords.append(row) # Last chain.
if typeof == 'matrix': coords = np.array(coords)
return labels,coords
[docs] def WriteGM(self,fasta,gm,chains=None,CA=False):
""" Write the information present in this PDB between multiple
chains as a Geometric Morphometric text file. This file will be
formatted such that individual lines correspond to chains and semi-colons
separate the (x,y,z) coordinates between all homologous residue positions.
Requires a FASTA alignment. Options include using alpha-carbon positions.
By default, uses centroids of residues. """
fgm = open(gm,'w')
# Get raw GM data.
labels,coords = self.gm(fasta,chains,CA)
# Start writing.
for ind in xrange(len(labels)):
fgm.write(labels[ind] + ';' + coords[ind] + '\n')
# Done.
fgm.close()
[docs] def WriteLandmarks(self,fasta,lm,chains=None):
""" Write the information present in this PDB between multiple
chains as a landmark text file. This file will be formatted such that
the file is partitioned in sections starting with chain names and individual
lines in these correspond to homologous residue positions denoted by
homologous position, residue number, and residue name tab-delimited. Requires
a FASTA file. """
ismodel=self.ismodel
if ismodel:
orderofthings = self.orderofmodels
things = self.models
else:
orderofthings = self.orderofchains
things = self.chains
if chains == None: chains = orderofthings
flm = open(lm,'w')
# Set up name grabbing from FASTA file for labelling.
f = FASTAnet.FASTAstructure(fasta,uniqueOnly=False)
name = lambda d: f.orderedSequences[d].name
# Acquire all homologous positions as defined in FASTA.
items = self._iterResidueAssociations(fasta,chains,f)
# Start writing.
chainsSaw = []
ind = -1
for pos,chain,res in items:
if chain not in chainsSaw:
ind = orderofthings.index(chain)
nm = name(ind).strip().split(':')[0]
flm.write('>%s\n' % (nm))
chainsSaw.append(chain)
ind = 0
flm.write('%s\t%s\t%s\n' % (ind,res.index,res.name))
ind += 1
[docs] def FDmatrix(self,fasta,chains=None,scaled=True):
"""
Compute the form difference matrix (FDM) as explained in Claude 2008. It relates to
the identification of the most influential residue, with respect to the overall
shape/structure. If the scaled option is True, will return an scaled list
(from -1 to 1) of the of lenght equal to the number of residues. Otherwise will return
the raw FDM, rounded so it can be included in a PDB. The scaled version is better for
vizualization. By default the FDM is computed far all chains, but a subset can be passed
to the chains option.
"""
names, data = self.gm(fasta,chains=chains,typeof='matrix')
#get dimensions
n_obs = data.shape[0] # number observations/structures
n_res = data.shape[1]/3 # Number of residues
# Re-sahpe data as a 3D array
data= data.reshape(n_obs,n_res,3)
# compute meanshape
msh=data.mean(axis=0)
#compute the form matrix for the mean shape
FMmsh = cdist(msh,msh)
FMmsh = np.ma.masked_array(FMmsh,np.isnan(FMmsh))
#iterate over observations to compute the form difference matrix
vector=[]
for index in xrange(n_obs):
ndarray=data[index,:,:]
FM = cdist(ndarray,ndarray)
FDM = np.ma.masked_array(FM,np.isnan(FM))/FMmsh
utr = FDM[np.triu_indices(FDM.shape[0],k=-1)]
mfdm = np.absolute(FDM - np.median(utr[~np.isnan(utr)]))
mdat = np.ma.masked_array(mfdm,np.isnan(mfdm))
s = np.sum(mdat,axis=1).filled(np.nan)
vector.append(s)
#compute FD and scaled if required
FD = (np.array(vector).sum(axis=0))/n_obs
if not scaled:
return [round(x,2) for x in (FD)]
else:
return [round(x,3) for x in (2*(FD - min(FD))/(max(FD) - min(FD)) - 1)]
[docs] def Map2Protein(self,outname,lis,chain,fasta):
"""
Map a list of values (lis), that must have a lenght equal to that of the number
of residues in the PDB to be mapped (chain). If a list of list is provided, the
first list will be mapped as the beta factor and the second as occupancy
"""
# Check if more than one thing need to be included
if len([lis]) == 1:
lis=[lis]
# Construct empty PDBstructure.
dummy = PDBstructure()
# Reset beta and ocuppancy
if self.models:
m = self.GetModel(chain)
newm = dummy.NewModel(m.name)
for r in m.IterResidues():
for a in r.GetAtoms():
a.tempFactor=0.00
if len(lis) > 1:
a.occupancy=0.00
newm.AddResidue(r)
else:
m = self.GetChain(chain)
newm = dummy.NewChain(m.GetName())
for r in m.IterResidues():
for a in r.GetAtoms():
a.tempFactor=0.00
if len(lis) > 1:
a.occupancy=0.00
newm.AddResidue(r)
# Acquire all homologous positions as defined in FASTA.
items = self._iterResidueAssociations(fasta,None,None)
# populate new field
for pos,chainb,res in items:
if not chainb == chain:
continue
res = newm.GetResidueByIndex(res.index)
for a in res.GetAtoms():
a.tempFactor = lis[0][pos]
if len(lis) > 1:
a.occupancy = lis[1][pos]
newm.WriteAsPDB(outname)
def __iter__(self):
""" Returns all PDBchains as an iterator. """
for ch in self.chains: yield self.chains[ch]
for mo in self.models: yield self.models[mo]
def __len__(self):
""" Returns the length of the protein. """
# For all chains.
chs = self.chains
mds = self.models
return sum([len(chs[x]) for x in chs] + [len(mds[x]) for x in mds])
def _remarksToString(self):
remarkstr = ''
for it in xrange(len(self.remarks)):
remarkstr += 'REMARK%4s %s\n' % (str(it+1),self.remarks[it])
return remarkstr
def __str__(self):
""" As a string, outputs structure in the PDB format. """
out = ''
out += self._remarksToString()
for chain in self.orderofchains: out += str(self.GetChain(chain))
for model in self.orderofmodels: out += str(self.GetModel(model))
return out
[docs]class PDBfile(object):
__slots__ = ('filePath','fileHandle','memHandle','modelIndices','chains','residueIndices','size')
def __init__(self,fi):
self.filePath = fi
self.fileHandle = open(fi,'r+b')
self.size = sizeOfFile(fi)
self.memHandle = memoryMappedFile(self.fileHandle.fileno(),0)
self.residueIndices = dict()
self.modelIndices = dict()
self.chains = list()
[docs] def isLargeFile(self):
""" Return whether or on the PDB file this object represents is incredibly large. """
return (self.size >= PDB_LARGE_FILE_SIZE)
[docs] def close(self):
""" Close the file. """
if self.memHandle:
self.memHandle.close()
else: self.fileHandle.close()
[docs] def read(self):
""" Acquire all remarks, and the indices of all models and residues. Returns
remarks, and biological source information as a tuple (remarks, organism,
taxid, mutant). """
# Metadata.
remarks,organism,taxid,mutant = [],'','',False
# Cursors.
k = 0
curRes = None
pos = 0
line = self.memHandle.readline()
model = -1
# Scan the file.
while line:
recordType = line[:6].strip()
if recordType == 'MODEL':
model = int(line.strip().split()[-1])
self.modelIndices[model] = pos
elif recordType == 'ATOM':
sl = line.split()
if int(sl[1]) >= 100000: k = 1
else: k = 0
chain = line[21+k].strip()
if chain not in self.chains:
self.chains.append(chain)
residue_index = line[22+k:27+k].strip()
iCode = line[27].strip()
residue = residue_index + iCode
if (curRes == None or residue != curRes):
self.residueIndices[(model,chain,residue)] = pos
curRes = residue
elif recordType == 'REMARK':
remark = line[6:].strip('\n')
if not remark in remarks: remarks.append(remark)
elif recordType == 'SOURCE':
if 'ORGANISM_SCIENTIFIC' in line:
organism = '_'.join(line.strip().strip(';').split()[-2:])
elif 'ORGANISM_TAXID' in line:
taxid = line.strip().strip(';').split()[-1]
elif 'MUTANT' in line or 'MUTATION' in line:
mutant = True
pos = self.memHandle.tell()
line = self.memHandle.readline()
self.memHandle.seek(0)
# Return metadata.
return (remarks,organism,taxid,mutant)
[docs] def hasResidue(self,chain,res,model=-1):
""" Determine if a residue is present in the PDBfile. """
return ((int(model),chain,res) in self.residueIndices)
[docs] def readResidue(self,chain,res,model=-1):
""" Parse a residue from the PDB file and return a PDBresidue. """
resObj = None
index = self.residueIndices[(int(model),chain,res)]
self.memHandle.seek(index)
line = self.memHandle.readline()
while line:
recordType = line[:6].strip()
if recordType == 'ATOM':
sl = line.split()
if int(sl[1]) >= 100000: k = 1
else: k = 0
chain = line[21+k].strip()
residue = line[22+k:27+k].strip() + line[27].strip()
residue_id = line[17+k:20+k].strip()
if residue == res:
if not resObj:
resObj = PDBresidue(residue,residue_id)
serial = int(line[6+k:12+k])
atom_name = line[12+k:16+k].strip()
x = float(line[30+k:38+k])
y = float(line[38+k:46+k])
z = float(line[46+k:54+k])
o = float(line[54+k:60+k])
b = float(line[60+k:66+k])
sym = line[76+k:78+k].strip()
try: charge = line[79+k:80+k].strip()
except: charge=' '
atom = PDBatom(serial,atom_name,x,y,z,o,b,sym,charge)
resObj.AddAtom(atom)
else: break
line = self.memHandle.readline()
self.memHandle.seek(0)
return resObj
[docs] def hasModels(self): return (len(self.modelIndices) > 0)
[docs] def getModelNames(self): return self.modelIndices.keys()
[docs] def getChainNames(self): return self.chains
[docs] def iterResidueData(self):
""" Yield the model number, chain name, and residue number
for all residues present in this PDB file, not necessarily
in order. """
for mod,ch,res in self.residueIndices.keys():
yield mod,ch,res
[docs] def getResidueNamesInChain(self,ch):
res = []
for c,r in self.residueIndices.keys():
if c == ch: res.append(r)
return res
# Constants
aa = {'ALA':'A','CYS':'C','ASP':'D','GLU':'E',
'PHE':'F','GLY':'G','HIS':'H','ILE':'I',
'LYS':'K','LEU':'L','MET':'M','ASN':'N',
'PRO':'P','GLN':'Q','ARG':'R','SER':'S',
'THR':'T','VAL':'V','TRP':'W','TYR':'Y',
'UNK':'X'}
aa_names = {v:k for k, v in aa.items()} # Flip above dictionary.
aa_lists = {'ALA':['N','CA','C','O','CB'],\
'CYS':['N','CA','C','O','CB','SG'],\
'ASP':['N','CA','C','O','CB','CG','OD1','OD2'],\
'GLU':['N','CA','C','O','CB','CG','CD','OE1','OE2'],\
'PHE':['N','CA','C','O','CB','CG','CD1','CD2','CE1','CE2','CZ'],\
'GLY':['N','CA','C','O'],\
'HIS':['N','CA','C','O','CB','CG','ND1','CD2','CE1','NE2'],\
'ILE':['N','CA','C','O','CB','CG1','CG2','CD1'],\
'LYS':['N','CA','C','O','CB','CG','CD','CE','NZ'],\
'LEU':['N','CA','C','O','CB','CG','CD1','CD2'],\
'MET':['N','CA','C','O','CB','CG','SD','CE'],\
'ASN':['N','CA','C','O','CB','CG','OD1','ND2'],\
'PRO':['N','CA','C','O','CB','CG','CD'],\
'GLN':['N','CA','C','O','CB','CG','CD','OE1','NE2'],\
'ARG':['N','CA','C','O','CB','CG','CD','NE','CZ','NH1','NH2'],\
'SER':['N','CA','C','O','CB','OG'],\
'THR':['N','CA','C','O','CB','OG1','CG2'],\
'VAL':['N','CA','C','O','CB','CG1','CG2'],\
'TRP':['N','CA','C','O','CB','CG','CD1','CD2','NE1','CE2','CE3','CZ2','CZ3','CH2'],\
'TYR':['N','CA','C','O','CB','CG','CD1','CD2','CE1','CE2','CZ','OH'],\
'UNK':[]}
# Debugging
if __name__ == "__main__":
mystructure = PDBstructure(sys.argv[1])
if mystructure.ismodel:
names = mystructure.GetModelNames()
else:
names = mystructure.GetChainNames()
lis = mystructure.FDmatrix(sys.argv[1].strip('pdb')+'gm') # for debugging purposes only
mystructure.Map2Protein('test.pdb',lis,names[0],sys.argv[1])# for debugging purposes only
if len(sys.argv) > 2:
print 'RMSD',mystructure.rmsd(sys.argv[2])
print 'RRMSD',mystructure.rrmsd(sys.argv[2])
print 'TMscore',mystructure.tmscore(sys.argv[2])
print 'GDT',mystructure.gdt(sys.argv[2])
x = mystructure.GetAllCentroid('A')
mystructure.Contacts()
print mystructure.contactmap