Source code for labblouin.PDBnet

#!/bin/python

""" PDBnet is a collection of Python objects intended to model and contain
PDB protein data.

PDBnet Copyright (C) 2012 Christian Blouin
Contributions by Alex Safatli and Jose Sergio Hleap
Major Refactoring (2014) done by Alex Safatli and Jose Sergio Hleap

E-mail: cblouin@cs.dal.ca, safatli@cs.dal.ca, jshleap@dal.ca
Dependencies: Scipy, BioPython, FASTAnet (contained in LabBlouinTools) """

from mmap import mmap as memoryMappedFile
import numpy as np
import sys, FASTAnet, math
from math import sqrt
from os.path import getsize as sizeOfFile
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from scipy.spatial.distance import cdist

# Metadata

__author__  = ['Christian Blouin','Jose Sergio Hleap','Alexander Safatli']
__version__ = '1.1.0'

# Constants

PDB_LARGE_FILE_SIZE = 100000000 #bytes

# Classes

[docs]class PDBatom(object): """ ATOM in a PDB protein structure. """ __slots__ = ('name','serial','x','y','z','occupancy','tempFactor', 'charge','symbol','parent') def __init__(self,serial,name,x,y,z,oc,b,symbol,charge): """ Construct an atom in a PDB protein structure. """ self.name = name self.serial = serial self.x = x self.y = y self.z = z self.occupancy = oc self.tempFactor = b self.charge = charge self.symbol = symbol self.parent = None
[docs] def fixname(self): """ Ensures the name of this atom fits within 4 characters. """ if len(self.name) == 4: return self.name else: return ' '+self.name.ljust(3)
def __str__(self): """ Express this atom as a single line in a PDB file (see PDB format). """ return "ATOM %5s %4s %3s %1s%4s %8.3f%8.3f%8.3f%6.2f%6.2f %2s%2s" \ % (self.serial,self.fixname(),self.parent.name,self.parent.chain, self.parent.index,self.x,self.y,self.z,self.occupancy, self.tempFactor,self.symbol,self.charge)
[docs] def GetPosition(self): """ Get the 3-dimensional coordinate of this atom in space. """ return (self.x,self.y,self.z)
[docs] def DistanceTo(self, atom): """ Acquire the distance from this atom to another atom. """ return ((self.x-atom.x)**2+(self.y-atom.y)**2+(self.z-atom.z)**2)**0.5
[docs]class PDBterminator(PDBatom): """ A placeholder class that represents a terminating ATOM-like line in the PDB file. """ __slots__ = ('name','serial','x','y','z','occupancy','tempFactor', 'charge','symbol','parent','lastatom','lastresname', 'lastreschain','lastresind') def __init__(self,chaininst): self.lastatom = chaininst.GetAtoms()[-1] self.lastresname = self.lastatom.parent.name self.lastreschain = self.lastatom.parent.chain self.lastresind = self.lastatom.parent.index newserial = str(int(self.lastatom.serial) + 1) super(PDBterminator,self).__init__(newserial,'',0,0,0,0,0,'','') def __str__(self): return 'TER %5s %4s %3s %1s%4s' % ( self.serial,'',self.lastresname,self.lastreschain,self.lastresind)
[docs]class PDBresidue: """ A residue (collection of ATOM fields) in a PDB protein structure. """ __slots__ = ('index','name','atoms','atomsOrdered','contacts','centroid', 'chain','model') def __init__(self,index=None,name=''): """ Construct a residue (collection of atoms) in a PDB protein structure. """ self.index = index self.name = name self.atoms = {} self.atomsOrdered = [] self.contacts = [] self.centroid = None self.chain = '' self.model = None
[docs] def GetAtoms(self): return [x for x in self.atomsOrdered]
[docs] def AddAtom(self, atom): """ Add a PDBatom structure to this residue. """ self.atoms[atom.name.strip()] = atom self.atomsOrdered.append(atom) atom.parent = self
[docs] def GetCA(self): """ Get the alpha-carbon found in this residue as a PDBatom. """ return self.atoms['CA']
[docs] def Centroid(self): """ Calculate the centroid of this residue. Return this as a PDBatom. """ x = 0.0 y = 0.0 z = 0.0 for atom in self.atoms: if atom in ['C','N','O']: continue x += self.atoms[atom].x y += self.atoms[atom].y z += self.atoms[atom].z div = float(len(self.atoms)-3) x /= div y /= div z /= div self.centroid = PDBatom(None,'centroid',x,y,z,0,0,'','') self.centroid.parent = self return self.centroid
[docs] def InContactWith(self,other,thres=4.5): """ Determine if in contact with another residue. """ foundmatch = False for atomA in self.atoms: if atomA in ['C', 'N', 'O']: continue atomA = self.atoms[atomA] for atomB in other.atoms: if atomB in ['C', 'N', 'O']: continue atomB = other.atoms[atomB] if atomA.DistanceTo(atomB) <= thres: foundmatch = True if foundmatch: break if foundmatch: break return foundmatch
def __int__(self): return int(self.index) def __str__(self): return '\n'.join([str(x) for x in self.atomsOrdered])
[docs]class PDBchain(object): """ A PDB chain (collection of protein residues). """ __slots__ = ('name','structure','residues','indices','parent') def __init__(self,name): self.name = name self.residues = list() self.indices = list() self.structure = None self.parent = None
[docs] def GetResidues(self): """ Acquire all of the residues in this chain. Note that if this is a large file, this will force the object to load all of these residues into memory from the file. """ return [self.GetResidueByIndex(i) for i in self.GetIndices()]
[docs] def IterResidues(self): """ Iteratively yield all of the residues in this chain. Note that if this is a large file, this will force the object to load all of these residues into memory from the file. """ for i in self.GetIndices(): yield self.GetResidueByIndex(i)
[docs] def GetResidueByIndex(self,index): """ Alias for acquiring a residue from this instance using [] operator. """ return self.__getitem__(index)
[docs] def GetAtoms(self): """ Get all comprising atoms in this chain in order of residues. """ atoms = [] for x in self.IterResidues(): atoms.extend(x.GetAtoms()) return atoms
[docs] def GetIndices(self): """ Acquire all of the indices for residues present in this chain. """ return [int(x) for x in self.indices]
[docs] def AddIndexOfResidue(self,index): """ Add an index for a residue to this chain. Will generate own residue information if called upon (forces lazy evaluation). """ self.indices.append(str(index))
[docs] def AddResidue(self,resid): """ Add a residue to this chain. """ resid.chain = self.name self.residues.append(resid) self.indices.append(str(resid.index))
[docs] def AddResidueByIndex(self,index): """ Add a residue to this chain by its index in the PDB. The residue object will automatically be constructed from the file. """ self.indices.append(str(index)) resid = self.GetResidueByIndex(index) self.residues.append(resid)
[docs] def RemoveResidue(self,resid): """ Remove a residue from this chain. """ return self.pop(resid)
[docs] def ContactMap(self,thres=4.5): """ Compute the contact map of this chain. """ done = [] contactmap = [] for rA in self: for rB in self: resA,resB = self[rA],self[rB] if resA == resB or (resA,resB) in done: continue elif resA.InContactWith(resB,thres): if not (int(resA),int(resB)) in contactmap: contactmap.append((int(resA),int(resB))) contactmap.append((int(resB),int(resA))) done.append((resA,resB)) done.append((resB,resA)) return sorted(contactmap, key=lambda e: (e[0], e[1]))
[docs] def GetPrimaryPropertiesFromBioPython(self): """ Use BioPython to populate this class with attributes. """ props=PA(self.AsFASTA()) self.amino_acid_composition= props.get_amino_acids_percent() self.molecular_weight= props.molecular_weight() self.aromaticity= props.aromaticity() self.instability_index= props.instability_index() self.flexibility= props.flexibility() self.isoelectric_point=props.isoelectric_point() self.secondary_structure_fraction=props.secondary_structure_fraction() return props
[docs] def AsFASTA(self): """ Return the string representing this chain's FASTA sequence. """ return ''.join([aa[item.name] for item in self.IterResidues()])
[docs] def WriteAsPDB(self,filename): """ Write this single chain to a file as a PDB. """ fh = open(filename,'w') fh.write(str(self)) fh.close()
[docs] def SortByNumericalIndices(self): """ Sort all internal items by a numerical index. """ if not self.structure.handle.isLargeFile(): self.residues = sorted(self.residues,key=lambda d: int(d.index)) self.indices = sorted(self.indices, key=lambda d: int(d))
[docs] def pop(self,s): """ Pop a residue. """ res = self.GetResidueByIndex(s) if not self.structure.handle.isLargeFile(): self.residues.remove(res) self.indices.remove(str(s)) return res
[docs] def update(self,o): """ Add residues from another chain. """ for res in o: self.AddResidue(res)
def __str__(self): get = lambda d: self.GetResidueByIndex(d) out = '\n'.join([str(get(x)) for x in self.indices]) out += '\n%s\n' % (str(PDBterminator(self))) return out def __getitem__(self,i): """ Force the object to load the residue if not present in this structure. """ if self.parent == None: parent = -1 else: parent = self.parent.name handle = self.structure.handle if str(i) in self.indices: if handle and ((handle.isLargeFile()) or (len(self.indices) > len(self.residues))): if handle.hasResidue(self.name,str(i),parent): resid = handle.readResidue(self.name,str(i),parent) resid.chain = self.name return resid elif self.indices.index(str(i)) < len(self.residues): return self.residues[self.indices.index(str(i))] else: raise KeyError('Could not find residue %s in %s:%s in file or structure.' % ( str(i),str(self.name),str(parent))) return self.residues[self.indices.index(str(i))] else: return None def __len__(self): return len(self.indices) def __iter__(self): for ind in self.indices: yield int(ind)
[docs]class PDBmodel(PDBchain): """ A PDB model (a special kind of chain). """ __slots__ = ('name','structure','residues','indices','chains','chainNames') def __init__(self,name): if type(name) != int: raise TypeError('Model names should be integers.') elif name < 0: raise ValueError('Model name should be zero or a positive integer.') super(PDBmodel,self).__init__(name) self.chains = list() self.chainNames = list()
[docs] def GetResidues(self): """ Acquire all of the residues in this model. Note that if this is a large file, this will force the object to load all of these residues into memory from the file. """ this = [self.GetResidueByIndex(i) for i in self.GetIndices()] for it in self.chains: this += it.GetResidues() return this
[docs] def IterResidues(self): """ Iteratively yield all of the residues in this model. Note that if this is a large file, this will force the object to load all of these residues into memory from the file. """ for i in self.GetIndices(): yield self.GetResidueByIndex(i) for i in self.chains: for j in i.IterResidues(): yield j
[docs] def GetChains(self): return self.chains
[docs] def GetChain(self,name): return self.GetChainByName(name)
[docs] def GetChainByName(self,name): for x in xrange(len(self.chainNames)): if self.chainNames[x] == name: return self.chains[x]
[docs] def GetChainNames(self): return self.chainNames
[docs] def NewChain(self,ch): """ Create a new PDBchain, add it to this model, and return it. """ c = PDBchain(ch) self.AddChain(c) return c
[docs] def AddChain(self,chain): """ Add a PDBchain (chain) to this model. """ chain.structure = self.structure chain.parent = self self.chains.append(chain) self.chainNames.append(chain.name)
[docs] def AddResidue(self,resid): """ Add a residue to this model. """ resid.model = self.name self.residues.append(resid) self.indices.append(str(resid.index))
def __getitem__(self, i): handle = self.structure.handle if str(i) in self.indices: if (handle and handle.isLargeFile()) or len(self.indices) > len(self.residues): resid = handle.readResidue('',str(i),self.name) resid.model = self.name return resid return self.residues[self.indices.index(str(i))] else: return None def __len__(self): l = len(self.indices) for c in self.GetChains(): l += len(c) return l def __str__(self): model = 'MODEL%9s\n' % (str(self.name)) + super( PDBmodel,self).__str__() for ch in self.chains: model += str(ch) return model + 'ENDMDL\n'
[docs]class PDBstructure(object): """ A PDB protein structure (a collection of chains/models). """ __slots__ = ('chains','orderofchains','models','orderofmodels','remarks', 'filepath','organism','taxid','mutation','contactmap','handle', 'ismodel') def __init__(self, filein=''): # Attributes. self.filepath = filein self.handle = None self.organism = None self.taxid = None self.mutation = False self.contactmap = list() self.orderofchains = list() self.orderofmodels = list() self.remarks = list() self.chains = dict() self.models = dict() # Read a file? if filein != '': self.ReadFile(filein) # Accesssors/Mutators.
[docs] def GetChainNames(self): return self.orderofchains
[docs] def GetModelNames(self): return self.orderofmodels
[docs] def GetChain(self,ch): """ Get a chain by name. """ if ch in self.chains: return self.chains[ch] return None
[docs] def GetModel(self,mod): """ Get a model by name. """ if mod in self.models: return self.models[mod] return None
[docs] def NewChain(self,name): """ Construct and add a new chain by name to the PDB. Returns the chain. """ p = PDBchain(name) self.AddChain(name,p) return p
[docs] def NewModel(self,name): """ Construct and add a new model by name to the PDB. Returns the model. """ p = PDBmodel(name) self.AddModel(name,p) return p
[docs] def RemoveChain(self,name): """ Remove a chain from the structure (by name). Returns the chain. """ c = self.GetChain(name) if not name in self.chains: raise KeyError('Chain does not exist in structure by that name!') del self.chains[name] self.orderofchains.remove(name) return c
[docs] def RemoveModel(self,name): """ Remove a model from the structure (by name). Returns the chain. """ m = self.GetModel(name) if not name in self.models: raise KeyError('Model does not exist in structure by that name!') del self.models[name] self.orderofmodels.remove(name) return m
[docs] def AddChain(self,chainname,chain): """ Add a chain as a list of residues to the PDB. """ # Do chain addition operations. if chainname in self.chains: raise KeyError('Chain already exists in structure by that name!') if type(chain) != PDBchain and type(chain) == PDBmodel: # Cast into a PDBchain. cast = PDBchain(chainname) for i in chain: cast.AddResidue(chain[i]) chain = cast self.chains[chainname] = chain self.orderofchains.append(chainname) chain.structure = self
[docs] def AddModel(self,modelname,model): """ Add a model as a list of residues to the PDB. """ # Do chain addition operations. if model in self.models: raise KeyError('Model already exists in structure by that name!') if type(model) != PDBmodel and type(model) == PDBchain: # Cast into a PDBmodel. cast = PDBmodel(modelname) for i in model: cast.AddResidue(model[i]) model = cast model.name = modelname self.models[modelname] = model self.orderofmodels.append(modelname) model.structure = self return model
[docs] def AddResidueToChain(self, chain, res): """ Add a residue to a chain. Deprecated; use chain class function. """ ch = self.GetChain(chain) if ch != None: ch.AddResidue(res)
[docs] def AddResidueToModel(self, model, res): """ Add a residue to a model. Deprecated; use model class function. """ mo = self.GetModel(model) if mo != None: mo.AddResidue(res)
[docs] def AddRemark(self,remark): """ Add a remark (note/comment) to the structure/PDB file. """ if len(remark) == 0: self.remarks.append('') return for it in xrange(0,len(remark),68): self.remarks.append(remark[it:it+68])
[docs] def GetRemarks(self): """ Return all remarks from the PDB as a list of strings. """ return self.remarks
[docs] def CheckComplete(self): """ For every chain, check to see if every residue is complete (see aa_list dictionary). """ for ch in self.orderofchains: for re in self.chains[ch]: res = self.chains[ch][re] nam = res.name if not nam in aa_lists: continue check = set(set(aa_lists[nam])).difference(res.atoms) if len(check) > 0: return res for ch in self.orderofmodels: for res in self.models[ch]: nam = res.name if not nam in aa_lists: continue check = set(set(aa_lists[nam])).difference(res.atoms) if len(check) > 0: return res return True # I/O Functionality.
def _pymol(self): # Start pymol if not started. import pymol pymol.finish_launching() return pymol
[docs] def view(self,istrajectory=False): """ View the structure in a Pymol window. Requires an installation of Pymol. """ pym = self._pymol() pym.cmd.read_pdbstr(str(self),'PDBnet') if len(self.models) > 0 and not istrajectory: pym.cmd.set('all_states','on') elif istrajectory: pass # SERGIO
[docs] def ViewStructure(self): self.view()
[docs] def WriteFile(self, filename): """ Write this PDB structure as a single PDB file. """ fh = open(filename,'w') fh.write(str(self)) fh.close()
[docs] def read(self,filename): """ Alias for ReadFile(). """ self.ReadFile(filename)
[docs] def ReadFile(self, filename): """ Read a PDB file. Populate this PDBstructure. """ # Map to memory and read the file. p = PDBfile(filename) self.handle = p self.remarks, self.organism, self.taxid, self.mutation = p.read() largeFile = p.isLargeFile() # Start creating high-level structures for models/chains. if p.hasModels(): self.ismodel = True map(self.NewModel,p.getModelNames()) for mo, ch, re in p.iterResidueData(): model = self.GetModel(mo) if ch != '': if not ch in model.GetChainNames(): model.NewChain(ch) chain = model.GetChain(ch) if largeFile: chain.AddIndexOfResidue(re) else: chain.AddResidueByIndex(re) else: if largeFile: model.AddIndexOfResidue(re) else: model.AddResidueByIndex(re) for mod in self.GetModelNames(): model = self.GetModel(mod) model.structure = self model.SortByNumericalIndices() for chain in model.GetChains(): chain.SortByNumericalIndices() else: self.ismodel = False map(self.NewChain,p.getChainNames()) for _, ch, re in p.iterResidueData(): chain = self.GetChain(ch) if largeFile: chain.AddIndexOfResidue(re) else: chain.AddResidueByIndex(re) for cha in self.GetChainNames(): chain = self.GetChain(cha) chain.structure = self chain.SortByNumericalIndices() # Remove this component from memory if no longer necessary. if not largeFile: p.close() self.handle = None del p
[docs] def ChainAsFASTA(self, chain): """ Return the chain as a FASTA. Deprecated; use chain class function. """ ch = self.GetChain(chain) if ch != None: return ch.AsFASTA()
[docs] def ModelAsFASTA(self, model): """ Return the model as a FASTA. Deprecated; use chain class function. """ mo = self.GetModel(model) if mo != None: return mo.AsFASTA() # Scoring Functionality
def _iterResidueAssociations(self,fasta,chains=None,fast=None): """ PRIVATE: Use an input FASTA file to determine associations between residues as they exist between 2+ chains. Constructs a mapping between chain and sequence and yields all strictly homologous positions as residues, chain-by-chain.""" ismodel=self.ismodel if ismodel: orderofthings = self.orderofmodels things = self.models else: orderofthings = self.orderofchains things = self.chains if not chains: chains = orderofthings chaininds = [orderofthings.index(x) for x in chains] if len(chaininds) < 2: raise ValueError('Need more than two chains (gave %d).' % (len(chaininds))) if fast is None: fast = FASTAnet.FASTAstructure(fasta,uniqueOnly=False) seqs = fast.orderedSequences if len(things) != len(seqs): raise IOError('FASTA set length not equal to PDB length.') posv = fast.getStrictlyUngappedPositions(chaininds) for n in chaininds: # Go chain by chain and yield information. seq = seqs[n].sequence ch = orderofthings[n] matches = self.GetFASTAIndices(things[ch],seq) residueList = [match for match in matches] if residueList[0] is None: nm = seqs[n].name chn = things[ch].name raise IOError('No relation found between sequence %s and chain %s.' % (nm,chn)) residueList = sorted(residueList,key=lambda d:d.fstindex) for pos in posv: # Go position by position. yield pos, ch, residueList.pop(0) # Yield position, chain name, and residue.
[docs] def tmscore(self,fasta,chains=None,native=None,CA=True): """ Get the TMscore between two chains. Requires a FASTA alignment and a value for the length of the native structure (e.g., for a pairwise alignment, the length of the structure used as a reference before alignment was done). The latter is computed by assuming the first of both provided chains is the native structure; otherwise, uses a provided chain name (native input). """ ismodel=self.ismodel if ismodel: orderofthings = self.orderofmodels things = self.models else: orderofthings = self.orderofchains things = self.chains if not chains: chains = orderofthings if len(chains) != 2: raise ValueError('Need exactly two chains to score.') fas = FASTAnet.FASTAstructure(fasta,uniqueOnly=False) posvect = fas.getStrictlyUngappedPositions() items = self._iterResidueAssociations(fasta,chains,fas) # Get the lengths of the respective chains. if native == None: leN = len(things[chains[0]]) else: leN = len(things[native]) # Length of the reference structure. leT = len(posvect) # Number of aligned positions. # Calculate d_0 for this alignment. cuberoot = lambda d: d**(1./3) d_0 = 1.24 * cuberoot(leN-15) - 1.8 # Get the summation portion of the TMscore. sumportion = 0 posmask = {} for pos,_,residue in items: if not pos in posmask: posmask[pos] = [] if CA: atom = residue.GetCA() else: atom = residue.Centroid() posmask[pos].append(atom) for pos in posvect: # Assume chA is Native Structure. cavect = posmask[pos] ca1, ca2 = cavect[0], cavect[1] d_i = sqrt((ca1.x-ca2.x)**2+ (ca1.y-ca2.y)**2+ (ca1.z-ca2.z)**2) sumdenom = 1 + (d_i/d_0)**2 suminside = 1./(sumdenom) sumportion += suminside # Return the TMscore. return (1./leN) * sumportion
[docs] def gdt(self,fasta,chains=None,distcutoffs=[1,2,4,8],CA=True): """ Get the GDT score between two chains. Requires a FASTA alignment. """ ismodel=self.ismodel if ismodel: orderofthings = self.orderofmodels things = self.models else: orderofthings = self.orderofchains things = self.chains if len(distcutoffs) != 4: raise ValueError('Need exactly 4 distance cutoff values.') if not chains: chains = orderofthings if len(chains) != 2: raise ValueError('Need exactly two chains to score.') items = self._iterResidueAssociations(fasta,chains) # Get all relevant atoms. posmask = {} for pos,_,residue in items: if not pos in posmask: posmask[pos] = [] if CA: atom = residue.GetCA() else: atom = residue.Centroid() posmask[pos].append(atom) # Get the Euclidean distances between # all pairs of aligned positions. distances = {} for pos in posmask: cavect = posmask[pos] ca1, ca2 = cavect[0], cavect[1] dist = sqrt((ca1.x-ca2.x)**2+ (ca1.y-ca2.y)**2+ (ca1.z-ca2.z)**2) distances[pos] = dist # Calculate the counts for different cutoffs. counts = [0,0,0,0] poslen = float(len(distances)) A,B,C,D = distcutoffs for pos in distances: dist = distances[pos] if dist < A: counts[0] += 1 if dist < B: counts[1] += 1 if dist < C: counts[2] += 1 if dist < D: counts[3] += 1 GDT_P1 = counts[0]/poslen GDT_P2 = counts[1]/poslen GDT_P3 = counts[2]/poslen GDT_P4 = counts[3]/poslen return (GDT_P1+GDT_P2+GDT_P3+GDT_P4)/4.0
[docs] def rmsd(self,fasta,chains=None,CA=True): """Get the RMSD between chains. Requires a FASTA alignment.""" items = self._iterResidueAssociations(fasta,chains) # Get all relevant atoms. posmask = {} for pos,_,residue in items: if not pos in posmask: posmask[pos] = [] if CA: atom = residue.GetCA() else: atom = residue.Centroid() posmask[pos].append(atom) # Get the Euclidean distance squared between # all pairs of aligned positions. distsqs = {} for pos in posmask: cavect = posmask[pos] for a in xrange(len(cavect)): for b in xrange(a+1,len(cavect)): if (a,b) not in distsqs: distsqs[(a,b)] = 0 ca1, ca2 = cavect[a], cavect[b] distsq = ((ca1.x-ca2.x)**2 +(ca1.y-ca2.y)**2 +(ca1.z-ca2.z)**2) distsqs[(a,b)] += distsq # Calculate the average RMSD. rmsd = 0 for d in distsqs: d = distsqs[d] r = sqrt((float(d)/len(posmask))) rmsd += r rmsd /= len(distsqs) return rmsd
[docs] def rrmsd(self,fasta,chains=None,CA=True): """ Get the RRMSD between chains. Requires a FASTA alignment. See Betancourt & Skolnick, "Universal Similarity Measure for Comparison Protein Structures". """ if self.ismodel: orderofthings = self.orderofmodels things = self.models else: orderofthings = self.orderofchains things = self.chains if not chains: chains = orderofthings if len(chains) != 2: raise ValueError('Need exactly two chains to score.') R_A = self.RadiusOfGyration([chains[0]]) R_B = self.RadiusOfGyration([chains[1]]) avglen = (len(things[chains[0]])+ len(things[chains[1]]))/2 L_N, e = avglen-1, math.e c = 0.42-0.05*L_N*e**(-L_N/4.7)+0.63*e**(-L_N/37) denom = R_A**2+R_B**2-2*c*R_A*R_B alignRMSD = self.rmsd(fasta,chains,CA) return float(alignRMSD)/sqrt(denom) # Other Functionality
[docs] def GetAverage(self,chains=None,newname=None): """ Acquire a new chain or model corresponding to an average of all present chains or models specified. """ if self.ismodel: orderofthings = self.orderofmodels things = self.models if newname == None: newname = 1 output = PDBmodel(newname) else: orderofthings = self.orderofchains things = self.chains if newname == None: newname = '*' output = PDBchain(newname) if chains == None: chains = orderofthings # Make sure all sequences have equal FASTAs. fas = '' for ch in chains: if fas == '': fas = things[ch].AsFASTA() if things[ch].AsFASTA() != fas: raise AssertionError('Not all provided chains/models have equal sequences.') def avgResidue(res_no): # Create the dummy residue structure. firstch = things[chains[0]] firstres = firstch.GetResidues()[res_no] fake = PDBresidue(firstres.index,firstres.name) # Populate it with atoms. atomlist = [] for a in firstres.GetAtoms(): atomlist.append(PDBatom(a.serial,a.name,0,0,0, 0,0,a.symbol,'')) # Average all properties. count = len(chains) for ch in chains: res = things[ch].GetResidueByIndex(things[ch].GetIndices()[res_no]) atoms = res.GetAtoms() for i in xrange(len(atoms)): atomlist[i].x += atoms[i].x atomlist[i].y += atoms[i].y atomlist[i].z += atoms[i].z atomlist[i].occupancy += atoms[i].occupancy atomlist[i].tempFactor += atoms[i].tempFactor for a in atomlist: a.x /= float(count) a.y /= float(count) a.z /= float(count) a.occupancy /= float(count) a.tempFactor /= float(count) for a in atomlist: fake.AddAtom(a) return fake res_nums = range(len(things[chains[0]])) for i in res_nums: output.AddResidue(avgResidue(i)) return output
[docs] def RadiusOfGyration(self,chains=None): """ Acquire the radius of the gyration of the entire, or a portion of, the PDB protein molecule. """ if self.ismodel: orderofthings = self.orderofmodels things = self.models else: orderofthings = self.orderofchains things = self.chains if not chains: chains = orderofthings # Get centre of mass of entire molecule. molCentroid = [0.,0.,0.] numAtoms = 0 for c in chains: for re in things[c]: r = things[c][re] for at in r.atoms: a = r.atoms[at] molCentroid[0] += a.x molCentroid[1] += a.y molCentroid[2] += a.z numAtoms += 1 m_x,m_y,m_z = molCentroid molCentroid = [m_x/numAtoms,m_y/numAtoms,m_z/numAtoms] # Sum all of the distances squared vs. centre of mass. distsqr = [] m_x,m_y,m_z = molCentroid for c in chains: for re in things[c]: r = things[c][re] for at in r.atoms: a = r.atoms[at] diff = (a.x-m_x+a.y-m_y+a.z-m_z) distsqr.append(diff**2) # Return the sum of all of these, divided by number of atoms, # square rooted. sumdistratio = sum(distsqr)/float(numAtoms) return sqrt(sumdistratio)
[docs] def GetAllCentroid(self, chain): """ Populates the centroids of all residues. """ out = [] if chain: if not chain in self.chains: raise ValueError('Chain %s does not exist!' % (chain)) ch = self.chains[chain] for res in ch: res = ch[res] res.Centroid() out.append(res.centroid) else: for res in self.IterAllResidues(): res.Centroid() out.append(res.centroid) return out
[docs] def IndexSeq(self, chain, fst): """ Store in residues the correct index to the fasta. Requires a 1-to-1 correspondance at least a portion of the way through. Deprecated; use GetFASTAIndices(). """ ismodel=self.ismodel if ismodel: thing = self.GetModel(chain) else: thing = self.GetChain(chain) return self.GetFASTAIndices(thing, fst)
[docs] def GetFASTAIndices(self, thing, fst): """ Given a PDBchain, find 1-to-1 correspondances between it and a FASTA sequence. """ chainseq = thing.AsFASTA() ungapped = fst.replace('-','') success = True if len(ungapped) != len(chainseq): success = False yield None for i in xrange(0,len(chainseq)): # See if there is a failed correspondance. if chainseq[i] != ungapped[i]: yield None success = False break if success: index = -1 for i in thing: i = thing[i] index = fst.find(aa[i.name],index+1) i.fstindex = index yield i
[docs] def IterResiduesFor(self,chains=None): """ Produce an iterator to allow one to iterate over all residues for a subset of the structure. """ if self.ismodel: orderofthings = self.orderofmodels things = self.models else: orderofthings = self.orderofchains things = self.chains if chains == None: chains = orderofthings for ch in chains: chain = things[ch] for res in chain: yield chain[res]
[docs] def IterAllResidues(self): """ Produce an iterator to allow one to iterate over all possible residues. """ for ch in self.chains.keys(): chain = self.GetChain(ch) for res in chain: yield chain[res] for mo in self.models.keys(): model = self.GetModel(mo) for res in model: yield model[res]
[docs] def gm(self,fasta,chains=None,CA=False,typeof='str'): """ Acquire Geometric Morphometric data corresponding to the (x,y,z) coordinates between all homologous residue positions. Requires a FASTA alignment. Options include using alpha-carbon positions. By default, uses centroids of residues. Returns a list of labels and a list of coordinates as raw GM data. The typeof option provides an option for coordinate output; they are returned as a semicolon-delimited string (str) or as a numpy 2d array (matrix). """ if self.ismodel: orderofthings = self.orderofmodels things = self.models else: orderofthings = self.orderofchains things = self.chains if chains == None: chains = orderofthings labels,coords = [],[] # Output Variables # Set up name grabbing from FASTA file for GM labelling. f = FASTAnet.FASTAstructure(fasta,uniqueOnly=False) name = lambda d: f.orderedSequences[d].name # Acquire all homologous positions as defined in FASTA. items = self._iterResidueAssociations(fasta,chains,f) chainsSaw = [] if typeof == 'matrix': row = [] else: row = '' for pos,chain,res in items: if chain not in chainsSaw: ind = orderofthings.index(chain) nm = name(ind).strip().split(':')[0] labels.append('>%s:%s' % (nm,nm[:4])) if len(chainsSaw) != 0: coords.append(row) chainsSaw.append(chain) if typeof == 'matrix': row = [] else: row = '' if CA: atom = res.GetCA() else: atom = res.Centroid() if isinstance(row, str): row += '%f;%f;%f;'%(atom.x,atom.y,atom.z) elif isinstance(row,list): row.extend([atom.x,atom.y,atom.z]) coords.append(row) # Last chain. if typeof == 'matrix': coords = np.array(coords) return labels,coords
[docs] def WriteGM(self,fasta,gm,chains=None,CA=False): """ Write the information present in this PDB between multiple chains as a Geometric Morphometric text file. This file will be formatted such that individual lines correspond to chains and semi-colons separate the (x,y,z) coordinates between all homologous residue positions. Requires a FASTA alignment. Options include using alpha-carbon positions. By default, uses centroids of residues. """ fgm = open(gm,'w') # Get raw GM data. labels,coords = self.gm(fasta,chains,CA) # Start writing. for ind in xrange(len(labels)): fgm.write(labels[ind] + ';' + coords[ind] + '\n') # Done. fgm.close()
[docs] def WriteLandmarks(self,fasta,lm,chains=None): """ Write the information present in this PDB between multiple chains as a landmark text file. This file will be formatted such that the file is partitioned in sections starting with chain names and individual lines in these correspond to homologous residue positions denoted by homologous position, residue number, and residue name tab-delimited. Requires a FASTA file. """ ismodel=self.ismodel if ismodel: orderofthings = self.orderofmodels things = self.models else: orderofthings = self.orderofchains things = self.chains if chains == None: chains = orderofthings flm = open(lm,'w') # Set up name grabbing from FASTA file for labelling. f = FASTAnet.FASTAstructure(fasta,uniqueOnly=False) name = lambda d: f.orderedSequences[d].name # Acquire all homologous positions as defined in FASTA. items = self._iterResidueAssociations(fasta,chains,f) # Start writing. chainsSaw = [] ind = -1 for pos,chain,res in items: if chain not in chainsSaw: ind = orderofthings.index(chain) nm = name(ind).strip().split(':')[0] flm.write('>%s\n' % (nm)) chainsSaw.append(chain) ind = 0 flm.write('%s\t%s\t%s\n' % (ind,res.index,res.name)) ind += 1
[docs] def FDmatrix(self,fasta,chains=None,scaled=True): """ Compute the form difference matrix (FDM) as explained in Claude 2008. It relates to the identification of the most influential residue, with respect to the overall shape/structure. If the scaled option is True, will return an scaled list (from -1 to 1) of the of lenght equal to the number of residues. Otherwise will return the raw FDM, rounded so it can be included in a PDB. The scaled version is better for vizualization. By default the FDM is computed far all chains, but a subset can be passed to the chains option. """ names, data = self.gm(fasta,chains=chains,typeof='matrix') #get dimensions n_obs = data.shape[0] # number observations/structures n_res = data.shape[1]/3 # Number of residues # Re-sahpe data as a 3D array data= data.reshape(n_obs,n_res,3) # compute meanshape msh=data.mean(axis=0) #compute the form matrix for the mean shape FMmsh = cdist(msh,msh) FMmsh = np.ma.masked_array(FMmsh,np.isnan(FMmsh)) #iterate over observations to compute the form difference matrix vector=[] for index in xrange(n_obs): ndarray=data[index,:,:] FM = cdist(ndarray,ndarray) FDM = np.ma.masked_array(FM,np.isnan(FM))/FMmsh utr = FDM[np.triu_indices(FDM.shape[0],k=-1)] mfdm = np.absolute(FDM - np.median(utr[~np.isnan(utr)])) mdat = np.ma.masked_array(mfdm,np.isnan(mfdm)) s = np.sum(mdat,axis=1).filled(np.nan) vector.append(s) #compute FD and scaled if required FD = (np.array(vector).sum(axis=0))/n_obs if not scaled: return [round(x,2) for x in (FD)] else: return [round(x,3) for x in (2*(FD - min(FD))/(max(FD) - min(FD)) - 1)]
[docs] def Map2Protein(self,outname,lis,chain,fasta): """ Map a list of values (lis), that must have a lenght equal to that of the number of residues in the PDB to be mapped (chain). If a list of list is provided, the first list will be mapped as the beta factor and the second as occupancy """ # Check if more than one thing need to be included if len([lis]) == 1: lis=[lis] # Construct empty PDBstructure. dummy = PDBstructure() # Reset beta and ocuppancy if self.models: m = self.GetModel(chain) newm = dummy.NewModel(m.name) for r in m.IterResidues(): for a in r.GetAtoms(): a.tempFactor=0.00 if len(lis) > 1: a.occupancy=0.00 newm.AddResidue(r) else: m = self.GetChain(chain) newm = dummy.NewChain(m.GetName()) for r in m.IterResidues(): for a in r.GetAtoms(): a.tempFactor=0.00 if len(lis) > 1: a.occupancy=0.00 newm.AddResidue(r) # Acquire all homologous positions as defined in FASTA. items = self._iterResidueAssociations(fasta,None,None) # populate new field for pos,chainb,res in items: if not chainb == chain: continue res = newm.GetResidueByIndex(res.index) for a in res.GetAtoms(): a.tempFactor = lis[0][pos] if len(lis) > 1: a.occupancy = lis[1][pos] newm.WriteAsPDB(outname)
[docs] def Contacts(self,chain=None,thres=4.5): """ Compute the contact map of all chains or a chain. :param chain: A list of chain or model names or a single string or integer. By default, entire structure. :param thres: A threshold for distinguishing contact in Angstroms. :returns: A list of tuples of indices (integers) which correspond to chains or models and their residues. """ ismodel = self.ismodel if ismodel: orderofthings = self.orderofmodels things = self.models else: orderofthings = self.orderofchains things = self.chains if chain == None or chain == 'ALL': chain = orderofthings done = [] self.contactmap = [] if (type(chain) == int or type(chain) == str) or len(chain) == 1: if type(chain) == list or type(chain) == tuple: chain = things[chain[0]] else: chain = things[chain] self.contactmap = chain.ContactMap(thres) else: # A number of chains. iteratorA = self.IterResiduesFor(chain) iteratorB = self.IterResiduesFor(chain) for resA in iteratorA: for resB in iteratorB: if resA == resB or (resA,resB) in done: continue elif resA.InContactWith(resB,thres): if not ismodel: A = resA.index+resA.chain B = resB.index+resB.chain else: A = (resA.index,resA.model) B = (resB.index,resB.model) if not (A,B) in self.contactmap: self.contactmap.append((A,B)) self.contactmap.append((B,A)) done.append((resA,resB)) done.append((resB,resA)) self.contactmap = sorted(self.contactmap, key=lambda element: ( element[0], element[1])) return self.contactmap
[docs] def WriteContacts(self, filename): """ Write contact map. """ fout = open(filename, 'w') for a in self.contactmap: fout.write('%s\n'%(str(a))) fout.close() # Internals
def __iter__(self): """ Returns all PDBchains as an iterator. """ for ch in self.chains: yield self.chains[ch] for mo in self.models: yield self.models[mo] def __len__(self): """ Returns the length of the protein. """ # For all chains. chs = self.chains mds = self.models return sum([len(chs[x]) for x in chs] + [len(mds[x]) for x in mds]) def _remarksToString(self): remarkstr = '' for it in xrange(len(self.remarks)): remarkstr += 'REMARK%4s %s\n' % (str(it+1),self.remarks[it]) return remarkstr def __str__(self): """ As a string, outputs structure in the PDB format. """ out = '' out += self._remarksToString() for chain in self.orderofchains: out += str(self.GetChain(chain)) for model in self.orderofmodels: out += str(self.GetModel(model)) return out
[docs]class PDBfile(object): __slots__ = ('filePath','fileHandle','memHandle','modelIndices','chains','residueIndices','size') def __init__(self,fi): self.filePath = fi self.fileHandle = open(fi,'r+b') self.size = sizeOfFile(fi) self.memHandle = memoryMappedFile(self.fileHandle.fileno(),0) self.residueIndices = dict() self.modelIndices = dict() self.chains = list()
[docs] def isLargeFile(self): """ Return whether or on the PDB file this object represents is incredibly large. """ return (self.size >= PDB_LARGE_FILE_SIZE)
[docs] def close(self): """ Close the file. """ if self.memHandle: self.memHandle.close() else: self.fileHandle.close()
[docs] def read(self): """ Acquire all remarks, and the indices of all models and residues. Returns remarks, and biological source information as a tuple (remarks, organism, taxid, mutant). """ # Metadata. remarks,organism,taxid,mutant = [],'','',False # Cursors. k = 0 curRes = None pos = 0 line = self.memHandle.readline() model = -1 # Scan the file. while line: recordType = line[:6].strip() if recordType == 'MODEL': model = int(line.strip().split()[-1]) self.modelIndices[model] = pos elif recordType == 'ATOM': sl = line.split() if int(sl[1]) >= 100000: k = 1 else: k = 0 chain = line[21+k].strip() if chain not in self.chains: self.chains.append(chain) residue_index = line[22+k:27+k].strip() iCode = line[27].strip() residue = residue_index + iCode if (curRes == None or residue != curRes): self.residueIndices[(model,chain,residue)] = pos curRes = residue elif recordType == 'REMARK': remark = line[6:].strip('\n') if not remark in remarks: remarks.append(remark) elif recordType == 'SOURCE': if 'ORGANISM_SCIENTIFIC' in line: organism = '_'.join(line.strip().strip(';').split()[-2:]) elif 'ORGANISM_TAXID' in line: taxid = line.strip().strip(';').split()[-1] elif 'MUTANT' in line or 'MUTATION' in line: mutant = True pos = self.memHandle.tell() line = self.memHandle.readline() self.memHandle.seek(0) # Return metadata. return (remarks,organism,taxid,mutant)
[docs] def hasResidue(self,chain,res,model=-1): """ Determine if a residue is present in the PDBfile. """ return ((int(model),chain,res) in self.residueIndices)
[docs] def readResidue(self,chain,res,model=-1): """ Parse a residue from the PDB file and return a PDBresidue. """ resObj = None index = self.residueIndices[(int(model),chain,res)] self.memHandle.seek(index) line = self.memHandle.readline() while line: recordType = line[:6].strip() if recordType == 'ATOM': sl = line.split() if int(sl[1]) >= 100000: k = 1 else: k = 0 chain = line[21+k].strip() residue = line[22+k:27+k].strip() + line[27].strip() residue_id = line[17+k:20+k].strip() if residue == res: if not resObj: resObj = PDBresidue(residue,residue_id) serial = int(line[6+k:12+k]) atom_name = line[12+k:16+k].strip() x = float(line[30+k:38+k]) y = float(line[38+k:46+k]) z = float(line[46+k:54+k]) o = float(line[54+k:60+k]) b = float(line[60+k:66+k]) sym = line[76+k:78+k].strip() try: charge = line[79+k:80+k].strip() except: charge=' ' atom = PDBatom(serial,atom_name,x,y,z,o,b,sym,charge) resObj.AddAtom(atom) else: break line = self.memHandle.readline() self.memHandle.seek(0) return resObj
[docs] def hasModels(self): return (len(self.modelIndices) > 0)
[docs] def getModelNames(self): return self.modelIndices.keys()
[docs] def getChainNames(self): return self.chains
[docs] def iterResidueData(self): """ Yield the model number, chain name, and residue number for all residues present in this PDB file, not necessarily in order. """ for mod,ch,res in self.residueIndices.keys(): yield mod,ch,res
[docs] def getResidueNamesInChain(self,ch): res = [] for c,r in self.residueIndices.keys(): if c == ch: res.append(r) return res # Constants
aa = {'ALA':'A','CYS':'C','ASP':'D','GLU':'E', 'PHE':'F','GLY':'G','HIS':'H','ILE':'I', 'LYS':'K','LEU':'L','MET':'M','ASN':'N', 'PRO':'P','GLN':'Q','ARG':'R','SER':'S', 'THR':'T','VAL':'V','TRP':'W','TYR':'Y', 'UNK':'X'} aa_names = {v:k for k, v in aa.items()} # Flip above dictionary. aa_lists = {'ALA':['N','CA','C','O','CB'],\ 'CYS':['N','CA','C','O','CB','SG'],\ 'ASP':['N','CA','C','O','CB','CG','OD1','OD2'],\ 'GLU':['N','CA','C','O','CB','CG','CD','OE1','OE2'],\ 'PHE':['N','CA','C','O','CB','CG','CD1','CD2','CE1','CE2','CZ'],\ 'GLY':['N','CA','C','O'],\ 'HIS':['N','CA','C','O','CB','CG','ND1','CD2','CE1','NE2'],\ 'ILE':['N','CA','C','O','CB','CG1','CG2','CD1'],\ 'LYS':['N','CA','C','O','CB','CG','CD','CE','NZ'],\ 'LEU':['N','CA','C','O','CB','CG','CD1','CD2'],\ 'MET':['N','CA','C','O','CB','CG','SD','CE'],\ 'ASN':['N','CA','C','O','CB','CG','OD1','ND2'],\ 'PRO':['N','CA','C','O','CB','CG','CD'],\ 'GLN':['N','CA','C','O','CB','CG','CD','OE1','NE2'],\ 'ARG':['N','CA','C','O','CB','CG','CD','NE','CZ','NH1','NH2'],\ 'SER':['N','CA','C','O','CB','OG'],\ 'THR':['N','CA','C','O','CB','OG1','CG2'],\ 'VAL':['N','CA','C','O','CB','CG1','CG2'],\ 'TRP':['N','CA','C','O','CB','CG','CD1','CD2','NE1','CE2','CE3','CZ2','CZ3','CH2'],\ 'TYR':['N','CA','C','O','CB','CG','CD1','CD2','CE1','CE2','CZ','OH'],\ 'UNK':[]} # Debugging if __name__ == "__main__": mystructure = PDBstructure(sys.argv[1]) if mystructure.ismodel: names = mystructure.GetModelNames() else: names = mystructure.GetChainNames() lis = mystructure.FDmatrix(sys.argv[1].strip('pdb')+'gm') # for debugging purposes only mystructure.Map2Protein('test.pdb',lis,names[0],sys.argv[1])# for debugging purposes only if len(sys.argv) > 2: print 'RMSD',mystructure.rmsd(sys.argv[2]) print 'RRMSD',mystructure.rrmsd(sys.argv[2]) print 'TMscore',mystructure.tmscore(sys.argv[2]) print 'GDT',mystructure.gdt(sys.argv[2]) x = mystructure.GetAllCentroid('A') mystructure.Contacts() print mystructure.contactmap