Source code for labblouin.homology

# homology.py
# -------------------------
# May 4, 2012; Alex Safatli
# -------------------------
# Homology Modelling utilities.

import os, math, glob, tempfile, random, subprocess, warnings, sys, cStringIO
import PDBnet as PDBnet
from pfam import extractPDBChain
import __main__
__main__.pymol_argv = ['pymol', '-qc'] # Quiet, no GUI for Pymol.
from IO import *
try:
    import pymol
    from modeller import *
    from modeller.scripts import complete_pdb
except: warnings.warn("Could not locate Pymol or Modeller.",ImportWarning)

# ------------ MANUAL MODELLING -------------- #

[docs]class manualModeller: ''' Set up and run Modeller using a given folder of PDB files and a target FASTA sequence. ''' def __init__(self, outfolder, targetfile, templatefldr): self.outFolder = outfolder self.targetFile = targetfile self.templateFolder = templatefldr self.templateFiles = getFilesInFolder(self.templateFolder,'pdb') self.__prepare__() def __buildList__(self): ''' Build a string list of all PDB files. ''' out = '' for item in self.templateFiles: if (len(out) > 0): out += ", '" + getFileName(item) + "' " else: out += "'" + getFileName(item) + "' " return out def __prepare__(self): ''' Prepare input for Modeller. ''' # Make modeller folder. if not os.path.isdir(self.outFolder): makeFolder(self.outFolder) # Setup path for alignment PIR file. alnfile = os.path.join(self.outFolder,'alignment.seg') # Setup modeller python script. scriptDir = os.path.dirname(os.path.realpath(__file__)) template = os.path.join(scriptDir,'manual_model') modscript = open(template).read() modscript = modscript.replace('@@PDBDIR@@', self.templateFolder) modscript = modscript.replace('@@ALNFILE@@', 'alignment.seg') modscript = modscript.replace('@@KNOWN@@', self.__buildList__()) modscript = modscript.replace('@@SEQUENCE@@', 'target') # Write modeller python script. modout = os.path.join(self.outFolder,'manual_model.py') modfout = open(modout,'w') modfout.write(modscript) modfout.close() # Write alignment PIR file. p = writeModellerPIR(self.templateFiles,self.targetFile,alnfile) if p is None: return
[docs] def run(self): ''' Run, call Modeller. ''' orig = os.getcwd() try: os.chdir(self.outFolder) os.system('python manual_model.py') finally: os.chdir(orig)
[docs] def postProcess(self): ''' Make a copy of all found models and rename. ''' # Ideally this should rename on basis of some sort of scoring. # However, the actual implementation of this scoring by Biskit # still needs to be looked at to keep this consistent. For now, # renames sequentially. templatePDBs = glob.glob(os.path.join(self.outFolder,'target*.pdb')) i = 0 for pdb in templatePDBs: copyFile(pdb,os.path.join(self.outFolder,'model_%.2d.pdb'%(i))) i += 1 # ----------------- FASTA -------------------- #
[docs]class fasta: ''' Open, read, and organize a FASTA file as an object. ''' def __init__(self, fname, removeGaps=True, allUpper=True): self.fileName = fname self.names = [] self.sequences = [] self.read(removeGaps, allUpper)
[docs] def read(self, rgaps, allup): # Reads Fasta file. fin = open(self.fileName) temp = '' for line in fin: if line[0] == '>': self.names.append(line[1:].strip()) if temp: self.sequences.append(temp) temp = '' else: toAdd = line.strip() if rgaps: toAdd = toAdd.replace('-','').replace('.','') if allup: toAdd = toAdd.upper() temp += toAdd self.sequences.append(temp) fin.close()
[docs]def cleanFastaFile(fname, targetfolder=''): ''' Removes gaps from target FASTA file. Allows for BLAST input using this file. Creates new files for every sequence entry. Returns the filename(s) of the new file(s). ''' if not os.path.isfile(fname): return None # Collect sequences. fastaIn = fasta(fname) # Open new file to write to. numFiles = 0 fileNames = [] seqs = fastaIn.sequences names = fastaIn.names if len(seqs) < 2: # Ensures a new file is not made if only # one sequence in given FASTA file. return [fname] for x in xrange(len(seqs)): temp = ''.join([s for s in names[x] if \ s.isalpha() or s.isdigit()]) # Ensures valid filename. temp = temp[0:10] # Ensure less than 10 chars. newfilename = checkForFileCollision(\ os.path.join(targetfolder, temp + '.fasta'),'fasta') fileNames.append(newfilename) fout = open(newfilename, 'w') fout.write('>%s\n'% (names[x])) for y in range(0, len(seqs[x]), 80): fout.write(seqs[x][y:y+80] + '\n') fout.close() # Return list of filenames. return fileNames
[docs]def writeModellerPIR(pdb_files, seq, dest_file): out = "" for pdb in pdb_files: out += ">P1;%s\n" % (getFileName(pdb)) out += "structure:%s:FIRST :@:END :: : : : \n*\n" % \ (getFileName(pdb)) seqfile = fasta(seq,False,False) firstseq = seqfile.sequences[0] out += ">P1;target\nsequence: : : : : : : : :\n%s*\n" % (firstseq) fout = open(dest_file,'w') fout.write(out) fout.close() if len(out) > 0: return True else: return False
[docs]def writeAlignmentFromFasta(fasta_file, dest_file): ''' Reads a FASTA file and outputs it to a Clustal-like alignment. ''' fastaIn = fasta(fasta_file, False, False) fseqs = fastaIn.sequences fnames = fastaIn.names fout = open(dest_file, 'w') fout.write('align1\n------\n') for seq in range(len(fseqs)): seqout = fseqs[seq].upper().replace('.','-') fout.write('%s\t%s\n' % (fnames[seq], seqout)) fout.close() return dest_file
[docs]def writeSOAPSSAlignment(fasta_file, pdb1, pdb2, dest_file): ''' Reads a FASTA file and outputs to a SOAPSS alignment file. ''' # Determine what residue numbers the PDBs start at. resnum = [getFirstResidueNumber(pdb1), getFirstResidueNumber(pdb2)] # Write the alignment file from the FASTA. fastaIn = fasta(fasta_file, False, True) fseqs = fastaIn.sequences fnames = fastaIn.names if len(fseqs) != 2: raise IndexError('Require FASTA length = 2 for alignment.') fout = open(dest_file, 'w') seq1 = fseqs[0].replace('.','-') seq2 = fseqs[1].replace('.','-') seq1out, seq2out = '','' if len(seq1) != len(seq2): raise IndexError('Need same length sequences in FASTA.') for ind in xrange(len(seq1)): if seq1[ind] == '-' or seq2[ind] == '-': seq1out += seq1[ind].lower() seq2out += seq2[ind].lower() elif seq1[ind] != '-' and seq2[ind] != '-': seq1out += seq1[ind] seq2out += seq2[ind] fout.write('%d %s\n' % (resnum[0],seq1out)) fout.write('%d %s\n' % (resnum[1],seq2out)) fout.close() return seq1, seq2
[docs]def writeFirstFromFasta(fasta_file, dest_file): ''' Extracts first sequence from a FASTA and writes to a new one. ''' fastaIn = fasta(fasta_file, False, False) fseq = fastaIn.sequences[0] fname = fastaIn.names[0] fout = open(dest_file, 'w') fout.write('>%s\n'% (fname)) for y in range(0, len(fseq), 80): fout.write(fseq[y:y+80] + '\n') fout.close()
[docs]def extractRandomFasta(fasta_file, numRandom, dest_file): ''' Extracts a random number of sequences from a FASTA file. Writes to new FASTA. ''' if not os.path.isfile(fasta_file): return # Collects sequences. fastaIn = fasta(fasta_file, False, False) # Extract random sequences. fseqs = fastaIn.sequences fnames = fastaIn.names numSeqs = len(fseqs) seqs = [] names = [] indexMap = range(numSeqs) random.shuffle(indexMap) randMap = indexMap[:numRandom] for num in randMap: seqs.append(fseqs[num]) names.append(fnames[num]) # Write to file. fout = open(dest_file, 'w') for x in xrange(len(seqs)): fout.write('>%s\n'% (names[x])) for y in range(0, len(seqs[x]), 80): fout.write(seqs[x][y:y+80] + '\n') fout.close() # -------------------------------------------- #
[docs]def completePDB(pdbin,pdbout): ''' Given a PDB, clean/complete the PDB using Modeller's complete_pdb function. ''' # Squelch stdout. save_ = sys.stdout sys.stdout = cStringIO.StringIO() # Perform PDB completion. env = environ() env.libs.topology.read('${LIB}/top_heav.lib') env.libs.parameters.read('${LIB}/par.lib') m = complete_pdb(env,pdbin) m.write(file=pdbout) # Reclaim stdout. sys.stdout = save_
[docs]def getRandomPDBFragment(pdb,k=100): ''' Given a PDB, get a random fragment of length k. ''' pdb = open(pdb) dat = [x for x in pdb if x.startswith('ATOM')] pdb.close() if len(dat) < k: return [] dat = zip(*[iter(dat)]*k) random.shuffle(dat) return dat[0]
[docs]def getFirstResidueNumber(pdb): ''' Gets the FASTA residue number of the first ATOM in a PDB file. ''' pdb = PDBnet.PDBstructure(pdb) order = pdb.chains firstchain = pdb.chains.keys()[0] return int(order[firstchain][0])
[docs]def checkForFileCollision(path, ext): ''' Ensures a file is not being overwritten. Will recursively keep adding a '0' to end of base filename until a unique filename is found. Returns the path. ''' if os.path.isfile(path): newpath = os.path.join(getFolderName(path),getFileName(path) + '0.' + ext) return checkForFileCollision(newpath, ext) else: return path
[docs]def system(instruction): ''' Executes a system command line instruction. Returns list of stdout, stderr. ''' sproc = subprocess.Popen(instruction, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) return sproc.communicate()
[docs]def cleanModellerFolder(path): ''' Moves intermediate PDBs to another subfolder in the Modeller folder output from Biskit. ''' templatePDBs = glob.glob(os.path.join(path,'target*.pdb')) targetFolder = makeFolder(os.path.join(path,'intermediates')) for pdb in templatePDBs: moveFile(pdb,targetFolder)
[docs]def getModellerPDB(workflowfolder, targetfile): ''' Copies the model_00.pdb file from Modeller if present in given workflow directory from homologyWorkflow. ''' modeller_folder = os.path.join(workflowfolder, 'modeller') if os.path.isdir(modeller_folder): modeller_file = os.path.join(modeller_folder, 'model_00.pdb') if os.path.isfile(modeller_file): copyFile(modeller_file, targetfile)
[docs]def compareStructures(foldername1, foldername2, outpath, verbose=False): ''' Given 2 folders with PDB files, calculates the RMSDs with all possible pairwise combinations of PDBs; writes to file located at path determined by outpath. ''' # Running list to ensure no redundant checks. crosscheck = [] # File to write scores to. scoreFile = os.path.join(outpath,"structcomparisons.list") out = open(scoreFile, "w") # Do the actual RMSDs. for filesA in glob.glob(os.path.join(foldername1, '*.pdb')): for filesB in glob.glob(os.path.join(foldername2, '*.pdb')): if (getFileName(filesB), getFileName(filesA)) in crosscheck: continue if (verbose): print "%s, %s" % (getFileName(filesA), getFileName(filesB)) line = "%s, %s\t%f\n" % (getFileName(filesA), getFileName(filesB), rmsd(filesA,filesB)) out.write(line) crosscheck.append((getFileName(filesA), getFileName(filesB))) # Sorts output RMSDs in descending order. sortInstr = 'cat ' + outpath + '/structcomparisons.list | sort -k 3 >> ' \ + outpath + '/structcomparisons.sorted' system(sortInstr) return os.path.join(outpath,"structcomparisons.sorted")
[docs]def startPymol(): ''' Ensures Pymol has been launched. ''' pymol.finish_launching()
[docs]def radiusOfGyration(pdbFile,chain,pdb=None): ''' Returns the approximate radius of gyration of a given PDB molecule. Deprecated; now in PDBnet. ''' if not pdb: pdb = PDBnet.PDBstructure(pdbFile) # Get centre of mass of entire molecule. mol_centroid = [0,0,0] num_atoms = 0 for r in pdb.chains[chain]: resid = pdb.chains[chain][r] for at in resid.atoms: a = resid.atoms[at] mol_centroid[0] += a.x mol_centroid[1] += a.y mol_centroid[2] += a.z num_atoms += 1 m_x, m_y, m_z = mol_centroid mol_centroid = [m_x/num_atoms,m_y/num_atoms,m_z/num_atoms] # Sum all of the distances squared vs. centre of mass. distsqr = [] m_x, m_y, m_z = mol_centroid for r in pdb.chains[chain]: resid = pdb.chains[chain][r] for at in resid.atoms: a = resid.atoms[at] diff = (a.x-m_x+a.y-m_y+a.z-m_z) distsqr.append(diff**2) # Return the summation of all distance squared divided by # the number of atoms, square rooted. sumdistratio = sum(distsqr)/num_atoms return math.sqrt(sumdistratio)
[docs]def tmscore(alignPDB,alignFASTA): ''' Returns the TMscore and associated P-value. See Xu & Zhang, "How significant is a protein structure similarity with TM-score = 0.5?". ''' pdb = PDBnet.PDBstructure(alignPDB) if len(pdb.chains) != 2: raise IOError('Alignment PDB chain length != 2') if not chains: chains = [x for x in pdb.chains][:2] tmsc = pdb.tmscore(alignFASTA,chains) return tmsc, 1 - math.exp(-math.exp((0.1512-tmsc)/0.0242))
[docs]def rrmsd(alignPDB,alignFASTA,rmsd=False,chains=None): ''' Returns the rRMSD of an aligned pairwise PDB and FASTA file. See Betancourt & Skolnick, "Universal Similarity Measure for Comparison Protein Structures". Deprecated. ''' pdb = PDBnet.PDBstructure(alignPDB) if len(pdb.chains) < 2: raise IOError('Alignment PDB chain length < 2') if not chains: chains = [x for x in pdb.chains][:2] rr = pdb.rrmsd(alignFASTA,chains) if rmsd: return rr, pdb.rmsd(alignFASTA,chains) return rr
[docs]def rmsd(pdbFile1, pdbFile2): ''' Returns the RMSD of two PDB files. Requires: Pymol. ''' pymol.cmd.load(pdbFile1, 'pdb1') pymol.cmd.load(pdbFile2, 'pdb2') rmsdOut = pymol.cmd.super('pdb1', 'pdb2')[0] pymol.cmd.delete('all') return rmsdOut