Source code for labblouin.homology

# homology.py
# -------------------------
# May 4, 2012; Alex Safatli
# -------------------------
# Homology Modelling utilities.

import os, math, glob, tempfile, random, subprocess, warnings, sys, cStringIO
import PDBnet as PDBnet
from pfam import extractPDBChain
import __main__
__main__.pymol_argv = ['pymol', '-qc'] # Quiet, no GUI for Pymol.
from IO import *
try:
    import pymol
    from modeller import *
    from modeller.scripts import complete_pdb
except: warnings.warn("Could not locate Pymol or Modeller.",ImportWarning)

# ------------ MANUAL MODELLING -------------- #

[docs]class manualModeller:
    '''
    Set up and run Modeller using a given
    folder of PDB files and a target FASTA
    sequence.
    '''
    def __init__(self, outfolder, targetfile, templatefldr):
        self.outFolder = outfolder
        self.targetFile = targetfile
        self.templateFolder = templatefldr
        self.templateFiles = getFilesInFolder(self.templateFolder,'pdb')
        self.__prepare__()
        
    def __buildList__(self):
        '''
        Build a string list of all PDB files.
        '''
        out = ''
        for item in self.templateFiles:
            if (len(out) > 0): out += ", '" + getFileName(item) + "' "
            else: out += "'" + getFileName(item) + "' "
        return out
    
    def __prepare__(self):
        '''
        Prepare input for Modeller.
        '''
        # Make modeller folder.
        if not os.path.isdir(self.outFolder):
            makeFolder(self.outFolder)
            
        # Setup path for alignment PIR file.
        alnfile = os.path.join(self.outFolder,'alignment.seg')
            
        # Setup modeller python script.
        scriptDir = os.path.dirname(os.path.realpath(__file__))
        template = os.path.join(scriptDir,'manual_model')
        modscript = open(template).read()
        modscript = modscript.replace('@@PDBDIR@@', self.templateFolder)
        modscript = modscript.replace('@@ALNFILE@@', 'alignment.seg')
        modscript = modscript.replace('@@KNOWN@@', self.__buildList__())
        modscript = modscript.replace('@@SEQUENCE@@', 'target')
        
        # Write modeller python script.
        modout = os.path.join(self.outFolder,'manual_model.py')
        modfout = open(modout,'w')
        modfout.write(modscript)
        modfout.close()
        
        # Write alignment PIR file.
        p = writeModellerPIR(self.templateFiles,self.targetFile,alnfile)
        if p is None: return
        
[docs]    def run(self):
        '''
        Run, call Modeller.
        '''
        orig = os.getcwd()
        try:
            os.chdir(self.outFolder)
            os.system('python manual_model.py')
        finally:
            os.chdir(orig)
            
[docs]    def postProcess(self):
        '''
        Make a copy of all found models and rename.
        '''
        # Ideally this should rename on basis of some sort of scoring.
        # However, the actual implementation of this scoring by Biskit
        # still needs to be looked at to keep this consistent. For now,
        # renames sequentially.
        templatePDBs = glob.glob(os.path.join(self.outFolder,'target*.pdb'))
        i = 0
        for pdb in templatePDBs:
            copyFile(pdb,os.path.join(self.outFolder,'model_%.2d.pdb'%(i)))
            i += 1

# ----------------- FASTA -------------------- #

[docs]class fasta:
    '''
    Open, read, and organize a FASTA file as an
    object.
    '''
    def __init__(self, fname, removeGaps=True, allUpper=True):
        self.fileName = fname
        self.names = []
        self.sequences = []
        self.read(removeGaps, allUpper)
[docs]    def read(self, rgaps, allup):
        # Reads Fasta file.
        fin = open(self.fileName)
        temp = ''
        for line in fin:
            if line[0] == '>':
                self.names.append(line[1:].strip())
                if temp: self.sequences.append(temp)
                temp = ''
            else:
                toAdd = line.strip()
                if rgaps: toAdd = toAdd.replace('-','').replace('.','')
                if allup: toAdd = toAdd.upper()
                temp += toAdd
        self.sequences.append(temp)
        fin.close()

[docs]def cleanFastaFile(fname, targetfolder=''):
    '''
    Removes gaps from target FASTA file.
    Allows for BLAST input using this file.
    Creates new files for every sequence entry.
    Returns the filename(s) of the new file(s).
    '''    
    if not os.path.isfile(fname): return None    

    # Collect sequences.

    fastaIn = fasta(fname)

    # Open new file to write to.

    numFiles = 0
    fileNames = []
    seqs = fastaIn.sequences
    names = fastaIn.names
    
    if len(seqs) < 2:
        # Ensures a new file is not made if only
        # one sequence in given FASTA file.
        return [fname]

    for x in xrange(len(seqs)):
        temp = ''.join([s for s in names[x] if \
                        s.isalpha() or s.isdigit()]) # Ensures valid filename.
        temp = temp[0:10] # Ensure less than 10 chars.
        newfilename = checkForFileCollision(\
            os.path.join(targetfolder, temp + '.fasta'),'fasta')
        fileNames.append(newfilename)
        fout = open(newfilename, 'w')
        fout.write('>%s\n'% (names[x]))
        for y in range(0, len(seqs[x]), 80):
            fout.write(seqs[x][y:y+80] + '\n')
        fout.close()

    # Return list of filenames.

    return fileNames

[docs]def writeModellerPIR(pdb_files, seq, dest_file):
    out = ""
    for pdb in pdb_files:
        out += ">P1;%s\n" % (getFileName(pdb))
        out += "structure:%s:FIRST    :@:END  :: :  : :  \n*\n" % \
            (getFileName(pdb))
    seqfile = fasta(seq,False,False)
    firstseq = seqfile.sequences[0]
    out += ">P1;target\nsequence: : : : : : : : :\n%s*\n" % (firstseq)
    fout = open(dest_file,'w')
    fout.write(out)
    fout.close()
    if len(out) > 0: return True
    else: return False

[docs]def writeAlignmentFromFasta(fasta_file, dest_file):
    '''
    Reads a FASTA file and outputs it to a
    Clustal-like alignment.
    '''    
    fastaIn = fasta(fasta_file, False, False)
    fseqs = fastaIn.sequences
    fnames = fastaIn.names
    fout = open(dest_file, 'w')
    fout.write('align1\n------\n')
    for seq in range(len(fseqs)):
        seqout = fseqs[seq].upper().replace('.','-')
        fout.write('%s\t%s\n' % (fnames[seq], seqout))
    fout.close()
    return dest_file

[docs]def writeSOAPSSAlignment(fasta_file, pdb1, pdb2, dest_file):
    '''
    Reads a FASTA file and outputs to a SOAPSS
    alignment file.
    '''
    # Determine what residue numbers the PDBs start at.
    resnum = [getFirstResidueNumber(pdb1), getFirstResidueNumber(pdb2)]
    # Write the alignment file from the FASTA.
    fastaIn = fasta(fasta_file, False, True)
    fseqs = fastaIn.sequences
    fnames = fastaIn.names
    if len(fseqs) != 2:
        raise IndexError('Require FASTA length = 2 for alignment.')
    fout = open(dest_file, 'w')
    seq1 = fseqs[0].replace('.','-')
    seq2 = fseqs[1].replace('.','-')
    seq1out, seq2out = '',''
    if len(seq1) != len(seq2):
        raise IndexError('Need same length sequences in FASTA.')
    for ind in xrange(len(seq1)):
        if seq1[ind] == '-' or seq2[ind] == '-':
            seq1out += seq1[ind].lower()
            seq2out += seq2[ind].lower()
        elif seq1[ind] != '-' and seq2[ind] != '-':
            seq1out += seq1[ind]
            seq2out += seq2[ind]
    fout.write('%d %s\n' % (resnum[0],seq1out))
    fout.write('%d %s\n' % (resnum[1],seq2out))
    fout.close()
    return seq1, seq2

[docs]def writeFirstFromFasta(fasta_file, dest_file):
    '''
    Extracts first sequence from a FASTA and
    writes to a new one.
    '''
    fastaIn = fasta(fasta_file, False, False)
    fseq = fastaIn.sequences[0]
    fname = fastaIn.names[0]
    fout = open(dest_file, 'w')
    fout.write('>%s\n'% (fname))
    for y in range(0, len(fseq), 80):
        fout.write(fseq[y:y+80] + '\n')
    fout.close()    

[docs]def extractRandomFasta(fasta_file, numRandom, dest_file):
    '''
    Extracts a random number of sequences
    from a FASTA file. Writes to new FASTA.
    '''
    if not os.path.isfile(fasta_file):
        return

    # Collects sequences.

    fastaIn = fasta(fasta_file, False, False)

    # Extract random sequences.

    fseqs = fastaIn.sequences
    fnames = fastaIn.names
    
    numSeqs = len(fseqs)
    seqs = []
    names = []

    indexMap = range(numSeqs)
    random.shuffle(indexMap)
    randMap = indexMap[:numRandom]
    
    for num in randMap:
        seqs.append(fseqs[num])
        names.append(fnames[num])

    # Write to file.

    fout = open(dest_file, 'w')
    for x in xrange(len(seqs)):
        fout.write('>%s\n'% (names[x]))
        for y in range(0, len(seqs[x]), 80):
            fout.write(seqs[x][y:y+80] + '\n')
    fout.close()        

# -------------------------------------------- #

[docs]def completePDB(pdbin,pdbout):
    '''
    Given a PDB, clean/complete the PDB using
    Modeller's complete_pdb function.
    '''
    # Squelch stdout.
    save_ = sys.stdout
    sys.stdout = cStringIO.StringIO()
    # Perform PDB completion.
    env = environ()
    env.libs.topology.read('${LIB}/top_heav.lib')
    env.libs.parameters.read('${LIB}/par.lib')    
    m = complete_pdb(env,pdbin)
    m.write(file=pdbout)
    # Reclaim stdout.
    sys.stdout = save_

[docs]def getRandomPDBFragment(pdb,k=100):
    '''
    Given a PDB, get a random fragment of length k.
    '''
    pdb = open(pdb)
    dat = [x for x in pdb if x.startswith('ATOM')]
    pdb.close()
    if len(dat) < k: return []
    dat = zip(*[iter(dat)]*k)
    random.shuffle(dat)
    return dat[0]

[docs]def getFirstResidueNumber(pdb):
    '''
    Gets the FASTA residue number of the first
    ATOM in a PDB file.
    '''
    pdb = PDBnet.PDBstructure(pdb)
    order = pdb.chains
    firstchain = pdb.chains.keys()[0]
    return int(order[firstchain][0])

[docs]def checkForFileCollision(path, ext):
    '''
    Ensures a file is not being overwritten.
    Will recursively keep adding a '0' to end of
    base filename until a unique filename is found.
    Returns the path.
    '''
    if os.path.isfile(path):
        newpath = os.path.join(getFolderName(path),getFileName(path) + '0.' + ext)
        return checkForFileCollision(newpath, ext)
    else: return path
    
[docs]def system(instruction):
    '''
    Executes a system command line instruction. Returns
    list of stdout, stderr.
    '''
    sproc = subprocess.Popen(instruction, stdout=subprocess.PIPE, 
                                     stderr=subprocess.PIPE, shell=True)  
    return sproc.communicate()

[docs]def cleanModellerFolder(path):
    '''
    Moves intermediate PDBs to another subfolder in the
    Modeller folder output from Biskit.
    '''        
    templatePDBs = glob.glob(os.path.join(path,'target*.pdb'))
    targetFolder = makeFolder(os.path.join(path,'intermediates'))
    for pdb in templatePDBs:
        moveFile(pdb,targetFolder)

[docs]def getModellerPDB(workflowfolder, targetfile):
    '''
    Copies the model_00.pdb file from Modeller if present
    in given workflow directory from homologyWorkflow.
    '''
    modeller_folder = os.path.join(workflowfolder, 'modeller')
    if os.path.isdir(modeller_folder):
        modeller_file = os.path.join(modeller_folder, 'model_00.pdb')
        if os.path.isfile(modeller_file):
            copyFile(modeller_file, targetfile)

[docs]def compareStructures(foldername1, foldername2, outpath, verbose=False):
    '''
    Given 2 folders with PDB files, calculates the RMSDs
    with all possible pairwise combinations of PDBs; writes
    to file located at path determined by outpath.
    ''' 
    # Running list to ensure no redundant checks.
    crosscheck = []
    # File to write scores to.
    scoreFile = os.path.join(outpath,"structcomparisons.list")
    out = open(scoreFile, "w")

    # Do the actual RMSDs.
    for filesA in glob.glob(os.path.join(foldername1, '*.pdb')):
        for filesB in glob.glob(os.path.join(foldername2, '*.pdb')):
            if (getFileName(filesB), getFileName(filesA)) in crosscheck:
                continue
            if (verbose): print "%s, %s" % (getFileName(filesA), getFileName(filesB))
            line = "%s, %s\t%f\n" % (getFileName(filesA), getFileName(filesB), rmsd(filesA,filesB))
            out.write(line)
            crosscheck.append((getFileName(filesA), getFileName(filesB)))

    # Sorts output RMSDs in descending order.
    sortInstr = 'cat ' + outpath + '/structcomparisons.list | sort -k 3 >> ' \
        + outpath + '/structcomparisons.sorted'
    system(sortInstr)
    return os.path.join(outpath,"structcomparisons.sorted")

[docs]def startPymol():
    '''
    Ensures Pymol has been launched.
    '''
    pymol.finish_launching()

[docs]def radiusOfGyration(pdbFile,chain,pdb=None):
    '''
    Returns the approximate radius of gyration
    of a given PDB molecule. Deprecated; now in PDBnet.
    '''
    if not pdb: pdb = PDBnet.PDBstructure(pdbFile)
    # Get centre of mass of entire molecule.
    mol_centroid = [0,0,0]
    num_atoms = 0
    for r in pdb.chains[chain]:
        resid = pdb.chains[chain][r]
        for at in resid.atoms:
            a = resid.atoms[at]
            mol_centroid[0] += a.x
            mol_centroid[1] += a.y
            mol_centroid[2] += a.z
            num_atoms += 1
    m_x, m_y, m_z = mol_centroid
    mol_centroid = [m_x/num_atoms,m_y/num_atoms,m_z/num_atoms]
    # Sum all of the distances squared vs. centre of mass.
    distsqr = []
    m_x, m_y, m_z = mol_centroid
    for r in pdb.chains[chain]:
        resid = pdb.chains[chain][r]
        for at in resid.atoms:
            a = resid.atoms[at]
            diff = (a.x-m_x+a.y-m_y+a.z-m_z)
            distsqr.append(diff**2)
    # Return the summation of all distance squared divided by
    # the number of atoms, square rooted.
    sumdistratio = sum(distsqr)/num_atoms
    return math.sqrt(sumdistratio)

[docs]def tmscore(alignPDB,alignFASTA):
    
    ''' Returns the TMscore and associated P-value. See Xu &
    Zhang, "How significant is a protein structure similarity with
    TM-score = 0.5?". '''
    
    pdb = PDBnet.PDBstructure(alignPDB)
    if len(pdb.chains) != 2:
        raise IOError('Alignment PDB chain length != 2')
    if not chains: chains = [x for x in pdb.chains][:2]
    tmsc = pdb.tmscore(alignFASTA,chains)
    return tmsc, 1 - math.exp(-math.exp((0.1512-tmsc)/0.0242))

[docs]def rrmsd(alignPDB,alignFASTA,rmsd=False,chains=None):
    
    ''' Returns the rRMSD of an aligned pairwise PDB and 
    FASTA file. See Betancourt & Skolnick, "Universal Similarity 
    Measure for Comparison Protein Structures". Deprecated. '''
    
    pdb = PDBnet.PDBstructure(alignPDB)
    if len(pdb.chains) < 2:
        raise IOError('Alignment PDB chain length < 2')
    if not chains:
        chains = [x for x in pdb.chains][:2]

    rr = pdb.rrmsd(alignFASTA,chains)
    if rmsd: return rr, pdb.rmsd(alignFASTA,chains)
    return rr

[docs]def rmsd(pdbFile1, pdbFile2):
    '''
    Returns the RMSD of two PDB files.
    Requires: Pymol.
    '''        
    pymol.cmd.load(pdbFile1, 'pdb1')
    pymol.cmd.load(pdbFile2, 'pdb2')
    rmsdOut = pymol.cmd.super('pdb1', 'pdb2')[0]
    pymol.cmd.delete('all')
    return rmsdOut
Navigation

Source code for labblouin.homology

Quick search

Navigation