Source code for labblouin.SeqMask

#!/usr/bin/python
'''
Get the amino acid sequence from a landmark file, align this profile to more sequences if required
using muscle, and mask the membership vector into this alignment.
'''
#importing bit####################################################################################################
import sys,os, glob
# End importing###################################################################################################

#Some definitions##################################################################################################
[docs]def InferSingleLettterCode(threlettercode): ''' Convert the amino acid three letter code, into a single letter code ''' from tl.rename.case import transform_sentence_case aa = {'Gly':'G' , 'Ala':'A' , 'Val':'V' , 'Leu':'L' , 'Ile':'I' , 'Met':'M' , 'Phe':'F' , 'Trp':'W' , 'Pro':'P' , 'Ser':'S' , 'Thr':'T' , 'Cys':'C' , 'Tyr':'Y' , 'Asn':'N' , 'Gln':'Q' , 'Asp':'D' , 'Glu':'E' , 'Lys':'K' , 'Arg':'R' , 'His':'H'} singleletter = aa[transform_sentence_case([threlettercode])[0]] return singleletter
[docs]def InferSeqs_landmarks(prefix): ''' Given a landmark file, return a dictionary with the name as key and the sequence as value ''' seqs={} inf = open(prefix+'.landmarks') rf = inf.read() spl = rf.split('\n>') for i in range(len(spl)): s='' name=spl[i][:spl[i].find('\n')] bline = spl[i][spl[i].find('\n')+1:].split() for j in range(2,len(bline),3): s+= InferSingleLettterCode(bline[j]) seqs[name]=s return seqs
[docs]def dict2fasta(outfilename, dic): ''' Convert the dictionary return by InferSeqs_landmarks fuction and writes a fasta file. The dictionary should have the seq name as key and a string as value containing the sequence ''' fout=open(outfilename,'w') for k,v in dic.iteritems(): fout.write('>'+k+'\n'+v+'\n') fout.close()
[docs]def fasta2dict(fastafilename): ''' convert a fasta file into a python dictionary, being the keys the name of the sequence and the values the sequence itself, in a list ''' dic={} inf = open(fastafilename) rf = inf.read().split('\n>') for i in range(len(rf)): s='' name=rf[i][rf[i].find('\n')-1] seq = rf[i][rf[i].find('\n'):].strip() dic[name]=str2list(seq) return dic
[docs]def Mem2List2tuple(prefix): ''' Read the membership vector, and returns a list and a dictionary with tupples for each module, indicating the indexes ''' mem = open(prefix+'.graphcluster').read() mem = mem.strip().split() s = set(mem) tuples = {} for i in range(len(s)): tuples[list(s)[i]]=() for j in range(len(mem)): if mem[j] == list(s)[i]: tuples[list(s)[i]]+=(j,) return mem, tuples
[docs]def str2list(string): ''' returns a list of a string that have no spaces between elements ''' l=[] for e in string: l.append(e) return l
[docs]def MaskAln(fastafilename, modulename, mem, tuples): ''' Strip the module in an alignment (fasta) file ''' ms=[] names=[] afadic = fasta2dict(fastafilename) for k, v in afadic.iteritems(): s='' for e in tuples[modulename]: s+=v[e] names.append(k) ms.append(s) return ms, names
[docs]def modseq2Fasta(prefix,modseq,names): for k,v in modseq.iteritems(): fout=open(prefix+'_%s.fst'%(k),'w') for e in range(len(v)): fout.write('>'+names[e]+'\n'+v[e]+'\n') fout.close() # End of definitions############################################################################################### # Aplication of the code ##########################################################################################
if __name__ == "__main__": # Some defaults seqname = None phylip = False # Get commands line for arg in sys.argv[1:]: if arg == '-align=': seqname = arg[7:] elif arg == '-phylip': phylip=True prefix = sys.argv[1] #create the fasta file of the structural alignment from which modules are being analized dict2fasta(prefix+'.fas', InferSeqs_landmarks(prefix)) # if more sequences perform a profile analysis if seqname: os.system('muscle -profile -in1 %.fas -in2 %s.fa -out %s_combined.afa'%(prefix , seqname , prefix)) # Otherwise, rename the fas file and use it as the aln. else: os.system('mv %s.fas %s.afa'%(prefix,prefix)) # given the alignment and the membership vector, mask each module mem , tuples = Mem2List2tuple(prefix) modseq={} for e in list(set(mem)): if e == '?': continue else: modseq[e], names = MaskAln(prefix+'.afa', e, mem, tuples) modseq2Fasta(prefix,modseq,names) if phylip: cwd = os.getcwd() files = glob.glob(cwd+'/*.fst') for f in files: os.system('Fasta2Phylip %s %s.phy'%(f,f[:-4]))