Source code for labblouin.SeqMask

#!/usr/bin/python
'''
Get the amino acid sequence from a landmark file, align this profile to more sequences if required
using muscle, and mask the membership vector into this alignment.
'''
#importing bit####################################################################################################
import sys,os, glob
# End importing###################################################################################################

#Some definitions##################################################################################################
[docs]def InferSingleLettterCode(threlettercode):
	'''
	Convert the amino acid three letter code, into a single letter code
	'''
	from tl.rename.case import transform_sentence_case
	aa = {'Gly':'G' , 'Ala':'A' , 'Val':'V' , 'Leu':'L' , 'Ile':'I' , 'Met':'M' , 'Phe':'F' ,
	      'Trp':'W' , 'Pro':'P' , 'Ser':'S' , 'Thr':'T' , 'Cys':'C' , 'Tyr':'Y' , 'Asn':'N' , 
	      'Gln':'Q' , 'Asp':'D' , 'Glu':'E' , 'Lys':'K' , 'Arg':'R' , 'His':'H'}
	singleletter = aa[transform_sentence_case([threlettercode])[0]]
	return singleletter
	
	
[docs]def InferSeqs_landmarks(prefix):
	'''
	Given a landmark file, return a dictionary with the name as key and the
	sequence as value
	'''
	seqs={}
	inf = open(prefix+'.landmarks')
	rf = inf.read()
	spl = rf.split('\n>')
	for i in range(len(spl)):
		s=''
		name=spl[i][:spl[i].find('\n')]
		bline = spl[i][spl[i].find('\n')+1:].split()

		for j in range(2,len(bline),3):
			s+= InferSingleLettterCode(bline[j])
		seqs[name]=s
	return seqs


[docs]def dict2fasta(outfilename, dic):
	'''
	Convert the dictionary return by InferSeqs_landmarks fuction
	and writes a fasta file. The dictionary should have the seq name as
	key and a string as value containing the sequence
	'''
	fout=open(outfilename,'w')
	for k,v in dic.iteritems():
		fout.write('>'+k+'\n'+v+'\n')
	fout.close()

[docs]def fasta2dict(fastafilename):
	'''
	convert a fasta file into a python dictionary, being the keys the name of the sequence
	and the values the sequence itself, in a list
	'''
	dic={}
	inf = open(fastafilename)
	rf = inf.read().split('\n>')
	for i in range(len(rf)):
		s=''
		name=rf[i][rf[i].find('\n')-1]
		seq = rf[i][rf[i].find('\n'):].strip()
		dic[name]=str2list(seq)
	return dic
			
[docs]def Mem2List2tuple(prefix):
	'''
	Read the membership vector, and returns a list and a dictionary with tupples for
	each module, indicating the indexes
	'''
	mem = open(prefix+'.graphcluster').read()
	mem = mem.strip().split()
	s = set(mem)
	tuples = {}
	for i in range(len(s)):
		tuples[list(s)[i]]=()
		for j in range(len(mem)):
			if mem[j] == list(s)[i]:
				tuples[list(s)[i]]+=(j,)
		
	return mem, tuples

[docs]def str2list(string):
	'''
	returns a list of a string that have no spaces between elements
	'''
	l=[]
	for e in string:
		l.append(e)
	return l
	
[docs]def MaskAln(fastafilename, modulename, mem, tuples):
	'''
	Strip the module in an alignment (fasta) file
	'''
	ms=[]
	names=[]
	afadic = fasta2dict(fastafilename)
	for k, v in afadic.iteritems():
		s=''
		for e in tuples[modulename]:
			s+=v[e]
		names.append(k)
		ms.append(s)
	return ms, names
	
[docs]def modseq2Fasta(prefix,modseq,names):
	for k,v in modseq.iteritems():
		fout=open(prefix+'_%s.fst'%(k),'w')
		for e in range(len(v)):
			fout.write('>'+names[e]+'\n'+v[e]+'\n')
		fout.close()
	
# End of definitions###############################################################################################

# Aplication of the code ##########################################################################################
if __name__ == "__main__":
	# Some defaults
	seqname = None
	phylip = False
	# Get commands line
	for arg in sys.argv[1:]:
		if arg == '-align=':
			seqname = arg[7:]
		elif arg == '-phylip':
			phylip=True
	
	prefix = sys.argv[1]
	
	#create the fasta file of the structural alignment from which modules are being analized
	dict2fasta(prefix+'.fas', InferSeqs_landmarks(prefix))
	# if more sequences perform a profile analysis
	if seqname:
		os.system('muscle -profile -in1 %.fas -in2 %s.fa -out %s_combined.afa'%(prefix , seqname , prefix))
	# Otherwise, rename the fas file and use it as the aln.
	else:
		os.system('mv %s.fas %s.afa'%(prefix,prefix))
	
	# given the alignment and the membership vector, mask each module
	mem , tuples = Mem2List2tuple(prefix)
	modseq={}
	for e in list(set(mem)):
		if e == '?':
			continue
		else:
			modseq[e], names = MaskAln(prefix+'.afa', e, mem, tuples)
	
	modseq2Fasta(prefix,modseq,names)
	
	if phylip:
		cwd = os.getcwd()
		files = glob.glob(cwd+'/*.fst')
		for f in files:
			os.system('Fasta2Phylip %s %s.phy'%(f,f[:-4]))
Navigation

Source code for labblouin.SeqMask

Quick search

Navigation