Source code for labblouin.RegExpress
#!/usr/bin/python
'''
Prosite pattern generator. Will turn an alignment into a regular expression
'''
#importing bit####################################################################################################
import sys
# End importing###################################################################################################
#some constants####################################################################################################
aa = {'A':'Ala','C':'Cys','D':'Asp','E':'Glu','F':'Phe','G':'Gly','H':'His','I':'Ile','K':'Lys','L':'Leu','M':'Met',\
'N':'Asn','P':'Pro','Q':'Gln','R':'Arg','S':'Ser','T':'Thr','V':'Val','W':'Trp','Y':'Tyr'}
allaa=set('ACDEFGHIKLMNPQRSTVWY')
#Some definitions##################################################################################################
[docs]def check_if_is_align(dictionary_fasta):
'''
will check the read fasta to see if is an alignment
'''
lenght=[]
for s in dictionary_fasta.itervalues():
lenght.append(len(s))
if len(set(lenght)) != 1:
print 'Sequences not of the same lenght. An alignment should be provided.'
sys.exit(-1)
else:
l=len(s)
return l
[docs]def read_fasta(prefix):
'''
Wll read the fasta, place sequences in a dictionary. Will break if not an alignment
'''
fastas={}
f = open(prefix+'.fasta').read().split('\n>')
for e in f:
if e == '':
continue
else:
name=e[:e.find('\n')]
seq=e[e.find('\n')+1:].strip().replace('\n','')
fastas[name]=seq
l= check_if_is_align(fastas)
return fastas, l
[docs]def strip_gaps(matrix):
'''
given an alignment matrix, strip out the gap columns
'''
gapc=[]
for e in matrix:
if '-' in e:
gapc.append(matrix.pop(matrix.index(e)))
return gapc
[docs]def reg_express_col(col):
'''
given a column of an alignment, return the string corresponding to the regular expression of such column
'''
s = set(col)
if s == allaa:
re='x'
elif len(s) >= 10 and s != allaa:
ts=allaa.difference(s)
re='{'
for e in ts:
re+=e
re+='}'
elif 1 < len(s) <= 10:
re='['
for e in s:
re+=e
re+=']'
elif len(s) == 1:
for e in s:
re=e
return re
[docs]def reg_express(fastas,l):
'''
will create a matrix with the alignment, discard columns with gaps, and create a regular expression similar to prosite
'''
m=[]
for i in range(l):
m.append([])
seqn=0
for v in fastas.itervalues():
for e in range(len(v)):
m[e].append(v[e])
#gapc = strip_gaps(m)
RE=''
for col in m:
re = reg_express_col(col)
if '-' in re:
continue
else:
RE+=re+'-'
RE = RE[:-1]
return RE
# End of definitions###############################################################################################
# Aplication of the code ##########################################################################################
if __name__ == "__main__":
if len(sys.argv) == 1 or '-help' or '-h' in sys.argv:
print 'Usage: RegExpress.py prefix'
prefix = sys.argv[1]
fastas , l = read_fasta(prefix)
RE = reg_express( fastas , l )
print RE