#!/usr/bin/python
'''
Process MATT output and convert them to the GM format.
MATT2GM Copyright (C) 2012 Christian Blouin
MATT2GM Version Copyright (C) 2013 Christian Blouin - Jose Sergio Hleap - Alex Safatli
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
E-mail: cblouin@cs.dal.ca, jshleap@squalus.org, iltafas@gmail.com
Requirements:
1) Python:
a) PDBnet.py
#######################################
#PDBnet.py is a python script and have#
#to be provided with this code #
#######################################
'''
# Import section
import os
import sys
from PDBnet import PDBstructure
# Constants
aa = {'A':'ALA','C':'CYS','D':'ASP','E':'GLU','F':'PHE','G':'GLY','H':'HIS','I':'ILE','K':'LYS','L':'LEU','M':'MET','N':'ASN','P':'PRO','Q':'GLN','R':'ARG','S':'SER','T':'THR','V':'VAL','W':'TRP','Y':'TYR'}
# definition section
[docs]def LoadFasta(filename):
# Load Fasta into a dictionary of sequences indexed by chain name
data = open(filename).read().replace(':>',':####boo')
data = data.split('>')[1:]
for i in range(len(data)):
if ':####boo' in data[i]:
data[i] = data[i].replace(':####boo', ':>')
# Output data structure
out = {}
outorder = []
# Read in the sequences
for i in data:
# chain name
colon = i.find(':')
endline = i.find('\n')
chain = i[colon+1:endline]
# Sequence
seq = i[endline:]
seq = seq.replace('\n','')
out[chain] = seq
outorder.append(chain)
return out, outorder
[docs]def GetMask(A):
# Returns a string with * for all homologous sites and . for non all-homologous sites.
# Assume that no gaps can occur in such sites
# Output data structure
out = ''
# Get the lenght of any sequence (they are all the same length)
L = len(A[A.keys()[0]])
keys = A.keys()
for i in range(L):
c = '*'
for k in keys:
if A[k][i] == '-':
c = '.'
break
out += c
return out
[docs]def RemoveNonHomologous(pdb, aln, msk, mapfile):
# Delete the residues that are not homologous through the whole alignment
fout = open(mapfile,'w')
# Get all chains
keys = pdb.chains.keys()
for chain in keys:
struct = pdb.chains[chain]
seq = aln[chain]
indexlist = pdb.chains[chain].GetIndices()
index = 0
fout.write('>'+chain + '\n')
counter = 0
for i in indexlist:
if msk[struct[i].fstindex] != '*':
# Delete
del struct[i]
else:
# Keep
fout.write('%d\t%s\t%s\n'%(counter,struct[i].index.strip(),struct[i].name))
counter += 1
fout.close()
[docs]def Rename_fasta(prefix):
'''
Rename the chains in the fasta file
'''
# chain names
chains = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
'1','2','3','4','5','6','7','8','9','0','~','!','@','#','$','%','^','&','(',')','-','+','=','_','<','>',
'/',"'","\\",'?','"','[',']','{','}','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q',
'r','s','t','u','v','w', 'x', 'y','z']
# create a backup of the fasta
os.system('cp %s.fasta ./%s.fasta_backup'%(prefix,prefix))
#get the fasta file
names=[]
seqs=[]
fasta=open(prefix+'.fasta').read().split('>')
c=0
for line in fasta[1:]:
bline=line.split(':')
names.append('>'+bline[0]+':'+chains[c])
c+=1
seqs.append(bline[1][bline[1].find('\n')+1:])
#open outfile
out=open(prefix+'.fasta','w')
#write the new fastafile
for j in range(len(names)):
out.write(names[j]+'\n'+seqs[j])
out.close()
[docs]def main(prefix,alphaC=False,redundant=False):
# Form the proper file names
pdbfile = prefix + '.pdb'
fastafile = prefix + '.fasta'
mapfile = prefix + '.landmarks'
outfile = prefix + '.gm'
# Read all structures into RAM
pdbs = PDBstructure(pdbfile)
# if redundant structure fasta need to rename the chains
if redundant:
Rename_fasta(prefix)
# Get the alignment
alns, kalns = LoadFasta(fastafile)
# Check that both the alignment and the pdb have the same chain names
kpdbs= pdbs.orderofchains
newalns={}
if not len(kalns) == len(kpdbs):
print 'not the same number of chains in the PDB and FASTA files'
sys.exit(-1)
else:
for a in range(len(kalns)):
newalns[kpdbs[a]] = alns.pop(kalns[a])
alns = None
alns = newalns
for chain in pdbs.chains:
pdbs.IndexSeq(chain, alns[chain])
# Get the mask
mask = GetMask(alns)
# Delete unaliagned sites
RemoveNonHomologous(pdbs, alns, mask, mapfile)
# Write to an output
fout = open(outfile,'w')
for chain in pdbs.chains:
fout.write('>'+prefix+':'+chain+';')
for index in pdbs.chains[chain].GetIndices():
# deleted item
if not pdbs.chains[chain].GetResidueByIndex(index):
continue
# Write XYZ
pdbs.chains[chain][index].Centroid()
if alphaC:
ca= pdbs.chains[chain][index].atoms['CA']
else:
ca = pdbs.chains[chain][index].centroid
for c in [ca.x,ca.y,ca.z]:
fout.write('%f;'%(c))
# End of line
fout.write('\n')
# Application code
if __name__ == '__main__':
# Read in the file prefix to process
if len(sys.argv) < 2:
print "Usage: MATT2GM.py prefix [options: -alphaC, -redundant]"
sys.exit()
alphaC=False
redundant=False
prefix= sys.argv[1]
for arg in sys.argv[1:]:
if arg == '-alphaC':
alphaC=True
elif arg == '-redundant':
redundant=True
main(prefix,alphaC,redundant)