Source code for labblouin.MATT2GM

#!/usr/bin/python
'''
   Process MATT output and convert them to the GM format.
   
   MATT2GM Copyright (C) 2012 Christian Blouin
   MATT2GM Version Copyright (C) 2013 Christian Blouin - Jose Sergio Hleap - Alex Safatli
   
   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
   
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
   
   E-mail: cblouin@cs.dal.ca, jshleap@squalus.org, iltafas@gmail.com
   
   Requirements:
   1) Python:
	  a) PDBnet.py
	  #######################################
	  #PDBnet.py is a python script and have#
	  #to be provided with this code        #
	  #######################################
   
'''

# Import section
import os
import sys
from PDBnet import PDBstructure
# Constants
aa = {'A':'ALA','C':'CYS','D':'ASP','E':'GLU','F':'PHE','G':'GLY','H':'HIS','I':'ILE','K':'LYS','L':'LEU','M':'MET','N':'ASN','P':'PRO','Q':'GLN','R':'ARG','S':'SER','T':'THR','V':'VAL','W':'TRP','Y':'TYR'}

# definition section
[docs]def LoadFasta(filename):
	# Load Fasta into a dictionary of sequences indexed by chain name
	data = open(filename).read().replace(':>',':####boo')
	data = data.split('>')[1:]
	for i in range(len(data)):
		if ':####boo' in data[i]:
			data[i] = data[i].replace(':####boo', ':>')

	# Output data structure
	out = {}
	outorder = []

	# Read in the sequences
	for i in data:
		# chain name
		colon = i.find(':')
		endline = i.find('\n')
		chain = i[colon+1:endline]

		# Sequence
		seq = i[endline:]
		seq = seq.replace('\n','')

		out[chain] = seq
		outorder.append(chain)

	return out, outorder

[docs]def GetMask(A):
	# Returns a string with * for all homologous sites and . for non all-homologous sites.
	# Assume that no gaps can occur in such sites

	# Output data structure
	out = ''

	# Get the lenght of any sequence (they are all the same length)
	L = len(A[A.keys()[0]])

	keys = A.keys()
	for i in range(L):
		c = '*'
		for k in keys:
			if A[k][i] == '-':
				c = '.'
				break
		out += c

	return out

[docs]def RemoveNonHomologous(pdb, aln, msk, mapfile):
	# Delete the residues that are not homologous through the whole alignment

	fout = open(mapfile,'w')

	# Get all chains
	keys = pdb.chains.keys()

	for chain in keys:
		struct = pdb.chains[chain]
		seq = aln[chain]
		indexlist = pdb.chains[chain].GetIndices()
		index = 0

		fout.write('>'+chain + '\n')
		counter = 0

		for i in indexlist:
			if msk[struct[i].fstindex] != '*':
				# Delete
				del struct[i]
			else:
				# Keep
				fout.write('%d\t%s\t%s\n'%(counter,struct[i].index.strip(),struct[i].name))
				counter += 1


	fout.close()

[docs]def Rename_fasta(prefix):
	'''
	Rename the chains in the fasta file
	'''
	# chain names
	chains = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
		      '1','2','3','4','5','6','7','8','9','0','~','!','@','#','$','%','^','&','(',')','-','+','=','_','<','>',
		      '/',"'","\\",'?','"','[',']','{','}','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q',
		      'r','s','t','u','v','w', 'x', 'y','z']
	# create a backup of the fasta
	os.system('cp %s.fasta ./%s.fasta_backup'%(prefix,prefix))
	#get the fasta file
	names=[]
	seqs=[]
	fasta=open(prefix+'.fasta').read().split('>')
	c=0
	for line in fasta[1:]:
		bline=line.split(':')
		names.append('>'+bline[0]+':'+chains[c])
		c+=1
		seqs.append(bline[1][bline[1].find('\n')+1:])

	#open outfile
	out=open(prefix+'.fasta','w')
	#write the new fastafile
	for j in range(len(names)):
		out.write(names[j]+'\n'+seqs[j])

	out.close()

[docs]def main(prefix,alphaC=False,redundant=False):
	# Form the proper file names
	pdbfile = prefix + '.pdb'
	fastafile = prefix + '.fasta'
	mapfile = prefix + '.landmarks'
	outfile = prefix + '.gm'

	# Read all structures into RAM
	pdbs = PDBstructure(pdbfile)

	# if redundant structure fasta need to rename the chains
	if redundant:
		Rename_fasta(prefix)

	# Get the alignment
	alns, kalns = LoadFasta(fastafile)

	# Check that both the alignment and the pdb have the same chain names
	kpdbs= pdbs.orderofchains
	newalns={}
	if not len(kalns) == len(kpdbs):
		print 'not the same number of chains in the PDB and FASTA files'
		sys.exit(-1)
	else:
		for a in range(len(kalns)):
			newalns[kpdbs[a]] = alns.pop(kalns[a])
	alns = None
	alns = newalns
				
	for chain in pdbs.chains:
		pdbs.IndexSeq(chain, alns[chain])

	# Get the mask
	mask = GetMask(alns)

	# Delete unaliagned sites
	RemoveNonHomologous(pdbs, alns, mask, mapfile)

	# Write to an output
	fout = open(outfile,'w')
	for chain in pdbs.chains:
		fout.write('>'+prefix+':'+chain+';')
		for index in pdbs.chains[chain].GetIndices():
			# deleted item
			if not pdbs.chains[chain].GetResidueByIndex(index):
				continue
			# Write XYZ
			pdbs.chains[chain][index].Centroid()
			if alphaC:
				ca= pdbs.chains[chain][index].atoms['CA']
			else:
				ca = pdbs.chains[chain][index].centroid

			for c in [ca.x,ca.y,ca.z]:
				fout.write('%f;'%(c))
		# End of line
		fout.write('\n')
		
# Application code
if __name__ == '__main__':
	# Read in the file prefix to process
	if len(sys.argv) < 2:
		print "Usage: MATT2GM.py prefix [options: -alphaC, -redundant]"
		sys.exit()
	alphaC=False
	redundant=False
	prefix= sys.argv[1]
	for arg in sys.argv[1:]:
		if arg == '-alphaC':
			alphaC=True
		elif arg == '-redundant':
			redundant=True
	
	main(prefix,alphaC,redundant)
Navigation

Source code for labblouin.MATT2GM

Quick search

Navigation