Source code for labblouin.FASTAnet
''' A simple parser and writer for FASTA sequence files that uses a rich object hierarchy. '''
# Date: May 15 2013
# Author: Alex Safatli
# Email: safatli@cs.dal.ca
[docs]class FASTAsequence:
def __init__(self,name,seq):
''' Initialize this object. Provide a name for the sequence
and the sequence itself as parameters. '''
self.name = name
self.sequence = seq
self.__fastaseq__ = '\n'.join([x for x in self])
def __iter__(self):
''' Iterate through a sequence in a pseudo-line-by-line
manner as if it was read in a FASTA file. '''
s = self.sequence
for i in xrange(0,len(s),70): yield s[i:i+70]
def __hash__(self):
return self.sequence.__hash__()
def __eq__(self,o):
return self.sequence == o.sequence
def __ne__(self,o):
return not self.__eq__(o)
[docs] def count(self,item):
''' Return the number of characters in the sequence equal to input string. '''
return self.sequence.count(item)
[docs] def removeGaps(self):
''' Modify the sequence so gaps are removed and return it. '''
self.sequence = self.sequence.replace('-','').replace('.','')
return self.sequence
[docs] def toUpper(self):
''' Modify the sequence so it is uppercase and return it. '''
self.sequence = self.sequence.upper()
return self.sequence
[docs] def toLower(self):
''' Return a lowercase version of the sequence, also changing it
in the structure. '''
self.sequence = self.sequence.lower()
return self.sequence
def __len__(self):
return len(self.sequence)
def __str__(self):
return '>%s\n%s\n' % (self.name,self.__fastaseq__)
[docs]class FASTAstructure:
def __init__(self,filein='',uniqueOnly=True,curate=False):
''' A file to be read is optional. If uniqueOnly is set to
false, multiple duplicate sequences are allowed; otherwise,
duplicates are ignored and their aliases are recorded in sequence
Names. If curate is triggered, will remove special characters
from names. '''
self.sequences = {}
self.orderedSequences = []
self.sequenceNames = {}
self.uniqueOnly = uniqueOnly
self.curate = curate
if filein:
# First, try to read in a path. Otherwise,
# read in the string comprising the content
# of a file.
try: self.readFile(filein)
except: self.read(filein.split('\n'))
[docs] def getSequenceNames(self): return self.sequenceNames.values()
[docs] def getSequences(self): return self.orderedSequences
[docs] def getSequenceByName(self,n): return self.sequenceNames[n]
[docs] def getStrictlyUngappedPositions(self,seqInds=None):
''' Acquire the positions of all strictly ungapped sites. If parameter
is set, expects a list of what sequences (by index) you are checking.
Defaults to all sequences. '''
if seqInds == None:
seqInds = range(len(self.sequences))
seqs = self.orderedSequences
mask = {}
posv = []
for n in seqInds:
seq = seqs[n].sequence
mask[n] = []
for i in xrange(len(seq)):
char = seq[i]
if char.isalpha(): mask[n].append(True)
else: mask[n].append(False)
alnlen = len(mask.values()[0])
for pos in xrange(alnlen):
homologous = True
for key in mask:
if not mask[key][pos]:
homologous = False
break
if homologous: posv.append(pos)
return posv
[docs] def readFile(self,fin):
''' Read a file in. Return this FASTA object. '''
fi = open(fin)
fast = fi.read()
fi.close()
self.read(fast.split('\n'))
return self
[docs] def read(self,fast):
''' Read the contents of a FASTA file. '''
name, seq = '', ''
for line in fast:
lineC = line.strip()
if lineC.startswith('>'):
# add last collected entry
if name and seq: self.addSequence(name,seq)
name, seq = (lineC.strip('>')), ''
if self.curate:
tmp = name
for c in name:
if not c.isalnum():
tmp = tmp.replace(c,"",1)
name = tmp
else: seq += lineC
if name and seq: self.addSequence(name,seq)
[docs] def writeFile(self,fout):
''' Write the information currently contained in the
FASTAstructure to a file as a FASTA-formatted file. '''
f = open(fout,'w')
f.write(str(self))
f.close()
[docs] def addSequence(self,name,seq):
''' Add a sequence to the FASTA object. '''
# Ensure not already in list with
# a different or same name.
f = FASTAsequence(name,seq)
seqs = self.sequences.values()
if name not in self.sequences:
if (f not in seqs or not self.uniqueOnly):
self.sequences[name] = f
self.orderedSequences.append(f)
self.sequenceNames[f] = [name]
else:
# Get that instance of FASTAsequence.
f = [x for x in seqs if f == x][0]
self.sequenceNames[f].append(name)
[docs] def renameSequence(self,oldname,newname):
''' Renames a given sequence. '''
if oldname in self.sequences:
f = self.sequences[oldname]
f.name = newname
del self.sequences[oldname]
self.sequences[newname] = f
self.sequenceNames[f] = newname
else: raise IndexError('Could not find that name among sequences.')
[docs] def removeSequence(self,name):
''' Remove a sequence from the FASTA object and
return it; or return None if it was not found. '''
if name in self.sequences:
f = self.sequences[name]
self.orderedSequences.remove(f)
del self.sequences[name]
del self.sequenceNames[f]
return f
else: return None
[docs] def reorderSequences(self,iterable):
''' Reorder all sequences by an iterable sequence
of their names. '''
if len(iterable) != len(self.sequences):
raise ValueError('Mismatch of length with sequence list.')
neworder = []
for it in iterable:
if it in self.sequences: neworder.append(self.sequences[it])
else:
raise KeyError('Could not find %s among sequence names.' % (it))
self.orderedSequences = neworder
[docs] def removeGaps(self):
''' Remove the gaps for all sequences. '''
s = self.sequences
for seq in s: s[seq].removeGaps()
[docs] def allUpper(self):
''' Change all sequences to uppercase. '''
s = self.sequences
for seq in s: s[seq].toUpper()
[docs] def allLower(self):
''' Change all sequences to lowercase. '''
s = self.sequences
for seq in s: s[seq].toLower()
def __iter__(self):
''' Iterate through the FASTA by going through
its sequences. '''
for seq in self.sequences: yield self.sequences[seq]
def __len__(self):
''' Return the number of sequences in the FASTA object. '''
return len(self.sequences)
def __str__(self):
''' Return the FASTA object as FASTA file text content. '''
return ''.join([str(x) for x in self.orderedSequences])