Source code for labblouin.FASTAnet

''' A simple parser and writer for FASTA sequence files that uses a rich object hierarchy. '''

# Date:   May 15 2013
# Author: Alex Safatli
# Email:  safatli@cs.dal.ca 

[docs]class FASTAsequence:

    def __init__(self,name,seq):

        ''' Initialize this object. Provide a name for the sequence
        and the sequence itself as parameters. '''

        self.name         = name
        self.sequence     = seq
        self.__fastaseq__ = '\n'.join([x for x in self])

    def __iter__(self):

        ''' Iterate through a sequence in a pseudo-line-by-line 
        manner as if it was read in a FASTA file. '''

        s = self.sequence
        for i in xrange(0,len(s),70): yield s[i:i+70]

    def __hash__(self):

        return self.sequence.__hash__()

    def __eq__(self,o):

        return self.sequence == o.sequence

    def __ne__(self,o):

        return not self.__eq__(o)

[docs]    def count(self,item):

        ''' Return the number of characters in the sequence equal to input string. '''

        return self.sequence.count(item)

[docs]    def removeGaps(self):

        ''' Modify the sequence so gaps are removed and return it. '''

        self.sequence = self.sequence.replace('-','').replace('.','')
        return self.sequence

[docs]    def toUpper(self):

        ''' Modify the sequence so it is uppercase and return it. '''

        self.sequence = self.sequence.upper()
        return self.sequence    

[docs]    def toLower(self):

        ''' Return a lowercase version of the sequence, also changing it
        in the structure. '''

        self.sequence = self.sequence.lower()
        return self.sequence            

    def __len__(self):

        return len(self.sequence)

    def __str__(self):

        return '>%s\n%s\n' % (self.name,self.__fastaseq__)

[docs]class FASTAstructure:

    def __init__(self,filein='',uniqueOnly=True,curate=False):

        ''' A file to be read is optional. If uniqueOnly is set to
        false, multiple duplicate sequences are allowed; otherwise,
        duplicates are ignored and their aliases are recorded in sequence
        Names. If curate is triggered, will remove special characters
        from names. '''

        self.sequences        = {}
        self.orderedSequences = []
        self.sequenceNames    = {}
        self.uniqueOnly       = uniqueOnly
        self.curate           = curate

        if filein:
            # First, try to read in a path. Otherwise,
            # read in the string comprising the content
            # of a file.
            try: self.readFile(filein)
            except: self.read(filein.split('\n'))

[docs]    def getSequenceNames(self):    return self.sequenceNames.values()    
[docs]    def getSequences(self):        return self.orderedSequences
[docs]    def getSequenceByName(self,n): return self.sequenceNames[n]

[docs]    def getStrictlyUngappedPositions(self,seqInds=None):

        ''' Acquire the positions of all strictly ungapped sites. If parameter
        is set, expects a list of what sequences (by index) you are checking. 
        Defaults to all sequences. '''

        if seqInds == None:
            seqInds = range(len(self.sequences))
        seqs = self.orderedSequences
        mask = {}
        posv = []
        for n in seqInds:
            seq = seqs[n].sequence
            mask[n] = []
            for i in xrange(len(seq)):
                char = seq[i]
                if char.isalpha(): mask[n].append(True)
                else: mask[n].append(False)
        alnlen = len(mask.values()[0])
        for pos in xrange(alnlen):
            homologous = True
            for key in mask:
                if not mask[key][pos]:
                    homologous = False
                    break
            if homologous: posv.append(pos)       
        return posv

[docs]    def readFile(self,fin):

        ''' Read a file in. Return this FASTA object. '''

        fi = open(fin)
        fast = fi.read()
        fi.close()
        self.read(fast.split('\n'))
        return self

[docs]    def read(self,fast):

        ''' Read the contents of a FASTA file. '''

        name, seq = '', ''
        for line in fast:
            lineC = line.strip()
            if lineC.startswith('>'):
                # add last collected entry
                if name and seq: self.addSequence(name,seq)
                name, seq = (lineC.strip('>')), ''
                if self.curate:
                    tmp = name
                    for c in name:
                        if not c.isalnum():
                            tmp = tmp.replace(c,"",1)
                    name = tmp         
            else: seq += lineC
        if name and seq: self.addSequence(name,seq)    

[docs]    def writeFile(self,fout):

        ''' Write the information currently contained in the
        FASTAstructure to a file as a FASTA-formatted file. '''

        f = open(fout,'w')
        f.write(str(self))
        f.close()

[docs]    def addSequence(self,name,seq):

        ''' Add a sequence to the FASTA object. '''

        # Ensure not already in list with
        # a different or same name.
        f = FASTAsequence(name,seq)
        seqs = self.sequences.values()
        if name not in self.sequences:
            if (f not in seqs or not self.uniqueOnly):
                self.sequences[name] = f
                self.orderedSequences.append(f)
                self.sequenceNames[f] = [name]
            else:
                # Get that instance of FASTAsequence.
                f = [x for x in seqs if f == x][0]
                self.sequenceNames[f].append(name)

[docs]    def renameSequence(self,oldname,newname):

        ''' Renames a given sequence. '''

        if oldname in self.sequences:
            f = self.sequences[oldname]
            f.name = newname
            del self.sequences[oldname]
            self.sequences[newname] = f
            self.sequenceNames[f] = newname
        else: raise IndexError('Could not find that name among sequences.')

[docs]    def removeSequence(self,name):

        ''' Remove a sequence from the FASTA object and
        return it; or return None if it was not found. '''

        if name in self.sequences:
            f = self.sequences[name]
            self.orderedSequences.remove(f)
            del self.sequences[name]
            del self.sequenceNames[f]
            return f
        else: return None

[docs]    def reorderSequences(self,iterable):

        ''' Reorder all sequences by an iterable sequence
        of their names. '''

        if len(iterable) != len(self.sequences):
            raise ValueError('Mismatch of length with sequence list.')
        neworder = []
        for it in iterable:
            if it in self.sequences: neworder.append(self.sequences[it])
            else:
                raise KeyError('Could not find %s among sequence names.' % (it))
        self.orderedSequences = neworder

[docs]    def removeGaps(self):

        ''' Remove the gaps for all sequences. '''

        s = self.sequences
        for seq in s: s[seq].removeGaps()

[docs]    def allUpper(self):

        ''' Change all sequences to uppercase. '''

        s = self.sequences
        for seq in s: s[seq].toUpper()

[docs]    def allLower(self):

        ''' Change all sequences to lowercase. '''

        s = self.sequences
        for seq in s: s[seq].toLower()

    def __iter__(self):

        ''' Iterate through the FASTA by going through
        its sequences. '''

        for seq in self.sequences: yield self.sequences[seq]

    def __len__(self):

        ''' Return the number of sequences in the FASTA object. '''

        return len(self.sequences)

    def __str__(self):

        ''' Return the FASTA object as FASTA file text content. '''

        return ''.join([str(x) for x in self.orderedSequences])
Navigation

Source code for labblouin.FASTAnet

Quick search

Navigation