Source code for labblouin.pfam

# pfam.py
# -------------------------
# May 14, 2012; Alex Safatli
# -------------------------
# Operations for PFAM.

import labblouin.PDBnet as PDBnet
import xml.dom.minidom
import urllib2, urllib, os, time, gzip, math
try: import texttable
except: pass

# Construct an object representing a PFAM flat file (Stockholm file format).

[docs]class pfamFile:
    
    def __init__(self, filepath):
        self.filePath = filepath
        self.clusterResults = None
        self.sequences = {}
        self.clusterSequences = {}
        self.structures = {}
        self.accessionNums = {}
        self.numLines = 0
    
[docs]    def parse(self):
        # Does the actual parsing of the file.
        fin = open(self.filePath, 'r')
        line = 'null'
        lineno = 1
        while (line != ''):
            line = self.__parseline(fin.readline())
            if lineno == 1 and line != "STOCKHOLM 1.0":
                print 'ERROR: File given is not a Stockholm file.'
                return
            lineno += 1
        self.numLines = lineno
        
    def __parseline(self, line):
        # Parses an individual line.
        extr = '' # Extracted data.
        
        if line.startswith("#"):
            # Is an ANNOTATION.
            extr = line[1:].strip()
            if extr == "STOCKHOLM 1.0": pass
            elif extr.startswith("=GS"):
                ann = extr[3:].strip().split()
                if ann[1] == 'AC':
                    # Is an accession number for a given sequence.
                    self.accessionNums[ann[0]] = ann[2]
                elif ann[1].startswith('DR'):
                    # Is structure information (extracts: PDB ID, chain).
                    self.structures[ann[0]] = (ann[3],ann[4].split(';')[0])
        else:
            # Is not an annotation -- is an ALIGNMENT or empty string.
            extr = line
            if not (len(line) == 0): # If not an empty string.
                seq = line.split()
                if len(seq) == 2: self.sequences[seq[0]] = seq[1]
                
        return extr

[docs]    def cluster(self):
        # Goes through all sequences and 
        # clusters them (i.e. sees which are
        # identical). Useful for full
        # alignments. Assumes ordered by tree.
        seq_list = {}
        curr_seq = ''
        curr_num = len(self.sequences)
        for key in self.sequences:
            if not curr_seq == self.sequences[key]:
                seq_list[key] = self.sequences[key]
                curr_seq = self.sequences[key]                
        self.sequences = seq_list
        new_num = len(seq_list)
        self.clusterResults = (curr_num, new_num)
        
# Other Functions

[docs]def extractPDBSequences(pdb_file):
    '''
    Extracts the Amino Acid sequences for a protein structure
    defined by a PDB file and returns as a dictionary.
    '''
    out_seq_list = {}
    
    pdb = PDBnet.PDBstructure(pdb_file)
    for ch in pdb.chains:
        # For every chain in the PDB file
        # get the FASTA sequence.
        seq = pdb.ChainAsFASTA(ch)
        out_seq_list[ch] = seq
        
    return out_seq_list

[docs]def extractPDBChain(pdb_file, ch, dest_file):
    '''
    Extracts a PDB file from another that has only
    information for a single chain.
    '''
    fin = open(pdb_file,'r')
    out = []
    for line in fin:
        if not line.startswith('ATOM'): continue
        sl = line.split()
        if int(sl[1]) >= 100000: k = 1
        else: k = 0
        chain = line[21+k].strip()
        if chain == ch: out.append(line)
    fin.close()
    fout = open(dest_file, 'w')
    for x in out: fout.write(x)
    fout.close()    
    return dest_file

[docs]def writeSequencesToFile(seq_list, targetfolder, fasta=True):
    '''
    Takes in a dictionary of sequences and writes them
    to seperate .seq or .fasta files.
    '''    
    out_file_list = []

    for ch, seq in seq_list.iteritems():
        if (fasta): ext = 'fasta'
        else: ext = 'sequence'
        path = os.path.join(targetfolder, ch + '.' + ext)
        fout = open(path, "w")
        if (fasta): fout.write('>%s\n' % ch)
        for y in range(0, len(seq), 80): fout.write(seq[y:y+80] + '\n')
        fout.close()
        out_file_list.append(path)
        
    return out_file_list

[docs]def writeSequenceToFasta(seq, seq_id, targetpath):
    '''
    Creates FASTA file from given sequence string.
    '''
    fout = open(targetpath, 'w')
    fout.write('>%s\n' % (seq_id))
    for y in range(0, len(seq), 80):
        fout.write(seq[y:y+80] + '\n')
    fout.close()
    
[docs]def writeSequencesToFasta(seq_list, dest_file):
    '''
    Creates single FASTA file from given sequence tuple list
    where first item is sequence ID, second item is the sequence.
    '''
    fout = open(dest_file, "w")
    for s in seq_list:
        seq = s[1]
        ch = s[0]
        fout.write('>%s\n' % ch)
        for y in range(0, len(seq), 80):
            fout.write(seq[y:y+80] + '\n')
    fout.close()

[docs]def doSequenceSearch(seq_file):
    '''
    Performs the PFAM search for a sequence file.
    '''       
    # Set up curl request.
    url = 'http://pfam.sanger.ac.uk/search/sequence'
    values = {'seq': open(seq_file).read(), 'evalue': '1.0', 'ga':'0', 'output':'xml'}
    data = urllib.urlencode(values)
    req = urllib2.Request(url,data)
    # Retrieve search results.
    response = urllib2.urlopen(req).read()
    dom = xml.dom.minidom.parseString(response)
    result_url = dom.getElementsByTagName('result_url')[0].toxml()
    # Find URL for results.
    result_url = result_url.replace('<result_url>','').replace('</result_url>','')
    # Get results.
    results = ''
    while results == '': # Ensures job is done server-side.
        result_out = urllib2.Request(result_url)
        results = urllib2.urlopen(result_out).read()
    return results

[docs]def processSequenceSearch(search_results):
    '''
    Takes in a XML result returned from a sequence
    search and parses it in order to determine what
    the most significant results are for related families.
    Returns a list of the family PFAM IDs.
    '''        
    sigresults = []
    dom = xml.dom.minidom.parseString(search_results)
    matches = dom.getElementsByTagName('location')
    for match in matches:
        if match.getAttribute('significant') == '1':
            sigresult = match.parentNode.getAttribute('id')
            sigresults.append(sigresult)
    return sigresults

[docs]def downloadFamilySequences(pfam_family_id, dest_folder, atype='seed'):
    '''
    Takes in PFAM family ID and acquires the gzipped
    flat file filled with seed or full sequence allignments.
    '''     
    dest = os.path.join(dest_folder, '%s-pfam-%s.gz' % (atype,pfam_family_id))
    if os.path.isfile(dest): return dest # If already exists.
    url = 'http://pfam.sanger.ac.uk/family/%s/alignment/%s/gzipped' \
        % (pfam_family_id, atype)
    try:
        dest, hdrs = urllib.urlretrieve(url, dest)
    except IOError, e:
        print "Can't retrieve %r to %r: %s" % (url, dest_folder, e)
        return None
    return dest

[docs]def decompressGzipFile(gzip_file_name, destfolder, file_ext='ann'):
    '''
    Takes in GZIP file from PFAM or PDB and decompresses it to an
    .ann or specified file extension.
    '''
    try:
        with gzip.open(gzip_file_name, 'rb') as f:
            file_content = f.read()
    except:
        return None
    dest_file = os.path.join(destfolder, os.path.splitext \
                             (os.path.basename(gzip_file_name))[0] + '.' + \
                             file_ext)
    fout = open(dest_file, 'w')
    fout.write(file_content)
    fout.close()
    return dest_file

[docs]def readPfamFile(file_name):
    '''
    Takes in PFAM Stockholm file and reads it, returning
    a pfamFile object.
    '''         
    obj_out = pfamFile(file_name)
    obj_out.parse()
    return obj_out

[docs]def grabNCBIAccessionMetadata(accession_id):
    '''
    Takes in an accession ID and returns NCBI
    metadata from GenBank.
    '''        
    url = 'http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?' + \
        'tool=portal&sendto=on&log$=seqview&db=protein&dopt=genpept' + \
        '&val=%s&extrafeat=984&maxplex=1' % (accession_id)
    return urllib.urlopen(url).read()

[docs]def grabPDBFile(pdb_code, dest_folder):
    '''
    Takes in a PDB code and acquires the PDB file
    from the PDB database, placing it in the specified
    destination folder.
    '''           
    dest = os.path.join(dest_folder, pdb_code + '.pdb')    
    if os.path.isfile(dest): return dest    
    url = 'http://www.rcsb.org/pdb/download/downloadFile.do?' + \
        'fileFormat=pdb&compression=NO&structureId=%s' \
        % (pdb_code)
    try:
        dest, hdrs = urllib.urlretrieve(url, dest)
    except IOError, e:
        print "Can't retrieve PDB for %s. Check network connection." % (pdb_code)
        return None        
    return dest

[docs]def printListToFile(list_in, dest):
    '''
    Prints a list to a file.
    '''           
    fout = open(dest, 'w')
    for item in list_in:
        fout.write(str(item) + '\n')
    fout.close()

[docs]def list2txttable(list_in, title):
    '''
    Converts a list to a string table.
    '''       
    txttable = texttable.Texttable()
    header = [title]
    txttable.header(header)
    txttable.set_deco(txttable.BORDER | txttable.HEADER | txttable.VLINES)
    txttable.set_chars(['-','|','+','-'])
    for item in list_in:
        txttable.add_row([item])
    return txttable.draw()

[docs]def removeListDuplicates(list_in): 
    '''
    Removes all duplicates in a list.
    '''     
    seen = {}
    list_out = []
    for item in list_in:
        if item in seen: continue
        seen[item] = 1
        list_out.append(item)
    return list_out
Navigation

Source code for labblouin.pfam

Quick search

Navigation