#!/usr/bin/python
# Begin my_bio_utils.py
"""Module for performing various basic biology operations.

Original version by Jesse Bloom, 2004.
Expanded by D. Allan Drummond, 2004-2007."""
#
import re, os, sys, string, math, random
#-----------------------------------------------------------------------------------
class BioUtilsError(Exception):
    """Error using one of the bio utils."""
#-----------------------------------------------------------------------------------
def Read_FASTA(infile_name):
	"""Reads the sequences and headers from a FASTA file.

	'infile' is a FASTA file.  Reads a list of all of the headers in the FASTA file
	and all of the sequences in the FASTA file and returns them as the tuple
	(headers, sequences) where headers[i] is the header corresponding to
	sequences[i].
	Removes the '>' from the headers.
	Returns None if there is a problem processing the file."""
	infile_name = os.path.expanduser(infile_name)
	if not os.path.isfile(infile_name):
		raise BioUtilsError, "Cannot find the FASTA file %s." % infile_name
	infile = file(infile_name, 'r')
	seq = []
	headers = []
	sequences = []
	lines = infile.readlines()
	if len(lines)>0:
		for line in lines:  # read the lines from the file
			if line[0] == '#':
				# Skip comment
				continue
			if line[0] == '>':  # a new header
				if seq:
					frag = ''.join(seq)
					sequences.append(frag.upper())
					seq = []
				headers.append(line[1 :].rstrip())
			else:
				frag = line.rstrip().upper()
				seq.append(frag)
		frag = ''.join(seq)
		sequences.append(frag.upper())
	infile.close()
	assert len(headers) == len(sequences), "Error, headers and sequences have different lengths."
	return (headers, sequences)
#--------------------------------------------------------------------------------
def firstField(x):
	return x.split()[0]

def get_ensembl_peptide_id(header):
	return header.split("pep:")[1].split()[0]

def get_ensembl_gene_id(header):
	return header.split("gene:")[1].split()[0]

def Read_FASTA_Dict(infile_name, fxn = firstField):
	#(headers, sequences) = Read_FASTA(infile)
	infile_name = os.path.expanduser(infile_name)
	fdict = {}
	if not os.path.isfile(infile_name):
		raise BioUtilsError, "Cannot find the FASTA file %s." % infile_name
	f = file(infile_name, 'r')
	seq = []
	currHeader = ''
	key_list = []
	for line in f:  # read the lines from the file
		if line[0] == '#':
			# Skip comment
			continue
		if line[0] == '>':  # a new header
			if seq:
				frag = ''.join(seq)
				fdict[currHeader] = frag.upper()
				key_list.append(currHeader)
				seq = []
			currHeader = fxn(line[1:].rstrip())
		else:
			frag = line.rstrip()
			seq.append(frag.upper())
			frag = ''.join(seq)
	fdict[currHeader] = frag.upper()
	key_list.append(currHeader)
	f.close()
	return (fdict, key_list)
#--------------------------------------------------------------------------------
def Translate(seq):
	"""Translates a gene sequence to a protein sequence.

	'seq' is the gene sequence to be translated. It can begin with any codon
	(does not have to be ATG), but it must be of the proper length
	(a multiple of 3).  It can contain a trailing stop codon, but if it
	contains stop codons before the end of the sequence it is not
	translated.  If the translation is successful, returns a string
	corresponding to the protein sequence.  If the translation
	fails, returns 'None'."""
	if len(seq) % 3 != 0:
		return # gene length not a multiple of three
	prot_length = len(seq) / 3
	prot = []
	for i in range(prot_length - 1):
		codon = seq[3 * i : 3 * (i + 1)]
		try:
			aa = Codon_to_AA(codon)
		except BioUtilsError: # unrecognized codon
			return
		if aa == '*': # premature stop codon
			return # return 'None'
		prot.append(aa)
	# last codon, might be stop codon
	codon = seq[3 * (prot_length - 1) : 3 * prot_length]
	aa = Codon_to_AA(codon)
	if aa != '*':
		prot.append(aa)
	protseq = ''.join(prot)
	assert len(protseq) in [prot_length, prot_length - 1]
	return protseq

def TranslateRaw(seq, bad_aa = 'X'):
	"""Translates a nucleotide sequence to a protein sequence.

	'seq' is the gene sequence to be translated. It can begin with any codon
	(does not have to be ATG), and the length must be at least 3 nucleotides.
	'bad_aa' is the character used to indicate any codon not in the standard
	code.

	If the translation is successful, returns a string corresponding to the
	protein sequence plus stop codons and ."""
	prot = []
	max_aas = int(math.floor(len(seq)/3))
	codon = seq[0:3]
	for i in range(max_aas):
		codon = seq[3 * i : 3 * (i + 1)]
		try:
			aa = Codon_to_AA(codon)
		except BioUtilsError: # unrecognized codon
			aa = bad_aa
		prot.append(aa)
	protseq = ''.join(prot)
	return protseq

def ReverseTranslate(prot, bad_codon ='xxx'):
	gene = ""
	rev_code = dict([(aa,codon) for (codon,aa) in _genetic_code.items() if not 'U' in codon])
	for aa in prot:
		try:
			gene += rev_code[aa]
		except KeyError, ke:
			gene += bad_codon
	assert(len(gene)==3*len(prot))
	return gene

# Test the reverse translator
def __test_reverseTranslate():
	N = 1000
	aas = 'ACDEFGHIKLMNPQRSTVWY'
	for i in range(N):
		prot = ''.join([random.choice(aas) for xi in range(100)])
		gene = ReverseTranslate(prot)
		newprot = Translate(gene)
		assert(prot == newprot)

#---------------------------------------------------------------------------
# The universal genetic code
_genetic_code = {
		'TTT':'F', 'TTC':'F', 'TTA':'L', 'TTG':'L', 'CTT':'L', 'CTC':'L',
		'CTA':'L', 'CTG':'L', 'ATT':'I', 'ATC':'I', 'ATA':'I', 'ATG':'M', 'GTT':'V',
		'GTC':'V', 'GTA':'V', 'GTG':'V', 'TCT':'S', 'TCC':'S', 'TCA':'S',
		'TCG':'S', 'CCT':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P', 'ACT':'T',
		'ACC':'T', 'ACA':'T', 'ACG':'T', 'GCT':'A', 'GCC':'A', 'GCA':'A',
		'GCG':'A', 'TAT':'Y', 'TAC':'Y', 'TAA':'*', 'TAG':'*',
		'CAT':'H', 'CAC':'H', 'CAA':'Q', 'CAG':'Q', 'AAT':'N', 'AAC':'N',
		'AAA':'K', 'AAG':'K', 'GAT':'D', 'GAC':'D', 'GAA':'E', 'GAG':'E',
		'TGT':'C', 'TGC':'C', 'TGA':'*', 'TGG':'W', 'CGT':'R',
		'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGT':'S', 'AGC':'S', 'AGA':'R',
		'AGG':'R', 'GGT':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G',
		'UUU':'F', 'UUC':'F', 'UUA':'L', 'UUG':'L', 'CUU':'L', 'CUC':'L',
		'CUA':'L', 'CUG':'L', 'AUU':'I', 'AUC':'I', 'AUA':'I', 'AUG':'M', 'GUU':'V',
		'GUC':'V', 'GUA':'V', 'GUG':'V', 'UCU':'S', 'UCC':'S', 'UCA':'S',
		'UCG':'S', 'CCU':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P', 'ACU':'T',
		'ACC':'T', 'ACA':'T', 'ACG':'T', 'GCU':'A', 'GCC':'A', 'GCA':'A',
		'GCG':'A', 'UAU':'Y', 'UAC':'Y', 'UAA':'*', 'UAG':'*',
		'CAU':'H', 'CAC':'H', 'CAA':'Q', 'CAG':'Q', 'AAU':'N', 'AAC':'N',
		'AAA':'K', 'AAG':'K', 'GAU':'D', 'GAC':'D', 'GAA':'E', 'GAG':'E',
		'UGU':'C', 'UGC':'C', 'UGA':'*', 'UGG':'W', 'CGU':'R',
		'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGU':'S', 'AGC':'S', 'AGA':'R',
		'AGG':'R', 'GGU':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G',
		'---':'-'}

#---------------------------------------------------------------------------
def Codon_to_AA(codon):
	"""Returns one-letter amino acid for codon.

	Argument is three-letter string 'codon'.
	'codon' should be upper case.
	Returns 'STOP' if stop codon, raises error if invalid codon.."""
	try:
		return _genetic_code[codon]
	except KeyError:
		raise BioUtilsError, "Invalid codon of %s." % codon
#---------------------------------------------------------------------------------
def sequenceIdentity(alignedSeq1, alignedSeq2):
	numIdentical = 0
	numAligned = 0
	for i in range(len(alignedSeq1)):
		aa1 = alignedSeq1[i]
		aa2 = alignedSeq2[i]
		if aa1 != '-' and aa2 != '-':
			numAligned += 1
			if aa1 == aa2:
				numIdentical += 1
	seqID = 0.0
	if numAligned > 0:
		seqID = float(numIdentical)/numAligned
	return seqID, numIdentical, numAligned
#---------------------------------------------------------------------------------
def complement(a):
	a = a.upper()
	if a == "A":
		return "T"
	elif a == "T" or a == "U":
		return "A"
	elif a == "C":
		return "G"
	elif a == "G":
		return "C"
	return a
#---------------------------------------------------------------------------------
def reverse_complement(seq):
	return reverseComplement(seq)

def reverseComplement(seq):
	rc = [x for x in seq]
	rc.reverse()
	for i in range(len(rc)):
		rc[i] = complement(rc[i])
	return ''.join(rc)

def get_codons_for_aa(aa, rna=True):
	gc = _genetic_code
	if rna:
		aa_codons = [c for c in gc.keys() if gc[c] == aa and not 'T' in c]
	else:
		aa_codons = [c for c in gc.keys() if gc[c] == aa and not 'U' in c]
	return aa_codons


# End my_bio_utils.py
