Harvard:Biophysics 101/2007/Notebook:Xiaodi Wu/2007-2-27
From OpenWetWare
The code...(a bit brute force in the end, but...)
#!/usr/bin/env python
import os, re
from Bio import Clustalw, Seq, SeqRecord, SubsMat
from Bio.Align import AlignInfo
from Bio.Align.FormatConvert import FormatConverter
print '-' * 40
# (1) Do nucleotide alignment
# print "Preparing initial alignment data"
# print '-' * 40
cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)
summary = AlignInfo.SummaryInfo(alignment)
# consensus = summary.dumb_consensus()
# pssmatrix = summary.pos_specific_score_matrix(consensus)
# (2) Make ourselves a copy of the reference sequence; very useful really
seq_copy = alignment.get_all_seqs()
ref_seq = seq_copy[0]
# TODO: ClustalW does something very nasty and reverses the order of inputted
# sequences on occasion: what is needed is an algorithm to compare the description
# fields of each of the sequences, finding their location in the original FASTA
# file and taking the topmost as the reference
# (3) Try orf determination on main sequence
# all others will be presumed mutants so we don't have to go through this
# analysis more than once; we of course need to presume that there are no
# frameshift mutations before the beginning of the orf that will throw these
# calculations off
print "ORF DETERMINATION"
print '-' * 40
orf_start = -1 # for now
orf_end = -1 # for now
orf_frame = 0 # for now
s = ref_seq.seq.tostring()
# Translate!
p = Seq.translate(s).split('*') # Split a stop codon
q = Seq.translate('C' + s).split('*') # Try a different frame
r = Seq.translate('CC' + s).split('*') # Try another
pc = Seq.translate(Seq.reverse_complement(s)).split('*') # And yet another
qc = Seq.translate('C' + Seq.reverse_complement(s)).split('*') # And another
rc = Seq.translate('CC' + Seq.reverse_complement(s)).split('*') # And another
# Group them all
pqr = Seq.translate(s).split('*')
pqr.extend(q)
pqr.extend(r)
pqr.extend(pc)
pqr.extend(qc)
pqr.extend(rc)
# Find start codon for each
def trim(x):
if x.find('M') >= 0: return x[x.find('M'):]
else: return ''
p_trimmed = map(trim, pqr)
# Find longest of them
def max(x, y):
if len(x) < len(y): return y
else: return x
p_max = reduce(max, p_trimmed)
# CREATE PRETTY OUTPUT (consumes extra cycles)
# and actually pinpoints the frame -- otherwise a pretty useless algo
px = reduce(max, map(trim, p))
px2 = Seq.translate(s).find(px) * 3 + 1 # Start codon position (add one b/c zero-based)
px3 = len(px) * 3 # Length
px4 = px2 + px3 # Stop codon position
if px3 == 0: # The pathetic minimal case that otherwise kills the algo
px2 = 0
px4 = 0
print "5'->3' frame 1:"
print "start codon position: %d. stop codon position: %d. transcript length: %d." % (px2, px4, px3),
if px == p_max:
print "(** using this one **)",
orf_frame = 1
orf_start = px2
orf_end = px4
print "\n",
qx = reduce(max, map(trim, q))
qx2 = Seq.translate('C' + s).find(qx) * 3 # Start codon position
qx3 = len(qx) * 3 # Length
qx4 = qx2 + qx3 # Stop codon position
if qx3 == 0: # The pathetic minimal case that otherwise kills the algo
qx2 = 0
qx4 = 0
print "5'->3' frame 2:"
print "start codon position: %d. stop codon position: %d. transcript length: %d." % (qx2, qx4, qx3),
if qx == p_max:
print "(** using this one **)",
orf_frame = 2
orf_start = qx2
orf_end = qx4
print "\n",
rx = reduce(max, map(trim, r))
rx2 = Seq.translate('CC' + s).find(rx) * 3 - 1 # Start codon position
rx3 = len(rx) * 3 # Length
rx4 = rx2 + rx3 # Stop codon position
if rx3 == 0: # The pathetic minimal case that otherwise kills the algo
rx2 = 0
rx4 = 0
print "5'->3' frame 3:"
print "start codon position: %d. stop codon position: %d. transcript length: %d." % (rx2, rx4, rx3),
if rx == p_max:
print "(** using this one **)",
orf_frame = 3
orf_start = rx2
orf_end = rx4
print "\n",
pcx = reduce(max, map(trim, pc))
pcx2 = len(s) + 1 - (Seq.translate(Seq.reverse_complement(s)).find(pcx) * 3 + 1) # Start codon position
pcx3 = len(pcx) * 3 # Length
pcx4 = pcx2 - pcx3 # Stop codon position
if pcx3 == 0: # The pathetic minimal case that otherwise kills the algo
pcx2 = 0
pcx4 = 0
print "3'->5' frame 1:"
print "start codon position: %d. stop codon position: %d. transcript length: %d." % (pcx2, pcx4, pcx3),
if pcx == p_max:
print "(** using this one **)",
orf_frame = 4
orf_start = pcx2
orf_end = pcx4
print "\n",
qcx = reduce(max, map(trim, qc))
qcx2 = len(s) + 1 - (Seq.translate('C' + Seq.reverse_complement(s)).find(qcx) * 3) # Start codon position
qcx3 = len(qcx) * 3 # Length
qcx4 = qcx2 - qcx3 # Stop codon position
if qcx3 == 0: # The pathetic minimal case that otherwise kills the algo
qcx2 = 0
qcx4 = 0
print "3'->5' frame 2:"
print "start codon position: %d. stop codon position: %d. transcript length: %d." % (qcx2, qcx4, qcx3),
if qcx == p_max:
print "(** using this one **)",
orf_frame = 5
orf_start = qcx2
orf_end = qcx4
print "\n",
rcx = reduce(max, map(trim, rc))
rcx2 = len(s) + 1 - (Seq.translate('CC' + Seq.reverse_complement(s)).find(rcx) * 3 - 1) # Start codon position
rcx3 = len(rcx) * 3 # Length
rcx4 = rcx2 - rcx3 # Stop codon position
if rcx3 == 0: # The pathetic minimal case that otherwise kills the algo
rcx2 = 0
rcx4 = 0
print "3'->5' frame 3:"
print "start codon position: %d. stop codon position: %d. transcript length: %d." % (rcx2, rcx4, rcx3),
if rcx == p_max:
print "(** using this one **)",
orf_frame = 6
orf_start = rcx2
orf_end = rcx4
print "\n",
print '-' * 40
# END OF USELESS OUTPUT-GENERATING SEGMENT
# We need to find a way to align all the sequences, including mutations
# our current orf_start and orf_end do not take into account insertions, etc.
# which we must examine before we can do any additional processing
s_copy = re.sub('[^\-]', '*', s) # well, the insertions we need to account for are all '-'
orf_start_normalized = s_copy.replace('*', '^', orf_start).rfind('^') # zero-based orf start
orf_end_normalized = s_copy.replace('*', '^', orf_end).rfind('^') # zero-based orf end
# Now, we need to trim everything down to this size
trimmed_seq_descriptions = []
trimmed_seqs = []
for seq in alignment.get_all_seqs():
s = seq.seq.tostring()
trimmed_seq_descriptions.append(seq.description)
if orf_start_normalized < orf_end_normalized:
start = orf_start_normalized
length = orf_end_normalized - orf_start_normalized + 3 # include stop codon
trimmed_seqs.append(s[start:start + length])
else:
temp = Seq.reverse_complement(s)
start = len(s) - orf_start_normalized - 1
length = orf_start_normalized - orf_end_normalized + 3 # include stop codon
trimmed_seqs.append(temp[start:start + length])
# (4) Work on the protein side of things (get an alignment in a hidden file)
prot = open(os.path.join(os.curdir, '.apoe_prot.fasta'), 'w')
counter = 0 # The dumb counter method of synching two lists
raw_translated_seqs = []
for seq in trimmed_seqs:
translated_seq = Seq.translate(seq.replace('-',''))
raw_translated_seqs.append(translated_seq)
# Create input file
prot.write('>')
prot.write(trimmed_seq_descriptions[counter])
prot.write('\n')
prot.write(translated_seq)
prot.write('\n')
# Increment
counter = counter + 1
prot.close()
# Figure out protein alignment
prot_cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, '.apoe_prot.fasta'))
prot_cline.set_output('.test_prot.aln')
prot_alignment = Clustalw.do_alignment(prot_cline)
os.remove(os.path.join(os.curdir, '.apoe_prot.fasta'))
os.remove(os.path.join(os.curdir, '.test_prot.aln'))
# (5) Use clumsy algo to find occurrences of deletions, insertions, rearrangements
seq_copy.pop(0) # pop out reference sequence (the first one)
ref_trimmed = trimmed_seqs.pop(0)
ref_trimmed_desc = trimmed_seq_descriptions.pop(0)
print "COMPARING AGAINST", ref_seq.description
print "[all nucleotide positions given relative to start codon]"
print '-' * 40
counter = 0 # clumsy counter again...to be used later
for seq in seq_copy:
# Write a temporary file for one-on-one alignment
working_file = os.path.join(os.curdir, '.apoe_working.fasta')
temp = open(working_file, 'w')
temp.write('>')
temp.write(ref_seq.description)
temp.write('\n')
temp.write(ref_trimmed)
temp.write('\n')
temp.write('>')
temp.write(seq.description)
temp.write('\n')
temp.write(trimmed_seqs[counter])
temp.close()
# Do the aligning
cline_temp = Clustalw.MultipleAlignCL(working_file)
cline_temp.set_output('.test_working.aln')
alignment_temp = Clustalw.do_alignment(cline_temp)
# Analyse it
realigned_ref = alignment_temp.get_seq_by_num(0).tostring()
realigned_test = alignment_temp.get_seq_by_num(1).tostring()
summary_temp = AlignInfo.SummaryInfo(alignment_temp)
consensus_temp = summary_temp.dumb_consensus().tostring()
# HEAVY DUTY OUTPUT TIME!
print seq.description
print '-' * 40
print '\n',
# Find point mutations in consensus sequence (X's)
point_mutations_num = consensus_temp.count('X')
point_mutations_find = consensus_temp.find('X',0)
point_mutations_count = dict(A=0, C=0, G=0, T=0)
while not point_mutations_find == -1:
old = realigned_ref[point_mutations_find]
new = realigned_test[point_mutations_find]
# (Take into account any insertions or deletions for frame)
index_adjusted = point_mutations_find - realigned_ref.count('-', 0, point_mutations_find)
offset = index_adjusted % 3
# (Work backwards to reconstitute the codon)
original_codon = realigned_ref[point_mutations_find]
j = 1
for i in range(0, offset):
char_temp = realigned_ref[point_mutations_find - j]
while not char_temp.isalpha():
j = j + 1
char_temp = realigned_ref[point_mutations_find - j]
if char_temp.isalpha():
original_codon = char_temp + original_codon
j = j + 1
# (Work forwards to fill out the codon)
j = 1
for i in range(0, 2 - offset):
char_temp = realigned_ref[point_mutations_find + j]
while not char_temp.isalpha():
j = j + 1
char_temp = realigned_ref[point_mutations_find + j]
if char_temp.isalpha():
original_codon = original_codon + char_temp
j = j + 1
# (Do the same for the mutated codon)
# (Work backwards to reconstitute the codon)
mutated_codon = realigned_test[point_mutations_find]
j = 1
for i in range(0, offset):
char_temp = realigned_test[point_mutations_find - j]
while not char_temp.isalpha():
j = j + 1
char_temp = realigned_test[point_mutations_find - j]
if char_temp.isalpha():
mutated_codon = char_temp + mutated_codon
j = j + 1
# (Work forwards to fill out the codon)
j = 1
for i in range(0, 2 - offset):
char_temp = realigned_test[point_mutations_find + j]
while not char_temp.isalpha():
j = j + 1
char_temp = realigned_test[point_mutations_find + j]
if char_temp.isalpha():
mutated_codon = mutated_codon + char_temp
j = j + 1
a = Seq.translate(original_codon)
b = Seq.translate(mutated_codon)
if a == b:
print "Silent point mutation found: %s%d%s." % (old, index_adjusted + 1, new),
print "Amino acid result: %s%d%s." % (a, (index_adjusted - index_adjusted % 3) / 3 + 1, b)
else:
print "Non-silent point mutation found: %s%d%s." % (old, index_adjusted + 1, new),
print "Amino acid result: %s%d%s." % (a, (index_adjusted - index_adjusted % 3) / 3 + 1, b)
# Increment
point_mutations_find = consensus_temp.find('X', point_mutations_find + 1)
print "*** Number of point mutations:", point_mutations_num, "***"
print '\n',
# Find dashes in the reference sequence: those are the insertions
insertions_num = 0
insertions_find = realigned_ref.find('-', 0)
while not insertions_find == -1:
contig_count = 1
while realigned_ref[insertions_find + contig_count] == '-':
contig_count = contig_count + 1
# (Take into account any earlier insertions)
index_adjusted = insertions_find - realigned_ref.count('-', 0, insertions_find)
offset = index_adjusted % 3
countback = insertions_find - offset - 6
translation_snippet_a = realigned_ref[countback:insertions_find + contig_count + 6]
a = Seq.translate(translation_snippet_a.replace('-',''))
translation_snippet_b = realigned_test[countback:insertions_find + contig_count + 6]
b = Seq.translate(translation_snippet_b.replace('-',''))
if contig_count % 3 == 0:
print "Non-frameshift insertion found: %d base insertion at position %d" % (contig_count, index_adjusted + 1)
print "DNA Reference: %s" % (translation_snippet_a)
print "Prot Reference: %s" % (a),
if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
else: print "(3'->5' frame %d)" % (orf_frame - 3)
print "DNA Result: %s" % (translation_snippet_b)
print "Prot Result: %s" % (b),
if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
else: print "(3'->5' frame %d)" % (orf_frame - 3)
else:
print "Frameshift insertion found: %d base insertion at position %d" % (contig_count, index_adjusted + 1)
print "DNA Reference: %s" % (translation_snippet_a)
print "Prot Reference: %s" % (a),
if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
else: print "(3'->5' frame %d)" % (orf_frame - 3)
print "DNA Result: %s" % (translation_snippet_b)
print "Prot Result: %s" % (b),
if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
else: print "(3'->5' frame %d)" % (orf_frame - 3)
insertions_find = realigned_ref.find('-', insertions_find + contig_count)
print '\n',
# Find dashes in the test sequence: those are the deletions
deletions_num = 0
deletions_find = realigned_test.find('-', 0)
while not deletions_find == -1:
contig_count = 1
while realigned_test[deletions_find + contig_count] == '-':
contig_count = contig_count + 1
# (Take into account any earlier insertions)
index_adjusted = deletions_find - realigned_ref.count('-', 0, deletions_find)
offset = index_adjusted % 3
countback = deletions_find - offset - 6
translation_snippet_a = realigned_ref[countback:deletions_find + contig_count + 6]
a = Seq.translate(translation_snippet_a.replace('-',''))
translation_snippet_b = realigned_test[countback:deletions_find + contig_count + 6]
b = Seq.translate(translation_snippet_b.replace('-',''))
if contig_count % 3 == 0:
print "Non-frameshift deletion found: %d base deletion, position %d to %d" % (contig_count, index_adjusted + 1, index_adjusted + contig_count + 1)
print "DNA Reference: %s" % (translation_snippet_a)
print "Prot Reference: %s" % (a),
if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
else: print "(3'->5' frame %d)" % (orf_frame - 3)
print "DNA Result: %s" % (translation_snippet_b)
print "Prot Result: %s" % (b),
if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
else: print "(3'->5' frame %d)" % (orf_frame - 3)
else:
print "Frameshift deletion found: %d base deletion, position %d to %d" % (contig_count, index_adjusted + 1, index_adjusted + contig_count + 1)
print "DNA Reference: %s" % (translation_snippet_a)
print "Prot Reference: %s" % (a),
if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
else: print "(3'->5' frame %d)" % (orf_frame - 3)
print "DNA Result: %s" % (translation_snippet_b)
print "Prot Result: %s" % (b),
if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
else: print "(3'->5' frame %d)" % (orf_frame - 3)
deletions_find = realigned_test.find('-', deletions_find + contig_count)
print '\n',
print '-' * 40
# Increment and clean up
counter = counter + 1
os.remove(working_file)
os.remove(os.path.join(os.curdir, '.test_working.aln'))
print "Xiaodi Wu, Biophysics 101, 2007-02-27\n"
...and the output...
---------------------------------------- ORF DETERMINATION ---------------------------------------- 5'->3' frame 1: start codon position: 61. stop codon position: 1012. transcript length: 951. (** using this one **) 5'->3' frame 2: start codon position: 105. stop codon position: 246. transcript length: 141. 5'->3' frame 3: start codon position: 566. stop codon position: 809. transcript length: 243. 3'->5' frame 1: start codon position: 94. stop codon position: 91. transcript length: 3. 3'->5' frame 2: start codon position: 929. stop codon position: 335. transcript length: 594. 3'->5' frame 3: start codon position: 0. stop codon position: 0. transcript length: 0. ---------------------------------------- COMPARING AGAINST gi|178850|gb|K00396.1|HUMAPOE3 [all nucleotide positions given relative to start codon] ---------------------------------------- lcl|HUMAPOE3_with_deletion ---------------------------------------- *** Number of point mutations: 0 *** Frameshift deletion found: 5 base deletion, position 743 to 748 DNA Reference: CGCCTGGACGAGGTGAAG Prot Reference: RLDEVK (5'->3' frame 1) DNA Result: CGCCTGG-----GTGAAG Prot Result: RLGE (5'->3' frame 1) ---------------------------------------- lcl|HUMAPOE3_with_mutation ---------------------------------------- Non-silent point mutation found: G599A. Amino acid result: G200E. Non-silent point mutation found: A743C. Amino acid result: D248A. *** Number of point mutations: 2 *** ---------------------------------------- Xiaodi Wu, Biophysics 101, 2007-02-27


