Harvard:Biophysics 101/2007/Notebook:Xiaodi Wu/2007-2-20: Difference between revisions
From OpenWetWare
Jump to navigationJump to search
uploading homework assignment |
(No difference)
|
Revision as of 06:11, 20 February 2007
#!/usr/bin/env python
import os
from Bio import Clustalw, Seq, SeqRecord, SubsMat
from Bio.Align import AlignInfo
from Bio.Align.FormatConvert import FormatConverter
# Do nucleotide alignment
print "Preparing initial alignment data"
print '-' * 40
cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)
summary = AlignInfo.SummaryInfo(alignment)
# consensus = summary.dumb_consensus()
# pssmatrix = summary.pos_specific_score_matrix(consensus)
# Try protein alignment
print "Preparing protein data"
print '-' * 40
prot = open(os.path.join(os.curdir, '.apoe_prot.fasta'), 'w')
for seq in alignment.get_all_seqs():
s = seq.seq.tostring()
# Translate!
p = Seq.translate(s).split('*') # Split a stop codon
q = Seq.translate('C' + s).split('*') # Try a different frame
r = Seq.translate('CC' + s).split('*') # Try another
# Group them all
pqr = p
pqr.extend(q)
pqr.extend(r)
# Find start codon for each
def trim(x): return x[x.find('M'):]
p_trimmed = map(trim, pqr)
# Find longest of them
def max(x, y):
if len(x) < len(y): return y
else: return x
p_max = reduce(max, p_trimmed)
# Create file
prot.write('>')
prot.write(seq.description)
prot.write('\n')
prot.write(p_max)
prot.write('\n')
prot.close()
prot_cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, '.apoe_prot.fasta'))
prot_cline.set_output('.test_prot.aln')
prot_alignment = Clustalw.do_alignment(prot_cline)
os.remove(os.path.join(os.curdir, '.apoe_prot.fasta'))
os.remove(os.path.join(os.curdir, '.test_prot.aln'))
# Use clumsy algo to find occurrences of deletions, insertions, rearrangements
seq_copy = alignment.get_all_seqs()
ref_seq = seq_copy.pop(0) # pop out reference sequence (the first one)
print "Comparing against", ref_seq.description
print '-' * 40
counter = 0 # clumsy counter...to be used later
for seq in seq_copy:
counter = counter + 1
# Write a temporary file for one-on-one alignment
s = seq.seq.tostring()
working_file = os.path.join(os.curdir, '.apoe_working.fasta')
temp = open(working_file, 'w')
temp.write('>')
temp.write(ref_seq.description)
temp.write('\n')
temp.write(ref_seq.seq.tostring())
temp.write('\n')
temp.write('>')
temp.write(seq.description)
temp.write('\n')
temp.write(s)
temp.close()
# Do the aligning
cline_temp = Clustalw.MultipleAlignCL(working_file)
cline_temp.set_output('.test_working.aln')
alignment_temp = Clustalw.do_alignment(cline_temp)
# Analyse it
realigned_ref = alignment_temp.get_seq_by_num(0).tostring()
realigned_test = alignment_temp.get_seq_by_num(1).tostring()
summary_temp = AlignInfo.SummaryInfo(alignment_temp)
consensus_temp = summary_temp.dumb_consensus().tostring()
# Find dashes in the reference sequence: those are the insertions
insertions_num = 0
insertions_find = realigned_ref.find('-', 0)
insertions_count = dict(A=0, C=0, G=0, T=0) # start a count of which nucleotides are inserted most
while not insertions_find == -1:
insertions_num = insertions_num + 1
insertions_find = realigned_ref.find('-', insertions_find + 1)
# Increment the appropriate nucleotide count by 1 by looking at what's in the test sequence
insertions_count[realigned_test[insertions_find]] = insertions_count[realigned_test[insertions_find]] + 1
# Find dashes in the test sequence: those are the deletions
deletions_num = 0
deletions_find = realigned_test.find('-', 0)
deletions_count = dict(A=0, C=0, G=0, T=0)
while not deletions_find == -1:
deletions_num = deletions_num + 1
deletions_find = realigned_test.find('-', deletions_find + 1)
deletions_count[realigned_ref[deletions_find]] = deletions_count[realigned_ref[deletions_find]] + 1
# Find point mutations in consensus sequence (X's)
point_mutations_num = consensus_temp.count('X')
print seq.description
print "Number of inserted bases:", insertions_num,
print str(insertions_count)
print "Number of deleted bases:", deletions_num,
print str(deletions_count)
print "Number of point mutations:", point_mutations_num
# Find any frameshifts
# Replace all occurrences of '---' from both reference and test sequences
processed_ref = realigned_ref.replace('---', '***').rstrip('-')
processed_test = realigned_test.replace('---', '***').rstrip('-')
# Find all occurrences of '--' still left
ref_frameshift_count = processed_ref.count('--', 0)
test_frameshift_count = processed_test.count('--', 0)
# Replace these
reprocessed_ref = processed_ref.replace('--', '**') # it's like reprocessed meat!
reprocessed_test = processed_test.replace('--', '**')
# Find all remaining occurrences of '-' still left
ref_frameshift_count = ref_frameshift_count + reprocessed_ref.count('-', 0)
test_frameshift_count = test_frameshift_count + reprocessed_test.count('-', 0)
total_frameshifts = ref_frameshift_count + test_frameshift_count
print "Possible frameshifts:", total_frameshifts
# Do more aligning, but this time with proteins!
ref_prot_seq = prot_alignment.get_seq_by_num(0).tostring()
test_prot_seq = prot_alignment.get_seq_by_num(counter).tostring()
# Write a temporary file for one-on-one alignment
working_prot_file = os.path.join(os.curdir, '.apoe_working_prot.fasta')
temp2 = open(working_prot_file, 'w')
temp2.write('>')
temp2.write(ref_seq.description)
temp2.write('\n')
temp2.write(ref_prot_seq)
temp2.write('\n')
temp2.write('>')
temp2.write(seq.description)
temp2.write('\n')
temp2.write(test_prot_seq)
temp2.close()
# Do actual aligning
cline_temp2 = Clustalw.MultipleAlignCL(working_prot_file)
cline_temp2.set_output('.test_working_prot.aln')
alignment_prot_temp = Clustalw.do_alignment(cline_temp2)
print '-' * 40
os.remove(working_file)
os.remove(os.path.join(os.curdir, '.test_working.aln'))
os.remove(working_prot_file)
os.remove(os.path.join(os.curdir, '.test_working_prot.aln'))
# converter = FormatConverter(alignment)
print "Xiaodi Wu, Biophysics 101, 2007-02-20\n"