Harvard:Biophysics 101/2007/Notebook:Xiaodi Wu/2007-2-20

From OpenWetWare
Revision as of 23:20, 19 February 2007 by Wuxiaodi (talk | contribs) (to do)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigationJump to search

The script:

#!/usr/bin/env python

import os
from Bio import Clustalw, Seq, SeqRecord, SubsMat
from Bio.Align import AlignInfo
from Bio.Align.FormatConvert import FormatConverter

# Do nucleotide alignment

print "Preparing initial alignment data"
print '-' * 40
cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)

summary = AlignInfo.SummaryInfo(alignment)
# consensus = summary.dumb_consensus()
# pssmatrix = summary.pos_specific_score_matrix(consensus)

# Try protein alignment

print "Preparing protein data"
print '-' * 40
prot = open(os.path.join(os.curdir, '.apoe_prot.fasta'), 'w')
for seq in alignment.get_all_seqs():
	s = seq.seq.tostring()
	
	# Translate!
	p = Seq.translate(s).split('*') # Split a stop codon
	q = Seq.translate('C' + s).split('*') # Try a different frame
	r = Seq.translate('CC' + s).split('*') # Try another
	
	# Group them all
	pqr = p
	pqr.extend(q)
	pqr.extend(r)
	
	# Find start codon for each
	def trim(x): return x[x.find('M'):]
	p_trimmed = map(trim, pqr)
	
	# Find longest of them
	def max(x, y):
		if len(x) < len(y): return y
		else: return x
	p_max = reduce(max, p_trimmed)
	
	# Create file
	prot.write('>')
	prot.write(seq.description)
	prot.write('\n')
	prot.write(p_max)
	prot.write('\n')
prot.close()
prot_cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, '.apoe_prot.fasta'))
prot_cline.set_output('.test_prot.aln')
prot_alignment = Clustalw.do_alignment(prot_cline)
os.remove(os.path.join(os.curdir, '.apoe_prot.fasta'))
os.remove(os.path.join(os.curdir, '.test_prot.aln'))

# Use clumsy algo to find occurrences of deletions, insertions, rearrangements

seq_copy = alignment.get_all_seqs()
ref_seq = seq_copy.pop(0) # pop out reference sequence (the first one)
print "Comparing against", ref_seq.description
print '-' * 40
counter = 0 # clumsy counter...to be used later
for seq in seq_copy:
	counter = counter + 1
	# Write a temporary file for one-on-one alignment
	s = seq.seq.tostring()
	working_file = os.path.join(os.curdir, '.apoe_working.fasta')
	temp = open(working_file, 'w')
	temp.write('>')
	temp.write(ref_seq.description)
	temp.write('\n')
	temp.write(ref_seq.seq.tostring())
	temp.write('\n')
	temp.write('>')
	temp.write(seq.description)
	temp.write('\n')
	temp.write(s)
	temp.close()
	
	# Do the aligning
	cline_temp = Clustalw.MultipleAlignCL(working_file)
	cline_temp.set_output('.test_working.aln')
	alignment_temp = Clustalw.do_alignment(cline_temp)	
	
	# Analyse it
	realigned_ref = alignment_temp.get_seq_by_num(0).tostring()
	realigned_test = alignment_temp.get_seq_by_num(1).tostring()
	summary_temp = AlignInfo.SummaryInfo(alignment_temp)
	consensus_temp = summary_temp.dumb_consensus().tostring()
	
	# Find dashes in the reference sequence: those are the insertions
	insertions_num = 0
	insertions_find = realigned_ref.find('-', 0)
	insertions_count = dict(A=0, C=0, G=0, T=0) # start a count of which nucleotides are inserted most
	while not insertions_find == -1:
		insertions_num = insertions_num + 1
		insertions_find = realigned_ref.find('-', insertions_find + 1)
		# Increment the appropriate nucleotide count by 1 by looking at what's in the test sequence
		insertions_count[realigned_test[insertions_find]] = insertions_count[realigned_test[insertions_find]] + 1
	
	# Find dashes in the test sequence: those are the deletions
	deletions_num = 0
	deletions_find = realigned_test.find('-', 0)
	deletions_count = dict(A=0, C=0, G=0, T=0)
	while not deletions_find == -1:
		deletions_num = deletions_num + 1
		deletions_find = realigned_test.find('-', deletions_find + 1)
		deletions_count[realigned_ref[deletions_find]] = deletions_count[realigned_ref[deletions_find]] + 1
	
	# Find point mutations in consensus sequence (X's)
	point_mutations_num = consensus_temp.count('X')
	print seq.description
	print "Number of inserted bases:", insertions_num,
	print str(insertions_count)
	print "Number of deleted bases:", deletions_num,
	print str(deletions_count)
	print "Number of point mutations:", point_mutations_num
	# Find any frameshifts
	# Replace all occurrences of '---' from both reference and test sequences
	processed_ref = realigned_ref.replace('---', '***').rstrip('-')
	processed_test = realigned_test.replace('---', '***').rstrip('-')
	# Find all occurrences of '--' still left
	ref_frameshift_count = processed_ref.count('--', 0)
	test_frameshift_count = processed_test.count('--', 0)
	# Replace these
	reprocessed_ref = processed_ref.replace('--', '**') # it's like reprocessed meat!
	reprocessed_test = processed_test.replace('--', '**')
	# Find all remaining occurrences of '-' still left
	ref_frameshift_count = ref_frameshift_count + reprocessed_ref.count('-', 0)
	test_frameshift_count = test_frameshift_count + reprocessed_test.count('-', 0)
	total_frameshifts = ref_frameshift_count + test_frameshift_count
	print "Possible frameshifts:", total_frameshifts
	
	# Do more aligning, but this time with proteins!
	ref_prot_seq = prot_alignment.get_seq_by_num(0).tostring()
	test_prot_seq = prot_alignment.get_seq_by_num(counter).tostring()
	
	# Write a temporary file for one-on-one alignment
	working_prot_file = os.path.join(os.curdir, '.apoe_working_prot.fasta')
	temp2 = open(working_prot_file, 'w')
	temp2.write('>')
	temp2.write(ref_seq.description)
	temp2.write('\n')
	temp2.write(ref_prot_seq)
	temp2.write('\n')
	temp2.write('>')
	temp2.write(seq.description)
	temp2.write('\n')
	temp2.write(test_prot_seq)
	temp2.close()
	
	# Do actual aligning
	cline_temp2 = Clustalw.MultipleAlignCL(working_prot_file)
	cline_temp2.set_output('.test_working_prot.aln')
	alignment_prot_temp = Clustalw.do_alignment(cline_temp2)	
	
	print '-' * 40
	
	os.remove(working_file)
	os.remove(os.path.join(os.curdir, '.test_working.aln'))
	os.remove(working_prot_file)
	os.remove(os.path.join(os.curdir, '.test_working_prot.aln'))

# converter = FormatConverter(alignment)
print "Xiaodi Wu, Biophysics 101, 2007-02-20\n"

File output (raw alignment, test.aln):

CLUSTAL W (1.83) multiple sequence alignment


gi|178850|gb|K00396.1|HUMAPOE3      CGCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGACTGGCCAATCAC
lcl|HUMAPOE3_with_deletion          CGCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGACTGGCCAATCAC
lcl|HUMAPOE3_with_mutation          CGCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGACTGGCCAATCAC
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      AGGCAGGAAGATGAAGGTTCTGTGGGCTGCGTTGCTGGTCACATTCCTGG
lcl|HUMAPOE3_with_deletion          AGGCAGGAAGATGAAGGTTCTGTGGGCTGCGTTGCTGGTCACATTCCTGG
lcl|HUMAPOE3_with_mutation          AGGCAGGAAGATGAAGGTTCTGTGGGCTGCGTTGCTGGTCACATTCCTGG
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      CAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCC
lcl|HUMAPOE3_with_deletion          CAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCC
lcl|HUMAPOE3_with_mutation          CAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCC
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      GAGCTGCGCCAGCAGACCGAGTGGCAGAGCGGCCAGCGCTGGGAACTGGC
lcl|HUMAPOE3_with_deletion          GAGCTGCGCCAGCAGACCGAGTGGCAGAGCGGCCAGCGCTGGGAACTGGC
lcl|HUMAPOE3_with_mutation          GAGCTGCGCCAGCAGACCGAGTGGCAGAGCGGCCAGCGCTGGGAACTGGC
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      ACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGACACTGTCTGAGC
lcl|HUMAPOE3_with_deletion          ACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGACACTGTCTGAGC
lcl|HUMAPOE3_with_mutation          ACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGACACTGTCTGAGC
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      AGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGGCG
lcl|HUMAPOE3_with_deletion          AGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGGCG
lcl|HUMAPOE3_with_mutation          AGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGGCG
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      CTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGA
lcl|HUMAPOE3_with_deletion          CTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGA
lcl|HUMAPOE3_with_mutation          CTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGA
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      GGAACAACTGACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGG
lcl|HUMAPOE3_with_deletion          GGAACAACTGACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGG
lcl|HUMAPOE3_with_mutation          GGAACAACTGACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGG
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      AGCTGCAGGCGGCGCAGGCCCGGCTGGGCGCGGACATGGAGGACGTGTGC
lcl|HUMAPOE3_with_deletion          AGCTGCAGGCGGCGCAGGCCCGGCTGGGCGCGGACATGGAGGACGTGTGC
lcl|HUMAPOE3_with_mutation          AGCTGCAGGCGGCGCAGGCCCGGCTGGGCGCGGACATGGAGGACGTGTGC
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      GGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGAG
lcl|HUMAPOE3_with_deletion          GGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGAG
lcl|HUMAPOE3_with_mutation          GGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGAG
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      CACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTA
lcl|HUMAPOE3_with_deletion          CACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTA
lcl|HUMAPOE3_with_mutation          CACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTA
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      AGCGGCTCCTCCGCGATGCCGATGACCTGCAGAAGCGCCTGGCAGTGTAC
lcl|HUMAPOE3_with_deletion          AGCGGCTCCTCCGCGATGCCGATGACCTGCAGAAGCGCCTGGCAGTGTAC
lcl|HUMAPOE3_with_mutation          AGCGGCTCCTCCGCGATGCCGATGACCTGCAGAAGCGCCTGGCAGTGTAC
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      CAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGCGCCATCCGCGA
lcl|HUMAPOE3_with_deletion          CAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGCGCCATCCGCGA
lcl|HUMAPOE3_with_mutation          CAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGCGCCATCCGCGA
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      GCGCCTGGGGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGG
lcl|HUMAPOE3_with_deletion          GCGCCTGGGGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGG
lcl|HUMAPOE3_with_mutation          GCGCCTGGAGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGG
                                    ******** *****************************************

gi|178850|gb|K00396.1|HUMAPOE3      GCTCCCTGGCCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAG
lcl|HUMAPOE3_with_deletion          GCTCCCTGGCCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAG
lcl|HUMAPOE3_with_mutation          GCTCCCTGGCCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAG
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      CGGCTGCGCGCGCGGATGGAGGAGATGGGCAGCCGGACCCGCGACCGCCT
lcl|HUMAPOE3_with_deletion          CGGCTGCGCGCGCGGATGGAGGAGATGGGCAGCCGGACCCGCGACCGCCT
lcl|HUMAPOE3_with_mutation          CGGCTGCGCGCGCGGATGGAGGAGATGGGCAGCCGGACCCGCGACCGCCT
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      GGACGAGGTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAGC
lcl|HUMAPOE3_with_deletion          GG-----GTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAGC
lcl|HUMAPOE3_with_mutation          GGCCGAGGTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAGC
                                    **     *******************************************

gi|178850|gb|K00396.1|HUMAPOE3      AGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAG
lcl|HUMAPOE3_with_deletion          AGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAG
lcl|HUMAPOE3_with_mutation          AGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAG
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      AGCTGGTTCGAGCCCCTGGTGGAAGACATGCAGCGCCAGTGGGCCGGGCT
lcl|HUMAPOE3_with_deletion          AGCTGGTTCGAGCCCCTGGTGGAAGACATGCAGCGCCAGTGGGCCGGGCT
lcl|HUMAPOE3_with_mutation          AGCTGGTTCGAGCCCCTGGTGGAAGACATGCAGCGCCAGTGGGCCGGGCT
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      GGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGCCCCTGTGCCCA
lcl|HUMAPOE3_with_deletion          GGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGCCCCTGTGCCCA
lcl|HUMAPOE3_with_mutation          GGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGCCCCTGTGCCCA
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      GCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACC
lcl|HUMAPOE3_with_deletion          GCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACC
lcl|HUMAPOE3_with_mutation          GCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACC
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      CCGTGCCTCCTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCC
lcl|HUMAPOE3_with_deletion          CCGTGCCTCCTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCC
lcl|HUMAPOE3_with_mutation          CCGTGCCTCCTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCC
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      CCAGCCGTCCTCCTGGGGTGGACCCTAGTTTAATAAAGATTCACCAAGTT
lcl|HUMAPOE3_with_deletion          CCAGCCGTCCTCCTGGGGTGGACCCTAGTTTAATAAAGATTCACCAAGTT
lcl|HUMAPOE3_with_mutation          CCAGCCGTCCTCCTGGGGTGGACCCTAGTTTAATAAAGATTCACCAAGTT
                                    **************************************************

gi|178850|gb|K00396.1|HUMAPOE3      TCACGC
lcl|HUMAPOE3_with_deletion          TCACGC
lcl|HUMAPOE3_with_mutation          TCACGC
                                    ******

Command line output:

Preparing initial alignment data
----------------------------------------
Preparing protein data
----------------------------------------
Comparing against gi|178850|gb|K00396.1|HUMAPOE3
----------------------------------------
lcl|HUMAPOE3_with_deletion
Number of inserted bases: 0 {'A': 0, 'C': 0, 'T': 0, 'G': 0}
Number of deleted bases: 5 {'A': 1, 'C': 2, 'T': 0, 'G': 2}
Number of point mutations: 0
Possible frameshifts: 1
----------------------------------------
lcl|HUMAPOE3_with_mutation
Number of inserted bases: 0 {'A': 0, 'C': 0, 'T': 0, 'G': 0}
Number of deleted bases: 0 {'A': 0, 'C': 0, 'T': 0, 'G': 0}
Number of point mutations: 2
Possible frameshifts: 0
----------------------------------------
Xiaodi Wu, Biophysics 101, 2007-02-20

Still to do: code a (fairly repetitive and mundane) part to examine the protein sequences that arise, and compare with the changes observed in nucleotide sequence.