Harvard:Biophysics 101/2007/Notebook:Xiaodi Wu/2007-2-20
From OpenWetWare
The script:
#!/usr/bin/env python import os from Bio import Clustalw, Seq, SeqRecord, SubsMat from Bio.Align import AlignInfo from Bio.Align.FormatConvert import FormatConverter # Do nucleotide alignment print "Preparing initial alignment data" print '-' * 40 cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta')) cline.set_output('test.aln') alignment = Clustalw.do_alignment(cline) summary = AlignInfo.SummaryInfo(alignment) # consensus = summary.dumb_consensus() # pssmatrix = summary.pos_specific_score_matrix(consensus) # Try protein alignment print "Preparing protein data" print '-' * 40 prot = open(os.path.join(os.curdir, '.apoe_prot.fasta'), 'w') for seq in alignment.get_all_seqs(): s = seq.seq.tostring() # Translate! p = Seq.translate(s).split('*') # Split a stop codon q = Seq.translate('C' + s).split('*') # Try a different frame r = Seq.translate('CC' + s).split('*') # Try another # Group them all pqr = p pqr.extend(q) pqr.extend(r) # Find start codon for each def trim(x): return x[x.find('M'):] p_trimmed = map(trim, pqr) # Find longest of them def max(x, y): if len(x) < len(y): return y else: return x p_max = reduce(max, p_trimmed) # Create file prot.write('>') prot.write(seq.description) prot.write('\n') prot.write(p_max) prot.write('\n') prot.close() prot_cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, '.apoe_prot.fasta')) prot_cline.set_output('.test_prot.aln') prot_alignment = Clustalw.do_alignment(prot_cline) os.remove(os.path.join(os.curdir, '.apoe_prot.fasta')) os.remove(os.path.join(os.curdir, '.test_prot.aln')) # Use clumsy algo to find occurrences of deletions, insertions, rearrangements seq_copy = alignment.get_all_seqs() ref_seq = seq_copy.pop(0) # pop out reference sequence (the first one) print "Comparing against", ref_seq.description print '-' * 40 counter = 0 # clumsy counter...to be used later for seq in seq_copy: counter = counter + 1 # Write a temporary file for one-on-one alignment s = seq.seq.tostring() working_file = os.path.join(os.curdir, '.apoe_working.fasta') temp = open(working_file, 'w') temp.write('>') temp.write(ref_seq.description) temp.write('\n') temp.write(ref_seq.seq.tostring()) temp.write('\n') temp.write('>') temp.write(seq.description) temp.write('\n') temp.write(s) temp.close() # Do the aligning cline_temp = Clustalw.MultipleAlignCL(working_file) cline_temp.set_output('.test_working.aln') alignment_temp = Clustalw.do_alignment(cline_temp) # Analyse it realigned_ref = alignment_temp.get_seq_by_num(0).tostring() realigned_test = alignment_temp.get_seq_by_num(1).tostring() summary_temp = AlignInfo.SummaryInfo(alignment_temp) consensus_temp = summary_temp.dumb_consensus().tostring() # Find dashes in the reference sequence: those are the insertions insertions_num = 0 insertions_find = realigned_ref.find('-', 0) insertions_count = dict(A=0, C=0, G=0, T=0) # start a count of which nucleotides are inserted most while not insertions_find == -1: insertions_num = insertions_num + 1 insertions_find = realigned_ref.find('-', insertions_find + 1) # Increment the appropriate nucleotide count by 1 by looking at what's in the test sequence insertions_count[realigned_test[insertions_find]] = insertions_count[realigned_test[insertions_find]] + 1 # Find dashes in the test sequence: those are the deletions deletions_num = 0 deletions_find = realigned_test.find('-', 0) deletions_count = dict(A=0, C=0, G=0, T=0) while not deletions_find == -1: deletions_num = deletions_num + 1 deletions_find = realigned_test.find('-', deletions_find + 1) deletions_count[realigned_ref[deletions_find]] = deletions_count[realigned_ref[deletions_find]] + 1 # Find point mutations in consensus sequence (X's) point_mutations_num = consensus_temp.count('X') print seq.description print "Number of inserted bases:", insertions_num, print str(insertions_count) print "Number of deleted bases:", deletions_num, print str(deletions_count) print "Number of point mutations:", point_mutations_num # Find any frameshifts # Replace all occurrences of '---' from both reference and test sequences processed_ref = realigned_ref.replace('---', '***').rstrip('-') processed_test = realigned_test.replace('---', '***').rstrip('-') # Find all occurrences of '--' still left ref_frameshift_count = processed_ref.count('--', 0) test_frameshift_count = processed_test.count('--', 0) # Replace these reprocessed_ref = processed_ref.replace('--', '**') # it's like reprocessed meat! reprocessed_test = processed_test.replace('--', '**') # Find all remaining occurrences of '-' still left ref_frameshift_count = ref_frameshift_count + reprocessed_ref.count('-', 0) test_frameshift_count = test_frameshift_count + reprocessed_test.count('-', 0) total_frameshifts = ref_frameshift_count + test_frameshift_count print "Possible frameshifts:", total_frameshifts # Do more aligning, but this time with proteins! ref_prot_seq = prot_alignment.get_seq_by_num(0).tostring() test_prot_seq = prot_alignment.get_seq_by_num(counter).tostring() # Write a temporary file for one-on-one alignment working_prot_file = os.path.join(os.curdir, '.apoe_working_prot.fasta') temp2 = open(working_prot_file, 'w') temp2.write('>') temp2.write(ref_seq.description) temp2.write('\n') temp2.write(ref_prot_seq) temp2.write('\n') temp2.write('>') temp2.write(seq.description) temp2.write('\n') temp2.write(test_prot_seq) temp2.close() # Do actual aligning cline_temp2 = Clustalw.MultipleAlignCL(working_prot_file) cline_temp2.set_output('.test_working_prot.aln') alignment_prot_temp = Clustalw.do_alignment(cline_temp2) print '-' * 40 os.remove(working_file) os.remove(os.path.join(os.curdir, '.test_working.aln')) os.remove(working_prot_file) os.remove(os.path.join(os.curdir, '.test_working_prot.aln')) # converter = FormatConverter(alignment) print "Xiaodi Wu, Biophysics 101, 2007-02-20\n"
File output (raw alignment, test.aln):
CLUSTAL W (1.83) multiple sequence alignment gi|178850|gb|K00396.1|HUMAPOE3 CGCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGACTGGCCAATCAC lcl|HUMAPOE3_with_deletion CGCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGACTGGCCAATCAC lcl|HUMAPOE3_with_mutation CGCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGACTGGCCAATCAC ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 AGGCAGGAAGATGAAGGTTCTGTGGGCTGCGTTGCTGGTCACATTCCTGG lcl|HUMAPOE3_with_deletion AGGCAGGAAGATGAAGGTTCTGTGGGCTGCGTTGCTGGTCACATTCCTGG lcl|HUMAPOE3_with_mutation AGGCAGGAAGATGAAGGTTCTGTGGGCTGCGTTGCTGGTCACATTCCTGG ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 CAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCC lcl|HUMAPOE3_with_deletion CAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCC lcl|HUMAPOE3_with_mutation CAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCC ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 GAGCTGCGCCAGCAGACCGAGTGGCAGAGCGGCCAGCGCTGGGAACTGGC lcl|HUMAPOE3_with_deletion GAGCTGCGCCAGCAGACCGAGTGGCAGAGCGGCCAGCGCTGGGAACTGGC lcl|HUMAPOE3_with_mutation GAGCTGCGCCAGCAGACCGAGTGGCAGAGCGGCCAGCGCTGGGAACTGGC ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 ACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGACACTGTCTGAGC lcl|HUMAPOE3_with_deletion ACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGACACTGTCTGAGC lcl|HUMAPOE3_with_mutation ACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGACACTGTCTGAGC ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 AGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGGCG lcl|HUMAPOE3_with_deletion AGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGGCG lcl|HUMAPOE3_with_mutation AGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGGCG ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 CTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGA lcl|HUMAPOE3_with_deletion CTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGA lcl|HUMAPOE3_with_mutation CTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGA ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 GGAACAACTGACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGG lcl|HUMAPOE3_with_deletion GGAACAACTGACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGG lcl|HUMAPOE3_with_mutation GGAACAACTGACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGG ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 AGCTGCAGGCGGCGCAGGCCCGGCTGGGCGCGGACATGGAGGACGTGTGC lcl|HUMAPOE3_with_deletion AGCTGCAGGCGGCGCAGGCCCGGCTGGGCGCGGACATGGAGGACGTGTGC lcl|HUMAPOE3_with_mutation AGCTGCAGGCGGCGCAGGCCCGGCTGGGCGCGGACATGGAGGACGTGTGC ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 GGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGAG lcl|HUMAPOE3_with_deletion GGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGAG lcl|HUMAPOE3_with_mutation GGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGAG ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 CACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTA lcl|HUMAPOE3_with_deletion CACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTA lcl|HUMAPOE3_with_mutation CACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTA ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 AGCGGCTCCTCCGCGATGCCGATGACCTGCAGAAGCGCCTGGCAGTGTAC lcl|HUMAPOE3_with_deletion AGCGGCTCCTCCGCGATGCCGATGACCTGCAGAAGCGCCTGGCAGTGTAC lcl|HUMAPOE3_with_mutation AGCGGCTCCTCCGCGATGCCGATGACCTGCAGAAGCGCCTGGCAGTGTAC ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 CAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGCGCCATCCGCGA lcl|HUMAPOE3_with_deletion CAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGCGCCATCCGCGA lcl|HUMAPOE3_with_mutation CAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGCGCCATCCGCGA ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 GCGCCTGGGGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGG lcl|HUMAPOE3_with_deletion GCGCCTGGGGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGG lcl|HUMAPOE3_with_mutation GCGCCTGGAGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGG ******** ***************************************** gi|178850|gb|K00396.1|HUMAPOE3 GCTCCCTGGCCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAG lcl|HUMAPOE3_with_deletion GCTCCCTGGCCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAG lcl|HUMAPOE3_with_mutation GCTCCCTGGCCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAG ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 CGGCTGCGCGCGCGGATGGAGGAGATGGGCAGCCGGACCCGCGACCGCCT lcl|HUMAPOE3_with_deletion CGGCTGCGCGCGCGGATGGAGGAGATGGGCAGCCGGACCCGCGACCGCCT lcl|HUMAPOE3_with_mutation CGGCTGCGCGCGCGGATGGAGGAGATGGGCAGCCGGACCCGCGACCGCCT ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 GGACGAGGTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAGC lcl|HUMAPOE3_with_deletion GG-----GTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAGC lcl|HUMAPOE3_with_mutation GGCCGAGGTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAGC ** ******************************************* gi|178850|gb|K00396.1|HUMAPOE3 AGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAG lcl|HUMAPOE3_with_deletion AGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAG lcl|HUMAPOE3_with_mutation AGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAG ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 AGCTGGTTCGAGCCCCTGGTGGAAGACATGCAGCGCCAGTGGGCCGGGCT lcl|HUMAPOE3_with_deletion AGCTGGTTCGAGCCCCTGGTGGAAGACATGCAGCGCCAGTGGGCCGGGCT lcl|HUMAPOE3_with_mutation AGCTGGTTCGAGCCCCTGGTGGAAGACATGCAGCGCCAGTGGGCCGGGCT ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 GGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGCCCCTGTGCCCA lcl|HUMAPOE3_with_deletion GGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGCCCCTGTGCCCA lcl|HUMAPOE3_with_mutation GGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGCCCCTGTGCCCA ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 GCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACC lcl|HUMAPOE3_with_deletion GCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACC lcl|HUMAPOE3_with_mutation GCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACC ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 CCGTGCCTCCTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCC lcl|HUMAPOE3_with_deletion CCGTGCCTCCTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCC lcl|HUMAPOE3_with_mutation CCGTGCCTCCTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCC ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 CCAGCCGTCCTCCTGGGGTGGACCCTAGTTTAATAAAGATTCACCAAGTT lcl|HUMAPOE3_with_deletion CCAGCCGTCCTCCTGGGGTGGACCCTAGTTTAATAAAGATTCACCAAGTT lcl|HUMAPOE3_with_mutation CCAGCCGTCCTCCTGGGGTGGACCCTAGTTTAATAAAGATTCACCAAGTT ************************************************** gi|178850|gb|K00396.1|HUMAPOE3 TCACGC lcl|HUMAPOE3_with_deletion TCACGC lcl|HUMAPOE3_with_mutation TCACGC ******
Command line output:
Preparing initial alignment data ---------------------------------------- Preparing protein data ---------------------------------------- Comparing against gi|178850|gb|K00396.1|HUMAPOE3 ---------------------------------------- lcl|HUMAPOE3_with_deletion Number of inserted bases: 0 {'A': 0, 'C': 0, 'T': 0, 'G': 0} Number of deleted bases: 5 {'A': 1, 'C': 2, 'T': 0, 'G': 2} Number of point mutations: 0 Possible frameshifts: 1 ---------------------------------------- lcl|HUMAPOE3_with_mutation Number of inserted bases: 0 {'A': 0, 'C': 0, 'T': 0, 'G': 0} Number of deleted bases: 0 {'A': 0, 'C': 0, 'T': 0, 'G': 0} Number of point mutations: 2 Possible frameshifts: 0 ---------------------------------------- Xiaodi Wu, Biophysics 101, 2007-02-20
Still to do: code a (fairly repetitive and mundane) part to examine the protein sequences that arise, and compare with the changes observed in nucleotide sequence.