Harvard:Biophysics 101/2007/Notebook:Xiaodi Wu/2007-2-20: Difference between revisions
From OpenWetWare
Jump to navigationJump to search
(uploading homework assignment) |
(No difference)
|
Revision as of 23:11, 19 February 2007
#!/usr/bin/env python import os from Bio import Clustalw, Seq, SeqRecord, SubsMat from Bio.Align import AlignInfo from Bio.Align.FormatConvert import FormatConverter # Do nucleotide alignment print "Preparing initial alignment data" print '-' * 40 cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta')) cline.set_output('test.aln') alignment = Clustalw.do_alignment(cline) summary = AlignInfo.SummaryInfo(alignment) # consensus = summary.dumb_consensus() # pssmatrix = summary.pos_specific_score_matrix(consensus) # Try protein alignment print "Preparing protein data" print '-' * 40 prot = open(os.path.join(os.curdir, '.apoe_prot.fasta'), 'w') for seq in alignment.get_all_seqs(): s = seq.seq.tostring() # Translate! p = Seq.translate(s).split('*') # Split a stop codon q = Seq.translate('C' + s).split('*') # Try a different frame r = Seq.translate('CC' + s).split('*') # Try another # Group them all pqr = p pqr.extend(q) pqr.extend(r) # Find start codon for each def trim(x): return x[x.find('M'):] p_trimmed = map(trim, pqr) # Find longest of them def max(x, y): if len(x) < len(y): return y else: return x p_max = reduce(max, p_trimmed) # Create file prot.write('>') prot.write(seq.description) prot.write('\n') prot.write(p_max) prot.write('\n') prot.close() prot_cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, '.apoe_prot.fasta')) prot_cline.set_output('.test_prot.aln') prot_alignment = Clustalw.do_alignment(prot_cline) os.remove(os.path.join(os.curdir, '.apoe_prot.fasta')) os.remove(os.path.join(os.curdir, '.test_prot.aln')) # Use clumsy algo to find occurrences of deletions, insertions, rearrangements seq_copy = alignment.get_all_seqs() ref_seq = seq_copy.pop(0) # pop out reference sequence (the first one) print "Comparing against", ref_seq.description print '-' * 40 counter = 0 # clumsy counter...to be used later for seq in seq_copy: counter = counter + 1 # Write a temporary file for one-on-one alignment s = seq.seq.tostring() working_file = os.path.join(os.curdir, '.apoe_working.fasta') temp = open(working_file, 'w') temp.write('>') temp.write(ref_seq.description) temp.write('\n') temp.write(ref_seq.seq.tostring()) temp.write('\n') temp.write('>') temp.write(seq.description) temp.write('\n') temp.write(s) temp.close() # Do the aligning cline_temp = Clustalw.MultipleAlignCL(working_file) cline_temp.set_output('.test_working.aln') alignment_temp = Clustalw.do_alignment(cline_temp) # Analyse it realigned_ref = alignment_temp.get_seq_by_num(0).tostring() realigned_test = alignment_temp.get_seq_by_num(1).tostring() summary_temp = AlignInfo.SummaryInfo(alignment_temp) consensus_temp = summary_temp.dumb_consensus().tostring() # Find dashes in the reference sequence: those are the insertions insertions_num = 0 insertions_find = realigned_ref.find('-', 0) insertions_count = dict(A=0, C=0, G=0, T=0) # start a count of which nucleotides are inserted most while not insertions_find == -1: insertions_num = insertions_num + 1 insertions_find = realigned_ref.find('-', insertions_find + 1) # Increment the appropriate nucleotide count by 1 by looking at what's in the test sequence insertions_count[realigned_test[insertions_find]] = insertions_count[realigned_test[insertions_find]] + 1 # Find dashes in the test sequence: those are the deletions deletions_num = 0 deletions_find = realigned_test.find('-', 0) deletions_count = dict(A=0, C=0, G=0, T=0) while not deletions_find == -1: deletions_num = deletions_num + 1 deletions_find = realigned_test.find('-', deletions_find + 1) deletions_count[realigned_ref[deletions_find]] = deletions_count[realigned_ref[deletions_find]] + 1 # Find point mutations in consensus sequence (X's) point_mutations_num = consensus_temp.count('X') print seq.description print "Number of inserted bases:", insertions_num, print str(insertions_count) print "Number of deleted bases:", deletions_num, print str(deletions_count) print "Number of point mutations:", point_mutations_num # Find any frameshifts # Replace all occurrences of '---' from both reference and test sequences processed_ref = realigned_ref.replace('---', '***').rstrip('-') processed_test = realigned_test.replace('---', '***').rstrip('-') # Find all occurrences of '--' still left ref_frameshift_count = processed_ref.count('--', 0) test_frameshift_count = processed_test.count('--', 0) # Replace these reprocessed_ref = processed_ref.replace('--', '**') # it's like reprocessed meat! reprocessed_test = processed_test.replace('--', '**') # Find all remaining occurrences of '-' still left ref_frameshift_count = ref_frameshift_count + reprocessed_ref.count('-', 0) test_frameshift_count = test_frameshift_count + reprocessed_test.count('-', 0) total_frameshifts = ref_frameshift_count + test_frameshift_count print "Possible frameshifts:", total_frameshifts # Do more aligning, but this time with proteins! ref_prot_seq = prot_alignment.get_seq_by_num(0).tostring() test_prot_seq = prot_alignment.get_seq_by_num(counter).tostring() # Write a temporary file for one-on-one alignment working_prot_file = os.path.join(os.curdir, '.apoe_working_prot.fasta') temp2 = open(working_prot_file, 'w') temp2.write('>') temp2.write(ref_seq.description) temp2.write('\n') temp2.write(ref_prot_seq) temp2.write('\n') temp2.write('>') temp2.write(seq.description) temp2.write('\n') temp2.write(test_prot_seq) temp2.close() # Do actual aligning cline_temp2 = Clustalw.MultipleAlignCL(working_prot_file) cline_temp2.set_output('.test_working_prot.aln') alignment_prot_temp = Clustalw.do_alignment(cline_temp2) print '-' * 40 os.remove(working_file) os.remove(os.path.join(os.curdir, '.test_working.aln')) os.remove(working_prot_file) os.remove(os.path.join(os.curdir, '.test_working_prot.aln')) # converter = FormatConverter(alignment) print "Xiaodi Wu, Biophysics 101, 2007-02-20\n"