Harvard:Biophysics 101/2007/Notebook:Xiaodi Wu/2007-2-20: Difference between revisions

Revision as of 23:11, 19 February 2007

#!/usr/bin/env python

import os
from Bio import Clustalw, Seq, SeqRecord, SubsMat
from Bio.Align import AlignInfo
from Bio.Align.FormatConvert import FormatConverter

# Do nucleotide alignment

print "Preparing initial alignment data"
print '-' * 40
cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)

summary = AlignInfo.SummaryInfo(alignment)
# consensus = summary.dumb_consensus()
# pssmatrix = summary.pos_specific_score_matrix(consensus)

# Try protein alignment

print "Preparing protein data"
print '-' * 40
prot = open(os.path.join(os.curdir, '.apoe_prot.fasta'), 'w')
for seq in alignment.get_all_seqs():
	s = seq.seq.tostring()
	
	# Translate!
	p = Seq.translate(s).split('*') # Split a stop codon
	q = Seq.translate('C' + s).split('*') # Try a different frame
	r = Seq.translate('CC' + s).split('*') # Try another
	
	# Group them all
	pqr = p
	pqr.extend(q)
	pqr.extend(r)
	
	# Find start codon for each
	def trim(x): return x[x.find('M'):]
	p_trimmed = map(trim, pqr)
	
	# Find longest of them
	def max(x, y):
		if len(x) < len(y): return y
		else: return x
	p_max = reduce(max, p_trimmed)
	
	# Create file
	prot.write('>')
	prot.write(seq.description)
	prot.write('\n')
	prot.write(p_max)
	prot.write('\n')
prot.close()
prot_cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, '.apoe_prot.fasta'))
prot_cline.set_output('.test_prot.aln')
prot_alignment = Clustalw.do_alignment(prot_cline)
os.remove(os.path.join(os.curdir, '.apoe_prot.fasta'))
os.remove(os.path.join(os.curdir, '.test_prot.aln'))

# Use clumsy algo to find occurrences of deletions, insertions, rearrangements

seq_copy = alignment.get_all_seqs()
ref_seq = seq_copy.pop(0) # pop out reference sequence (the first one)
print "Comparing against", ref_seq.description
print '-' * 40
counter = 0 # clumsy counter...to be used later
for seq in seq_copy:
	counter = counter + 1
	# Write a temporary file for one-on-one alignment
	s = seq.seq.tostring()
	working_file = os.path.join(os.curdir, '.apoe_working.fasta')
	temp = open(working_file, 'w')
	temp.write('>')
	temp.write(ref_seq.description)
	temp.write('\n')
	temp.write(ref_seq.seq.tostring())
	temp.write('\n')
	temp.write('>')
	temp.write(seq.description)
	temp.write('\n')
	temp.write(s)
	temp.close()
	
	# Do the aligning
	cline_temp = Clustalw.MultipleAlignCL(working_file)
	cline_temp.set_output('.test_working.aln')
	alignment_temp = Clustalw.do_alignment(cline_temp)	
	
	# Analyse it
	realigned_ref = alignment_temp.get_seq_by_num(0).tostring()
	realigned_test = alignment_temp.get_seq_by_num(1).tostring()
	summary_temp = AlignInfo.SummaryInfo(alignment_temp)
	consensus_temp = summary_temp.dumb_consensus().tostring()
	
	# Find dashes in the reference sequence: those are the insertions
	insertions_num = 0
	insertions_find = realigned_ref.find('-', 0)
	insertions_count = dict(A=0, C=0, G=0, T=0) # start a count of which nucleotides are inserted most
	while not insertions_find == -1:
		insertions_num = insertions_num + 1
		insertions_find = realigned_ref.find('-', insertions_find + 1)
		# Increment the appropriate nucleotide count by 1 by looking at what's in the test sequence
		insertions_count[realigned_test[insertions_find]] = insertions_count[realigned_test[insertions_find]] + 1
	
	# Find dashes in the test sequence: those are the deletions
	deletions_num = 0
	deletions_find = realigned_test.find('-', 0)
	deletions_count = dict(A=0, C=0, G=0, T=0)
	while not deletions_find == -1:
		deletions_num = deletions_num + 1
		deletions_find = realigned_test.find('-', deletions_find + 1)
		deletions_count[realigned_ref[deletions_find]] = deletions_count[realigned_ref[deletions_find]] + 1
	
	# Find point mutations in consensus sequence (X's)
	point_mutations_num = consensus_temp.count('X')
	print seq.description
	print "Number of inserted bases:", insertions_num,
	print str(insertions_count)
	print "Number of deleted bases:", deletions_num,
	print str(deletions_count)
	print "Number of point mutations:", point_mutations_num
	# Find any frameshifts
	# Replace all occurrences of '---' from both reference and test sequences
	processed_ref = realigned_ref.replace('---', '***').rstrip('-')
	processed_test = realigned_test.replace('---', '***').rstrip('-')
	# Find all occurrences of '--' still left
	ref_frameshift_count = processed_ref.count('--', 0)
	test_frameshift_count = processed_test.count('--', 0)
	# Replace these
	reprocessed_ref = processed_ref.replace('--', '**') # it's like reprocessed meat!
	reprocessed_test = processed_test.replace('--', '**')
	# Find all remaining occurrences of '-' still left
	ref_frameshift_count = ref_frameshift_count + reprocessed_ref.count('-', 0)
	test_frameshift_count = test_frameshift_count + reprocessed_test.count('-', 0)
	total_frameshifts = ref_frameshift_count + test_frameshift_count
	print "Possible frameshifts:", total_frameshifts
	
	# Do more aligning, but this time with proteins!
	ref_prot_seq = prot_alignment.get_seq_by_num(0).tostring()
	test_prot_seq = prot_alignment.get_seq_by_num(counter).tostring()
	
	# Write a temporary file for one-on-one alignment
	working_prot_file = os.path.join(os.curdir, '.apoe_working_prot.fasta')
	temp2 = open(working_prot_file, 'w')
	temp2.write('>')
	temp2.write(ref_seq.description)
	temp2.write('\n')
	temp2.write(ref_prot_seq)
	temp2.write('\n')
	temp2.write('>')
	temp2.write(seq.description)
	temp2.write('\n')
	temp2.write(test_prot_seq)
	temp2.close()
	
	# Do actual aligning
	cline_temp2 = Clustalw.MultipleAlignCL(working_prot_file)
	cline_temp2.set_output('.test_working_prot.aln')
	alignment_prot_temp = Clustalw.do_alignment(cline_temp2)	
	
	print '-' * 40
	
	os.remove(working_file)
	os.remove(os.path.join(os.curdir, '.test_working.aln'))
	os.remove(working_prot_file)
	os.remove(os.path.join(os.curdir, '.test_working_prot.aln'))

# converter = FormatConverter(alignment)
print "Xiaodi Wu, Biophysics 101, 2007-02-20\n"
Harvard:Biophysics 101/2007/Notebook:Xiaodi Wu/2007-2-20: Difference between revisions

Revision as of 23:11, 19 February 2007

Navigation menu

Page actions

Page actions

Personal tools

Navigation

Search

research

Tools