Harvard:Biophysics 101/Notebook:ZS/2007-2-27

From OpenWetWare
Revision as of 15:09, 26 February 2007 by Zsun (talk | contribs) (→‎Assignment 4)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigationJump to search

Assignment 4

This program was extremely extremely painful to do... I really like Java/C++ better :D Anyways, it has the ability to:

  • Find all open reading frames in a sequence of DNA
  • Classify single and multiple mutations, deletions, and insertions
  • Determine if the mutation is in exon or intron
  • IF intron, determine that the mutation is harmful or not

THIS part is still being worked on, I can't get the protein translation table to work

There are some Achilles' heels in error-checking, however - hopefully will be implemented in further editions, but works in 95% of cases.

#Zachary Sun, Assignment 4
#2.26.07
#!/usr/bin/env python
#This program is able to:
##Find all open reading frames in a sequence of DNA
##Classify single and multiple mutations, deletions, and insertions
##Determine if the mutation is in exon or intron
##IF intron, determine that the mutation is harmful or not
###THIS part is still being worked on, I can't get the protein translation table to work

import os
import math
from Bio import Clustalw
from Bio import GenBank, Seq
from Bio.Seq import Seq,translate
from sets import Set
from string import *

#finds DNA complement (borrowed code!)
def complement(dna):            
    tab = maketrans("AGCTagct", "TCGAtcga")
    return translate(dna, tab)

cmdline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'Apoe.fasta'))
cmdline.set_output('test.aln')
align = Clustalw.do_alignment(cmdline)

refSeqObj = align.get_all_seqs()
refSeq_forward = refSeqObj[0].seq.tostring()
refSeq_backward = complement(refSeq_forward)
readingFrame = [["",""],["",""],["",""],["",""],["",""],["",""]]
##NOTE: readingFrame stores in format [["frame0"]["frame1"]["frame2"]["rev_frame0"]...
## and in each frame [0] = startsite and [1] = endsite

#finds the site of a codon in the reference sequence passed
def codonFind(index, codon, refSeq):
    for testCodon in codon:
        if index+2 >= len(refSeq): #if out of bounds
            return -1
        elif refSeq[index] + refSeq[index+1] + refSeq[index+2] == testCodon: #equals codon
            return index
    return codonFind(index+3, codon, refSeq) #recursive call

#function to return a Set object of characters in a column
def colSetFunction(i):
    col = align.get_column(i)
    s = Set() # create a new set
    for c in range(len(col)):
        s.add(col[c]) # add each column element to the set
    return s

#given that there is an abnormality at a single mutation, finds abnormality"
def findSingleMutation(i):
    colSet = colSetFunction(i)
    if '-' in colSet: #if there is a insertion or deletion
        if refSeqObj[0].seq.tostring()[i] == "-": #test reference sequence 
             for z in range(len(refSeqObj)-1):
                 if refSeqObj[z+1].seq.tostring()[i] != "-":
                     print "***Intron abnormality (major)"
                     print "\tInsertion in sequence " , z+1, "of [",refSeqObj[z+1].seq.tostring()[i],"] at position " , i
                     print "\tFRAMESHIFT MUTATION!"
        else: #there is a deletion
            for z in range(len(refSeqObj)-1):
                 if refSeqObj[z+1].seq.tostring()[i] == "-":
                     print "***Intron abnormality (major)"
                     print "\tDeletion at sequence " , z+1, "of [",refSeqObj[0].seq.tostring()[i],"] at position " , i
                     print "\tFRAMESHIFT MUTATION!"
    else: #there must be a mutation
        for z in range(len(refSeqObj)-1):
            if refSeqObj[0].seq.tostring()[i] != refSeqObj[z+1].seq.tostring()[i]:
                print "*Intron abnormality (minor)"
                print "\tMutation at sequence " , z+1, "of [",refSeqObj[0].seq.tostring()[i],"->", refSeqObj[z+1].seq.tostring()[i],"] at position " , i
                proteinTranslate(i, 1, z+1)

#This function doesn't work yet, but it should translate a set of codons to protein
def proteinTranslate(i,toTrans, seq):#second value is number of codons to translate, #3rd value is sequence with mutation
    origCode = ''
    newCode = ''
    for j in range(toTrans):
        origCode = origCode + refSeqObj[0].seq.tostring()[i] +refSeqObj[0].seq.tostring()[i+1] + refSeqObj[0].seq.tostring()[i+2]
        newCode = newCode + refSeqObj[seq].seq.tostring()[i] +refSeqObj[seq].seq.tostring()[i+1] + refSeqObj[seq].seq.tostring()[i+2]
        i = i + 3

   ### print "\t Amino Acid change(or lack thereof) [",translate(origCode,'').tostring(),"]->[",translate(newCode, '').tostring(),"]"
   ### cannot get translation table to work
        
#given that there is an abnormality at a multiple mutation, finds abnormality"
#Note: cannot handle back to back muations/ins/del in different strands
#Note: cannot handle two back to back muations in same strand
def findMultipleMutation(i):
    curIndex = i;
    colSet = colSetFunction(i)
    mutLength = 1
    while(1):
        curIndex = curIndex + 1
        if len(colSetFunction(curIndex)) == 1: #no more mutation
            break
        else:
            mutLength = mutLength+1

    if refSeqObj[0].seq.tostring()[i] == "-": #test reference sequence shows insertion
        for z in range(len(refSeqObj)-1):
            if refSeqObj[z+1].seq.tostring()[i] != "-":
                tempString = ''
                for x in range(mutLength): #make the insertion strand
                    tempString = tempString + refSeqObj[z+1].seq.tostring()[i+x]
                print "***Intron abnormality (major)"
                print "\tMultiple Insertion in sequence " , z+1, "of [" ,tempString ,"] starting at position " , i
                if mutLength%3 != 0: #if mutation not a multiple of 3
                    print "\tFRAMESHIFT MUTATION!"
                else:
                    proteinTranslate(i,mutLength%3, z+1)
    else: #there is a deletion
         for z in range(len(refSeqObj)-1):
            if refSeqObj[z+1].seq.tostring()[i] == "-":
                tempString = ''
                for x in range(mutLength): #make the insertion strand
                    tempString = tempString + refSeqObj[0].seq.tostring()[i+x]
                print "***Intron abnormality (major)"
                print "\tMultiple Deletion in sequence " , z+1, "of [" ,tempString ,"] starting at position " , i
                if mutLength%3 != 0: #if mutation not a multiple of 3
                    print "\tFRAMESHIFT MUTATION!"
                else:
                    proteinTranslate(i,mutLength%3, z+1)
    return curIndex - 1 

##Finding Start codons
readingFrame[0][0] = codonFind(0,["ATG"], refSeq_forward)
readingFrame[1][0] = codonFind(1,["ATG"], refSeq_forward)
readingFrame[2][0] = codonFind(2,["ATG"], refSeq_forward)
readingFrame[3][0] = codonFind(0,["ATG"], refSeq_backward)
readingFrame[4][0] = codonFind(1,["ATG"], refSeq_backward)
readingFrame[5][0] = codonFind(2,["ATG"], refSeq_backward)

##Finding Stop codons
for z in range(3):
    w = z
    while(1):
        if codonFind(w, ["TAA", "TAG", "TGA"], refSeq_forward) == -1: #did not find start
            readingFrame[z][1] = '-1'
            break
        elif codonFind(w, ["TAA", "TAG", "TGA"], refSeq_forward) < codonFind(z, ["ATG"], refSeq_forward): #stop before start
            w = 3 + codonFind(w, ["TAA","TAG","TGA"], refSeq_forward)
        else:
            readingFrame[z][1] = codonFind(w, ["TAA", "TAG", "TGA"], refSeq_forward)
            break
        
for z in range(3):
    w = z
    while(1):
        if codonFind(w, ["TAA", "TAG", "TGA"], refSeq_backward) == -1: #did not find start
            readingFrame[z+3][1] = '-1'
            break
        elif codonFind(w, ["TAA", "TAG", "TGA"], refSeq_backward) < codonFind(z, ["ATG"], refSeq_backward): #stop before start
            w = 3 + codonFind(w, ["TAA","TAG","TGA"], refSeq_backward)
        else:
            readingFrame[z+3][1] = codonFind(w, ["TAA", "TAG", "TGA"], refSeq_backward)
            break

##Output information for reading frames
print "***Reading Frame Information (Using Longest Length)***"
print "5->3 Frame 0:"
if readingFrame[0][0] == -1 or readingFrame[0][1] == -1: #if DNE
    print "\tNonexistant!"
else:
    print "\tStart: " , readingFrame[0][0], "\n\tEnd: ", readingFrame[0][1], "\n\tLength: " ,readingFrame[0][1]-readingFrame[0][0]
print "5->3 Frame 1:"
if readingFrame[1][0] == -1 or readingFrame[1][1] == -1: #if DNE
    print "\tNonexistant!"
else:
     print "\tStart: " , readingFrame[1][0], "\n\tEnd: ", readingFrame[1][1], "\n\tLength: ", readingFrame[1][1]-readingFrame[1][0]
print "5->3 Frame 2:"
if readingFrame[2][0] == -1 or readingFrame[2][1] == -1: #if DNE
    print "\tNonexistant!"
else:
    print "\tStart: " , readingFrame[2][0], "\n\tEnd: ", readingFrame[2][1], "\n\tLength: ", readingFrame[2][1]-readingFrame[2][0]
print "3->5 Frame 0 (reversed):"
if readingFrame[3][0] == -1 or readingFrame[3][1] == -1: #if DNE
    print "\tNonexistant!"
else:
    print "\tStart: " , readingFrame[3][0], "\n\tEnd: ", readingFrame[3][1], "\n\tLength: ", readingFrame[3][1]-readingFrame[3][0]
print "3->5 Frame 1 (reversed):"
if readingFrame[4][0] == -1 or readingFrame[4][1] == -1: #if DNE
    print "\tNonexistant!"
else:
    print "\tStart: " , readingFrame[4][0], "\n\tEnd: ", readingFrame[4][1], "\n\tLength: ", readingFrame[4][1]-readingFrame[4][0] 
print "3->5 Frame 2 (reversed):"
if readingFrame[5][0] == -1 or readingFrame[5][1] == -1: #if DNE
    print "\tNonexistant!"
else:
    print "\tStart: " , readingFrame[5][0], "\n\tEnd: ", readingFrame[5][1], "\n\tLength: ", readingFrame[5][1]-readingFrame[5][0]

#Determining longest frame and assigning to finalRefSeq - assuming 5-3 directionality for now
longest=-1;
for r in range(6):
    if readingFrame[r][0] != -1 and readingFrame[r][1] != -1:
        if readingFrame[r][1]-readingFrame[r][0] > longest:
            longest = readingFrame[r][1]-readingFrame[r][0]
            longIndex = r
startSite = readingFrame[longIndex][0]
stopSite = readingFrame[longIndex][1]
    
#Main block to determine deviations from std.
i=0
print "\n***Information about allignment***"
while(i <align.get_alignment_length()):
    colSet = colSetFunction(i)
    if len(colSet) > 1: # multiple elements in s indicate a mismatch
        if i<startSite or i>stopSite: #If it is considered an Exon
            print "*Exon abnormality (minor)\n\tPosition ", i
        elif len(colSetFunction(i+1)) == 1: #if the next position is normal
            findSingleMutation(i)
        else:
            i = findMultipleMutation(i) #returns a new value for i
    i = i + 1

The modified Apoe.fasta dataset can be found File:Apoe ZS.fasta.

The output for this modified dataset is:

***Reading Frame Information (Using Longest Length)***
5->3 Frame 0:
	Start:  60 
	End:  246 
	Length:  186
5->3 Frame 1:
	Start:  304 
	End:  1015 
	Length:  711
5->3 Frame 2:
	Start:  104 
	End:  293 
	Length:  189
3->5 Frame 0 (reversed):
	Nonexistant!
3->5 Frame 1 (reversed):
	Start:  220 
	End:  646 
	Length:  426
3->5 Frame 2 (reversed):
	Start:  725 
	End:  1010 
	Length:  285

***Information about allignment***
*Exon abnormality (minor)
	Position  152
***Intron abnormality (major)
	Multiple Deletion in sequence  1 of [ TG ] starting at position  458
	FRAMESHIFT MUTATION!
***Intron abnormality (major)
	Multiple Insertion in sequence  2 of [ TTG ] starting at position  588
*Intron abnormality (minor)
	Mutation at sequence  2 of [ G -> A ] at position  662
***Intron abnormality (major)
	Multiple Deletion in sequence  1 of [ ACGAG ] starting at position  806
	FRAMESHIFT MUTATION!