Harvard:Biophysics 101/2007/Notebook:Resmi Charalel/2007-2-27: Difference between revisions

From OpenWetWare
Jump to navigationJump to search
No edit summary
Line 127: Line 127:
Silent point mutation A 742 C in variable sequence two resulting in E 248 E
Silent point mutation A 742 C in variable sequence two resulting in E 248 E
</pre>
</pre>
==Notes==
* I considered the first sequence in the apoe.fasta file to be the reference sequence, but this could also be downloaded directly from GenBank.

Revision as of 21:34, 26 February 2007

Program Code

#!/usr/bin/env python

import os
from Bio import GenBank, Seq
from Bio.Seq import Seq,translate
from Bio import Clustalw
from Bio.Clustalw import MultipleAlignCL
from Bio.Align import AlignInfo

cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)

numseq=[]
orfs=[]
proteins=[]
seqs=[]
numseq=alignment.get_all_seqs()
t=0

for i in range(len(numseq)):
    se=numseq[i]
    seqs.append(se.seq.tostring())
    sq=seqs[i]
    start = sq.find('ATG')
    orf = ''
    c=start
    for x in range(len(sq)-start-4):
        orf = orf + sq[c]
        c= c +1
        length = c-start
        remainder=length%3
        if remainder == 0:
            codon=sq[c]+sq[c+1]+sq[c+2]
            if codon== 'TAA' or codon=='TAG' or codon=='TGA':
                orf=orf+sq[c+1]+sq[c+2]
                break
            
    orfs.append(orf)
    proteins.append(translate(orfs[t]))
    t=t+1

star = seqs[0].find('ATG')
print "5'->3' ORF:  Start Codon Position:", start, " Stop Codon Position:", c 
    
for i in range(len(orfs[0])):
    if orfs[0][i]==orfs[1][i]==orfs[2][i]:
        continue
    x=i//3 +1
    if not orfs[0][i]==orfs[1][i]:
        if orfs[1][i-1]=='-':
            break
        elif orfs[1][i]=='-':
            q=1
            for q in range(len(orfs[0])):
                if orfs[1][i+q]=='-':
                    continue
                else:
                    break
            if q%3==0:
                print q, 'base pair deletion in variable sequence one beginning at position', i, '\n    resulting in a non-frameshift mutation.'
            else:
                print q, 'base pair deletion in variable sequence one beginning at position', i, '\n    resulting in a frameshift mutation.'
        elif len(orfs[1])>len(orfs[0]):
            dif=len(orfs[1])-len(orfs[0])
            d=0
            for y in range(dif):
                if not orfs[1][i+y-dif+1]==orfs[0][i]:
                    continue
                elif orfs[1][i+y-dif+1]==orfs[0][i]:
                    one=''.join([orfs[1][j] for j in range(i+y-dif+1, i+y-dif+11)])
                    zero=''.join([orfs[0][j] for j in range(i+y-dif+1, i+y-dif+11)])
                    if one==zero:
                        if y%3==0:                            
                            print y+dif, 'base pair insertion in variable sequence one beginning at position', i, '\n    resulting in a non-frameshift mutation.'
                        else:
                            print y+dif, 'base pair insertion in variable sequence one beginning at position', i, '\n    resulting in a frameshift mutation.'            
                        break
        elif proteins[0][x]==proteins[1][x]:
            print 'Silent point mutation', orfs[0][i], i, orfs[2][i], 'in variable sequence one resulting in', proteins[0][x], x, proteins[2][x]
        else:
            print 'Non-silent point mutation', orfs[0][i], i, orfs[2][i], 'in varable sequence one resulting in', proteins[0][x], x, proteins[2][x]
    if not orfs[0][i]==orfs[2][i]:
        if orfs[2][i-1]=='-':
            break
        elif orfs[2][i]=='-':
            q=1
            for q in range(len(orfs[0])):
                if orfs[2][i+q]=='-':
                    continue
                else:
                    break
            if q%3==0:
                print q, 'base pair deletion in variable sequence two beginning at position', i, '\n    resulting in a non-frameshift mutation.'
            else:
                print q, 'base pair deletion in variable sequence two beginning at position', i, '\n    resulting in a frameshift mutation.'
        elif len(orfs[2])>len(orfs[0]):
            dif=len(orfs[2])-len(orfs[0])
            d=0
            for y in range(dif):
                if not orfs[2][i+y]==orfs[0][i]:
                    continue
                elif orfs[2][i+y]==orfs[0][i]:
                    two=''.join([orfs[2][j] for j in range(i+y, i+y+10)])
                    zero=''.join([orfs[0][j] for j in range(i+y, i+y+10)])
                    if two==zero:
                        if y%3==0:                            
                            print y, 'base pair insertion in variable sequence two beginning at position', i, '\n    resulting in a non-frameshift mutation.'
                        else:
                            print y, 'base pair insertion in variable sequence two beginning at position', i, '\n    resulting in a frameshift mutation.'            
        elif proteins[0][x]==proteins[2][x]:
            print 'Silent point mutation', orfs[0][i], i, orfs[2][i], 'in variable sequence two resulting in', proteins[0][x], x, proteins[2][x]
        else:
            print 'Non-silent point mutation', orfs[0][i], i, orfs[2][i], 'in variable sequence two resulting in', proteins[0][x], x, proteins[2][x]

Output of Program

5'->3' ORF:  Start Codon Position: 60  Stop Codon Position: 1011
Silent point mutation G 598 A in variable sequence two resulting in P 200 P
5 base pair deletion in variable sequence one beginning at position 742 
    resulting in a frameshift mutation.
Silent point mutation A 742 C in variable sequence two resulting in E 248 E

Notes

  • I considered the first sequence in the apoe.fasta file to be the reference sequence, but this could also be downloaded directly from GenBank.