Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-2-27

From OpenWetWare

(Difference between revisions)
Jump to: navigation, search
(New page: Notes: I've decided to make an orf class that indexes to a list of codon classes. The mutation detection will be built into comparators for the class. It's taking me a bit to figure out e...)
Line 1: Line 1:
-
Notes:
 
-
I've decided to make an orf class that indexes to a list of codon classes.
 
-
The mutation detection will be built into comparators for the class.  It's taking me a bit to figure out exactly how these things all work...
 
-
 
-
<pre>
 
#find all orfs within the sequences
#find all orfs within the sequences
import re
import re
 +
from Bio import Transcribe
 +
transcriber = Transcribe.unambiguous_transcriber
 +
from Bio import Translate
 +
from Bio.Alphabet import IUPAC
 +
from Bio.Seq import Seq
 +
standard_translator = Translate.unambiguous_dna_by_id[1]
 +
class codon:
 +
    def  __init__(self, sequence=""):
 +
        self.sequence = sequence
 +
        self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna))
 +
        self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]
 +
 +
class mutation:
 +
    def __init__(self,type="",start=0,stop=0):
 +
        self.type = type
 +
        self.span=(start,stop)
 +
   
class orf:
class orf:
-
     span = (0,0)
+
     #On initiation, the orf is stored as a list of codons.  If the orf has no stop, any excess bases will
-
     sequence = ""
+
     #be ignored on the conversion to codons
 +
    def __init__(self, sequence=""):
 +
        self.codons = []
 +
        for i in range(len(sequence)/3):
 +
            #print i
 +
            temp_codon = codon(sequence[i*3:3+(i*3)])  #this algorithm of seperating into codons ignores any excess bases
 +
            self.codons.append(temp_codon)
 +
            #print self.codons[i].sequence
 +
        self.sequence = sequence
 +
       
 +
    #orfs are indexed by codons
 +
    def __getitem__(self,index):
 +
        return self.codons[index]
 +
 
 +
    #comparing two orfs returns a list of mutations       
 +
    def __eq__(self,other)
 +
 
def findORFS(sequence, startpos=0):
def findORFS(sequence, startpos=0):
     all_orfs = []
     all_orfs = []
-
     start = re.compile('AUG')
+
     start = re.compile('ATG')
-
     stop = re.compile('(UAA|UGA|UAG)')
+
     stop = re.compile('(TAA|TGA|TAG)')
     all_starts = start.finditer(sequence)
     all_starts = start.finditer(sequence)
     all_stops = stop.finditer(sequence)
     all_stops = stop.finditer(sequence)
-
      
+
     print "Infunction: ",sequence
     print "starts:"
     print "starts:"
     all_starts_list = []
     all_starts_list = []
Line 40: Line 68:
             all_orfs.append((start[0],-1))
             all_orfs.append((start[0],-1))
          
          
-
     print all_orfs
+
     return all_orfs
-
         
+
 
-
teststring = "AUG GGG GGG AAU GAU UAA CGT CGT UAA AGT AUG TTT TTU GUA G"
+
#Main Program starts here         
 +
teststring = "ATG GGG GGG AAT GAT TAA CGT CGT TAA AGT ATG TTT TTT GTA G"
print teststring
print teststring
teststring = re.sub("\s+", "", teststring)
teststring = re.sub("\s+", "", teststring)
print 'chomped:',teststring
print 'chomped:',teststring
-
findORFS(teststring)
+
allspans = findORFS(teststring)
-
#AUG GGG GGG AAU GAU UAA CGT CGT UAA AGT AUG TTT TTU GUA G
+
allorfs = []
-
#Find the position of all the start codons
+
print "spans"
-
 
+
print allspans
-
 
+
for i in allspans:
-
#OK.  I quickly learned that iterators don't reset once you get to the end of them   
+
    x = i[0]  #I don't know why it won't let me use i[0] and i[1] directly as an int
-
#all_starts = first_all_starts
+
     y = i[1]
-
#all_stops = first_all_stops
+
    temp_orf = orf(teststring[x:y])
-
#Well...apparently you can't copy iterators... Great
+
    allorfs.append(temp_orf)
-
#This is wasteful, but I'm going to shove all the objects into a tuple
+
print allorfs
-
#Ideally I could instead shove all codons into an orf object, but I don't know how to do that in python...
+
for i in allorfs:
-
      
+
     print "orfseq:",i.sequence
-
 
+
     for j in i:
-
#Sequentially go through each start codon and look for stop codons in fram, otherwise store then entire sequen
+
        print j.sequence
-
 
+
        print j.protein
-
#now that I have the span for all starts, and all stops if I want, I can match up normalized distances from the first start
+
-
#and pair them up without even looking at the strings again.  But i need to make sure its in a triplet search
+
-
orf = re.compile( 'AUG[ATUCG]+?(UAA|UGA|UAG)')
+
-
te = orf.finditer(teststring)
+
-
print "dammit"
+
-
for i in te:
+
-
     print "found one"
+
-
     print i.group()
+
-
 
+
-
</pre>
+

Revision as of 01:04, 28 February 2007

  1. find all orfs within the sequences

import re from Bio import Transcribe transcriber = Transcribe.unambiguous_transcriber from Bio import Translate from Bio.Alphabet import IUPAC from Bio.Seq import Seq standard_translator = Translate.unambiguous_dna_by_id[1] class codon:

   def  __init__(self, sequence=""):
       self.sequence = sequence
       self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna))
       self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]

class mutation:

   def __init__(self,type="",start=0,stop=0):
       self.type = type
       self.span=(start,stop)
   

class orf:

   #On initiation, the orf is stored as a list of codons.  If the orf has no stop, any excess bases will
   #be ignored on the conversion to codons
   def __init__(self, sequence=""):
       self.codons = []
       for i in range(len(sequence)/3):
           #print i
           temp_codon = codon(sequence[i*3:3+(i*3)])  #this algorithm of seperating into codons ignores any excess bases
           self.codons.append(temp_codon)
           #print self.codons[i].sequence
       self.sequence = sequence
       
   #orfs are indexed by codons
   def __getitem__(self,index):
       return self.codons[index]
   #comparing two orfs returns a list of mutations         
   def __eq__(self,other)


def findORFS(sequence, startpos=0):

   all_orfs = []
   start = re.compile('ATG')
   stop = re.compile('(TAA|TGA|TAG)')
   all_starts = start.finditer(sequence)
   all_stops = stop.finditer(sequence)
   print "Infunction: ",sequence
   print "starts:"
   all_starts_list = []
   for match in all_starts:
       all_starts_list.append(match.span())
       print match.span()
   print "stops:"
   all_stops_list = []
   for stops in all_stops:
       all_stops_list.append(stops.span())
       print stops.span()
   for start in all_starts_list:
       found = 0
       for stop in all_stops_list:
           print "checking", start[0], "and", stop[0]
           diff = (stop[0]-start[0])
           if ((diff>0) and ((diff%3) == 0)):
               print "orf at:", start[0]," ",stop[1]
               all_orfs.append((start[0],stop[1]))
               found = 1
               break
       if found ==0:
           all_orfs.append((start[0],-1))
       
   return all_orfs
  1. Main Program starts here

teststring = "ATG GGG GGG AAT GAT TAA CGT CGT TAA AGT ATG TTT TTT GTA G" print teststring teststring = re.sub("\s+", "", teststring) print 'chomped:',teststring allspans = findORFS(teststring) allorfs = [] print "spans" print allspans for i in allspans:

   x = i[0]  #I don't know why it won't let me use i[0] and i[1] directly as an int
   y = i[1]
   temp_orf = orf(teststring[x:y])
   allorfs.append(temp_orf)

print allorfs for i in allorfs:

   print "orfseq:",i.sequence
   for j in i:
       print j.sequence
       print j.protein
Personal tools