Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-2-27: Difference between revisions

From OpenWetWare
Jump to navigationJump to search
(New page: Notes: I've decided to make an orf class that indexes to a list of codon classes. The mutation detection will be built into comparators for the class. It's taking me a bit to figure out e...)
 
No edit summary
Line 1: Line 1:
Notes:
I've decided to make an orf class that indexes to a list of codon classes.
The mutation detection will be built into comparators for the class.  It's taking me a bit to figure out exactly how these things all work...
<pre>
#find all orfs within the sequences
#find all orfs within the sequences
import re
import re
from Bio import Transcribe
transcriber = Transcribe.unambiguous_transcriber
from Bio import Translate
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
standard_translator = Translate.unambiguous_dna_by_id[1]
class codon:
    def  __init__(self, sequence=""):
        self.sequence = sequence
        self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna))
        self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]
class mutation:
    def __init__(self,type="",start=0,stop=0):
        self.type = type
        self.span=(start,stop)
   
class orf:
class orf:
     span = (0,0)
     #On initiation, the orf is stored as a list of codons.  If the orf has no stop, any excess bases will
     sequence = ""
    #be ignored on the conversion to codons
     def __init__(self, sequence=""):
        self.codons = []
        for i in range(len(sequence)/3):
            #print i
            temp_codon = codon(sequence[i*3:3+(i*3)])  #this algorithm of seperating into codons ignores any excess bases
            self.codons.append(temp_codon)
            #print self.codons[i].sequence
        self.sequence = sequence
       
    #orfs are indexed by codons
    def __getitem__(self,index):
        return self.codons[index]
 
    #comparing two orfs returns a list of mutations       
    def __eq__(self,other)
 


def findORFS(sequence, startpos=0):
def findORFS(sequence, startpos=0):
     all_orfs = []
     all_orfs = []
     start = re.compile('AUG')
     start = re.compile('ATG')
     stop = re.compile('(UAA|UGA|UAG)')
     stop = re.compile('(TAA|TGA|TAG)')
     all_starts = start.finditer(sequence)
     all_starts = start.finditer(sequence)
     all_stops = stop.finditer(sequence)
     all_stops = stop.finditer(sequence)
      
     print "Infunction: ",sequence
     print "starts:"
     print "starts:"
     all_starts_list = []
     all_starts_list = []
Line 40: Line 68:
             all_orfs.append((start[0],-1))
             all_orfs.append((start[0],-1))
          
          
     print all_orfs
     return all_orfs
         
 
teststring = "AUG GGG GGG AAU GAU UAA CGT CGT UAA AGT AUG TTT TTU GUA G"
#Main Program starts here         
teststring = "ATG GGG GGG AAT GAT TAA CGT CGT TAA AGT ATG TTT TTT GTA G"
print teststring
print teststring
teststring = re.sub("\s+", "", teststring)
teststring = re.sub("\s+", "", teststring)
print 'chomped:',teststring
print 'chomped:',teststring
findORFS(teststring)
allspans = findORFS(teststring)
#AUG GGG GGG AAU GAU UAA CGT CGT UAA AGT AUG TTT TTU GUA G
allorfs = []
#Find the position of all the start codons
print "spans"
 
print allspans
 
for i in allspans:
#OK.  I quickly learned that iterators don't reset once you get to the end of them   
    x = i[0]  #I don't know why it won't let me use i[0] and i[1] directly as an int
#all_starts = first_all_starts
     y = i[1]
#all_stops = first_all_stops
    temp_orf = orf(teststring[x:y])
#Well...apparently you can't copy iterators... Great
    allorfs.append(temp_orf)
#This is wasteful, but I'm going to shove all the objects into a tuple
print allorfs
#Ideally I could instead shove all codons into an orf object, but I don't know how to do that in python...
for i in allorfs:
      
     print "orfseq:",i.sequence
 
     for j in i:
#Sequentially go through each start codon and look for stop codons in fram, otherwise store then entire sequen
        print j.sequence
 
        print j.protein
#now that I have the span for all starts, and all stops if I want, I can match up normalized distances from the first start
#and pair them up without even looking at the strings again.  But i need to make sure its in a triplet search
orf = re.compile( 'AUG[ATUCG]+?(UAA|UGA|UAG)')
te = orf.finditer(teststring)
print "dammit"
for i in te:
     print "found one"
     print i.group()
 
</pre>

Revision as of 22:04, 27 February 2007

  1. find all orfs within the sequences

import re from Bio import Transcribe transcriber = Transcribe.unambiguous_transcriber from Bio import Translate from Bio.Alphabet import IUPAC from Bio.Seq import Seq standard_translator = Translate.unambiguous_dna_by_id[1] class codon:

   def  __init__(self, sequence=""):
       self.sequence = sequence
       self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna))
       self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]

class mutation:

   def __init__(self,type="",start=0,stop=0):
       self.type = type
       self.span=(start,stop)
   

class orf:

   #On initiation, the orf is stored as a list of codons.  If the orf has no stop, any excess bases will
   #be ignored on the conversion to codons
   def __init__(self, sequence=""):
       self.codons = []
       for i in range(len(sequence)/3):
           #print i
           temp_codon = codon(sequence[i*3:3+(i*3)])  #this algorithm of seperating into codons ignores any excess bases
           self.codons.append(temp_codon)
           #print self.codons[i].sequence
       self.sequence = sequence
       
   #orfs are indexed by codons
   def __getitem__(self,index):
       return self.codons[index]
   #comparing two orfs returns a list of mutations         
   def __eq__(self,other)


def findORFS(sequence, startpos=0):

   all_orfs = []
   start = re.compile('ATG')
   stop = re.compile('(TAA|TGA|TAG)')
   all_starts = start.finditer(sequence)
   all_stops = stop.finditer(sequence)
   print "Infunction: ",sequence
   print "starts:"
   all_starts_list = []
   for match in all_starts:
       all_starts_list.append(match.span())
       print match.span()
   print "stops:"
   all_stops_list = []
   for stops in all_stops:
       all_stops_list.append(stops.span())
       print stops.span()
   for start in all_starts_list:
       found = 0
       for stop in all_stops_list:
           print "checking", start[0], "and", stop[0]
           diff = (stop[0]-start[0])
           if ((diff>0) and ((diff%3) == 0)):
               print "orf at:", start[0]," ",stop[1]
               all_orfs.append((start[0],stop[1]))
               found = 1
               break
       if found ==0:
           all_orfs.append((start[0],-1))
       
   return all_orfs
  1. Main Program starts here

teststring = "ATG GGG GGG AAT GAT TAA CGT CGT TAA AGT ATG TTT TTT GTA G" print teststring teststring = re.sub("\s+", "", teststring) print 'chomped:',teststring allspans = findORFS(teststring) allorfs = [] print "spans" print allspans for i in allspans:

   x = i[0]  #I don't know why it won't let me use i[0] and i[1] directly as an int
   y = i[1]
   temp_orf = orf(teststring[x:y])
   allorfs.append(temp_orf)

print allorfs for i in allorfs:

   print "orfseq:",i.sequence
   for j in i:
       print j.sequence
       print j.protein