Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-2-27: Difference between revisions

Revision as of 22:04, 27 February 2007

find all orfs within the sequences

import re from Bio import Transcribe transcriber = Transcribe.unambiguous_transcriber from Bio import Translate from Bio.Alphabet import IUPAC from Bio.Seq import Seq standard_translator = Translate.unambiguous_dna_by_id[1] class codon:

   def  __init__(self, sequence=""):
       self.sequence = sequence
       self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna))
       self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]

class mutation:

   def __init__(self,type="",start=0,stop=0):
       self.type = type
       self.span=(start,stop)

class orf:

   #On initiation, the orf is stored as a list of codons.  If the orf has no stop, any excess bases will
   #be ignored on the conversion to codons
   def __init__(self, sequence=""):
       self.codons = []
       for i in range(len(sequence)/3):
           #print i
           temp_codon = codon(sequence[i*3:3+(i*3)])  #this algorithm of seperating into codons ignores any excess bases
           self.codons.append(temp_codon)
           #print self.codons[i].sequence
       self.sequence = sequence
       
   #orfs are indexed by codons
   def __getitem__(self,index):
       return self.codons[index]

   #comparing two orfs returns a list of mutations         
   def __eq__(self,other)

def findORFS(sequence, startpos=0):

   all_orfs = []
   start = re.compile('ATG')
   stop = re.compile('(TAA|TGA|TAG)')
   all_starts = start.finditer(sequence)
   all_stops = stop.finditer(sequence)
   print "Infunction: ",sequence
   print "starts:"
   all_starts_list = []
   for match in all_starts:
       all_starts_list.append(match.span())
       print match.span()
   print "stops:"
   all_stops_list = []
   for stops in all_stops:
       all_stops_list.append(stops.span())
       print stops.span()
   for start in all_starts_list:
       found = 0
       for stop in all_stops_list:
           print "checking", start[0], "and", stop[0]
           diff = (stop[0]-start[0])
           if ((diff>0) and ((diff%3) == 0)):
               print "orf at:", start[0]," ",stop[1]
               all_orfs.append((start[0],stop[1]))
               found = 1
               break
       if found ==0:
           all_orfs.append((start[0],-1))
       
   return all_orfs

Main Program starts here

teststring = "ATG GGG GGG AAT GAT TAA CGT CGT TAA AGT ATG TTT TTT GTA G" print teststring teststring = re.sub("\s+", "", teststring) print 'chomped:',teststring allspans = findORFS(teststring) allorfs = [] print "spans" print allspans for i in allspans:

   x = i[0]  #I don't know why it won't let me use i[0] and i[1] directly as an int
   y = i[1]
   temp_orf = orf(teststring[x:y])
   allorfs.append(temp_orf)

print allorfs for i in allorfs:

   print "orfseq:",i.sequence
   for j in i:
       print j.sequence
       print j.protein

Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-2-27: Difference between revisions

Revision as of 22:04, 27 February 2007

Navigation menu

Page actions

Page actions

Personal tools

Navigation

Search

research

Tools