Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-2-27: Difference between revisions

Revision as of 22:04, 27 February 2007

Notes: I've decided to make an orf class that indexes to a list of codon classes. The mutation detection will be built into comparators for the class. It's taking me a bit to figure out exactly how these things all work...

#find all orfs within the sequences
import re
from Bio import Transcribe
transcriber = Transcribe.unambiguous_transcriber
from Bio import Translate
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
standard_translator = Translate.unambiguous_dna_by_id[1] 
class codon:
    def  __init__(self, sequence=""):
        self.sequence = sequence
        self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna))
        self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]

class mutation:
    def __init__(self,type="",start=0,stop=0):
        self.type = type
        self.span=(start,stop)
    
class orf:
    #On initiation, the orf is stored as a list of codons.  If the orf has no stop, any excess bases will
    #be ignored on the conversion to codons
    def __init__(self, sequence=""):
        self.codons = []
        for i in range(len(sequence)/3):
            #print i
            temp_codon = codon(sequence[i*3:3+(i*3)])  #this algorithm of seperating into codons ignores any excess bases
            self.codons.append(temp_codon)
            #print self.codons[i].sequence
        self.sequence = sequence
        
    #orfs are indexed by codons
    def __getitem__(self,index):
        return self.codons[index]

    #comparing two orfs returns a list of mutations         
    def __eq__(self,other)


def findORFS(sequence, startpos=0):
    all_orfs = []
    start = re.compile('ATG')
    stop = re.compile('(TAA|TGA|TAG)')
    all_starts = start.finditer(sequence)
    all_stops = stop.finditer(sequence)
    print "Infunction: ",sequence
    print "starts:"
    all_starts_list = []
    for match in all_starts:
        all_starts_list.append(match.span())
        print match.span()
    print "stops:"
    all_stops_list = []
    for stops in all_stops:
        all_stops_list.append(stops.span())
        print stops.span()
    for start in all_starts_list:
        found = 0
        for stop in all_stops_list:
            print "checking", start[0], "and", stop[0]
            diff = (stop[0]-start[0])
            if ((diff>0) and ((diff%3) == 0)):
                print "orf at:", start[0]," ",stop[1]
                all_orfs.append((start[0],stop[1]))
                found = 1
                break
        if found ==0:
            all_orfs.append((start[0],-1))
        
    return all_orfs

#Main Program starts here          
teststring = "ATG GGG GGG AAT GAT TAA CGT CGT TAA AGT ATG TTT TTT GTA G"
print teststring
teststring = re.sub("\s+", "", teststring)
print 'chomped:',teststring
allspans = findORFS(teststring)
allorfs = []
print "spans"
print allspans
for i in allspans:
    x = i[0]  #I don't know why it won't let me use i[0] and i[1] directly as an int
    y = i[1]
    temp_orf = orf(teststring[x:y])
    allorfs.append(temp_orf)
print allorfs
for i in allorfs:
    print "orfseq:",i.sequence
    for j in i:
        print j.sequence
        print j.protein

Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-2-27: Difference between revisions

Revision as of 22:04, 27 February 2007

Navigation menu

Page actions

Page actions

Personal tools

Navigation

Search

research

Tools