Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-2-27

(Difference between revisions)

Mdwang (Talk | contribs)
(New page: Notes: I've decided to make an orf class that indexes to a list of codon classes. The mutation detection will be built into comparators for the class. It's taking me a bit to figure out e...)
Next diff →

Revision as of 13:19, 27 February 2007

Notes: I've decided to make an orf class that indexes to a list of codon classes. The mutation detection will be built into comparators for the class. It's taking me a bit to figure out exactly how these things all work...

```#find all orfs within the sequences
import re
class orf:
span = (0,0)
sequence = ""

def findORFS(sequence, startpos=0):
all_orfs = []
start = re.compile('AUG')
stop = re.compile('(UAA|UGA|UAG)')
all_starts = start.finditer(sequence)
all_stops = stop.finditer(sequence)

print "starts:"
all_starts_list = []
for match in all_starts:
all_starts_list.append(match.span())
print match.span()
print "stops:"
all_stops_list = []
for stops in all_stops:
all_stops_list.append(stops.span())
print stops.span()
for start in all_starts_list:
found = 0
for stop in all_stops_list:
print "checking", start[0], "and", stop[0]
diff = (stop[0]-start[0])
if ((diff>0) and ((diff%3) == 0)):
print "orf at:", start[0]," ",stop[1]
all_orfs.append((start[0],stop[1]))
found = 1
break
if found ==0:
all_orfs.append((start[0],-1))

print all_orfs

teststring = "AUG GGG GGG AAU GAU UAA CGT CGT UAA AGT AUG TTT TTU GUA G"
print teststring
teststring = re.sub("\s+", "", teststring)
print 'chomped:',teststring
findORFS(teststring)
#AUG GGG GGG AAU GAU UAA CGT CGT UAA AGT AUG TTT TTU GUA G
#Find the position of all the start codons

#OK.  I quickly learned that iterators don't reset once you get to the end of them
#all_starts = first_all_starts
#all_stops = first_all_stops
#Well...apparently you can't copy iterators... Great
#This is wasteful, but I'm going to shove all the objects into a tuple
#Ideally I could instead shove all codons into an orf object, but I don't know how to do that in python...

#Sequentially go through each start codon and look for stop codons in fram, otherwise store then entire sequen

#now that I have the span for all starts, and all stops if I want, I can match up normalized distances from the first start
#and pair them up without even looking at the strings again.  But i need to make sure its in a triplet search
orf = re.compile( 'AUG[ATUCG]+?(UAA|UGA|UAG)')
te = orf.finditer(teststring)
print "dammit"
for i in te:
print "found one"
print i.group()

```