Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-2-27: Difference between revisions
From OpenWetWare
Jump to navigationJump to search
(New page: Notes: I've decided to make an orf class that indexes to a list of codon classes. The mutation detection will be built into comparators for the class. It's taking me a bit to figure out e...) |
No edit summary |
||
Line 1: | Line 1: | ||
#find all orfs within the sequences | #find all orfs within the sequences | ||
import re | import re | ||
from Bio import Transcribe | |||
transcriber = Transcribe.unambiguous_transcriber | |||
from Bio import Translate | |||
from Bio.Alphabet import IUPAC | |||
from Bio.Seq import Seq | |||
standard_translator = Translate.unambiguous_dna_by_id[1] | |||
class codon: | |||
def __init__(self, sequence=""): | |||
self.sequence = sequence | |||
self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna)) | |||
self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0] | |||
class mutation: | |||
def __init__(self,type="",start=0,stop=0): | |||
self.type = type | |||
self.span=(start,stop) | |||
class orf: | class orf: | ||
#On initiation, the orf is stored as a list of codons. If the orf has no stop, any excess bases will | |||
sequence = "" | #be ignored on the conversion to codons | ||
def __init__(self, sequence=""): | |||
self.codons = [] | |||
for i in range(len(sequence)/3): | |||
#print i | |||
temp_codon = codon(sequence[i*3:3+(i*3)]) #this algorithm of seperating into codons ignores any excess bases | |||
self.codons.append(temp_codon) | |||
#print self.codons[i].sequence | |||
self.sequence = sequence | |||
#orfs are indexed by codons | |||
def __getitem__(self,index): | |||
return self.codons[index] | |||
#comparing two orfs returns a list of mutations | |||
def __eq__(self,other) | |||
def findORFS(sequence, startpos=0): | def findORFS(sequence, startpos=0): | ||
all_orfs = [] | all_orfs = [] | ||
start = re.compile(' | start = re.compile('ATG') | ||
stop = re.compile('( | stop = re.compile('(TAA|TGA|TAG)') | ||
all_starts = start.finditer(sequence) | all_starts = start.finditer(sequence) | ||
all_stops = stop.finditer(sequence) | all_stops = stop.finditer(sequence) | ||
print "Infunction: ",sequence | |||
print "starts:" | print "starts:" | ||
all_starts_list = [] | all_starts_list = [] | ||
Line 40: | Line 68: | ||
all_orfs.append((start[0],-1)) | all_orfs.append((start[0],-1)) | ||
return all_orfs | |||
teststring = " | #Main Program starts here | ||
teststring = "ATG GGG GGG AAT GAT TAA CGT CGT TAA AGT ATG TTT TTT GTA G" | |||
print teststring | print teststring | ||
teststring = re.sub("\s+", "", teststring) | teststring = re.sub("\s+", "", teststring) | ||
print 'chomped:',teststring | print 'chomped:',teststring | ||
findORFS(teststring) | allspans = findORFS(teststring) | ||
allorfs = [] | |||
print "spans" | |||
print allspans | |||
for i in allspans: | |||
# | x = i[0] #I don't know why it won't let me use i[0] and i[1] directly as an int | ||
y = i[1] | |||
temp_orf = orf(teststring[x:y]) | |||
allorfs.append(temp_orf) | |||
print allorfs | |||
for i in allorfs: | |||
print "orfseq:",i.sequence | |||
for j in i: | |||
print j.sequence | |||
print j.protein | |||
orf | |||
print | |||
for i in | |||
print " | |||
print | |||
Revision as of 22:04, 27 February 2007
- find all orfs within the sequences
import re from Bio import Transcribe transcriber = Transcribe.unambiguous_transcriber from Bio import Translate from Bio.Alphabet import IUPAC from Bio.Seq import Seq standard_translator = Translate.unambiguous_dna_by_id[1] class codon:
def __init__(self, sequence=""): self.sequence = sequence self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna)) self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]
class mutation:
def __init__(self,type="",start=0,stop=0): self.type = type self.span=(start,stop)
class orf:
#On initiation, the orf is stored as a list of codons. If the orf has no stop, any excess bases will #be ignored on the conversion to codons def __init__(self, sequence=""): self.codons = [] for i in range(len(sequence)/3): #print i temp_codon = codon(sequence[i*3:3+(i*3)]) #this algorithm of seperating into codons ignores any excess bases self.codons.append(temp_codon) #print self.codons[i].sequence self.sequence = sequence #orfs are indexed by codons def __getitem__(self,index): return self.codons[index]
#comparing two orfs returns a list of mutations def __eq__(self,other)
def findORFS(sequence, startpos=0):
all_orfs = [] start = re.compile('ATG') stop = re.compile('(TAA|TGA|TAG)') all_starts = start.finditer(sequence) all_stops = stop.finditer(sequence) print "Infunction: ",sequence print "starts:" all_starts_list = [] for match in all_starts: all_starts_list.append(match.span()) print match.span() print "stops:" all_stops_list = [] for stops in all_stops: all_stops_list.append(stops.span()) print stops.span() for start in all_starts_list: found = 0 for stop in all_stops_list: print "checking", start[0], "and", stop[0] diff = (stop[0]-start[0]) if ((diff>0) and ((diff%3) == 0)): print "orf at:", start[0]," ",stop[1] all_orfs.append((start[0],stop[1])) found = 1 break if found ==0: all_orfs.append((start[0],-1)) return all_orfs
- Main Program starts here
teststring = "ATG GGG GGG AAT GAT TAA CGT CGT TAA AGT ATG TTT TTT GTA G" print teststring teststring = re.sub("\s+", "", teststring) print 'chomped:',teststring allspans = findORFS(teststring) allorfs = [] print "spans" print allspans for i in allspans:
x = i[0] #I don't know why it won't let me use i[0] and i[1] directly as an int y = i[1] temp_orf = orf(teststring[x:y]) allorfs.append(temp_orf)
print allorfs for i in allorfs:
print "orfseq:",i.sequence for j in i: print j.sequence print j.protein