Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-3-6: Difference between revisions
From OpenWetWare
Jump to navigationJump to search
No edit summary |
No edit summary |
||
Line 1: | Line 1: | ||
Notes: | Notes: | ||
This isn't as clean as the sample output, but it compares two hardcoded strings and does mutation compatison. The functions package mutations and orfs into objects that could be exported for further manipulation. Though this version doesn't implement it, it would be fairly easy to load in multiple sequences to do pairwise alignments and call the main script (This also goes for the reverse strand comparisons). Also missing in this version is the final output proteins for the insertions and deletions. I couldn't quite figure out exactly we're supposed to handle out of fram mutations in getting the codons to line up. | This isn't as clean as the sample output, but it compares two hardcoded strings and does mutation compatison. The functions package mutations and orfs into objects that could be exported for further manipulation. Though this version doesn't implement it, it would be fairly easy to load in multiple sequences to do pairwise alignments and call the main script (This also goes for the reverse strand comparisons). Also missing in this version is the final output proteins for the insertions and deletions. I couldn't quite figure out exactly we're supposed to handle out of fram mutations in getting the codons to line up. | ||
<pre> | <pre> |
Revision as of 23:31, 5 March 2007
Notes: This isn't as clean as the sample output, but it compares two hardcoded strings and does mutation compatison. The functions package mutations and orfs into objects that could be exported for further manipulation. Though this version doesn't implement it, it would be fairly easy to load in multiple sequences to do pairwise alignments and call the main script (This also goes for the reverse strand comparisons). Also missing in this version is the final output proteins for the insertions and deletions. I couldn't quite figure out exactly we're supposed to handle out of fram mutations in getting the codons to line up.
#find all orfs within the sequences import re, string from Bio import Transcribe transcriber = Transcribe.unambiguous_transcriber from Bio import Translate from Bio.Alphabet import IUPAC from Bio.Seq import Seq import os from Bio import Clustalw standard_translator = Translate.unambiguous_dna_by_id[1] class codon: def __init__(self, sequence=""): self.sequence = sequence self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna)) self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0] class raw_mutation: def __init__(self,type="",ref_location=0,other_location=0, original="", mutant=""): self.type = type self.ref_location=ref_location self.other_location = other_location self.original = original self.mutant = mutant class mutation: def __init__(self,type="",ref_span=[0,0],other_span=[0,0],original="", mutant=""): self.type = type self.ref_span = ref_span self.other_span = other_span self.original = original self.mutant = mutant class orf: #On initiation, the orf is stored as a list of codons. If the orf has no stop, any excess bases will #be ignored on the conversion to codons def __init__(self, source="", source_span=[]): self.source = source self.codons = [] self.sequence = source[source_span[0]:source_span[1]] self.source_span = source_span for i in range(len(self.sequence)/3): temp_codon = codon(self.sequence[i*3:3+(i*3)]) #this algorithm of seperating into codons ignores any excess bases self.codons.append(temp_codon) #orfs are indexed by codons def __getitem__(self,index): return self.codons[index] def find_mutations(self,other): refPos = 0 seqPos = 0 #print "Ref:",self #print "Other:",other all_mutations = [] for i in range(len(self)): #print self[i], " ", other[i], ' ', refPos, ' ', seqPos if self[i] != other[i]: if self[i] == '-': new_mutation = raw_mutation("insertion",refPos,seqPos) elif other[i] == '-': new_mutation = raw_mutation("deletion",refPos,seqPos) else: new_mutation = raw_mutation("point",refPos,seqPos, self[i],other[i]) all_mutations.append(new_mutation) #print "Mutation!" #print new_mutation.type," ",new_mutation.ref_location," ",new_mutation.other_location if self[i] != '-': refPos += 1 if other[i] != '-': seqPos += 1 return all_mutations def consolidate_mutations(raw_mutations): #I think this will have a bug for insertions before the first base of the refseq because #in every other case, ref_loc will refer to the base before the insertion consolidated_mutations =[] i = 0 while i < len(raw_mutations): #print "outer loop" current_type = raw_mutations[i].type if(current_type == "point"): ref_loc = raw_mutations[i].ref_location other_loc = raw_mutations[i].other_location new_mutation = mutation("point",[ref_loc,ref_loc],[other_loc,other_loc],raw_mutations[i].original, raw_mutations[i].mutant) consolidated_mutations.append(new_mutation) i += 1 elif(current_type == "deletion") or (current_type == "insertion"): ref_start = raw_mutations[i].ref_location other_start = raw_mutations[i].other_location ref_end = raw_mutations[i].ref_location other_end = raw_mutations[i].other_location i += 1 #It would have been nice to have a do while here... while (i<=len(raw_mutations)): #and (raw_mutations[i].type == current_type): #hopefully it exits after the first condition so it doesn't go past array length if (i==len(raw_mutations)) or ((raw_mutations[i].ref_location!=ref_start) and (raw_mutations[i].other_location!=other_start)): new_mutation = mutation(current_type,[ref_start,ref_end],[other_start,other_end]) consolidated_mutations.append(new_mutation) #print current_type, "found at ", "ref", ref_start, ",",ref_end," seq ", other_start, ",", other_end break elif (current_type == "insertion"): #and (raw_mutations[i].ref_location==ref_start): other_end += 1 i += 1 #print "extending insertion" elif (current_type == "deletion"): #and (raw_mutations[i].ref_location==other_start): ref_end += 1 i += 1 #print "extending deletion" else: print "I shouldn't be here!!!" break return consolidated_mutations def findORFS(sequence, startpos=0): print "For",sequence all_orfs = [] start = re.compile('ATG') stop = re.compile('(TAA|TGA|TAG)') all_starts = start.finditer(sequence) all_stops = stop.finditer(sequence) all_starts_list = [] for match in all_starts: all_starts_list.append(match.span()) all_stops_list = [] for stops in all_stops: all_stops_list.append(stops.span()) #print stops.span() for start in all_starts_list: found = 0 for stop in all_stops_list: diff = (stop[0]-start[0]) if ((diff>0) and ((diff%3) == 0)): print "orf at:", start[0]," ",stop[1] temp_orf = orf(sequence,(start[0],stop[1])) all_orfs.append(temp_orf) #all_orfs.append((start[0],stop[1])) found = 1 break if found ==0: print "open orf", temp_orf = orf(sequence,(start[0],None)) all_orfs.append(temp_orf) print "starting at:", start[0] #all_orfs.append((start[0],None)) return all_orfs def check_point(mutation, ref_orfs): mut_pos = mutation.ref_span[0] #ref_span[0] and [1] are the same print "For ", mutation.type, "at position", mutation.ref_span[0], ':' for i in ref_orfs: if (mut_pos>=i.source_span[0] and mut_pos<=i.source_span[1]): #does the mutation affect the orf print "Point affects ", i.sequence ref_codon_position = (mut_pos - i.source_span[0])/3 base_position = (mut_pos-i.source_span[0])%3 original_aa = i[ref_codon_position].protein mut_codon = i[ref_codon_position].sequence # print "Original Codon:", mut_codon mut_codon = list(mut_codon) mut_codon[base_position] = mutation.mutant mut_codon = "".join(mut_codon) # print "Mutant Codon",mut_codon mutant_aa = standard_translator.translate(Seq(mut_codon, IUPAC.unambiguous_dna))[0] if (mutant_aa == original_aa): print "Silent Point Mutation detected: ", else: print "Non-silent Point Mutation detected", print (mutation.original+str(mut_pos)+mutation.mutant),"Protein result:",original_aa+str(mut_pos)+mutant_aa def formatted_seq_print (sequence): for i in range(len(sequence)/3): start_base = i*3 print_codon = sequence.data[start_base:start_base+3] print print_codon, print "" def formatted_codon_print (sequence): for i in range(len(sequence)/3): start_base = i*3 prot = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0] print_codon = sequence.data[start_base:start_base+3] #if currently in the deletion region, don't print print print_codon, print "" def check_frame(mutation, all_orfs, all_records): if ((mutation.ref_span[1]-mutation.ref_span[0])%3 !=0) or ((mutation.other_span[1]-mutation.other_span[0])%3 !=0): print mutation.type, "from", mutation.ref_span[0]," to", mutation.ref_span[1], "is a frameshift" else: print mutation.type, "from", mutation.ref_span[0]," to", mutation.ref_span[1], "is in frame" print "odna:", formatted_seq_print (all_records[0].seq) #for i in range(len(something)/3): # print " ",some_protein," " print "mdna:", formatted_seq_print (all_records[1].seq) #for i in range(len(something)/3): # print " ",some_protein," " #Main Program starts here teststring = " A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT" teststring2= "CGA ATG GGG GCG ATG ATT AAC C GTT AAA GTA TGT TTT TTG TAG" print teststring teststring = re.sub("\s+", "", teststring) teststring2 = re.sub("\s+", "", teststring2) print "Forward orfs" allorfs = findORFS(teststring) allorfs2 = findORFS(teststring2) temp_file = open(os.path.join(os.curdir, 'temp.txt'),"w") temp_file.write(">temp1|\n ") temp_file.write(teststring) temp_file.write("\n\n") temp_file.write(">gtemp2|\n ") temp_file.write(teststring2) temp_file.close() cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'temp.txt')) cline.set_output('test.aln') alignment = Clustalw.do_alignment(cline) all_records = alignment.get_all_seqs() #print alignment mutations = find_mutations(all_records[0].seq,all_records[1].seq) #print mutations all_real = consolidate_mutations (mutations) for i in all_real: print i.type, "at ref:", i.ref_span, " other ", i.other_span for i in all_real: if (i.type == "point"): check_point(i, allorfs) else: check_frame(i,allorfs,all_records) reverse_list1 = list(teststring)[:] reverse_list2 = list(teststring2)[:] teststring = "".join(teststring) teststring2 = "".join(teststring2) reverse_list1.reverse() reverse_list2.reverse() reverse_string1 = "".join(reverse_list1) reverse_string2 = "".join(reverse_list2) table = string.maketrans("ATGC","TACG") reverse_string1 = reverse_string1.translate(table) reverse_string2 = reverse_string2.translate(table) print "reverse orfs" revallorfs = findORFS(reverse_string1) revallorfs2 = findORFS(reverse_string2) #print "orfs2" #for i in allorfs2: # print i.sequence