Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-4-19: Difference between revisions
No edit summary |
No edit summary |
||
Line 2: | Line 2: | ||
<syntax type='python'> | <syntax type='python'> | ||
from Bio import SeqIO | from Bio import SeqIO | ||
from Bio.Blast import NCBIWWW | from Bio.Blast import NCBIWWW | ||
Line 116: | Line 115: | ||
print e[0].hit_def | print e[0].hit_def | ||
</syntax> | </syntax> | ||
Latest revision as of 23:00, 18 April 2007
This script does a regular BLAST lookup and parses out the top human hit and relevant info (hit to, hit from, hit id, hit def, hit seq, etc). The next step will be to get the relevant part of the contig file. Then we'll use that to get the gene and figure out the mutation and do relevant searching thereafter.
<syntax type='python'> from Bio import SeqIO from Bio.Blast import NCBIWWW from Bio.EUtils import DBIdsClient import xml.dom.minidom from xml.dom.minidom import parse, parseString from threading import Thread import time, sys
- C-style struct to pass parameters
class AllelicVariant: pass
class Hit: # the fields are added on the fly but include hit_from, # hit_to, etc. (see extract_data()) pass
- basic function to open fasta file and get sequence
def get_sequence_from_fasta(file): file_handle = open(file) records = SeqIO.parse(file_handle, format="fasta") record = records.next() return record.seq.data
- lookup blast snp data
def blast_lookup(seq): result_handle = NCBIWWW.qblast("blastn", 'refseq_genomic', seq) return result_handle.read()
- basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html
def get_text(node_list):
rc = "" for node in node_list: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc
- extracts snp data
def extract_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Hit") if len(variants) == 0: return parsed = [] for v in variants: # now populate the struct access = get_text(v.getElementsByTagName("Hit_accession")[0].childNodes) score = get_text(v.getElementsByTagName("Hsp_score")[0].childNodes) # only consider it a genuine snp if the hit score is above 100 and the accession number starts with NC (meaning that it's human). if int(score) > 100 and access.startswith("NC"): hit_from = get_text(v.getElementsByTagName("Hsp_hit-from")[0].childNodes) hit_to = get_text(v.getElementsByTagName("Hsp_hit-to")[0].childNodes) hit_def = get_text(v.getElementsByTagName("Hit_def")[0].childNodes) hit_seq = get_text(v.getElementsByTagName("Hsp_hseq")[0].childNodes) hit_id = get_text(v.getElementsByTagName("Hit_id")[0].childNodes) print access print score print "YAY!" new_hit = Hit() new_hit.hit_access = access new_hit.hit_score = score new_hit.hit_from = hit_from new_hit.hit_to = hit_to new_hit.hit_def = hit_def new_hit.hit_seq = hit_seq new_hit.hit_id = hit_id parsed.append(new_hit)
return parsed
- queries the database and returns all info in an XML format
def omim_snp_search(dnsnp_id): client = DBIdsClient.DBIdsClient() query = client.search(dnsnp_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records
- extracts allelic variant data, as the name implies, using the struct above
def extract_allelic_variant_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Mim-allelic-variant") if len(variants) == 0: return parsed = [] for v in variants: a = AllelicVariant() # create empty instance of struct # now populate the struct a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes) parsed.append(a) return parsed
- BEGIN ACTUAL PROGRAM
- read sequence from file
sequence = get_sequence_from_fasta("example.fasta") print "Sequence received; please wait."
- look up data - HERE WE ARE USING BLAST NOT BLAST_SNP
b = blast_lookup(sequence)
print b
- extract data
e = extract_data(b) print e[0].hit_access print e[0].hit_to print e[0].hit_from print e[0].hit_id print e[0].hit_seq print e[0].hit_def </syntax>