Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-4-19

From OpenWetWare
Revision as of 22:59, 18 April 2007 by Kfifer (talk | contribs)
Jump to navigationJump to search

This script does a regular BLAST lookup and parses out the top human hit and relevant info (hit to, hit from, hit id, hit def, hit seq, etc). The next step will be to get the relevant part of the contig file. Then we'll use that to get the gene and figure out the mutation and do relevant searching thereafter.

<syntax type='python'>

from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.EUtils import DBIdsClient
import xml.dom.minidom
from xml.dom.minidom import parse, parseString
from threading import Thread
import time, sys

# C-style struct to pass parameters
class AllelicVariant:
	pass

class Hit:
	# the fields are added on the fly but include hit_from,
	# hit_to, etc. (see extract_data())
	pass
 
# basic function to open fasta file and get sequence
def get_sequence_from_fasta(file):
	file_handle = open(file)
	records = SeqIO.parse(file_handle, format="fasta")
	record = records.next()
	return record.seq.data
 
# lookup blast snp data
def blast_lookup(seq):
	result_handle = NCBIWWW.qblast("blastn", 'refseq_genomic', seq)
	return result_handle.read()
 
# basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html
def get_text(node_list):
    rc = ""
    for node in node_list:
        if node.nodeType == node.TEXT_NODE:
            rc = rc + node.data
    return rc
 
# extracts snp data
def extract_data(str):
	dom = parseString(str)
	variants = dom.getElementsByTagName("Hit")
	if len(variants) == 0:
		return
	parsed = []
	for v in variants:
		# now populate the struct
		access = get_text(v.getElementsByTagName("Hit_accession")[0].childNodes)
		score = get_text(v.getElementsByTagName("Hsp_score")[0].childNodes)
		# only consider it a genuine snp if the hit score is above 100 and the accession number starts with NC (meaning that it's human). 
		if int(score) > 100 and access.startswith("NC"):
			hit_from = get_text(v.getElementsByTagName("Hsp_hit-from")[0].childNodes)
			hit_to = get_text(v.getElementsByTagName("Hsp_hit-to")[0].childNodes)
			hit_def = get_text(v.getElementsByTagName("Hit_def")[0].childNodes)
			hit_seq = get_text(v.getElementsByTagName("Hsp_hseq")[0].childNodes)
			hit_id = get_text(v.getElementsByTagName("Hit_id")[0].childNodes)
			print access
			print score
			print "YAY!"
			new_hit = Hit()
			new_hit.hit_access = access
			new_hit.hit_score = score
			new_hit.hit_from = hit_from
			new_hit.hit_to = hit_to
			new_hit.hit_def = hit_def
			new_hit.hit_seq = hit_seq
			new_hit.hit_id = hit_id
			parsed.append(new_hit)
		
	return parsed
 
# queries the database and returns all info in an XML format
def omim_snp_search(dnsnp_id):
	client = DBIdsClient.DBIdsClient()
	query = client.search(dnsnp_id, "omim")
	records = [i.efetch(rettype="xml") for i in query]
	return records
 
# extracts allelic variant data, as the name implies, using the struct above
def extract_allelic_variant_data(str):
	dom = parseString(str)
	variants = dom.getElementsByTagName("Mim-allelic-variant")
	if len(variants) == 0:
		return
	parsed = []
	for v in variants:
		a = AllelicVariant() # create empty instance of struct
		# now populate the struct
		a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes)
		a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
		a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
		parsed.append(a)
	return parsed
 
# BEGIN ACTUAL PROGRAM
# read sequence from file
sequence = get_sequence_from_fasta("example.fasta")
print "Sequence received; please wait."
 
# look up data - HERE WE ARE USING BLAST NOT BLAST_SNP
b = blast_lookup(sequence)

print b


# extract data
e = extract_data(b)
print e[0].hit_access
print e[0].hit_to
print e[0].hit_from
print e[0].hit_id
print e[0].hit_seq
print e[0].hit_def
</syntax>