Harvard:Biophysics 101/2007/Notebook:Kaull/2007-5-2

From OpenWetWare
Jump to navigationJump to search

OMIM workaround in work-able form. Enjoy.

Specs: Takes any rs # Returns AllelicVariant struct - which is how Xiaodi's code worked before, so life should not be complicated for those downstream (tell me if this is untrue!)

from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.EUtils import DBIdsClient

import xml.dom.minidom
from xml.dom.minidom import parse, parseString
from threading import Thread

import pickle, sys, time, urllib

#########################################
#### Code taken from elsewhere, unedited

outputlist = []

# C-style struct to pass parameters
class AllelicVariant:
	pass

# basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html
def get_text(node_list):
    rc = ""
    for node in node_list:
        if node.nodeType == node.TEXT_NODE:
            rc = rc + node.data
    return rc

# extracts allelic variant data, as the name implies, using the struct above
def extract_allelic_variant_data(str):
	dom = parseString(str)
	variants = dom.getElementsByTagName("Mim-allelic-variant")
	if len(variants) == 0:
		return
	parsed = []
	for v in variants:
		a = AllelicVariant() # create empty instance of struct
		# now populate the struct
		a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes)
		a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
		a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
		parsed.append(a)
	return parsed

# queries the database and returns all info in an XML format
def omim_snp_search(dnsnp_id):
	client = DBIdsClient.DBIdsClient()
	query = client.search(dnsnp_id, "omim")
	records = [i.efetch(rettype="xml") for i in query]
	return records

#########################################
#### Code taken from elsewhere, edited

# queries the database and returns all info in an XML format
def omim_tag_search(tag_id):
	client = DBIdsClient.DBIdsClient()
	query = client.search(tag_id, "omim")
	records = [i.efetch(rettype="xml") for i in query]
	return records

#########################################
#### New code - from Kay

## **** TEST DATA **** (delete me!)
snp_id = 'rs11200638'

# queries the SNP database and returns geneID tag as a string
## Currently, DBIdsClient does not support snp parsing - so it's not used.
## A future update should correct this when possible, for ease of reading.
def parse_geneID_tag(snp_id):
    SNP_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=snp'
    url = SNP_URL + '&id=' + snp_id + '&mode=xml'
    dom = minidom.parse(urllib.urlopen(url))
    symbol = dom.getElementsByTagName("FxnSet_symbol")
    return symbol[0].toxml().split('>')[1].split('<')[0]

## Note: This code is a temporary solution to a dbSNP formatting issue.
## Older entries are best searched directly by ID #.
##  - > this case is the first covered
## Newer entries are not indexed in this fashion, although SNP ID data is
##  available on the individual entry.  These contain Allelic Variant data
##  which is located and extracted by the script
##  - > this case is the second covered

# takes SNP ID and gets search results from OMIM in XML format
def snp_to_omim(snp_id):
    records = omim_snp_search(snp_id)

    if records == list():
        tag_id = parse_geneID_tag(snp_id)
        records = omim_tag_search(tag_id)

    return records

## I'm not yet happy with this bottom bit.  I'll keep the output steady
## for downstream, though.

o = snp_to_omim(snp_id)

if len(o) == 0:
	outputlist.append("No information found for " + snp_id + "\n")
# nothing more to be done if no records can be found
# otherwise, find the allelic variant data
else: 
        outputlist.append(snp_id + " details:" + "\n")
        for i in o:
                v = extract_allelic_variant_data(i.read())
        	if v != None:
                        for a in v:
                                outputlist.append(a.name + "\n")
                                outputlist.append(a.mutation + "\n")
                                outputlist.append(a.description + "\n")
#print '-' * 40
#print "\n"
#print "yay! we're done!"
for item in outputlist:
	print item
    

'''
o = omim_snp_search(snp_id)
for i in o:
    dom = parseString(i.read())
    print dom.toxml()
'''