Harvard:Biophysics 101/2007/Notebook:Resmi Charalel/2007-5-1

From OpenWetWare
Jump to navigationJump to search

Update

  • I have been out of town since last Thursday's class for a revisit weekend and did not have email/computer access there. So, unfortunately, I did not get as much to work on this program as much as I had hoped. But, I definitely plan to continue working on the program and contributing to the project further over the next couple weeks and hope to be able to stay involved this summer.
  • So, I have written all of the code to appropriately:
    • 1) Parse through OMIM records and obtain all associated PMIDs
    • 2) Search all obtained PMIDs in PubMed to return XML output
    • 3) Parse the XML output of PubMed for each entry to return the MeshTerms that are labelled as major
    • 4) Take the entry title of OMIM and search this in PubMed
    • 5) Return the references of the top five PubMed review articles associated with this condition/name as further reading on the general allelic variant
  • However, there is still one error in the program, which is difficult to understand and through which I am still working. Here it is:
 Traceback (most recent call last):
 File "/Users/resmicharalel/Documents/biophysics101/pubmed.py", line 93, in <module>
   m = extract_meshterms(cur_record)
 File "/Users/resmicharalel/Documents/biophysics101/pubmed.py", line 60, in extract_meshterms
   dom = parseString(str)
 File "/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/xml/dom/minidom.py", line 1923, in parseString
   return expatbuilder.parseString(string)
 File "/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/xml/dom/expatbuilder.py", line 940, in parseString
   return builder.parseString(string)
 File "/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/xml/dom/expatbuilder.py", line 223, in parseString
   parser.Parse(string, True)
 TypeError: Parse() argument 1 must be string or read-only buffer, not instance"
  • The following is the code thus far:
from Bio.EUtils import DBIdsClient
import xml.dom.minidom
from xml.dom.minidom import parse, parseString

# C-style struct to pass parameters
class AllelicVariant:
	pass

class PubmedID:
        pass

class MeshTerms:
        pass

# queries the database and returns all info in an XML format
def omim_snp_search(dnsnp_id):
	client = DBIdsClient.DBIdsClient()
	query = client.search(dnsnp_id, "omim")
	records = [i.efetch(rettype="xml") for i in query]
	return records

# basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html
def get_text(node_list):
    rc = ""
    for node in node_list:
        if node.nodeType == node.TEXT_NODE:
            rc = rc + node.data
    return rc

# extracts allelic variant data, as the name implies, using the struct above
def extract_allelic_variant_data(str):
    dom = parseString(str)
    variants = dom.getElementsByTagName("Mim-allelic-variant")
#    print "variant:", variants
    if len(variants) == 0:
        return
    parsed = []
    for v in variants:
        a = AllelicVariant() # create empty instance of struct
        # now populate the struct
        a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes)
        a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
        a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
        parsed.append(a)
    return parsed

def extract_allelic_variant_pmid(str):
    dom = parseString(str)
    pmids = dom.getElementsByTagName("Mim-reference")
    if len(pmids) == 0:
        return
    ids = []
    for p in pmids:
        i = PubmedID()
        i.pmid = get_text(p.getElementsByTagName("Mim-reference_pubmedUID")[0].childNodes)
        ids.append(i.pmid)
    return ids
	
def extract_meshterms(str):
    dom = parseString(str)
    meshheadings = dom.getElementsByTagName("MeshHeading")
    if len(meshterms) == 0:
        return
    meshterms = []
    for h in meshheadings:
        m = MeshTerms()
        m = get_text(h.getElementsByTagName("DescriptorName MajorTopicYN")[0].childNodes)
        if m == "Y":
            mesh = get_text(h.getElementsByTagName("DescriptorName")[0].childNodes)
        meshterms.append(mesh)
    return meshterms

from Bio import PubMed
from Bio import Medline
import string

rec_parser = Medline.RecordParser()
medline_dict = PubMed.Dictionary(parser = rec_parser)

for i in omim_snp_search("rs11200638"):
    result = i.read()
    if result:
        v = extract_allelic_variant_data(result)
        p = extract_allelic_variant_pmid(result)
    if v != None:
        for a in v:
            print a.name
            print a.mutation
            print a.description
    if p != None:
        for s in p:
            cur_record = medline_dict[s]
            m = extract_meshterms(cur_record)
            if m != None:
                for mh in m:
                    print mh
    #if p != None:
       # for i in p:
            #print i.pmid


disease = a.name

search_term = "Review[ptyp] "+disease
#print search_term

review_ids = PubMed.search_for(search_term)

##rec_parser = Medline.RecordParser()
##medline_dict = PubMed.Dictionary(parser = rec_parser)
##
count = 1

for did in review_ids[0:5]:
    cur_record = medline_dict[did]
    print '\n', count, ')  ', string.rstrip(cur_record.title), cur_record.authors, string.strip(cur_record.source)
    count=count+1

##for i in omim_snp_search("rs11200638"):
##    result = i.read()
##    if result:
##        p = extract_allelic_variant_pmid(result)
##    if p != None:
##        key_source = PubMed.search_for(p[0])
##        key_rec = medline_dict[0]
##        print key_rec
##
##        keywords = get_text(key_rec.getElementsByTagName("MeshHeading")[0].childNodes)
##        print keywords