Harvard:Biophysics 101/Notebook:ZS/2007-5-2

From OpenWetWare
Revision as of 23:47, 2 May 2007 by Zsun (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigationJump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
#Zachary Sun, Biophysics 101
#5.03.07 code
#Tackling the problem of finding surrounding CDS's and information about a reference sequence from blast,
#this program parses the HTML output of blastn as of 4.26.07 for information which can then be fed
#into other NCBI programs to be programmed,
#Saves a local file; to stop queuing the web, just comment the code with pound symbols at end.

#update from v1: fixed some bug issues w/output, added the code in CDS option and additional parsing for information
#to be fed into polyphen as currently there is no working local database option.
#Note: this code has 2 cases:
#1) The seqence is in a CDS
#   *It outputs the CDS it is in, the gene asc. #, and the CDS sequence itself
#2) The sequence is not in a CDS, but surrounded by CDSes
#   *It outputs the CDS location of the surrounding sequence and the gene name of surrounding sequence, and mutation site
#
#Note: Any other data can be found for the gene asc. #, eg. protein sequences which go into polyphen

from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import xml.sax.handler
from xml.dom import minidom

#class seq (macular degen.) on first line, other CDS seq on second
#seq = "CACCCTCGCCAGTTACGAGCTGCCGAGCCGCTTCCTAGGCTCTCTGCGAATACGGACACGCATGCCACCCACAACAACTTTTTAAAAGAATCAGACGTGTGAAGGATTCTATTCGAATTACTTCTGCTCTCTGCTTTTATCACTTCACTGTGGGTCTGGGCGCGGGCTTTCTGCCAGCTCCGCGGACGCTGCCTTCGTCCRGCCGCAGAGGCCCCGCGGTCAGGGTCCCGCGTGCGGGGTACCGGGGGCAGAACCAGCGCGTGACCGGGGTCCGCGGTGCCGCAACGCCCCGGGTCTGCGCAGAGGCCCCTGCAGTCCCTGCCCGGCCCAGTCCGAGCTTCCCGGGCGGGCCCCCAGTCCGGCGATTTGCAGGAACTTTCCCCGGCGCTCCCACGCGAAGC"
seq = "ATGCTGCGCCTATACCCAGGACCGATGGTAACTGAGGCGGAGGGGAAAGGAGGGCCTGAGATGGCAAGTCTGTCCTCCTCGGTGGTTCCTGTGTCCTTCATTTCCACTCTGCGAGAGTCTGTGCTGGACCCTGGAGTTGGTGGAGAAGGAGCCAGTGACAAGCAGAGGAGCAAACTGTCTTTATCACACTCCATGATCCCAGCTGCTAAAATCCACACTGAGCTCTGCTTACCAGCCTTCTTCTCTCCTGCTGGAACCCAGAGGAGGTTCCAGCAGCCTC"
# the above is a user-defined variable; put in your sequence to be Blast against!

class RecInfo():
    def __init__(self, a, b, c, d, e, f, g):
        self.surroundStart = a
        self.surroundEnd = b
        self.refStart = c
        self.refEnd = d
        self.hits = e
        self.name = f
        self.title = g
        self.inCDS = "?"

    def printInfo(self):
        print "INFORMATION:"
        print "-----------"
        print "Gene name: ", self.name
        print "Location: " , self.refStart, "->", self.refEnd
        print "Hit qualification: ", self.hits
        print "Surrounding gene information:"
        print "----------------------------"
        for (sStart, sEnd, n) in zip(self.surroundStart, self.surroundEnd, self.title):
            print n, ": " , sStart, "->", sEnd
            

result_handle = NCBIWWW.qblast("blastn","Test/gpipe/9606/allcontig_and_rna", seq, format_type="HTML") #the db is from the blast homepage (other db's dont give surrounding info) #
blast_results = result_handle.read()#

save_file = open("my_blast2.html","w")#
save_file.write(blast_results)#
save_file.close()#

load_file = open("my_blast2.html","r")
line = load_file.readline()

surroundStart = []
surroundEnd = []
title = []

CDS = "false"
CDS_ascID = "NA"
temp = ""
temp1 = ""

while(line):
    findLine = line.find("spanning the HSP") #to find the sequence I
    if findLine != -1:
        name = line[findLine+23:-1]
    findLine = line.find("PREDICTED")
    if findLine != -1: #if part of a CDS
        end = line.find("Length")
        temp = line[findLine-30:findLine]
        temp = temp.split("value=")
        #print temp
        temp1 = temp[len(temp)-1]
        temp1 = temp1[1:-1]
        #print temp1 
        CDS_ascID = temp1 # this is the gene ascension number
        CDS = "true"
        break
    findLine = line.find("Features flanking this part of subject sequence:")
    if findLine != -1: #found features flanking
        CDS = "false"
        line = load_file.readline()
        while line.find("&from=") != -1: #while there is still a feature
            w = line.find("&from=") #index of start site
            x = line.find("&to=") #index of mid
            y = line.find("&view=gbwithparts") #index of end
            surroundStart.append(int(line[w+6:x]))
            surroundEnd.append(int(line[x+4:y]))
            w = line.find("gbwithparts") #index of gene title
            x = line.find("</a>") #index of end of title
            title.append(line[w+13:x])
            line = load_file.readline()

        line = load_file.readline() #manual parse
        line = load_file.readline() #manual parse
        hits = line

        seqFirstFound = "false" #if first part found

        while line.find("input type") == -1:#while not done with sequence
            if line.find("Sbjct") != -1: #if begin
                if(seqFirstFound == "false"): #if first sequence
                    temp = line.split(" ") #split by space
                    refStart = temp[2]
                    seqFirstFound = "true"
                else: #find end sequence
                    temp = line[:-1]#delete /n
                    temp = temp.split(" ")
                    refEnd = temp[6]
            line = load_file.readline()

        break
    line = load_file.readline()


if CDS == "false":
    record = RecInfo(surroundStart, surroundEnd, refStart, refEnd, hits, name, title) 
    record.printInfo()
else:
    print "The snippit is a CDS:"
    print "Name: ", name
    print "Asc. Number: ", CDS_ascID
        
    # We can create a GenBank object that will parse a raw record
    # This facilitates extracting specific information from the sequences
    record_parser = GenBank.FeatureParser()

    # NCBIDictionary is an interface to Genbank
    ncbi_dict = GenBank.NCBIDictionary('nucleotide', 'genbank', parser = record_parser)

    # If you pass NCBIDictionary a GenBank id, it will download that record
    parsed_record = ncbi_dict[CDS_ascID]

    # Extract the sequence from the parsed_record
    s = parsed_record.seq.tostring()
    print "Sequence: ", s
  

OUTPUT:

CASE 1: The sequence is surrounded by a CDS:

INFORMATION:
-----------
Gene name:  Homo sapiens chromosome 10 genomic contig, reference assembly
Location:  42968870 -> 42969270
Hit qualification:   Identities = 400/401 (99%), Gaps = 0/401 (0%)

Surrounding gene information:
----------------------------
3895 bp at 5' side: hypothetical protein :  42962770 -> 42964975
425 bp at 3' side: HtrA serine peptidase 1 :  42969695 -> 43022401

CASE 2: The sequence is in a CDS:

The snippit is a CDS:
Name:  Homo sapiens chromosome 10 genomic contig, reference assembly
Asc. Number:  XM_001131263
Sequence:  GAGATGGCAGCTGGCTTGGCAAGGGGACAGCACCTTTGTCACCACATTATGTCCCTGTACCCTACATGCTGCGCCTATACCCAGGACCGATGGTAACTGAGGCGGAGGGGAAAGGAGGGCCTGAGATGGCAAGTCTGTCCTCCTCGGTGGTTCCTGTGTCCTTCATTTCCACTCTGCGAGAGTCTGTGCTGGACCCTGGAGTTGGTGGAGAAGGAGCCAGTGACAAGCAGAGGAGCAAACTGTCTTTATCACACTCCATGATCCCAGCTGCTAAAATCCACACTGAGCTCTGCTTACCAGCCTTCTTCTCTCCTGCTGGAACCCAGAGGAGGTTCCAGCAGCCTCAGCACCACCTGACACTGTCTATCATCCACACTGCAGCAAGGTGATTCTGCCAAAACATATCTCCTTAAAAGCCAACTGGAGCTTCTCATCAGCATCAATGTGAAGCCAAAAATCCTTAGGAGGACAGAGGGAGTCCCTCACAACCTAGACTGGTCCCCTTCCCTCCAGCTGCCTCAACTGTCCACAGGACTCTCTTCCCACCTGCGGCCACACTGTGCAACCTGGAATTTCCCCACCTGGGCGGACTCATCACGTCATCACCAATTGGATGCATCTTCTGCTCTGTGCAGCTGGTGAAATCTTTCTCAACCCTTGAGATGCAGCCCAATCTTCTCCTAACATCTGGATTCCTCTCTGTCACTGCATTCCCTCCTGTCATCCTGCCTTTGTTTTCTTGCCCTCCTTTCTCTCCCGGGTGATAGGCATTAACTAAAATTAAATAAAAATTCAGATCATCCTTGCA