TChan/Notebook/2007-5-3: Difference between revisions
From OpenWetWare
Jump to navigationJump to search
No edit summary |
No edit summary |
||
| Line 1: | Line 1: | ||
=Presentation= | |||
** Lessons learned | |||
** Stuff done | |||
=Allelic Frequency= | =Allelic Frequency= | ||
* '''Input''': rs# | * '''Input''': rs# (string) | ||
* '''Output''': allelic frequency | * '''Output''': allelic frequency data (list of lists (of lists, in some cases)) | ||
==Sample Input== | |||
<pre>"rs11200538"</pre> | |||
==Code== | |||
<pre> | |||
import urllib | |||
# Definitions of functions | |||
# Returns the dbSNP URL for the search term | |||
def parse_for_dbSNP_search(search_term): | |||
#search_term will be initial input, the RS# in a string (ie. "rs11200538" or "11200538") | |||
parsed_term = search_term.replace("rs", "") | |||
return "http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?rs=%s" % parsed_term | |||
# Grabs the dbSNP HTML search_file | |||
def get_dbSNP_search_file(URL, genl_search_file): | |||
URL_stream_genl = urllib.urlopen(URL) | |||
page = URL_stream_genl.read() | |||
URL_stream_genl.close() | |||
genl_search_file.write(page) | |||
# Extracts out the relevant allelic frequency line from the dbSNP HTML search file | |||
def extract_allelic_freq_line(dbSNP_file): | |||
for line in dbSNP_file: | |||
if line.find('''Alleles</TH></TR><TR ><TH bgcolor="silver">ss''') != -1: | |||
return line | |||
elif line.find('''There is no frequency data.''') != -1: | |||
better_luck_next_time = "There is no frequency data." | |||
return better_luck_next_time | |||
# Divides the relevant allelic frequency line into separate HTML-table 'rows', which delineate the populations | |||
def divide_freq_line_into_TRs(freq_line): | |||
TR_list = [] | |||
while freq_line.rfind("<TR") != -1: | |||
TR_instance = freq_line.rfind("<TR") | |||
TR_list.insert(0, freq_line[TR_instance:(len(freq_line))]) | |||
freq_line = freq_line[0:TR_instance] | |||
TR_list.insert(0, freq_line) | |||
return TR_list | |||
# Parses out (1) categories, and (2) population rows | |||
def extract_categories_and_population_TRs(categories, population_list, TR_list): | |||
for element in TR_list: | |||
if element.find('''ss#''') != -1: | |||
categories = element | |||
elif element.find('''<td ><a href="snp_viewTable.cgi?pop=''') != -1: | |||
population_list.append(element) | |||
return categories, population_list | |||
def parse_IMG_tags_out_of_category(category): | |||
if "<IMG" in category: | |||
category = category[0:category.find("<IMG")] | |||
return category | |||
def parse_BR_tags_out_of_category(category): | |||
br = "<BR>", "<br>" | |||
if category.endswith(br): | |||
category = category[0:len(category)-4] | |||
category = category.replace("<BR>", ' ') | |||
category = category.replace("<br>", ' ') | |||
return category | |||
# Returns cleaned-up categories (ie. ss#, Population, etc.) | |||
def parse_categories(categories): | |||
categories_list = [] | |||
while categories.rfind('''<TH bgcolor="silver">''') != -1: | |||
category_instance = categories.rfind('''<TH bgcolor="silver">''') | |||
end_tag_instance = categories.rfind('''</TH>''') | |||
categories_list.insert(0, categories[(category_instance+22):end_tag_instance]) | |||
categories = categories[0:category_instance] | |||
for index in range(len(categories_list)): | |||
categories_list[index] = parse_IMG_tags_out_of_category(categories_list[index]) | |||
categories_list[index] = parse_BR_tags_out_of_category(categories_list[index]) | |||
return categories_list | |||
# Extraction functions to parse allelic frequency data from populations | |||
# Returns whether or not the particular population in population_list has an ss_numb | |||
def ss_numb_in_population(population): | |||
if '''<a href="snp_ss.cgi?ss=''' in population: | |||
return True | |||
else: | |||
return False | |||
def extract_ss_numb(population): | |||
#SS_numb START: after '''<a href="snp_ss.cgi?ss=''' | |||
#SS_numb END: before the '''">''' immediately after '''<a href="snp_ss.cgi?ss=''' | |||
if ss_numb_in_population(population): | |||
ss_numb = population[population.find('''<a href="snp_ss.cgi?ss=''')+23:population.find('''">''', | |||
population.find('''<a href="snp_ss.cgi?ss='''))] | |||
last_index = population.find('''">''', population.find('''<a href="snp_ss.cgi?ss=''')) + 2 | |||
else: | |||
ss_numb = '' | |||
last_index = 0 | |||
return ss_numb, last_index | |||
def extract_population_name(population, last_index): | |||
#population_name START: after the '''">''' immediately after '''<a href="snp_viewTable.cgi?pop=''' | |||
#population_name END: before the '''</a>''' that occurs after '''<a href="snp_viewTable.cgi?pop=''' | |||
population_name = population[population.find('''">''', last_index)+2: | |||
population.find('''</a>''', population.find('''<a href="snp_viewTable.cgi?pop='''))] | |||
last_index = population.find('''</a>''', population.find('''<a href="snp_viewTable.cgi?pop=''')) + 5 | |||
return population_name, last_index | |||
def extract_group(population, last_index): | |||
start_point = population.find('''<td >''', last_index) + 5 | |||
group = population[start_point:population.find('''</td>''', start_point)] | |||
last_index = population.find('''</td>''', start_point) + 5 | |||
return group, last_index | |||
def extract_chrom_cnt(population, last_index): | |||
start_point = population.find('''<td >''', last_index) + 5 | |||
chrom_cnt = population[start_point:population.find('''</td>''', start_point)] | |||
chrom_cnt = chrom_cnt.strip() | |||
last_index = population.find('''</td>''', start_point) | |||
return chrom_cnt, last_index | |||
def extract_source(population, last_index): | |||
start_point = population.find('''<td >''', last_index) + 5 | |||
source = population[start_point:population.find('''</td>''', start_point)] | |||
source = source.strip() | |||
last_index = population.find('''</td>''', start_point) | |||
return source, last_index | |||
def extract_allele_combos(num_of_allele_combos, population, last_index): | |||
# This function works even if there are identical allele combos | |||
allele_combos = [] | |||
start_point = population.find('''<FONT size="-1">''', last_index) + 17 | |||
for i in range(num_of_allele_combos): | |||
allele_combo = population[start_point:population.find('''</FONT>''', start_point)] | |||
allele_combos.append(allele_combo) | |||
last_index = start_point + 5 | |||
start_point = population.find('''<FONT size="-1">''', population.find('''</FONT>''', start_point)) + 17 | |||
for j in range(num_of_allele_combos): | |||
allele_combos[j] = allele_combos[j].strip() | |||
return allele_combos, last_index | |||
def extract_HWP(population, last_index): | |||
# This function works even if the last allele_combo was '' | |||
start_point = population.find('''<FONT size="-1">''', last_index) + 17 | |||
HWP = population[start_point:population.find('''</FONT>''', start_point)] | |||
HWP = HWP.strip() | |||
last_index = population.find('''</FONT>''', start_point) | |||
return HWP, last_index | |||
def extract_alleles(num_of_alleles, population, last_index): | |||
alleles = [] | |||
start_point = population.find('''<FONT size="-1">''', last_index) + 17 | |||
for i in range(num_of_alleles): | |||
allele = population[start_point:population.find('''</FONT>''', start_point)] | |||
alleles.append(allele) | |||
last_index = start_point + 1 | |||
start_point = population.find('''<FONT size="-1">''', population.find('''</FONT>''', start_point)) + 17 | |||
for j in range(num_of_alleles): | |||
alleles[j] = alleles[j].strip() | |||
return alleles, last_index | |||
# Master function to compile the list of lists (of lists) that holds all the interesting allelic frequency data | |||
def parse_population_list(num_of_allele_combos, num_of_alleles, population_list, master_data_list): | |||
for index in range(len(population_list)): | |||
last_index = 0 | |||
ss_numb = '' | |||
ss_numb, last_index = extract_ss_numb(population_list[index]) | |||
population_name, last_index = extract_population_name(population_list[index], last_index) | |||
group, last_index = extract_group(population_list[index], last_index) | |||
chrom_cnt, last_index = extract_chrom_cnt(population_list[index], last_index) | |||
source, last_index = extract_source(population_list[index], last_index) | |||
allele_combos, last_index = extract_allele_combos(num_of_allele_combos, population_list[index], last_index) | |||
HWP, last_index = extract_HWP(population_list[index], last_index) | |||
alleles, last_index = extract_alleles(num_of_alleles, population_list[index], last_index) | |||
master_data_list.append([ss_numb, population_name, group, chrom_cnt, source, allele_combos, HWP, alleles]) | |||
return master_data_list | |||
#BEGIN ACTUAL PROGRAM | |||
search_term = "rs11200538" # example search_term for now; will be returned by rest of program when finished | |||
search_file_name = "%s_dbSNP.html" % search_term | |||
dbSNP_file = open(search_file_name, 'w') | |||
URL = parse_for_dbSNP_search(search_term) | |||
get_dbSNP_search_file(URL, dbSNP_file) | |||
dbSNP_file.close() | |||
dbSNP_file = open(search_file_name, 'r') | |||
freq_line = extract_allelic_freq_line(dbSNP_file) | |||
dbSNP_file.close() | |||
TR_list = divide_freq_line_into_TRs(freq_line) | |||
categories = '' | |||
population_list = [] | |||
categories, population_list = extract_categories_and_population_TRs(categories, population_list, TR_list) | |||
'' | categories_list = [] | ||
categories_list = parse_categories(categories) | |||
num_of_categories = len(categories_list) | |||
num_of_allele_combos = categories_list.count('A/A') + categories_list.count('A/T') + categories_list.count('A/C') + categories_list.count('A/G') + categories_list.count('T/A') + categories_list.count('T/T') +categories_list.count('T/C') + categories_list.count('T/G') + categories_list.count('C/A') + categories_list.count('C/T') + categories_list.count('C/C') + categories_list.count('C/G') + categories_list.count('G/A') + categories_list.count('G/T') + categories_list.count('G/C') + categories_list.count('G/G') | |||
num_of_alleles = categories_list.count('A') + categories_list.count('T') + categories_list.count('C') + categories_list.count('G') | |||
master_data_list = [] | |||
master_data_list.append(categories_list) | |||
master_data_list = parse_population_list(num_of_allele_combos, num_of_alleles, population_list, master_data_list) | |||
print master_data_list | |||
</pre> | |||
==Sample Output== | |||
<pre> | |||
['ss#', 'Population', 'Individual Group', 'Chrom. Sample Cnt.', 'Source', 'A/A', 'A/G', 'G/G', 'HWP', 'A', 'G'] | |||
['ss16081968', 'HapMap-CEU', 'European', '118', 'IG', ['0.983', '0.017', ''], '1.000', ['0.992', '0.008']] | |||
['', '<td ></td><td ><a href="snp_viewTable.cgi?pop=1410">HapMap-HCB', 'Asian', '90', 'IG', ['0.556', '0.356', '0.089'], '0.584', ['0.733', '0.267']] | |||
['', '<td ></td><td ><a href="snp_viewTable.cgi?pop=1411">HapMap-JPT', 'Asian', '90', 'IG', ['0.533', '0.356', '0.111'], '0.371', ['0.711', '0.289']] | |||
['', '<td ></td><td ><a href="snp_viewTable.cgi?pop=1412">HapMap-YRI', 'Sub-Saharan African', '120', 'IG', ['1.000', '', ''], '1.000', ['', 'tom"><td ></td><td ><a href="snp_viewTable.cgi?pop=1412">HapMap-YRI</a></td><td >Sub-Saharan African</td><td > 120</td><td >IG</td><td ><FONT size="-1"> 1.000']] | |||
['', '<td ></td><td ><a href="snp_viewTable.cgi?pop=1771">CHMJ', 'Asian', '74', 'IG', ['', '', ''], '0.757', ['0.243', 'tom"><td ></td><td ><a href="snp_viewTable.cgi?pop=1771">CHMJ</a></td><td >Asian</td><td > 74</td><td >IG</td><td ><FONT size="-1">']] | |||
['ss24106683', 'AFD_EUR_PANEL', 'European', '48', 'IG', ['0.917', '0.083', ''], '1.000', ['0.958', '0.042']] | |||
['', '<td ></td><td ><a href="snp_viewTable.cgi?pop=1372">AFD_AFR_PANEL', 'African American', '44', 'IG', ['1.000', '', ''], '1.000', ['', 'tom"><td ></td><td ><a href="snp_viewTable.cgi?pop=1372">AFD_AFR_PANEL</a></td><td >African American</td><td > 44</td><td >IG</td><td ><FONT size="-1"> 1.000']] | |||
['', '<td ></td><td ><a href="snp_viewTable.cgi?pop=1373">AFD_CHN_PANEL', 'Asian', '48', 'IG', ['0.583', '0.333', '0.083'], '0.655', ['0.750', '0.250']] | |||
<pre> | |||
The categories in master_data_list[0] correspond to the data items in each of the following rows. For convenience, the allele_combos frequencies and the allele frequencies were collected in to their own lists. | |||
Revision as of 08:32, 3 May 2007
Presentation
- Lessons learned
- Stuff done
Allelic Frequency
- Input: rs# (string)
- Output: allelic frequency data (list of lists (of lists, in some cases))
Sample Input
"rs11200538"
Code
import urllib
# Definitions of functions
# Returns the dbSNP URL for the search term
def parse_for_dbSNP_search(search_term):
#search_term will be initial input, the RS# in a string (ie. "rs11200538" or "11200538")
parsed_term = search_term.replace("rs", "")
return "http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?rs=%s" % parsed_term
# Grabs the dbSNP HTML search_file
def get_dbSNP_search_file(URL, genl_search_file):
URL_stream_genl = urllib.urlopen(URL)
page = URL_stream_genl.read()
URL_stream_genl.close()
genl_search_file.write(page)
# Extracts out the relevant allelic frequency line from the dbSNP HTML search file
def extract_allelic_freq_line(dbSNP_file):
for line in dbSNP_file:
if line.find('''Alleles</TH></TR><TR ><TH bgcolor="silver">ss''') != -1:
return line
elif line.find('''There is no frequency data.''') != -1:
better_luck_next_time = "There is no frequency data."
return better_luck_next_time
# Divides the relevant allelic frequency line into separate HTML-table 'rows', which delineate the populations
def divide_freq_line_into_TRs(freq_line):
TR_list = []
while freq_line.rfind("<TR") != -1:
TR_instance = freq_line.rfind("<TR")
TR_list.insert(0, freq_line[TR_instance:(len(freq_line))])
freq_line = freq_line[0:TR_instance]
TR_list.insert(0, freq_line)
return TR_list
# Parses out (1) categories, and (2) population rows
def extract_categories_and_population_TRs(categories, population_list, TR_list):
for element in TR_list:
if element.find('''ss#''') != -1:
categories = element
elif element.find('''<td ><a href="snp_viewTable.cgi?pop=''') != -1:
population_list.append(element)
return categories, population_list
def parse_IMG_tags_out_of_category(category):
if "<IMG" in category:
category = category[0:category.find("<IMG")]
return category
def parse_BR_tags_out_of_category(category):
br = "<BR>", "<br>"
if category.endswith(br):
category = category[0:len(category)-4]
category = category.replace("<BR>", ' ')
category = category.replace("<br>", ' ')
return category
# Returns cleaned-up categories (ie. ss#, Population, etc.)
def parse_categories(categories):
categories_list = []
while categories.rfind('''<TH bgcolor="silver">''') != -1:
category_instance = categories.rfind('''<TH bgcolor="silver">''')
end_tag_instance = categories.rfind('''</TH>''')
categories_list.insert(0, categories[(category_instance+22):end_tag_instance])
categories = categories[0:category_instance]
for index in range(len(categories_list)):
categories_list[index] = parse_IMG_tags_out_of_category(categories_list[index])
categories_list[index] = parse_BR_tags_out_of_category(categories_list[index])
return categories_list
# Extraction functions to parse allelic frequency data from populations
# Returns whether or not the particular population in population_list has an ss_numb
def ss_numb_in_population(population):
if '''<a href="snp_ss.cgi?ss=''' in population:
return True
else:
return False
def extract_ss_numb(population):
#SS_numb START: after '''<a href="snp_ss.cgi?ss='''
#SS_numb END: before the '''">''' immediately after '''<a href="snp_ss.cgi?ss='''
if ss_numb_in_population(population):
ss_numb = population[population.find('''<a href="snp_ss.cgi?ss=''')+23:population.find('''">''',
population.find('''<a href="snp_ss.cgi?ss='''))]
last_index = population.find('''">''', population.find('''<a href="snp_ss.cgi?ss=''')) + 2
else:
ss_numb = ''
last_index = 0
return ss_numb, last_index
def extract_population_name(population, last_index):
#population_name START: after the '''">''' immediately after '''<a href="snp_viewTable.cgi?pop='''
#population_name END: before the '''</a>''' that occurs after '''<a href="snp_viewTable.cgi?pop='''
population_name = population[population.find('''">''', last_index)+2:
population.find('''</a>''', population.find('''<a href="snp_viewTable.cgi?pop='''))]
last_index = population.find('''</a>''', population.find('''<a href="snp_viewTable.cgi?pop=''')) + 5
return population_name, last_index
def extract_group(population, last_index):
start_point = population.find('''<td >''', last_index) + 5
group = population[start_point:population.find('''</td>''', start_point)]
last_index = population.find('''</td>''', start_point) + 5
return group, last_index
def extract_chrom_cnt(population, last_index):
start_point = population.find('''<td >''', last_index) + 5
chrom_cnt = population[start_point:population.find('''</td>''', start_point)]
chrom_cnt = chrom_cnt.strip()
last_index = population.find('''</td>''', start_point)
return chrom_cnt, last_index
def extract_source(population, last_index):
start_point = population.find('''<td >''', last_index) + 5
source = population[start_point:population.find('''</td>''', start_point)]
source = source.strip()
last_index = population.find('''</td>''', start_point)
return source, last_index
def extract_allele_combos(num_of_allele_combos, population, last_index):
# This function works even if there are identical allele combos
allele_combos = []
start_point = population.find('''<FONT size="-1">''', last_index) + 17
for i in range(num_of_allele_combos):
allele_combo = population[start_point:population.find('''</FONT>''', start_point)]
allele_combos.append(allele_combo)
last_index = start_point + 5
start_point = population.find('''<FONT size="-1">''', population.find('''</FONT>''', start_point)) + 17
for j in range(num_of_allele_combos):
allele_combos[j] = allele_combos[j].strip()
return allele_combos, last_index
def extract_HWP(population, last_index):
# This function works even if the last allele_combo was ''
start_point = population.find('''<FONT size="-1">''', last_index) + 17
HWP = population[start_point:population.find('''</FONT>''', start_point)]
HWP = HWP.strip()
last_index = population.find('''</FONT>''', start_point)
return HWP, last_index
def extract_alleles(num_of_alleles, population, last_index):
alleles = []
start_point = population.find('''<FONT size="-1">''', last_index) + 17
for i in range(num_of_alleles):
allele = population[start_point:population.find('''</FONT>''', start_point)]
alleles.append(allele)
last_index = start_point + 1
start_point = population.find('''<FONT size="-1">''', population.find('''</FONT>''', start_point)) + 17
for j in range(num_of_alleles):
alleles[j] = alleles[j].strip()
return alleles, last_index
# Master function to compile the list of lists (of lists) that holds all the interesting allelic frequency data
def parse_population_list(num_of_allele_combos, num_of_alleles, population_list, master_data_list):
for index in range(len(population_list)):
last_index = 0
ss_numb = ''
ss_numb, last_index = extract_ss_numb(population_list[index])
population_name, last_index = extract_population_name(population_list[index], last_index)
group, last_index = extract_group(population_list[index], last_index)
chrom_cnt, last_index = extract_chrom_cnt(population_list[index], last_index)
source, last_index = extract_source(population_list[index], last_index)
allele_combos, last_index = extract_allele_combos(num_of_allele_combos, population_list[index], last_index)
HWP, last_index = extract_HWP(population_list[index], last_index)
alleles, last_index = extract_alleles(num_of_alleles, population_list[index], last_index)
master_data_list.append([ss_numb, population_name, group, chrom_cnt, source, allele_combos, HWP, alleles])
return master_data_list
#BEGIN ACTUAL PROGRAM
search_term = "rs11200538" # example search_term for now; will be returned by rest of program when finished
search_file_name = "%s_dbSNP.html" % search_term
dbSNP_file = open(search_file_name, 'w')
URL = parse_for_dbSNP_search(search_term)
get_dbSNP_search_file(URL, dbSNP_file)
dbSNP_file.close()
dbSNP_file = open(search_file_name, 'r')
freq_line = extract_allelic_freq_line(dbSNP_file)
dbSNP_file.close()
TR_list = divide_freq_line_into_TRs(freq_line)
categories = ''
population_list = []
categories, population_list = extract_categories_and_population_TRs(categories, population_list, TR_list)
categories_list = []
categories_list = parse_categories(categories)
num_of_categories = len(categories_list)
num_of_allele_combos = categories_list.count('A/A') + categories_list.count('A/T') + categories_list.count('A/C') + categories_list.count('A/G') + categories_list.count('T/A') + categories_list.count('T/T') +categories_list.count('T/C') + categories_list.count('T/G') + categories_list.count('C/A') + categories_list.count('C/T') + categories_list.count('C/C') + categories_list.count('C/G') + categories_list.count('G/A') + categories_list.count('G/T') + categories_list.count('G/C') + categories_list.count('G/G')
num_of_alleles = categories_list.count('A') + categories_list.count('T') + categories_list.count('C') + categories_list.count('G')
master_data_list = []
master_data_list.append(categories_list)
master_data_list = parse_population_list(num_of_allele_combos, num_of_alleles, population_list, master_data_list)
print master_data_list
Sample Output
['ss#', 'Population', 'Individual Group', 'Chrom. Sample Cnt.', 'Source', 'A/A', 'A/G', 'G/G', 'HWP', 'A', 'G'] ['ss16081968', 'HapMap-CEU', 'European', '118', 'IG', ['0.983', '0.017', ], '1.000', ['0.992', '0.008']] [, '<a href="snp_viewTable.cgi?pop=1410">HapMap-HCB', 'Asian', '90', 'IG', ['0.556', '0.356', '0.089'], '0.584', ['0.733', '0.267']] [, '<a href="snp_viewTable.cgi?pop=1411">HapMap-JPT', 'Asian', '90', 'IG', ['0.533', '0.356', '0.111'], '0.371', ['0.711', '0.289']] [, '<a href="snp_viewTable.cgi?pop=1412">HapMap-YRI', 'Sub-Saharan African', '120', 'IG', ['1.000', , ], '1.000', [, 'tom"><a href="snp_viewTable.cgi?pop=1412">HapMap-YRI</a>Sub-Saharan African 120IG 1.000']] [, '<a href="snp_viewTable.cgi?pop=1771">CHMJ', 'Asian', '74', 'IG', [, , ], '0.757', ['0.243', 'tom"><a href="snp_viewTable.cgi?pop=1771">CHMJ</a>Asian 74IG']] ['ss24106683', 'AFD_EUR_PANEL', 'European', '48', 'IG', ['0.917', '0.083', ], '1.000', ['0.958', '0.042']] [, '<a href="snp_viewTable.cgi?pop=1372">AFD_AFR_PANEL', 'African American', '44', 'IG', ['1.000', , ], '1.000', [, 'tom"><a href="snp_viewTable.cgi?pop=1372">AFD_AFR_PANEL</a>African American 44IG 1.000']] [, '<a href="snp_viewTable.cgi?pop=1373">AFD_CHN_PANEL', 'Asian', '48', 'IG', ['0.583', '0.333', '0.083'], '0.655', ['0.750', '0.250']]The categories in master_data_list[0] correspond to the data items in each of the following rows. For convenience, the allele_combos frequencies and the allele frequencies were collected in to their own lists.