Last active
May 20, 2017 10:47
-
-
Save naturale0/12ea818c24ba0e1238db8f43f8668e61 to your computer and use it in GitHub Desktop.
Python class which reads and interprets 23andMe raw genotype data. Support searching your genotype and related traits at SNPedia.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import requests | |
import json | |
import sys | |
import os | |
from bs4 import BeautifulSoup | |
from collections import OrderedDict | |
class Geno(object): | |
"""class to reads and interprets 23andMe raw genotype data.""" | |
def __init__(self, raw_23andMe): | |
"""input path to raw 23andMe genotype data. data will be read as a form of pandas DataFrame. | |
>>> g = Geno("./23andMe_raw_data.txt") | |
>>> g.my_geno""" | |
# GET report as json from 23andMe API | |
res = requests.get("https://api.23andme.com/3/report/").content | |
self._report = json.loads(res) | |
self.my_geno = pd.read_csv(raw_23andMe, sep="\t", header=14, | |
names=["rsid", "chr", "pos", "genotype"], | |
low_memory=False) | |
def type_id(self, rsid, ref_population="JPT"): | |
"""search input rsid at SNPedia, report your possible trait related to this SNP. | |
>>> g.type_id("rs28897696")""" | |
mine = self.my_geno[self.my_geno.rsid == rsid] | |
if mine.empty: | |
return None | |
# your genotype. if you are a male(have "Y" chromosome), | |
# duplicate your X chromosome allele to make a two-lettered string | |
if mine.chr.values[0] in ["X", "Y"]: | |
if "Y" in self.my_geno.chr.unique(): | |
mm = mine.genotype.values[0] | |
genotype = _sort_seq(mm + ";" + mm) | |
else: | |
mm, ff = mine.genotype.values[0] | |
genotype = _sort_seq(mm + ";" + ff) | |
# search this at snpedia | |
snpedia_report, g_percent = _search_rsid(rsid, genotype, ref_population) | |
snpedia_report = snpedia_report.strip() | |
print rsid, "|", genotype, "|", snpedia_report, "|", "{}% of JPT".format(g_percent) | |
return mine | |
def type_trait(self, query, ref_population="JPT"): | |
"""search input query at SNPedia, report all related SNPs and your genotype. | |
>>> g.type_trait("lung cancer")""" | |
rsid_list = _search_anything(query) | |
if not rsid_list[0].startswith("rs"): | |
print "try {} instead".format(rsid_list) | |
return | |
for rsid in rsid_list: | |
self.type_id(rsid, ref_population) | |
detail_url = ("https://www.snpedia.com/index.php/" + query).replace(" ", "%20") | |
print " * more details at", detail_url | |
def report_wellness(self, ref_population="JPT"): | |
"""return wellness report provided by 23andMe API sevice. | |
>>> g.report_wellness()""" | |
try: | |
height, width = map(int, os.popen('stty size', 'r').read().split()) | |
except: | |
width = 100 | |
print "="*(width//2-9), "WELLNESS REPORT", "="*(width//2-9), "\n" | |
for i in range(len(self._report["data"])): | |
with_detail = pd.DataFrame(self._report["data"][i]).dropna(subset=["details"]) | |
if not "markers" in with_detail.index: | |
continue | |
for j in range(len(with_detail.ix["markers"].details)): | |
#print with_detail.report_id#with_detail.ix["markers"].details[j][u'biological_explanation'] | |
rsid_report = with_detail.ix["markers"].details[j]["id"] | |
variants_report = with_detail.ix["markers"].details[j]["variants"] | |
for variant_dict in variants_report: | |
pos_report = variant_dict["end"] | |
pick = self.my_geno[self.my_geno.rsid == rsid_report] | |
if pick.empty: | |
continue | |
#print pick.pos.values[0], pos_report | |
#if pick.pos.values[0] == pos_report: | |
from_mom, from_dad = pick.genotype.values[0] | |
genotype = from_mom+";"+from_dad | |
snpedia_report, geno_percent = _search_rsid(rsid_report, genotype, ref_population) | |
if variant_dict["has_effect"] == True: | |
print "< {} >".format(with_detail.ix["markers"]["title"]) | |
print "-" * width | |
print with_detail.ix["markers"].details[j]["biological_explanation"].strip() | |
print " rsID:", rsid_report | |
print " Yours:", genotype, "- homozygous" | |
if ("There is currently no text" not in snpedia_report) and (snpedia_report.strip() != ""): | |
print " Detail:", snpedia_report.strip(), "|", "{}% of JPT".format(geno_percent) | |
print "\n\n" | |
def _search_rsid(rsid, genotype, ref_population="JPT"): | |
"""search input rsid at SNPedia, return and compare SNPedia result and yours""" | |
genotype = _sort_seq(genotype) | |
snpedia_query = "https://www.snpedia.com/index.php/" + rsid | |
soup = BeautifulSoup(requests.get(snpedia_query).content, "lxml") | |
table = soup.find_all("table", attrs={"class": "sortable"}) | |
stab_orien = soup.find_all("td")[3].text | |
# complement my genotype if stabilized orientation is 'minus' | |
if stab_orien == "minus": | |
genotype = _comp_seq(genotype) | |
if table != []: | |
snpedia_var_dict = OrderedDict() | |
for n, i in enumerate(table[0].find_all("td")): | |
if n % 3 == 0: k = i.text.strip().strip("(").strip(")") | |
if n % 3 == 1: m = i.text.strip() | |
if n % 3 == 2: | |
if len(k) == 2: continue | |
snpedia_var_dict[k] = (m, i.text.strip()) | |
# save snpedia_report (result of input genotype) | |
try: | |
snpedia_report = " | ".join(snpedia_var_dict[genotype]) | |
except KeyError: | |
snpedia_report = "No info about {}".format(genotype) | |
# save geno_percent (which percent of people in JPT population have input genotype) | |
try: | |
geno_order = snpedia_var_dict.keys().index(genotype) | |
#print snpedia_var_dict.keys(), genotype, rsid | |
for td in soup.find_all("td"): | |
if td.img is not None: | |
_chart = td.img["src"] | |
break | |
_, vals, __, ___, pops, ____, _____, ______ = _chart.split("&") | |
val1, val2, val3 = vals[6:].split("|") | |
pops = pops[11:-2].split("|") | |
try: | |
jpt_idx = len(pops)-pops.index(ref_population)-1 | |
geno_ratio = [map(float, val1.split(","))[jpt_idx], | |
map(float, val2.split(","))[jpt_idx], | |
map(float, val3.split(","))[jpt_idx]] | |
#print geno_ratio, geno_order | |
geno_percent = geno_ratio[geno_order] | |
except ValueError: | |
geno_percent = "--" | |
except ValueError: | |
geno_percent = "--" | |
except UnboundLocalError: | |
geno_percent = "--" | |
else: | |
snpedia_report = "" | |
geno_percent = "--" | |
return snpedia_report, geno_percent | |
def _search_anything(query): | |
"""search input words at SNPedia, return and compare related genotypes and yours""" | |
snpedia_query = "https://www.snpedia.com/index.php/" + query | |
soup = BeautifulSoup(requests.get(snpedia_query).content, "lxml") | |
links = soup.find_all("div", attrs={"class": "mw-content-ltr"})[0].find_all("a") | |
rsid_list = [l.text for l in links if l.text.startswith("rs")] | |
rsid_list = [txt for txt in rsid_list if not txt.endswith(")")] | |
if rsid_list == []: | |
rsid_list = [l.text for l in links if l.text] | |
if 'search for this page title' in rsid_list: | |
rsid_list = ["another phenotype"] | |
return rsid_list | |
def _comp_seq(genotype): | |
comp_dict = {"A":"T", "T":"A", "G":"C", "C":"G", ";":";"} | |
return _sort_seq("".join([comp_dict[i] for i in genotype])) | |
def _sort_seq(genotype): | |
order_dict = {"A":0, "C":1, "G":2, "T":3, ";":";"} | |
m, _, f = [order_dict[i] for i in genotype] | |
if m > f: return ";".join[f, m] | |
else: return genotype |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment