-
-
Save erikyao/c5b80eb15143dc12eb7e41551996ca97 to your computer and use it in GitHub Desktop.
Alternative implementation of chebi_parser.OntologyReader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pronto import Ontology | |
from pronto.entity.attributes import Relationships | |
class ProntoOntologyReader: | |
""" | |
Class that reads an obo ontology file into a networkx graph, and further construct ontology document for each node | |
in the graph | |
""" | |
""" | |
We have some super-hub nodes in the ontology graph (e.g. CHEBI:50860, "organic molecular entity") that have | |
up to 141,138 descendants and 7,336 successors. | |
The 99.9% quantiles of numbers of successors/predecessors/descendants/ancestors are 224.92, 8, 2187.12, and 86. | |
Here we limit the max number of successors/predecessors/descendants/ancestors nodes to be included in any | |
ontology document to 2000 | |
""" | |
NODE_FAMILY_CAPACITY = 2000 | |
def __init__(self, obo_file_path): | |
self.obo_file_path = obo_file_path | |
self.ontology_object = Ontology(obo_file_path) | |
def get_all_term_ids(self): | |
return set(term.id for term in self.ontology_object.terms() if not term.obsolete) | |
@classmethod | |
def convert_subsets(cls, subsets: set): | |
""" | |
The 'subsets' attribute of a pronto term has 1 of the 3 unique values, {'1_STAR'}, {'2_STAR'}, {'3_STAR'} | |
This method will convert the a 'subsets' attribute from a 1-element set to integer correspondingly, i.e. 1, 2, 3 | |
""" | |
if not subsets: | |
return None | |
return int(next(iter(subsets))[0]) | |
@classmethod | |
def convert_relationships(cls, relationships: Relationships): | |
""" | |
The 'relationships' attribute of a pronto term is a 'pronto.entity.attributes.Relationships' object. | |
Its 'items()' method return a 'ItemsView' which is an iterator of pairs of | |
<class 'pronto.relationship.Relationship'>, <class 'pronto.term.TermSet'> | |
This method will convert the 'relationships' attribute to a dict correspondingly. E.g. | |
{'has_role': ['CHEBI:68495', 'CHEBI:38637', 'CHEBI:35610'], 'has_functional_parent': [CHEBI:28179']} | |
Currently there are 9 types of relationship in the ChEBI obo file: | |
{'has_functional_parent', 'has_parent_hydride', 'has_part', 'has_role', | |
'is_conjugate_acid_of', 'is_conjugate_base_of', 'is_enantiomer_of', 'is_substituent_group_from', | |
'is_tautomer_of'} | |
""" | |
relationship_dict = dict() | |
for relationship, term_set in relationships.items(): | |
relationship_dict[relationship.id] = list(term.id for term in term_set) | |
return relationship_dict | |
def read_ontology(self, term_id): | |
""" | |
Read the term in the ontology object given `term_id` and convert it to an ontology document | |
""" | |
""" | |
Key mapping: | |
term.alternate_ids -> ontology_dict['secondary_chebi_id'] | |
term.definition -> ontology_dict['definition'] | |
term.name -> ontology_dict['name'] | |
term.relationships -> ontology_dict['relationship'] | |
term.subsets -> ontology_dict['star'] | |
""" | |
term = self.ontology_object.get(term_id) | |
if (term is None) or term.obsolete: | |
return None | |
parents = [_term.id for _term in term.superclasses(distance=1, with_self=False)] | |
children = [_term.id for _term in term.subclasses(distance=1, with_self=False)] | |
ancestors = [_term.id for _term in term.superclasses(distance=None, with_self=False)] | |
descendants = [_term.id for _term in term.subclasses(distance=None, with_self=False)] | |
# Start construction of the ontology document | |
ontology_dict = dict() | |
ontology_dict["id"] = term_id | |
ontology_dict["secondary_chebi_id"] = list(term.alternate_ids) | |
ontology_dict['definition'] = str(term.definition) # pronto.definition.Definition is a subclass of 'str' | |
ontology_dict['name'] = term.name | |
ontology_dict['relationship'] = self.convert_relationships(term.relationships) | |
ontology_dict['star'] = self.convert_subsets(term.subsets) | |
# Use the same naming convention as in the Mondo parser | |
# See https://github.com/biothings/mydisease.info/blob/master/src/plugins/mondo/parser.py | |
ontology_dict["num_children"] = len(children) | |
ontology_dict["num_parents"] = len(parents) | |
ontology_dict["num_descendants"] = len(descendants) | |
ontology_dict["num_ancestors"] = len(ancestors) | |
ontology_dict["children"] = children[:self.NODE_FAMILY_CAPACITY] | |
ontology_dict["parents"] = parents[:self.NODE_FAMILY_CAPACITY] | |
ontology_dict["descendants"] = descendants[:self.NODE_FAMILY_CAPACITY] | |
ontology_dict["ancestors"] = ancestors[:self.NODE_FAMILY_CAPACITY] | |
return ontology_dict |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment