Skip to content

Instantly share code, notes, and snippets.

@erikyao
Last active July 20, 2021 21:15
Show Gist options
  • Save erikyao/c5b80eb15143dc12eb7e41551996ca97 to your computer and use it in GitHub Desktop.
Save erikyao/c5b80eb15143dc12eb7e41551996ca97 to your computer and use it in GitHub Desktop.
Alternative implementation of chebi_parser.OntologyReader
from pronto import Ontology
from pronto.entity.attributes import Relationships
class ProntoOntologyReader:
"""
Class that reads an obo ontology file into a networkx graph, and further construct ontology document for each node
in the graph
"""
"""
We have some super-hub nodes in the ontology graph (e.g. CHEBI:50860, "organic molecular entity") that have
up to 141,138 descendants and 7,336 successors.
The 99.9% quantiles of numbers of successors/predecessors/descendants/ancestors are 224.92, 8, 2187.12, and 86.
Here we limit the max number of successors/predecessors/descendants/ancestors nodes to be included in any
ontology document to 2000
"""
NODE_FAMILY_CAPACITY = 2000
def __init__(self, obo_file_path):
self.obo_file_path = obo_file_path
self.ontology_object = Ontology(obo_file_path)
def get_all_term_ids(self):
return set(term.id for term in self.ontology_object.terms() if not term.obsolete)
@classmethod
def convert_subsets(cls, subsets: set):
"""
The 'subsets' attribute of a pronto term has 1 of the 3 unique values, {'1_STAR'}, {'2_STAR'}, {'3_STAR'}
This method will convert the a 'subsets' attribute from a 1-element set to integer correspondingly, i.e. 1, 2, 3
"""
if not subsets:
return None
return int(next(iter(subsets))[0])
@classmethod
def convert_relationships(cls, relationships: Relationships):
"""
The 'relationships' attribute of a pronto term is a 'pronto.entity.attributes.Relationships' object.
Its 'items()' method return a 'ItemsView' which is an iterator of pairs of
<class 'pronto.relationship.Relationship'>, <class 'pronto.term.TermSet'>
This method will convert the 'relationships' attribute to a dict correspondingly. E.g.
{'has_role': ['CHEBI:68495', 'CHEBI:38637', 'CHEBI:35610'], 'has_functional_parent': [CHEBI:28179']}
Currently there are 9 types of relationship in the ChEBI obo file:
{'has_functional_parent', 'has_parent_hydride', 'has_part', 'has_role',
'is_conjugate_acid_of', 'is_conjugate_base_of', 'is_enantiomer_of', 'is_substituent_group_from',
'is_tautomer_of'}
"""
relationship_dict = dict()
for relationship, term_set in relationships.items():
relationship_dict[relationship.id] = list(term.id for term in term_set)
return relationship_dict
def read_ontology(self, term_id):
"""
Read the term in the ontology object given `term_id` and convert it to an ontology document
"""
"""
Key mapping:
term.alternate_ids -> ontology_dict['secondary_chebi_id']
term.definition -> ontology_dict['definition']
term.name -> ontology_dict['name']
term.relationships -> ontology_dict['relationship']
term.subsets -> ontology_dict['star']
"""
term = self.ontology_object.get(term_id)
if (term is None) or term.obsolete:
return None
parents = [_term.id for _term in term.superclasses(distance=1, with_self=False)]
children = [_term.id for _term in term.subclasses(distance=1, with_self=False)]
ancestors = [_term.id for _term in term.superclasses(distance=None, with_self=False)]
descendants = [_term.id for _term in term.subclasses(distance=None, with_self=False)]
# Start construction of the ontology document
ontology_dict = dict()
ontology_dict["id"] = term_id
ontology_dict["secondary_chebi_id"] = list(term.alternate_ids)
ontology_dict['definition'] = str(term.definition) # pronto.definition.Definition is a subclass of 'str'
ontology_dict['name'] = term.name
ontology_dict['relationship'] = self.convert_relationships(term.relationships)
ontology_dict['star'] = self.convert_subsets(term.subsets)
# Use the same naming convention as in the Mondo parser
# See https://github.com/biothings/mydisease.info/blob/master/src/plugins/mondo/parser.py
ontology_dict["num_children"] = len(children)
ontology_dict["num_parents"] = len(parents)
ontology_dict["num_descendants"] = len(descendants)
ontology_dict["num_ancestors"] = len(ancestors)
ontology_dict["children"] = children[:self.NODE_FAMILY_CAPACITY]
ontology_dict["parents"] = parents[:self.NODE_FAMILY_CAPACITY]
ontology_dict["descendants"] = descendants[:self.NODE_FAMILY_CAPACITY]
ontology_dict["ancestors"] = ancestors[:self.NODE_FAMILY_CAPACITY]
return ontology_dict
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment