Source code for src.graphdb_builder.ontologies.ontologies_controller

from graphdb_builder import mapping as mp, builder_utils
import config.ckg_config as ckg_config
from graphdb_builder.ontologies.parsers import *
import os.path
import pandas as pd
import csv
from datetime import date
import sys

log_config = ckg_config.graphdb_builder_log
logger = builder_utils.setup_logging(log_config, key="ontologies_controller")

try:
    config = builder_utils.setup_config('ontologies')
except Exception as err:
    logger.error("Reading configuration > {}.".format(err))


[docs]def entries_to_remove(entries, the_dict):
    """
    This function removes pairs from a given dictionary, based on a list of provided keys.

    :param list entries: list of keys to be deleted from dictionary.
    :param dict the_dict: dictionary.
    :return: The original dictionary minus the key,value pairs from the provided entries list.
    """
    for key in entries:
        if key in the_dict:
            del the_dict[key]


[docs]def get_extra_entities_rels(ontology_directory):
    extra_entities_file = 'extra_entities.tsv'
    extra_entities = builder_utils.get_extra_pairs(ontology_directory, extra_entities_file)
    extra_rels_file = 'extra_rels.tsv'
    extra_rels = builder_utils.get_extra_pairs(ontology_directory, extra_rels_file)

    return extra_entities, extra_rels


[docs]def parse_ontology(ontology, download=True):
    """
    Parses and extracts data from a given ontology file(s), and returns a tuple with multiple dictionaries.

    :param str ontology: acronym of the ontology to be parsed (e.g. Disease Ontology:'DO').
    :param bool download: wether database is to be downloaded.
    :return: Tuple with three nested dictionaries: terms, relationships between terms, and definitions of the terms.\
            For more information on the returned dictionaries, see the documentation for any ontology parser.
    """
    directory = config["ontologies_directory"]
    ontology_directory = os.path.join(directory, ontology)
    builder_utils.checkDirectory(ontology_directory)
    ontology_files = []
    ontologyData = None
    mappings = None
    extra_entities = set()
    extra_rels = set()
    if ontology in config["ontology_types"]:
        otype = config["ontology_types"][ontology]
        if 'urls' in config:
            if otype in config['urls']:
                urls = config['urls'][otype]
                for url in urls:
                    f = url.split('/')[-1].replace('?', '_').replace('=', '_')
                    ontology_files.append(os.path.join(ontology_directory, f))
                    if download:
                        builder_utils.downloadDB(url, directory=ontology_directory, file_name=f)
            elif otype in config["files"]:
                ofiles = config["files"][otype]
                for f in ofiles:
                    if '*' not in f:
                        if os.path.isfile(os.path.join(directory, f)):
                            ontology_files.append(os.path.join(directory, f))
                        else:
                            logger.error("Error: file {} is not in the directory {}".format(f, directory))
                    else:
                        ontology_files.append(os.path.join(directory, f))

        filters = None
        if otype in config["parser_filters"]:
            filters = config["parser_filters"][otype]
        extra_entities, extra_rels = get_extra_entities_rels(ontology_directory)
    if len(ontology_files) > 0:
        if ontology == "SNOMED-CT":
            ontologyData = snomedParser.parser(ontology_files, filters)
        elif ontology == "ICD":
            ontologyData = icdParser.parser(ontology_files)
        elif ontology == 'EFO':
            ontologyData, mappings = efoParser.parser(ontology_files)
        else:
            ontologyData = oboParser.parser(ontology, ontology_files)
            mp.buildMappingFromOBO(ontology_files[0], ontology)
    else:
        if ontology == "SNOMED-CT":
            logger.info("WARNING: SNOMED-CT terminology needs to be downloaded manually since it requires UMLS License. More information available here: https://www.nlm.nih.gov/databases/umls.html")
        else:
            logger.info("WARNING: Ontology {} could not be downloaded. Check that the link in configuration works.".format(ontology))

    return ontologyData, mappings, extra_entities, extra_rels


[docs]def generate_graphFiles(import_directory, ontologies=None, download=True):
    """
    This function parses and extracts data from a given list of ontologies. If no ontologies are provided, \
    all availables ontologies are used. Terms, relationships and definitions are saved as .tsv files to be loaded into \
    the graph database.

    :param str import_directory: relative path from current python module to 'imports' directory.
    :param ontologies: list of ontologies to be imported. If None, all available ontologies are imported.
    :type ontologies: list or None
    :param bool download: wether database is to be downloaded.
    :return: Dictionary of tuples. Each tuple corresponds to a unique label/relationship type, date, time, \
            database, and number of nodes and relationships.
    """
    entities = config["ontologies"]
    if ontologies is not None:
        entities = {}
        for ontology in ontologies:
            ontology = ontology.capitalize()
            if ontology.capitalize() in config["ontologies"]:
                entities.update({ontology: config["ontologies"][ontology]})

    updated_on = "None"
    if download:
        updated_on = str(date.today())

    stats = set()
    for entity in entities:
        ontology = config["ontologies"][entity]
        if ontology in config["ontology_types"]:
            ontologyType = config["ontology_types"][ontology]
        try:
            result, mappings, extra_entities, extra_rels = parse_ontology(ontology, download)
            if result is not None:
                terms, relationships, definitions = result
                for namespace in terms:
                    if namespace in config["entities"]:
                        name = config["entities"][namespace]
                    entity_outputfile = os.path.join(import_directory, name + ".tsv")
                    with open(entity_outputfile, 'w', encoding='utf-8') as csvfile:
                        writer = csv.writer(csvfile, delimiter='\t', escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL)
                        writer.writerow(['ID', ':LABEL', 'name', 'description', 'type', 'synonyms'])
                        num_terms = 0
                        for term in terms[namespace]:
                            writer.writerow([term, entity, list(terms[namespace][term])[0], definitions[term], ontologyType, ",".join(terms[namespace][term])])
                            num_terms += 1
                        for extra_entity in extra_entities:
                            writer.writerow(list(extra_entity))
                            num_terms += 1
                    logger.info("Ontology {} - Number of {} entities: {}".format(ontology, name, num_terms))
                    stats.add(builder_utils.buildStats(num_terms, "entity", name, ontology, entity_outputfile, updated_on))
                    if namespace in relationships:
                        relationships_outputfile = os.path.join(import_directory, name+"_has_parent.tsv")
                        relationships[namespace].update(extra_rels)
                        relationshipsDf = pd.DataFrame(list(relationships[namespace]))
                        relationshipsDf.columns = ['START_ID', 'END_ID', 'TYPE']
                        relationshipsDf.to_csv(path_or_buf=relationships_outputfile,
                                                    sep='\t',
                                                    header=True, index=False, quotechar='"',
                                                    quoting=csv.QUOTE_ALL,
                                                    line_terminator='\n', escapechar='\\')
                        logger.info("Ontology {} - Number of {} relationships: {}".format(ontology, name+"_has_parent", len(relationships[namespace])))
                        stats.add(builder_utils.buildStats(len(relationships[namespace]), "relationships", name+"_has_parent", ontology, relationships_outputfile, updated_on))
            else:
                logger.warning("Ontology {} - The parsing did not work".format(ontology))
            if mappings is not None:
                for name in mappings:
                    mappings_outputfile = os.path.join(import_directory, name + ".tsv")
                    mappingsDf = pd.DataFrame(list(mappings[name]))
                    mappingsDf.columns = ['START_ID', 'END_ID', 'TYPE']
                    mappingsDf.to_csv(path_or_buf=mappings_outputfile,
                                                sep='\t',
                                                header=True, index=False, quotechar='"',
                                                quoting=csv.QUOTE_ALL,
                                                line_terminator='\n', escapechar='\\')
                    logger.info("Ontology {} - Number of {} relationships: {}".format(ontology, name, len(mappings[name])))
                    stats.add(builder_utils.buildStats(len(mappings[name]), "relationships", name, ontology, mappings_outputfile, updated_on))
        except Exception as err:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            logger.error("Error: {}. Ontology {}: {}, file: {},line: {}".format(err, ontology, sys.exc_info(), fname, exc_tb.tb_lineno))
    return stats


if __name__ == "__main__":
    generate_graphFiles(import_directory='../../../data/imports', download=True)