Source code for src.graphdb_builder.databases.databases_controller

import os.path
import sys
import config.ckg_config as ckg_config
from graphdb_builder import builder_utils
from graphdb_builder.databases.parsers import *
from joblib import Parallel, delayed
from datetime import date

log_config = ckg_config.graphdb_builder_log
logger = builder_utils.setup_logging(log_config, key="database_controller")

try:
    dbconfig = builder_utils.setup_config('databases')
except Exception as err:
    logger.error("Reading configuration > {}.".format(err))


[docs]def parseDatabase(importDirectory, database, download=True): stats = set() updated_on = None if download: updated_on = str(date.today()) try: logger.info("Parsing database {}".format(database)) if database.lower() == "jensenlab": result = jensenlabParser.parser(dbconfig["databasesDir"], download) for qtype in result: relationships, header, outputfileName = result[qtype] outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_relationships(relationships, header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, qtype, len(relationships))) stats.add(builder_utils.buildStats(len(relationships), "relationships", qtype, database, outputfile, updated_on)) elif database.lower() == "mentions": num_entities, outputfile = textminingParser.parser(dbconfig["databasesDir"], importDirectory, download) logger.info("Database {} - Number of {} entities: {}".format(database, "Publication", num_entities)) stats.add(builder_utils.buildStats(num_entities, "entity", "Publication", database, outputfile, updated_on)) elif database.lower() == "hgnc": #HGNC entities, header = hgncParser.parser(dbconfig["databasesDir"], download) outputfile = os.path.join(importDirectory, "Gene.tsv") builder_utils.write_entities(entities, header, outputfile) logger.info("Database {} - Number of {} entities: {}".format(database, "Gene", len(entities))) stats.add(builder_utils.buildStats(len(entities), "entity", "Gene", database, outputfile, updated_on)) elif database.lower() == "refseq": entities, relationships, headers = refseqParser.parser(dbconfig["databasesDir"], download) for entity in entities: header = headers[entity] outputfile = os.path.join(importDirectory, entity+".tsv") builder_utils.write_entities(entities[entity], header, outputfile) logger.info("Database {} - Number of {} entities: {}".format(database, entity, len(entities[entity]))) stats.add(builder_utils.buildStats(len(entities[entity]), "entity", entity, database, outputfile, updated_on)) for rel in relationships: header = headers[rel] outputfile = os.path.join(importDirectory, "refseq_"+rel.lower()+".tsv") builder_utils.write_relationships(relationships[rel], header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, rel, len(relationships[rel]))) stats.add(builder_utils.buildStats(len(relationships[rel]), "relationships", rel, database, outputfile, updated_on)) elif database.lower() == "uniprot": #UniProt stats.update(uniprotParser.parser(dbconfig["databasesDir"], importDirectory, download, updated_on)) elif database.lower() == "pfam": #UniProt stats.update(pfamParser.parser(dbconfig["databasesDir"], importDirectory, download, updated_on)) elif database.lower() == "intact": #IntAct relationships, header, outputfileName = intactParser.parser(dbconfig["databasesDir"], download) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_relationships(relationships, header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, "curated_interacts_with", len(relationships))) stats.add(builder_utils.buildStats(len(relationships), "relationships", "curated_interacts_with", database, outputfile, updated_on)) elif database.lower() == "mutationds": #MutationDs relationships, header, outputfileName = mutationDsParser.parser(dbconfig["databasesDir"], download) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_relationships(relationships, header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, "curated_affects_interaction_with", len(relationships))) stats.add(builder_utils.buildStats(len(relationships), "relationships", "curated_affects_interaction_with", database, outputfile, updated_on)) elif database.lower() == "string": #STRING proteinMapping, drugMapping = stringParser.parser(dbconfig["databasesDir"], importDirectory, download=download) stringParser.parseActions(dbconfig["databasesDir"], importDirectory, proteinMapping, drugMapping, download=download, db="STRING") elif database.lower() == "stitch": #STITCH proteinMapping, drugMapping = stringParser.parser(dbconfig["databasesDir"], importDirectory, drug_source=dbconfig["sources"]["Drug"], download=download, db="STITCH") stringParser.parseActions(dbconfig["databasesDir"], importDirectory, proteinMapping, drugMapping, download=download, db="STITCH") elif database.lower() == "disgenet": #DisGeNet relationships, header, outputfileName = disgenetParser.parser(dbconfig["databasesDir"], download) for idType in relationships: outputfile = os.path.join(importDirectory, idType+"_"+outputfileName) builder_utils.write_relationships(relationships[idType], header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, idType, len(relationships[idType]))) stats.add(builder_utils.buildStats(len(relationships[idType]), "relationships", idType, database, outputfile, updated_on)) elif database.lower() == "pathwaycommons": #PathwayCommons pathways entities, relationships, entities_header, relationships_header = pathwayCommonsParser.parser(dbconfig["databasesDir"], download) entity_outputfile = os.path.join(importDirectory, "Pathway.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) stats.add(builder_utils.buildStats(len(entities), "entity", "Pathway", database, entity_outputfile, updated_on)) pathway_outputfile = os.path.join(importDirectory, "pathwaycommons_protein_associated_with_pathway.tsv") builder_utils.write_relationships(relationships, relationships_header, pathway_outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, "protein_associated_with_pathway", len(relationships))) stats.add(builder_utils.buildStats(len(relationships), "relationships", "protein_associated_with_pathway", database, pathway_outputfile, updated_on)) elif database.lower() == "reactome": #Reactome entities, relationships, entities_header, relationships_header = reactomeParser.parser(dbconfig["databasesDir"], download) entity_outputfile = os.path.join(importDirectory, database.lower()+"_Pathway.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) stats.add(builder_utils.buildStats(len(entities), "entity", "Pathway", database, entity_outputfile, updated_on)) for entity,relationship in relationships: reactome_outputfile = os.path.join(importDirectory, database.lower()+"_"+entity.lower()+"_"+relationship.lower()+".tsv") builder_utils.write_relationships(relationships[(entity, relationship)], relationships_header[entity], reactome_outputfile) logger.info("Database {} - Number of {} {} relationships: {}".format(database, entity, relationship, len(relationships[(entity,relationship)]))) stats.add(builder_utils.buildStats(len(relationships[(entity,relationship)]), "relationships", relationship, database, reactome_outputfile, updated_on)) elif database.lower() == "smpdb": #SMPDB entities, relationships, entities_header, relationships_header = smpdbParser.parser(dbconfig["databasesDir"], download) entity_outputfile = os.path.join(importDirectory, database.lower()+"_Pathway.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) stats.add(builder_utils.buildStats(len(entities), "entity", "Pathway", database, entity_outputfile, updated_on)) for entity,relationship in relationships: smpdb_outputfile = os.path.join(importDirectory, database.lower()+"_"+entity.lower()+"_"+relationship.lower()+".tsv") builder_utils.write_relationships(relationships[(entity, relationship)], relationships_header[entity], smpdb_outputfile) logger.info("Database {} - Number of {} {} relationships: {}".format(database, entity, relationship, len(relationships[(entity,relationship)]))) stats.add(builder_utils.buildStats(len(relationships[(entity,relationship)]), "relationships", relationship, database, smpdb_outputfile, updated_on)) elif database.lower() == "dgidb": relationships, header, outputfileName = drugGeneInteractionDBParser.parser(dbconfig["databasesDir"], download) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_relationships(relationships, header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, "targets", len(relationships))) stats.add(builder_utils.buildStats(len(relationships), "relationships", "targets", database, outputfile, updated_on)) elif database.lower() == "sider": relationships,header, outputfileName, drugMapping, phenotypeMapping = siderParser.parser(dbconfig["databasesDir"], dbconfig["sources"]["Drug"], download) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_relationships(relationships, header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, "has_side_effect", len(relationships))) stats.add(builder_utils.buildStats(len(relationships), "relationships", "has_side_effect", database, outputfile, updated_on)) relationships, header, outputfileName = siderParser.parserIndications(dbconfig["databasesDir"], drugMapping, phenotypeMapping, download = download) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_relationships(relationships, header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, "indicated_for", len(relationships))) stats.add(builder_utils.buildStats(len(relationships), "relationships", "indicated_for", database, outputfile, updated_on)) elif database.lower() == "oncokb": entities, relationships, entities_header, relationships_headers = oncokbParser.parser(dbconfig["databasesDir"], download) outputfile = os.path.join(importDirectory, "oncokb_Clinically_relevant_variant.tsv") builder_utils.write_entities(entities, entities_header, outputfile) logger.info("Database {} - Number of {} entities: {}".format(database, "Clinically_relevant_variant", len(entities))) stats.add(builder_utils.buildStats(len(entities), "entity", "Clinically_relevant_variant", database, outputfile, updated_on)) for relationship in relationships: oncokb_outputfile = os.path.join(importDirectory, "oncokb_"+relationship+".tsv") if relationship in relationships_headers: header = relationships_headers[relationship] else: header = ['START_ID', 'END_ID','TYPE'] builder_utils.write_relationships(relationships[relationship], header, oncokb_outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, relationship, len(relationships[relationship]))) stats.add(builder_utils.buildStats(len(relationships[relationship]), "relationships", relationship, database, outputfile, updated_on)) elif database.lower() == "cancergenomeinterpreter": entities, relationships, entities_header, relationships_headers = cancerGenomeInterpreterParser.parser(dbconfig["databasesDir"], download) entity_outputfile = os.path.join(importDirectory, "cgi_Clinically_relevant_variant.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format(database, "Clinically_relevant_variant", len(entities))) stats.add(builder_utils.buildStats(len(entities), "entity", "Clinically_relevant_variant", database, entity_outputfile, updated_on)) for relationship in relationships: cgi_outputfile = os.path.join(importDirectory, "cgi_"+relationship+".tsv") header = ['START_ID', 'END_ID','TYPE'] if relationship in relationships_headers: header = relationships_headers[relationship] builder_utils.write_relationships(relationships[relationship], header, cgi_outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, relationship, len(relationships[relationship]))) stats.add(builder_utils.buildStats(len(relationships[relationship]), "relationships", relationship, database, cgi_outputfile, updated_on)) elif database.lower() == "hmdb": entities, relationships, entities_header, relationships_header = hmdbParser.parser(dbconfig["databasesDir"], download) entity_outputfile = os.path.join(importDirectory, "Metabolite.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format(database, "Metabolite", len(entities))) stats.add(builder_utils.buildStats(len(entities), "entity", "Metabolite", database, entity_outputfile, updated_on)) for relationship in relationships: hmdb_outputfile = os.path.join(importDirectory, relationship+".tsv") builder_utils.write_relationships(relationships[relationship], relationships_header, hmdb_outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, relationship, len(relationships[relationship]))) stats.add(builder_utils.buildStats(len(relationships[relationship]), "relationships", relationship, database, hmdb_outputfile, updated_on)) elif database.lower() == "drugbank": entities, relationships, entities_header, relationships_headers = drugBankParser.parser(dbconfig["databasesDir"]) entity_outputfile = os.path.join(importDirectory, "Drug.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format(database, "Drug", len(entities))) stats.add(builder_utils.buildStats(len(entities), "entity", "Drug", database, entity_outputfile, updated_on)) for relationship in relationships: relationship_outputfile = os.path.join(importDirectory, relationship+".tsv") header = ['START_ID', 'END_ID','TYPE', 'source'] if relationship in relationships_headers: header = relationships_headers[relationship] builder_utils.write_relationships(relationships[relationship], header, relationship_outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, relationship, len(relationships[relationship]))) stats.add(builder_utils.buildStats(len(relationships[relationship]), "relationships", relationship, database, relationship_outputfile, updated_on)) elif database.lower() == "gwascatalog": entities, relationships, entities_header, relationships_header = gwasCatalogParser.parser(dbconfig["databasesDir"], download) entity_outputfile = os.path.join(importDirectory, "GWAS_study.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format(database, "GWAS_study", len(entities))) stats.add(builder_utils.buildStats(len(entities), "entity", "GWAS_study", database, entity_outputfile, updated_on)) for relationship in relationships: header = ['START_ID', 'END_ID','TYPE', 'source'] if relationship in relationships_header: header = relationships_header[relationship] outputfile = os.path.join(importDirectory, "GWAS_study_"+relationship+".tsv") builder_utils.write_relationships(relationships[relationship], header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, relationship, len(relationships[relationship]))) stats.add(builder_utils.buildStats(len(relationships[relationship]), "relationships", relationship, database, outputfile, updated_on)) elif database.lower() == "phosphositeplus": entities, relationships, entities_header, relationships_headers = pspParser.parser(dbconfig["databasesDir"]) entity_outputfile = os.path.join(importDirectory, "psp_Modified_protein.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format(database, "Modified_protein", len(entities))) stats.add(builder_utils.buildStats(len(entities), "entity", "Modified_protein", database, entity_outputfile, updated_on)) for entity,relationship in relationships: rel_header = ["START_ID", "END_ID", "TYPE", "source"] if entity in relationships_headers: rel_header = relationships_headers[entity] outputfile = os.path.join(importDirectory, "psp_"+entity.lower()+"_"+relationship.lower()+".tsv") builder_utils.write_relationships(relationships[(entity,relationship)], rel_header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, relationship, len(relationships[(entity,relationship)]))) stats.add(builder_utils.buildStats(len(relationships[(entity,relationship)]), "relationships", relationship, database, outputfile, updated_on)) elif database.lower() == "signor": entities, relationships, entities_header, relationships_headers = signorParser.parser(dbconfig["databasesDir"]) entity_outputfile = os.path.join(importDirectory, "signor_Modified_protein.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format(database, "Modified_protein", len(entities))) stats.add(builder_utils.buildStats(len(entities), "entity", "Modified_protein", database, entity_outputfile, updated_on)) for entity,relationship in relationships: rel_header = ["START_ID", "END_ID", "TYPE", "source"] prefix = 'signor_'+entity.lower() if relationship in relationships_headers: rel_header = relationships_headers[relationship] if relationship == 'mentioned_in_publication': prefix = entity outputfile = os.path.join(importDirectory, prefix+"_"+relationship.lower()+".tsv") builder_utils.write_relationships(relationships[(entity,relationship)], rel_header, outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, relationship, len(relationships[(entity,relationship)]))) stats.add(builder_utils.buildStats(len(relationships[(entity,relationship)]), "relationships", relationship, database, outputfile, updated_on)) elif database.lower() == "corum": entities, relationships, entities_header, relationships_headers = corumParser.parser(dbconfig["databasesDir"], download) entity_outputfile = os.path.join(importDirectory, "Complex.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format(database, "Complex", len(entities))) stats.add(builder_utils.buildStats(len(entities), "entity", "Complex", database, entity_outputfile, updated_on)) for entity, relationship in relationships: corum_outputfile = os.path.join(importDirectory, database.lower()+"_"+entity.lower()+"_"+relationship.lower()+".tsv") builder_utils.write_relationships(relationships[(entity,relationship)], relationships_headers[entity], corum_outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, relationship, len(relationships[(entity,relationship)]))) stats.add(builder_utils.buildStats(len(relationships[(entity,relationship)]), "relationships", relationship, database, corum_outputfile, updated_on)) elif database.lower() == "foodb": entities, relationships, entities_header, relationships_headers = foodbParser.parser(dbconfig["databasesDir"], download) entity_outputfile = os.path.join(importDirectory, "Food.tsv") builder_utils.write_entities(entities, entities_header, entity_outputfile) logger.info("Database {} - Number of {} entities: {}".format(database, "Food", len(entities))) stats.add(builder_utils.buildStats(len(entities), "entity", "Food", database, entity_outputfile, updated_on)) for entity, relationship in relationships: foodb_outputfile = os.path.join(importDirectory, database.lower()+"_"+entity.lower()+"_"+relationship.lower()+".tsv") builder_utils.write_relationships(relationships[(entity,relationship)], relationships_headers[entity], foodb_outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, relationship, len(relationships[(entity,relationship)]))) stats.add(builder_utils.buildStats(len(relationships[(entity,relationship)]), "relationships", relationship, database, foodb_outputfile, updated_on)) elif database.lower() == "exposome explorer": relationships, header = exposomeParser.parser(dbconfig["databasesDir"], download) for entity, relationship in relationships: ee_outputfile = os.path.join(importDirectory, database.lower()+"_"+entity.lower()+"_"+relationship.lower()+".tsv") builder_utils.write_relationships(relationships[(entity,relationship)], header[entity], ee_outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, relationship, len(relationships[(entity,relationship)]))) stats.add(builder_utils.buildStats(len(relationships[(entity,relationship)]), "relationships", relationship, database, ee_outputfile, updated_on)) elif database.lower() == "hpa": relationships, headers = hpaParser.parser(dbconfig["databasesDir"], download) for entity, relationship in relationships: hpa_outputfile = os.path.join(importDirectory, database.lower()+"_"+entity.lower()+"_"+relationship.lower()+".tsv") builder_utils.write_relationships(relationships[(entity,relationship)], headers[relationship], hpa_outputfile) logger.info("Database {} - Number of {} relationships: {}".format(database, relationship, len(relationships[(entity,relationship)]))) stats.add(builder_utils.buildStats(len(relationships[(entity,relationship)]), "relationships", relationship, database, hpa_outputfile, updated_on)) except Exception as err: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("Database {}: {}, file: {},line: {}".format(database, sys.exc_info(), fname, exc_tb.tb_lineno)) return stats
######################### # Graph files # #########################
[docs]def generateGraphFiles(importDirectory, databases=None, download=True, n_jobs = 4): if databases is None: databases = dbconfig["databases"] stats = Parallel(n_jobs=n_jobs)(delayed(parseDatabase)(importDirectory,database, download) for database in databases) allstats = {val if type(sublist) == set else sublist for sublist in stats for val in sublist} return allstats
if __name__ == "__main__": pass