Source code for src.graphdb_builder.databases.parsers.reactomeParser

import os.path
import re
from collections import defaultdict
from graphdb_builder import mapping as mp, builder_utils

#########################
#   Reactome database   #
#########################
[docs]def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="reactomeConfig.yml", data_type='databases') urls = config['reactome_urls'] entities = set() relationships = defaultdict(set) entities_header = config['pathway_header'] relationships_headers = config['relationships_header'] directory = os.path.join(databases_directory, "Reactome") builder_utils.checkDirectory(directory) metabolite_mapping = mp.getMappingForEntity("Metabolite") #drug_mapping = mp.getMappingForEntity("Drug") for dataset in urls: url = urls[dataset] file_name = url.split('/')[-1] if download: builder_utils.downloadDB(url, directory) f = os.path.join(directory, file_name) with open(f, 'r') as rf: if dataset == "pathway": entities = parsePathways(config, databases_directory, rf) elif dataset == "hierarchy": relationships[("pathway", "has_parent")] = parsePathwayHierarchy(rf) elif dataset == "protein": relationships[(dataset, "annotated_to_pathway")] = parsePathwayRelationships(config, rf) elif dataset == "metabolite": relationships[(dataset, "annotated_to_pathway")] = parsePathwayRelationships(config, rf, metabolite_mapping) #elif dataset == "drug": #relationships[(dataset, "annotated_to_pathway")] = set() builder_utils.remove_directory(directory) return entities, relationships, entities_header, relationships_headers
[docs]def parsePathways(config, databases_directory, fhandler): entities = set() organisms = config['organisms'] url = config['linkout_url'] directory = os.path.join(databases_directory, "Reactome") mapping_file = os.path.join(directory, "mapping.tsv") mp.reset_mapping(entity="Pathway") with open(mapping_file, 'w') as mf: for line in fhandler: data = line.rstrip("\r\n").split("\t") identifier = data[0] name = data[1] organism = data[2] linkout = url.replace("PATHWAY", identifier) if organism in organisms: organism = organisms[organism] entities.add((identifier, "Pathway", name, name, organism, linkout, "Reactome")) mf.write(identifier+"\t"+name+"\n") mp.mark_complete_mapping(entity="Pathway") return entities
[docs]def parsePathwayHierarchy(fhandler): relationships = set() for line in fhandler: data = line.rstrip("\r\n").split("\t") parent = data[0] child = data[1] relationships.add((child, parent, "HAS_PARENT", "Reactome")) return relationships
[docs]def parsePathwayRelationships(config, fhandler, mapping=None): relationships = set() regex = r"(.+)\s\[(.+)\]" organisms = config['organisms'] for line in fhandler: data = line.rstrip("\r\n").split("\t") identifier = data[0] id_loc = data[2] pathway = data[3] evidence = data[6] organism = data[7] match = re.search(regex, id_loc) loc = "unspecified" if match: name = match.group(1) loc = match.group(2) if organism in organisms: organism = organisms[organism] if mapping is not None: if identifier in mapping: identifier = mapping[identifier] elif name in mapping: identifier = mapping[name] else: continue relationships.add((identifier, pathway, "ANNOTATED_TO_PATHWAY", evidence, organism, loc, "Reactome")) return relationships