Source code for src.graphdb_builder.databases.parsers.hmdbParser
import os.path
from collections import defaultdict
from lxml import etree
import zipfile
from graphdb_builder import mapping as mp, builder_utils
#################################
# Human Metabolome Database #
#################################
[docs]def parser(databases_directory, download=True):
config = builder_utils.get_config(config_name="hmdbConfig.yml", data_type='databases')
directory = os.path.join(databases_directory, "HMDB")
builder_utils.checkDirectory(directory)
metabolites = extract_metabolites(config, directory, download)
mapping = mp.getMappingFromOntology(ontology="Disease", source=config['HMDB_DO_source'])
mapping.update(mp.getMappingFromOntology(ontology="Tissue", source=None))
entities, attributes = build_metabolite_entity(config, directory, metabolites)
relationships = build_relationships_from_HMDB(config, metabolites, mapping)
entities_header = ['ID'] + attributes
relationships_header = config['relationships_header']
#builder_utils.remove_directory(directory)
return (entities, relationships, entities_header, relationships_header)
[docs]def extract_metabolites(config, directory, download=True):
metabolites = defaultdict()
prefix = "{http://www.hmdb.ca}"
url = config['HMDB_url']
fileName = os.path.join(directory, url.split('/')[-1])
if download:
builder_utils.downloadDB(url, directory)
fields = config['HMDB_fields']
parentFields = config['HMDB_parentFields']
structuredFields = config['HMDB_structures']
with zipfile.ZipFile(fileName, 'r') as zipped:
for zfile in zipped.namelist():
zipped.extract(member=zfile, path=directory)
xfile = os.path.join(directory, zfile)
with open(xfile, 'rb') as f:
context = etree.iterparse(f, events=("end",), tag=prefix + "metabolite")
for _, elem in context:
values = {child.tag.replace(prefix, ''): child.text for child in elem.iterchildren() if child.tag.replace(prefix,'') in fields and child.text is not None}
for child in elem.iterchildren():
if child.tag.replace(prefix, '') in parentFields:
label = child.tag.replace(prefix, '')
values[label] = set()
for intchild in child.iter():
if intchild.text is not None:
text = intchild.text
if text.strip() != "":
if label in structuredFields:
if intchild.tag.replace(prefix, '') in structuredFields[label]:
if len(structuredFields[label]) > 1:
values[intchild.tag.replace(prefix, '')] = text
else:
values[label].add(text)
elif intchild.tag.replace(prefix, '') in fields and text:
values[label].add(text)
if "accession" in values:
metabolites[values["accession"]] = values
return metabolites
[docs]def build_metabolite_entity(config, directory, metabolites):
entities = set()
attributes = config['HMDB_attributes']
for metid in metabolites:
entity = []
entity.append(metid)
for attr in attributes:
if attr in metabolites[metid]:
if type(metabolites[metid][attr]) == set:
lattr = ";".join(list(metabolites[metid][attr]))
entity.append(lattr)
else:
entity.append(metabolites[metid][attr])
else:
entity.append('')
entities.add(tuple(entity))
build_HMDB_dictionary(directory, metabolites)
return entities, attributes
[docs]def build_relationships_from_HMDB(config, metabolites, mapping):
relationships = defaultdict(list)
associations = config['HMDB_associations']
for metid in metabolites:
for ass in associations:
ident = ass
if len(associations[ass]) > 1:
ident = associations[ass][1]
if ass in metabolites[metid]:
if type(metabolites[metid][ass]) == set:
for partner in metabolites[metid][ass]:
if partner.lower() in mapping:
partner = mapping[partner.lower()]
relationships[ident].append((metid, partner, associations[ass][0], "HMDB"))
else:
partner = metabolites[metid][ass]
if metabolites[metid][ass].lower() in mapping:
partner = mapping[metabolites[metid][ass].lower()]
relationships[ident].append((metid, partner, associations[ass][0], "HMDB"))
return relationships
[docs]def build_HMDB_dictionary(directory, metabolites):
filename = "mapping.tsv"
outputfile = os.path.join(directory, filename)
mp.reset_mapping(entity="Metabolite")
with open(outputfile, 'w', encoding='utf-8') as out:
for metid in metabolites:
if "name" in metabolites[metid]:
name = metabolites[metid]["name"]
out.write(metid+"\t"+name.lower()+"\n")
if "synonyms" in metabolites[metid]:
for synonym in metabolites[metid]["synonyms"]:
out.write(metid+"\t"+synonym.lower()+"\n")
if "chebi_id" in metabolites[metid]:
chebi_id = metabolites[metid]["chebi_id"]
out.write(metid+"\t"+chebi_id+"\n")
mp.mark_complete_mapping(entity="Metabolite")