Source code for src.graphdb_builder.databases.parsers.drugBankParser
import os.path
from collections import defaultdict
from lxml import etree
import zipfile
from graphdb_builder import mapping as mp, builder_utils
#########################
# Drug Bank #
#########################
[docs]def parser(databases_directory):
config = builder_utils.get_config(config_name="drugBankConfig.yml", data_type='databases')
directory = os.path.join(databases_directory, "DrugBank")
builder_utils.checkDirectory(directory)
drugs = extract_drugs(config, directory)
build_DrugBank_dictionary(config, directory, drugs)
relationships = build_relationships_from_DrugBank(config, drugs)
entities, attributes = build_drug_entity(config, drugs)
entities_header = ['ID'] + attributes
relationships_headers = config['relationships_headers']
return (entities, relationships, entities_header, relationships_headers)
[docs]def extract_drugs(config, directory):
drugs = {}
prefix = '{http://www.drugbank.ca}'
url = config['DrugBank_url']
fileName = os.path.join(directory, url.split('/')[-1])
fields = config['DrugBank_fields']
parentFields = config['DrugBank_parentFields']
structuredFields = config['DrugBank_structures']
vocabulary = parseDrugBankVocabulary(config, directory)
with zipfile.ZipFile(fileName, 'r') as zipped:
for zfile in zipped.namelist():
zipped.extract(member=zfile, path=directory)
xfile = os.path.join(directory, zfile)
with open(xfile, 'rb') as f:
context = etree.iterparse(f, events=("end",), tag=prefix+"drug")
for a, elem in context:
synonyms = set()
values = {child.tag.replace(prefix, ''): child.text for child in elem.iterchildren() if child.tag.replace(prefix, '') in fields and child.text is not None}
if "drugbank-id" in values:
synonyms.add(values["drugbank-id"])
for child in elem.iterchildren():
if child.tag.replace(prefix, '') in parentFields:
label = child.tag.replace(prefix, '')
values[label] = []
for intchild in child.iter():
if intchild.text is not None and intchild.text.strip() != "":
if label in structuredFields:
if intchild.tag.replace(prefix, '') in structuredFields[label]:
if label == "external-identifiers":
synonyms.add(intchild.text)
else:
values[label].append(intchild.text)
elif intchild.tag.replace(prefix, '') in fields and intchild.text:
values[label].append(intchild.text)
if "drugbank-id" in values and len(values) > 2:
if values["drugbank-id"] in vocabulary:
values["id"] = vocabulary[values["drugbank-id"]]
synonyms.add(values["drugbank-id"])
#values["alt_drugbank-id"] = vocabulary[values['id']]
values["synonyms"] = list(synonyms)
drugs[values["id"]] = values
return drugs
[docs]def parseDrugBankVocabulary(config, directory):
vocabulary = {}
url = config['DrugBank_vocabulary_url']
fileName = os.path.join(directory, url.split('/')[-1])
with zipfile.ZipFile(fileName, 'r') as zipped:
for f in zipped.namelist():
with zipped.open(f) as vf:
# with open(os.path.join(directory,f), 'r') as vf:
for line in vf:
data = line.decode('utf-8').rstrip('\r\n').split(',')
primary = data[0]
secondaries = data[1].split(' | ')
for sec in secondaries:
vocabulary[sec] = primary
vocabulary[primary] = primary
return vocabulary
[docs]def build_relationships_from_DrugBank(config, drugs):
relationships = defaultdict(list)
associations = config['DrugBank_associations']
for did in drugs:
for ass in associations:
ident = ass
if len(associations[ass]) > 1:
ident = associations[ass][1]
if ass in drugs[did]:
if type(drugs[did][ass]) == list:
partners = drugs[did][ass]
if ass == "drug-interactions":
partners = zip(partners[0::2], partners[1::2])
elif ass in ["snp-effects", 'snp-adverse-drug-reactions']:
partners = zip(partners[0::3], partners[1::3], partners[2::3])
elif ass == "targets":
partners = zip(partners[0::2], partners[1::2])
partners = [p for r, p in partners if r == "UniProtKB"]
for partner in partners:
rel = (did, partner, associations[ass][0], "DrugBank")
relationships[ident].append(tuple(builder_utils.flatten(rel)))
else:
partner = drugs[did][ass]
relationships[ident].append((did, partner, associations[ass][0], "DrugBank"))
return relationships
[docs]def build_drug_entity(config, drugs):
entities = set()
attributes = config['DrugBank_attributes']
properties = config['DrugBank_exp_prop']
allAttr = attributes[:-1] + [p.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '') for p in properties]
for did in drugs:
entity = []
entity.append(did)
for attr in attributes:
if attr in drugs[did]:
if type(drugs[did][attr]) == list:
if attr == "experimental-properties":
newAttr = dict(zip(drugs[did][attr][0::2], drugs[did][attr][1::2]))
for prop in properties:
if prop in newAttr:
entity.append(newAttr[prop])
else:
entity.append('')
else:
lattr = "|".join(drugs[did][attr])
entity.append(lattr)
else:
entity.append(drugs[did][attr])
else:
entity.append('')
entities.add(tuple(entity))
return entities, allAttr
[docs]def build_DrugBank_dictionary(config, directory, drugs):
filename = config['DrugBank_dictionary_file']
outputfile = os.path.join(directory, filename)
mp.reset_mapping(entity="Drug")
with open(outputfile, 'w', encoding='utf-8') as out:
for did in drugs:
if "name" in drugs[did]:
name = drugs[did]["name"]
out.write(did+"\t"+name.lower()+"\n")
if "synonyms" in drugs[did]:
for synonym in drugs[did]["synonyms"]:
out.write(did+"\t"+synonym.lower()+"\n")
mp.mark_complete_mapping(entity="Drug")
if __name__ == "__main__":
parser("../../../../data/databases/")