Source code for src.graphdb_builder.databases.parsers.cancerGenomeInterpreterParser

import os.path
from collections import defaultdict
import zipfile
from graphdb_builder import builder_utils, mapping as mp
import re

#######################################
#   The Cancer Genome Interpreter     # 
#######################################
[docs]def parser(databases_directory, download=True): variant_regex = r"(\D\d+\D)$" regex = r"(chr\d+)\:g\.(\d+)(\w)>(\w)" config = builder_utils.get_config(config_name="cancerGenomeInterpreterConfig.yml", data_type='databases') url = config['cancerBiomarkers_url'] entities_header = config['entities_header'] relationships_headers = config['relationships_headers'] amino_acids = config['amino_acids'] mapping = mp.getMappingFromOntology(ontology="Disease", source=None) drugmapping = mp.getMappingForEntity("Drug") protein_mapping = mp.getMultipleMappingForEntity("Protein") fileName = config['cancerBiomarkers_variant_file'] relationships = defaultdict(set) entities = set() directory = os.path.join(databases_directory, "CancerGenomeInterpreter") builder_utils.checkDirectory(directory) zipFile = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) with zipfile.ZipFile(zipFile) as z: if fileName in z.namelist(): with z.open(fileName, 'r') as responses: first = True for line in responses: if first: first = False continue data = line.decode('utf-8').rstrip("\r\n").split("\t") gene_variant = data[0].split(':') if len(gene_variant) < 2: continue gene = gene_variant[0] variants = gene_variant[1].split(',') #alterationType = data[1] response = data[3] drugs = data[10].split(';') #status = data[11].split(';') evidence = data[12] tumors = data[16].split(';') publications = data[17].split(';') identifier = data[21] prot_variant = data[22] matches = re.match(regex, identifier) alternative_names = [identifier] if matches is not None: cpra = matches.groups() chromosome, position, reference, alternative = cpra variant = chromosome+":g."+position+reference+">"+alternative if prot_variant != "": prot_variant = prot_variant.split(':')[1] alternative_names.append(prot_variant) valid_variants = [] if gene in protein_mapping: for protein in protein_mapping[gene]: for variant in variants: match = re.search(variant_regex, variant) if match: if variant[0] in amino_acids and variant[-1] in amino_acids: valid_variant = protein + '_p.' + amino_acids[variant[0]] + ''.join(variant[1:-1]) + amino_acids[variant[-1]] valid_variants.append(valid_variant) entities.add((valid_variant, "Clinically_relevant_variant", ",".join(alternative_names), chromosome, position, reference, alternative, "", "", "CGI")) relationships["known_variant_is_clinically_relevant"].add((valid_variant, valid_variant, "KNOWN_VARIANT_IS_CLINICALLY_RELEVANT", "CGI")) for drug in drugs: if drug.lower() in drugmapping: drug = drugmapping[drug.lower()] elif drug.split(" ")[0].lower() in drugmapping: drug = drugmapping[drug.split(" ")[0].lower()] elif " ".join(drug.split(" ")[1:]).lower() in drugmapping: drug = drugmapping[" ".join(drug.split(" ")[1:]).lower()] relationships["targets"].add((drug, gene, "CURATED_TARGETS", evidence, response, ",".join(tumors), "curated", "CGI")) for valid_variant in valid_variants: relationships["targets_clinically_relevant_variant"].add((drug, valid_variant, "TARGETS_CLINICALLY_RELEVANT_VARIANT", evidence, response, "".join(tumors), "curated", "CGI")) for tumor in tumors: if tumor.lower() in mapping: tumor = mapping[tumor.lower()] for valid_variant in valid_variants: relationships["associated_with"].add((valid_variant, tumor, "ASSOCIATED_WITH", "curated","curated", "CGI", len(publications))) builder_utils.remove_directory(directory) return (entities, relationships, entities_header, relationships_headers)