Source code for src.graphdb_builder.databases.parsers.textminingParser

import os.path
import pandas as pd
from graphdb_builder import mapping as mp, builder_utils

#####################################
#   Textmining (JensenLab.org)      # 
#####################################


[docs]def parser(databases_directory, importDirectory, download=True): config = builder_utils.get_config(config_name="jensenlabConfig.yml", data_type='databases') outputfileName = "Publications.tsv" url = config['db_url'] ifile = config['organisms_file'] organisms = str(config['organisms']) directory = os.path.join(databases_directory, "Jensenlab") builder_utils.checkDirectory(os.path.join(directory, "textmining")) if download: builder_utils.downloadDB(url.replace("FILE", ifile), os.path.join(directory, "textmining")) ifile = os.path.join(directory, os.path.join("textmining", ifile)) valid_pubs = read_valid_pubs(organisms, ifile) entities, header = parse_PMC_list(config, os.path.join(directory, "textmining"), download=download, valid_pubs=valid_pubs) num_entities = len(entities) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_entities(entities, header, outputfile) entities = None for qtype in config['db_mentions_types']: parse_mentions(config, directory, qtype, importDirectory, download) builder_utils.remove_directory(os.path.join(directory, "textmining")) return (num_entities, outputfile)
[docs]def read_valid_pubs(organisms, organisms_file): pubs = set() with open(organisms_file, 'r') as idbf: for line in idbf: data = line.rstrip('\r\n').split('\t') if str(data[0]) in organisms: pubs.update(set(data[1].split(" "))) return list(pubs)
[docs]def parse_PMC_list(config, directory, download=True, valid_pubs=None): url = config['PMC_db_url'] plinkout = config['pubmed_linkout'] entities = set() fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) entities = pd.read_csv(fileName, sep=',', dtype=str, compression='gzip', low_memory=False) entities = entities[config['PMC_fields']] entities = entities[entities.iloc[:, 0].notnull()] entities = entities.set_index(list(entities.columns)[0]) if valid_pubs is not None: entities = entities.loc[valid_pubs] entities['linkout'] = [plinkout.replace("PUBMEDID", str(int(pubmedid))) for pubmedid in list(entities.index)] entities.index.names = ['ID'] entities['TYPE'] = 'Publication' entities = entities.reset_index() header = [c.replace(' ', '_').lower() if c not in ['ID', 'TYPE'] else c for c in list(entities.columns)] entities = entities.replace('\\\\', '', regex=True) entities = list(entities.itertuples(index=False)) return entities, header
[docs]def parse_mentions(config, directory, qtype, importDirectory, download=True): url = config['db_url'] ifile = config['db_mentions_files'][qtype] if qtype == "9606": mapping = mp.getSTRINGMapping(download=download) elif qtype == "-1": mapping = mp.getSTRINGMapping(source=config['db_sources']["Drug"], download=download, db="STITCH") filters = [] if qtype in config['db_mentions_filters']: filters = config['db_mentions_filters'][qtype] entity1, entity2 = config['db_mentions_types'][qtype] outputfile = os.path.join(importDirectory, entity1 + "_" + entity2 + "_mentioned_in_publication.tsv") if download: builder_utils.downloadDB(url.replace("FILE", ifile), os.path.join(directory, "textmining")) ifile = os.path.join(directory, os.path.join("textmining", ifile)) with open(outputfile, 'w') as f: f.write("START_ID\tEND_ID\tTYPE\n") with open(ifile, 'r') as idbf: for line in idbf: data = line.rstrip("\r\n").split('\t') id1 = data[0] pubmedids = data[1].split(" ") ident = [] if qtype == "9606": id1 = "9606."+id1 if id1 in mapping: ident = mapping[id1] elif qtype == "-1": if id1 in mapping: ident = mapping[id1] elif qtype == "-26": if id1.startswith("DOID"): ident = [id1] else: ident = [id1] for i in ident: if i not in filters: aux = pd.DataFrame(data={"Pubmedids": list(pubmedids)}) aux["START_ID"] = i aux["TYPE"] = "MENTIONED_IN_PUBLICATION" aux.to_csv(path_or_buf=f, sep='\t', header=False, index=False, quotechar='"', line_terminator='\n', escapechar='\\') aux = None