from analytics_core.viz import viz
import dash_core_components as dcc
import plotly.graph_objs as go
import plotly.subplots as tools
import pandas as pd
import numpy as np
from itertools import chain
from collections import defaultdict
from natsort import natsorted
[docs]def get_stats_data(filename, n=3):
"""
Reads graph database stats file and filters for the last 'n' full and partial independent \
imports, returning a Pandas DataFrame.
:param str filename: path to stats file (including filename and '.hdf' extension).
:param int n: number of independent imports to plot.
:return: Pandas Dataframe with different entities and relationships as rows and columns:
"""
store = pd.HDFStore(filename, 'r')
full, partial = list(store.keys())
df_full = store[full]
df_partial = store[partial]
store.close()
df_full['Import_flag'] = 'full'
df_partial['Import_flag'] = 'partial'
df = pd.concat([df_full, df_partial])
df['datetime'] = pd.to_datetime(df['date']+' '+df['time'])
imp = select_last_n_imports(df, n=n)
df = df[df['import_id'].isin(imp)].reset_index(drop=True)
return df
[docs]def select_last_n_imports(stats_file, n=3):
"""
Selects which independent full and partial imports should be plotted based on n.
:param stats_file: pandas DataFrame with stats data.
:param int n: number of independent imports to select.
:return: List of import ids to be plotted according to selection criterion.
"""
df = stats_file[['datetime', 'import_id', 'Import_flag']].sort_values('datetime', ascending=False).drop_duplicates(['import_id'], keep = 'first', inplace = False)
f = df[df['Import_flag'] == 'full']
f = f.iloc[:n, 1].tolist()
p = df[df['Import_flag'] == 'partial']
p = p.iloc[:n, 1].tolist()
return p+f
[docs]def remove_legend_duplicates(figure):
"""
Removes duplicated legend items.
:param figure: plotly graph object figure.
"""
seen = []
for n,i in enumerate(figure['data']):
name = figure['data'][n]['name']
if name in seen:
figure.data[n].update(showlegend=False)
else:
figure.data[n].update(showlegend=True)
seen.append(name)
[docs]def get_databases_entities_relationships(stats_file, key='full', options='databases'):
"""
Builds dictionary from stats file. Depending on 'options', keys and values can differ. \
If *options* is set to 'dates', keys are dates of the imports and values are databases imported at each date; \
if 'databases', keys are databases and values are entities and relationships created from each database; \
if 'entities', keys are databases and values are entities created from each database; \
if 'relationships', keys are databases and values are relationships created from each database.
:param stats_file: pandas DataFrame with stats data.
:param str key: use only full, partial or both kinds of imports ('full', 'partial', 'all').
:param str options: name of the variables to be used as keys in the output dictionary ('dates', \
'databases', 'entities' or 'relationships').
:return: Dictionary.
"""
if key == 'full':
stats = stats_file[stats_file['Import_flag'] == 'full']
elif key == 'partial':
stats = stats_file[stats_file['Import_flag'] == 'partial']
elif key == 'all':
stats = stats_file
mask = (stats['Import_type']=='entity')
mask2 = (stats['Import_type']=='relationships')
ent = list(set(list(zip(stats.loc[mask,'filename'], stats.loc[mask,'dataset']))))
rel = list(set(list(zip(stats.loc[mask2,'filename'], stats.loc[mask2,'dataset']))))
dat = []
for i, j in stats.groupby('import_id'):
date = str(j['datetime'].sort_values().reset_index(drop=True)[0])
for i in j['dataset'].unique():
dat.append((date, i))
d_dat = defaultdict(list)
for k, v in dat:
d_dat[k].append(v)
d_dat = {k: tuple(v) for k, v in d_dat.items()}
d_dat = dict(natsorted(d_dat.items()))
d_ent = defaultdict(list)
for k, v in ent:
d_ent[v].append(k)
d_ent = {k: tuple(v) for k, v in d_ent.items()}
d_ent = dict(natsorted(d_ent.items()))
d_rel = defaultdict(list)
for k, v in rel:
d_rel[v].append(k)
d_rel = {k: tuple(v) for k, v in d_rel.items()}
d_rel = dict(natsorted(d_rel.items()))
for i in stats_file['dataset'].unique():
if i not in d_ent.keys():
d_ent[i] = ''
if i not in d_rel.keys():
d_rel[i] = ''
d_dbs_filename = defaultdict(list)
for k, v in chain(d_ent.items(), d_rel.items()):
d_dbs_filename[k].append(v)
d_dbs_filename = {k: tuple(v) for k, v in d_dbs_filename.items()}
d_dbs_filename = dict(natsorted(d_dbs_filename.items()))
if options == 'entities':
return d_ent
if options == 'relationships':
return d_rel
if options == 'databases':
return d_dbs_filename
if options == 'dates':
return d_dat
[docs]def set_colors(dictionary):
"""
This function takes the values in a dictionary and attributes them an RGB color.
:param dict dictionary: dictionary with variables to be attributed a color, as values.
:return: Dictionary where 'dictionary' values are keys and random RGB colors are the values.
"""
colors = []
for i in list(chain(*dictionary.values())):
color = 'rgb' + str(tuple(np.random.choice(range(256), size=3)))
colors.append((i, color))
colors = dict(colors)
return colors
[docs]def get_totals_per_date(stats_file, key='full', import_types=False):
"""
Summarizes stats file to a Pandas DataFrame with import dates and total number of \
imported entities and relationships.
:param stats_file: pandas DataFrame with stats data.
:param str key: use only full or partial imports ('full', 'partial').
:param bool import_types: breakdown importing stats into entities or relationships related.
:return: Pandas DataFrame with independent import dates as rows and imported numbers as columns.
"""
if key == 'full':
stats = stats_file[stats_file['Import_flag'] == 'full']
elif key == 'partial':
stats = stats_file[stats_file['Import_flag'] == 'partial']
cols = ['date', 'total']
counts = []
for i, j in stats.groupby('import_id'):
date = str(j['datetime'].sort_values().reset_index(drop=True)[0])
count = j['Imported_number'].sum()
counts.append((date, count))
df = pd.DataFrame(counts, columns=cols)
df = df.set_index('date')
if import_types:
cols = ['date', 'entity', 'relationships']
counts = []
for i, j in stats.groupby(['import_id']):
date = str(j['datetime'].sort_values().reset_index(drop=True)[0])
ent = j.loc[(j['Import_type'] == 'entity'), 'Imported_number'].sum()
rel = j.loc[(j['Import_type'] == 'relationships'), 'Imported_number'].sum()
counts.append((date, ent, rel))
df = pd.DataFrame(counts, columns=cols)
df = df.set_index('date')
return df
[docs]def get_imports_per_database_date(stats_file):
"""
Summarizes stats file to a Pandas DataFrame with import dates, databases and total number of \
imported entities and relationships per database.
:param stats_file: pandas DataFrame with stats data.
:return: Pandas DataFrame with independent import dates and databases as rows and imported numbers as columns.
"""
cols = ['date', 'dataset', 'entities', 'relationships', 'total']
stats_sum = []
for i, j in stats_file.groupby(['import_id']):
date = str(j['datetime'].sort_values().reset_index(drop=True)[0])
for a, b in j.groupby('dataset'):
ent = b.loc[(b['Import_type'] == 'entity'), 'Imported_number'].sum()
rel = b.loc[(b['Import_type'] == 'relationships'), 'Imported_number'].sum()
total = b['Imported_number'].sum()
stats_sum.append((date, a, ent, rel, total))
df = pd.DataFrame(stats_sum, columns=cols)
df = df.sort_values(['date', 'total'])
df = df.set_index(['date', 'dataset'])
df = df.drop('total', axis=1)
return df
[docs]def plot_total_number_imported(stats_file, plot_title):
"""
Creates plot with overview of imports numbers per date.
:param stats_file: pandas DataFrame with stats data.
:param str plot_title: title of the plot.
:return: Line plot figure within the <div id="_dash-app-content">.
"""
df_full = get_totals_per_date(stats_file, key='full', import_types=False).sort_index()
df_partial = get_totals_per_date(stats_file, key='partial', import_types=False).sort_index()
traces_f = viz.getPlotTraces(df_full, key='full', type='lines')
traces_p = viz.getPlotTraces(df_partial, key='partial', type='lines')
traces = traces_f + traces_p
if type(traces[0]) == list:
traces = list(chain.from_iterable(traces))
else: pass
layout = go.Layout(title='', xaxis=dict(title=''), yaxis={'title': 'Number of imports'},
legend={'font': {'size': 11}}, margin=go.layout.Margin(l=80, r=40, t=100, b=50),
annotations=[dict(text='<b>{}<b>'.format(plot_title), font=dict(family='Arial', size=18),
showarrow=False, xref='paper', x=-0.06, xanchor='left', yref='paper', y=1.15, yanchor='top')])
fig = go.Figure(data=traces, layout=layout)
fig['layout']['template'] = 'plotly_white'
return dcc.Graph(id='total imports', figure=fig)
[docs]def plot_total_numbers_per_date(stats_file, plot_title):
"""
Plots number of entities and relationships imported per date, with scaled markers reflecting numbers rations.
:param stats_file: pandas DataFrame with stats data.
:param str plot_title: title of the plot.
:return: Scatter plot figure within the <div id="_dash-app-content">, with scalled markers.
"""
df_full = get_totals_per_date(stats_file, key='full', import_types=True)
df_partial = get_totals_per_date(stats_file, key='partial', import_types=True)
traces_f = viz.getPlotTraces(df_full, key='full', type='scaled markers', div_factor=float(10^1000))
traces_p = viz.getPlotTraces(df_partial, key='partial', type='scaled markers', div_factor=float(10^1000))
traces = traces_f + traces_p
if type(traces[0]) == list:
traces = list(chain.from_iterable(traces))
else:
pass
layout = go.Layout(title='',
xaxis={'showgrid': True},
yaxis={'title': 'Imported entities/relationships'},
legend={'font': {'size':11}},
height=550,
margin=go.layout.Margin(l=80, r=40, t=100, b=100),
annotations=[dict(text='<b>{}<b>'.format(plot_title), font=dict(family='Arial', size=18),
showarrow=False, xref='paper', x=-0.06, xanchor='left', yref='paper', y=1.15, yanchor='top')])
fig = go.Figure(data=traces, layout=layout)
fig['layout']['template'] = 'plotly_white'
return dcc.Graph(id='entities-relationships per date', figure=fig)
[docs]def plot_databases_numbers_per_date(stats_file, plot_title, key='full', dropdown=False, dropdown_options='dates'):
"""
Grouped horizontal barplot showing the number of entities and relationships imported from each biomedical database.
:param stats_file: pandas DataFrame with stats data.
:param str plot_title: title of the plot.
:param str key: use only full or partial imports ('full', 'partial').
:param bool dropdown: add dropdown menu to figure or not.
:param str dropdown_options: name of the variables to be used as options in the dropdown menu ('dates', \
'databases', 'entities' or 'relationships').
:return: Horizontal barplot figure within the <div id="_dash-app-content">.
"""
if key == 'full':
stats = stats_file[stats_file['Import_flag'] == 'full']
elif key == 'partial':
stats = stats_file[stats_file['Import_flag'] == 'partial']
else:
print('Syntax error')
dropdown_options = get_databases_entities_relationships(stats_file, key=key, options=dropdown_options)
data = get_imports_per_database_date(stats)
traces = []
for i in dropdown_options.keys():
df = data.iloc[data.index.get_level_values(0).str.contains(i)].droplevel(0)
traces.append(viz.getPlotTraces(df, key=key, type='bars', horizontal=True))
if type(traces[0]) == list:
traces = list(chain.from_iterable(traces))
else:
pass
layout = go.Layout(title='', xaxis = {'showgrid':True, 'type':'log','title':'Imported entities/relationships'},
legend={'font':{'size':11}}, height=600, margin=go.layout.Margin(l=40,r=40,t=80,b=100),
annotations=[dict(text='<b>{}<b>'.format(plot_title), font = dict(family='Arial', size = 18),
showarrow=False, xref = 'paper', x=-0.17, xanchor='left', yref = 'paper', y=1.2, yanchor='top')])
fig = go.Figure(data=traces, layout=layout)
fig['layout']['template'] = 'plotly_white'
if dropdown:
updatemenus = get_dropdown_menu(fig, dropdown_options, add_button=True, equal_traces=True, number_traces=2)
fig.layout.update(go.Layout(updatemenus = updatemenus))
names = set([fig['data'][n]['name'] for n,i in enumerate(fig['data'])])
colors = dict(zip(names, ['red', 'blue', 'green', 'yellow', 'orange']))
for name in names:
fig.for_each_trace(lambda trace: trace.update(marker=dict(color=colors[name])), selector=dict(name=name))
# remove_legend_duplicates(fig) #Removes legend from individual plots.
return dcc.Graph(id = 'databases imports {}'.format(key), figure = fig)
[docs]def plot_import_numbers_per_database(stats_file, plot_title, key='full', subplot_titles = ('',''), colors=True, plots_1='entities', plots_2='relationships', dropdown=True, dropdown_options='databases'):
"""
Creates plotly multiplot figure with breakdown of imported numbers and size of the respective files, per database and \
import type (entities or relationships).
:param stats_file: pandas DataFrame with stats data.
:param str plot_title: title of the plot.
:param str key: use only full or partial imports ('full', 'partial').
:param tuple subplot_titles: title of the subplots (tuple of strings, one for each subplot).
:param bool colors: define standard colors for entities and for relationships.
:param str plots_1: name of the variable plotted.
:param str plots_2: name of the variable plotted.
:param bool dropdown: add dropdown menu to figure or not.
:param str dropdown_options: name of the variables to be used as options in the dropdown menu ('dates', \
'databases', 'entities' or 'relationships').
:return: Multi-scatterplot figure within the <div id="_dash-app-content">.
"""
if key == 'full':
stats = stats_file[stats_file['Import_flag'] == 'full']
elif key == 'partial':
stats = stats_file[stats_file['Import_flag'] == 'partial']
else:
print('Syntax error')
ent = get_databases_entities_relationships(stats_file, key=key, options=plots_1)
rel = get_databases_entities_relationships(stats_file, key=key, options=plots_2)
dropdown_options = get_databases_entities_relationships(stats_file, key=key, options=dropdown_options)
if colors:
ent_colors = set_colors(ent)
rel_colors = set_colors(rel)
fig = tools.make_subplots(2, 2, subplot_titles = subplot_titles, vertical_spacing = 0.18, horizontal_spacing = 0.2)
for i, j in stats.groupby(['dataset', 'filename']):
date = pd.Series(str(j['datetime'].sort_values().reset_index(drop=True)[0]))
j = j.sort_values(['import_id', 'datetime']).drop_duplicates(['dataset', 'import_id', 'filename'], keep='first', inplace=False)
entities_df = j[j['Import_type'] == 'entity']
relationships_df = j[j['Import_type'] == 'relationships']
if not entities_df['Imported_number'].empty:
fig.append_trace(go.Scattergl(visible=True,
x=entities_df['datetime'],
y=entities_df['Imported_number'],
mode='markers+lines',
marker = dict(color = ent_colors[i[1]]),
name=i[1].split('.')[0]),1,1)
fig.append_trace(go.Scattergl(visible=True,
x=entities_df['datetime'],
y=entities_df['file_size'],
mode='markers+lines',
marker = dict(color = ent_colors[i[1]]),
name=i[1].split('.')[0],
showlegend=False),2,1)
if not relationships_df['Imported_number'].empty:
fig.append_trace(go.Scattergl(visible=True,
x=relationships_df['datetime'],
y=relationships_df['Imported_number'],
mode='markers+lines',
marker = dict(color = rel_colors[i[1]]),
name=i[1].split('.')[0]),1,2)
fig.append_trace(go.Scattergl(visible=True,
x=relationships_df['datetime'],
y=relationships_df['file_size'],
mode='markers+lines',
marker = dict(color = rel_colors[i[1]]),
name=i[1].split('.')[0],
showlegend=False),2,2)
fig.layout.update(go.Layout(legend={'orientation':'v', 'font':{'size':11}},
height=700, margin=go.layout.Margin(l=20,r=20,t=150,b=60)))
annotations = []
annotations.append(dict(text='<b>{}<b>'.format(plot_title), font = dict(family='Arial', size = 18),
showarrow=False, xref = 'paper', x=-0.07, xanchor='left', yref = 'paper', y=1.3, yanchor='top'))
annotations.append({'font':{'size': 14},'showarrow':False,'text':subplot_titles[0],'x':0.23,'xanchor':'center','xref':'paper','y':1.0,'yanchor':'bottom','yref':'paper'})
annotations.append({'font':{'size': 14},'showarrow':False,'text':subplot_titles[1],'x':0.78,'xanchor':'center','xref':'paper','y':1.0,'yanchor':'bottom','yref':'paper'})
annotations.append({'font':{'size': 14},'showarrow':False,'text':subplot_titles[2],'x':0.23,'xanchor':'center','xref':'paper','y':0.44,'yanchor':'bottom','yref':'paper'})
annotations.append({'font':{'size': 14},'showarrow':False,'text':subplot_titles[3],'x':0.78,'xanchor':'center','xref':'paper','y':0.44,'yanchor':'bottom','yref':'paper'})
fig.layout['annotations'] = annotations
fig['layout']['template'] = 'plotly_white'
if dropdown:
updatemenus = get_dropdown_menu(fig, dropdown_options, add_button=True, equal_traces=False)
fig.layout.update(go.Layout(updatemenus = updatemenus))
return dcc.Graph(id = 'imports-breakdown per database {}'.format(key), figure = fig)