Source code for src.report_manager.apps.imports

from analytics_core.viz import viz
import dash_core_components as dcc
import plotly.graph_objs as go
import plotly.subplots as tools
import pandas as pd
import numpy as np
from itertools import chain
from collections import defaultdict
from natsort import natsorted


[docs]def get_stats_data(filename, n=3):
    """
    Reads graph database stats file and filters for the last 'n' full and partial independent \
    imports, returning a Pandas DataFrame.
    
    :param str filename: path to stats file (including filename and '.hdf' extension).
    :param int n: number of independent imports to plot.
    :return: Pandas Dataframe with different entities and relationships as rows and columns:
    """
    
    store = pd.HDFStore(filename, 'r')
    full, partial = list(store.keys())
    df_full = store[full]
    df_partial = store[partial]
    store.close()

    df_full['Import_flag'] = 'full'
    df_partial['Import_flag'] = 'partial'
    df = pd.concat([df_full, df_partial])
    df['datetime'] = pd.to_datetime(df['date']+' '+df['time'])
    imp = select_last_n_imports(df, n=n)
    df = df[df['import_id'].isin(imp)].reset_index(drop=True)
    return df


[docs]def select_last_n_imports(stats_file, n=3):
    """
    Selects which independent full and partial imports should be plotted based on n.

    :param stats_file: pandas DataFrame with stats data.
    :param int n: number of independent imports to select.
    :return: List of import ids to be plotted according to selection criterion.
    """
    df = stats_file[['datetime', 'import_id', 'Import_flag']].sort_values('datetime', ascending=False).drop_duplicates(['import_id'], keep = 'first', inplace = False) 
    f = df[df['Import_flag'] == 'full']
    f = f.iloc[:n, 1].tolist()
    p = df[df['Import_flag'] == 'partial']
    p = p.iloc[:n, 1].tolist()
    return p+f


[docs]def remove_legend_duplicates(figure):
    """
    Removes duplicated legend items.

    :param figure: plotly graph object figure.
    """
    seen = []
    for n,i in enumerate(figure['data']):
        name = figure['data'][n]['name']
        if name in seen:
            figure.data[n].update(showlegend=False)
        else:
            figure.data[n].update(showlegend=True)
        seen.append(name)


[docs]def get_databases_entities_relationships(stats_file, key='full', options='databases'):
    """
    Builds dictionary from stats file. Depending on 'options', keys and values can differ. \
    If *options* is set to 'dates', keys are dates of the imports and values are databases imported at each date; \
    if 'databases', keys are databases and values are entities and relationships created from each database; \
    if 'entities', keys are databases and values are entities created from each database; \
    if 'relationships', keys are databases and values are relationships created from each database.

    :param stats_file: pandas DataFrame with stats data.
    :param str key: use only full, partial or both kinds of imports ('full', 'partial', 'all').
    :param str options: name of the variables to be used as keys in the output dictionary ('dates', \
                        'databases', 'entities' or 'relationships').
    :return: Dictionary.
    """
    if key == 'full':
        stats = stats_file[stats_file['Import_flag'] == 'full']
    elif key == 'partial':
        stats = stats_file[stats_file['Import_flag'] == 'partial']
    elif key == 'all':
        stats = stats_file

    mask = (stats['Import_type']=='entity')
    mask2 = (stats['Import_type']=='relationships')
    ent = list(set(list(zip(stats.loc[mask,'filename'], stats.loc[mask,'dataset']))))
    rel = list(set(list(zip(stats.loc[mask2,'filename'], stats.loc[mask2,'dataset']))))

    dat = []
    for i, j in stats.groupby('import_id'):
        date = str(j['datetime'].sort_values().reset_index(drop=True)[0])
        for i in j['dataset'].unique():
            dat.append((date, i))

    d_dat = defaultdict(list)
    for k, v in dat:
        d_dat[k].append(v)
    d_dat = {k: tuple(v) for k, v in d_dat.items()}
    d_dat = dict(natsorted(d_dat.items()))

    d_ent = defaultdict(list)
    for k, v in ent:
        d_ent[v].append(k)
    d_ent = {k: tuple(v) for k, v in d_ent.items()}
    d_ent = dict(natsorted(d_ent.items()))

    d_rel = defaultdict(list)
    for k, v in rel:
        d_rel[v].append(k)
    d_rel = {k: tuple(v) for k, v in d_rel.items()}
    d_rel = dict(natsorted(d_rel.items()))

    for i in stats_file['dataset'].unique():
        if i not in d_ent.keys():
            d_ent[i] = ''
        if i not in d_rel.keys():
            d_rel[i] = ''

    d_dbs_filename = defaultdict(list)
    for k, v in chain(d_ent.items(), d_rel.items()):
        d_dbs_filename[k].append(v)
    d_dbs_filename = {k: tuple(v) for k, v in d_dbs_filename.items()}
    d_dbs_filename = dict(natsorted(d_dbs_filename.items()))

    if options == 'entities':
        return d_ent
    if options == 'relationships':
        return d_rel
    if options == 'databases':
        return d_dbs_filename
    if options == 'dates':
        return d_dat


[docs]def set_colors(dictionary):
    """
    This function takes the values in a dictionary and attributes them an RGB color.

    :param dict dictionary: dictionary with variables to be attributed a color, as values.
    :return: Dictionary where 'dictionary' values are keys and random RGB colors are the values.
    """
    colors = []
    for i in list(chain(*dictionary.values())):
        color = 'rgb' + str(tuple(np.random.choice(range(256), size=3)))
        colors.append((i, color))
    colors = dict(colors)

    return colors


[docs]def get_dropdown_menu(fig, options_dict, add_button=True, equal_traces=True, number_traces=2):
    """
    Builds a list for the dropdown menu, based on a plotly figure traces and a dictionary with \
    the options to be used in the dropdown.

    :param fig: plotly graph object figure.
    :param options_dict: dictionary where keys are used as dropdown options and values data points.
    :param bool add_button: add option to display all dropdown options simultaneously.
    :param bool equal_traces: defines if all dropdown options have the same number of traces each. \
                                If True, define 'number_traces' as well. If False, number of traces \
                                will be the same as the number of values for each 'options_dict' key.
    :param int number_traces: number of traces created for each 'options_dict' key.
    :return: List of nested structures. Each dictionary within *updatemenus[0]['buttons'][0]* corresponds \
            to one dropdown menu options and contains information on which traces are visible, label and method.
    """
    
    list_updatemenus = []
    
    start = 0
    for n, i in enumerate(options_dict.keys()):
        if equal_traces:
            visible = [False] * len(fig['data'])
            end = start + number_traces
            visible[start:end] = [True] * number_traces
            start += number_traces
        else:
            number_traces = len([element for tupl in options_dict[i] for element in tupl])*2
            visible = [False] * len(fig['data'])
            end = start + number_traces
            visible[start:end] = [True] * number_traces
            start += number_traces    
        temp_dict = dict(label=str(i),
                         method='update',
                         args=[{'visible': visible},
                                 {'title': 'Date: '+i}])
        list_updatemenus.append(temp_dict)

    if add_button:
        button = [dict(label='All',
                        method='update',
                        args=[{'visible': [True] * len(fig['data'])}, {'title': 'All'}])]
        list_updatemenus = list_updatemenus + button
    else: 
        pass

    updatemenus = list([dict(active=len(list_updatemenus)-1,
                             buttons=list_updatemenus,
                             direction='down',
                             showactive=True, x=-0.17, xanchor='left', y=1.1, yanchor='top'), ])

    return updatemenus


[docs]def get_totals_per_date(stats_file, key='full', import_types=False):
    """
    Summarizes stats file to a Pandas DataFrame with import dates and total number of \
    imported entities and relationships.

    :param stats_file: pandas DataFrame with stats data.
    :param str key: use only full or partial imports ('full', 'partial').
    :param bool import_types: breakdown importing stats into entities or relationships related.
    :return: Pandas DataFrame with independent import dates as rows and imported numbers as columns.
    """
    if key == 'full':
        stats = stats_file[stats_file['Import_flag'] == 'full']
    elif key == 'partial':
        stats = stats_file[stats_file['Import_flag'] == 'partial']

    cols = ['date', 'total']
    counts = []
    for i, j in stats.groupby('import_id'):
        date = str(j['datetime'].sort_values().reset_index(drop=True)[0])
        count = j['Imported_number'].sum()
        counts.append((date, count))

    df = pd.DataFrame(counts, columns=cols)
    df = df.set_index('date')

    if import_types:
        cols = ['date', 'entity', 'relationships']
        counts = []
        for i, j in stats.groupby(['import_id']):
            date = str(j['datetime'].sort_values().reset_index(drop=True)[0])
            ent = j.loc[(j['Import_type'] == 'entity'), 'Imported_number'].sum()
            rel = j.loc[(j['Import_type'] == 'relationships'), 'Imported_number'].sum()
            counts.append((date, ent, rel))

        df = pd.DataFrame(counts, columns=cols)
        df = df.set_index('date')

    return df


[docs]def get_imports_per_database_date(stats_file):
    """
    Summarizes stats file to a Pandas DataFrame with import dates, databases and total number of \
    imported entities and relationships per database.

    :param stats_file: pandas DataFrame with stats data.
    :return: Pandas DataFrame with independent import dates and databases as rows and imported numbers as columns.
    """
    cols = ['date', 'dataset', 'entities', 'relationships', 'total']
    stats_sum = []
    for i, j in stats_file.groupby(['import_id']):
        date = str(j['datetime'].sort_values().reset_index(drop=True)[0])
        for a, b in j.groupby('dataset'):
            ent = b.loc[(b['Import_type'] == 'entity'), 'Imported_number'].sum()
            rel = b.loc[(b['Import_type'] == 'relationships'), 'Imported_number'].sum()
            total = b['Imported_number'].sum()
            stats_sum.append((date, a, ent, rel, total))

    df = pd.DataFrame(stats_sum, columns=cols)
    df = df.sort_values(['date', 'total'])
    df = df.set_index(['date', 'dataset'])
    df = df.drop('total', axis=1)

    return df


[docs]def plot_total_number_imported(stats_file, plot_title):
    """
    Creates plot with overview of imports numbers per date.

    :param stats_file: pandas DataFrame with stats data.
    :param str plot_title: title of the plot.
    :return: Line plot figure within the <div id="_dash-app-content">.
    """
    df_full = get_totals_per_date(stats_file, key='full', import_types=False).sort_index()
    df_partial = get_totals_per_date(stats_file, key='partial', import_types=False).sort_index()

    traces_f = viz.getPlotTraces(df_full, key='full', type='lines')
    traces_p = viz.getPlotTraces(df_partial, key='partial', type='lines')
    traces = traces_f + traces_p

    if type(traces[0]) == list:
        traces = list(chain.from_iterable(traces))
    else: pass

    layout = go.Layout(title='', xaxis=dict(title=''), yaxis={'title': 'Number of imports'},
                       legend={'font': {'size': 11}}, margin=go.layout.Margin(l=80, r=40, t=100, b=50),
                       annotations=[dict(text='<b>{}<b>'.format(plot_title), font=dict(family='Arial', size=18),
                       showarrow=False, xref='paper', x=-0.06, xanchor='left', yref='paper', y=1.15, yanchor='top')])

    fig = go.Figure(data=traces, layout=layout)
    fig['layout']['template'] = 'plotly_white'

    return dcc.Graph(id='total imports', figure=fig)


[docs]def plot_total_numbers_per_date(stats_file, plot_title):
    """
    Plots number of entities and relationships imported per date, with scaled markers reflecting numbers rations.

    :param stats_file: pandas DataFrame with stats data.
    :param str plot_title: title of the plot.
    :return: Scatter plot figure within the <div id="_dash-app-content">, with scalled markers.
    """
    df_full = get_totals_per_date(stats_file, key='full', import_types=True)
    df_partial = get_totals_per_date(stats_file, key='partial', import_types=True)

    traces_f = viz.getPlotTraces(df_full, key='full', type='scaled markers', div_factor=float(10^1000))
    traces_p = viz.getPlotTraces(df_partial, key='partial', type='scaled markers', div_factor=float(10^1000))
    traces = traces_f + traces_p

    if type(traces[0]) == list:
        traces = list(chain.from_iterable(traces))
    else: 
        pass

    layout = go.Layout(title='', 
                    xaxis={'showgrid': True},
                    yaxis={'title': 'Imported entities/relationships'},
                    legend={'font': {'size':11}},
                    height=550,
                    margin=go.layout.Margin(l=80, r=40, t=100, b=100),
                    annotations=[dict(text='<b>{}<b>'.format(plot_title), font=dict(family='Arial', size=18),
                    showarrow=False, xref='paper', x=-0.06, xanchor='left', yref='paper', y=1.15, yanchor='top')])

    fig = go.Figure(data=traces, layout=layout)
    fig['layout']['template'] = 'plotly_white'

    return dcc.Graph(id='entities-relationships per date', figure=fig)


[docs]def plot_databases_numbers_per_date(stats_file, plot_title, key='full', dropdown=False, dropdown_options='dates'):
    """
    Grouped horizontal barplot showing the number of entities and relationships imported from each biomedical database.

    :param stats_file: pandas DataFrame with stats data.
    :param str plot_title: title of the plot.
    :param str key: use only full or partial imports ('full', 'partial').
    :param bool dropdown: add dropdown menu to figure or not.
    :param str dropdown_options: name of the variables to be used as options in the dropdown menu ('dates', \
                        'databases', 'entities' or 'relationships').
    :return: Horizontal barplot figure within the <div id="_dash-app-content">.
    """
    if key == 'full':
        stats = stats_file[stats_file['Import_flag'] == 'full']
    elif key == 'partial':
        stats = stats_file[stats_file['Import_flag'] == 'partial']
    else:
        print('Syntax error')

    dropdown_options = get_databases_entities_relationships(stats_file, key=key, options=dropdown_options)
    data = get_imports_per_database_date(stats)

    traces = []
    for i in dropdown_options.keys():
        df = data.iloc[data.index.get_level_values(0).str.contains(i)].droplevel(0)
        traces.append(viz.getPlotTraces(df, key=key, type='bars', horizontal=True))

    if type(traces[0]) == list:
        traces = list(chain.from_iterable(traces))
    else:
        pass

    layout = go.Layout(title='', xaxis = {'showgrid':True, 'type':'log','title':'Imported entities/relationships'},
                        legend={'font':{'size':11}}, height=600, margin=go.layout.Margin(l=40,r=40,t=80,b=100),
                        annotations=[dict(text='<b>{}<b>'.format(plot_title), font = dict(family='Arial', size = 18),
                        showarrow=False, xref = 'paper', x=-0.17, xanchor='left', yref = 'paper', y=1.2, yanchor='top')])

    fig = go.Figure(data=traces, layout=layout)
    fig['layout']['template'] = 'plotly_white'

    if dropdown:
        updatemenus = get_dropdown_menu(fig, dropdown_options, add_button=True, equal_traces=True, number_traces=2)
        fig.layout.update(go.Layout(updatemenus = updatemenus))
        
    names = set([fig['data'][n]['name'] for n,i in enumerate(fig['data'])])
    colors = dict(zip(names, ['red', 'blue', 'green', 'yellow', 'orange']))

    for name in names:
        fig.for_each_trace(lambda trace: trace.update(marker=dict(color=colors[name])), selector=dict(name=name))

    # remove_legend_duplicates(fig) #Removes legend from individual plots.

    return dcc.Graph(id = 'databases imports {}'.format(key), figure = fig)


[docs]def plot_import_numbers_per_database(stats_file, plot_title, key='full', subplot_titles = ('',''), colors=True, plots_1='entities', plots_2='relationships', dropdown=True, dropdown_options='databases'):
    """
    Creates plotly multiplot figure with breakdown of imported numbers and size of the respective files, per database and \
    import type (entities or relationships).

    :param stats_file: pandas DataFrame with stats data.
    :param str plot_title: title of the plot.
    :param str key: use only full or partial imports ('full', 'partial').
    :param tuple subplot_titles: title of the subplots (tuple of strings, one for each subplot).
    :param bool colors: define standard colors for entities and for relationships.
    :param str plots_1: name of the variable plotted.
    :param str plots_2: name of the variable plotted.
    :param bool dropdown: add dropdown menu to figure or not.
    :param str dropdown_options: name of the variables to be used as options in the dropdown menu ('dates', \
                        'databases', 'entities' or 'relationships').
    :return: Multi-scatterplot figure within the <div id="_dash-app-content">.
    """
    if key == 'full':
        stats = stats_file[stats_file['Import_flag'] == 'full']
    elif key == 'partial':
        stats = stats_file[stats_file['Import_flag'] == 'partial']
    else:
        print('Syntax error')

    ent = get_databases_entities_relationships(stats_file, key=key, options=plots_1)
    rel = get_databases_entities_relationships(stats_file, key=key, options=plots_2)
    dropdown_options = get_databases_entities_relationships(stats_file, key=key, options=dropdown_options)

    if colors:
        ent_colors = set_colors(ent)
        rel_colors = set_colors(rel)

    fig = tools.make_subplots(2, 2, subplot_titles = subplot_titles, vertical_spacing = 0.18, horizontal_spacing = 0.2)

    for i, j in stats.groupby(['dataset', 'filename']):
        date = pd.Series(str(j['datetime'].sort_values().reset_index(drop=True)[0]))
        j = j.sort_values(['import_id', 'datetime']).drop_duplicates(['dataset', 'import_id', 'filename'], keep='first', inplace=False)
        entities_df = j[j['Import_type'] == 'entity']
        relationships_df = j[j['Import_type'] == 'relationships']
        
        if not entities_df['Imported_number'].empty:
            fig.append_trace(go.Scattergl(visible=True,
                                                  x=entities_df['datetime'],
                                                  y=entities_df['Imported_number'],
                                                  mode='markers+lines',
                                                  marker = dict(color = ent_colors[i[1]]),
                                                  name=i[1].split('.')[0]),1,1)
            fig.append_trace(go.Scattergl(visible=True,
                                                  x=entities_df['datetime'],
                                                  y=entities_df['file_size'],
                                                  mode='markers+lines',
                                                  marker = dict(color = ent_colors[i[1]]),
                                                  name=i[1].split('.')[0],
                                                  showlegend=False),2,1)
        
        if not relationships_df['Imported_number'].empty:
            fig.append_trace(go.Scattergl(visible=True,
                                                  x=relationships_df['datetime'],
                                                  y=relationships_df['Imported_number'],
                                                  mode='markers+lines',
                                                  marker = dict(color = rel_colors[i[1]]),
                                                  name=i[1].split('.')[0]),1,2)
            fig.append_trace(go.Scattergl(visible=True,
                                                  x=relationships_df['datetime'],
                                                  y=relationships_df['file_size'],
                                                  mode='markers+lines',
                                                  marker = dict(color = rel_colors[i[1]]),
                                                  name=i[1].split('.')[0],
                                                  showlegend=False),2,2)
                
    fig.layout.update(go.Layout(legend={'orientation':'v', 'font':{'size':11}},
                                height=700, margin=go.layout.Margin(l=20,r=20,t=150,b=60)))

    annotations = []
    annotations.append(dict(text='<b>{}<b>'.format(plot_title), font = dict(family='Arial', size = 18),
                            showarrow=False, xref = 'paper', x=-0.07, xanchor='left', yref = 'paper', y=1.3, yanchor='top'))
    annotations.append({'font':{'size': 14},'showarrow':False,'text':subplot_titles[0],'x':0.23,'xanchor':'center','xref':'paper','y':1.0,'yanchor':'bottom','yref':'paper'})
    annotations.append({'font':{'size': 14},'showarrow':False,'text':subplot_titles[1],'x':0.78,'xanchor':'center','xref':'paper','y':1.0,'yanchor':'bottom','yref':'paper'})
    annotations.append({'font':{'size': 14},'showarrow':False,'text':subplot_titles[2],'x':0.23,'xanchor':'center','xref':'paper','y':0.44,'yanchor':'bottom','yref':'paper'})
    annotations.append({'font':{'size': 14},'showarrow':False,'text':subplot_titles[3],'x':0.78,'xanchor':'center','xref':'paper','y':0.44,'yanchor':'bottom','yref':'paper'})

    fig.layout['annotations'] = annotations
    fig['layout']['template'] = 'plotly_white'

    if dropdown:
        updatemenus = get_dropdown_menu(fig, dropdown_options, add_button=True, equal_traces=False)
        fig.layout.update(go.Layout(updatemenus = updatemenus))
            

    return dcc.Graph(id = 'imports-breakdown per database {}'.format(key), figure = fig)