evaluation.py 10.07 KiB
"""Function to process benchmark results in pandas."""
from pystencils.runhelper.db import Database, remove_constant_columns
db = None
def get_categorical(query):
global db
if db is None:
db = Database('mongo://lbmpy_bench')
res = basic_clean_up(db.to_pandas(query))
res = make_categorical(res)
return res
def get(query, **kwargs):
global db
if db is None:
db = Database('mongo://lbmpy_bench')
return basic_clean_up(db.to_pandas(query, **kwargs))
def remove_all_column_prefixes(df, inplace=False):
"""Strips everything left of a dots in pandas data frame column names: 'abc.def.value' is renamed to 'value'
Similar to remove_prefix_in_column_name, that removes everything before the FIRST dot
"""
if not inplace:
df = df.copy()
new_column_names = []
for column_name in df.columns:
if '.' in column_name:
new_column_names.append(column_name[-column_name[::-1].index('.'):])
else:
new_column_names.append(column_name)
df.columns = new_column_names
return df
def basic_clean_up(df):
"""Cleans up a data frame that was loaded from the benchmark database.
- fills default values for vectorization options
- replaces columns that have stored lists with tuples
- removes constant columns
"""
if df is None or len(df) == 0:
return df
df = df.applymap(lambda e: tuple(e) if isinstance(e, list) else e)
fill_default = {
'optimization.vectorization.nontemporal': False,
'optimization.vectorization.instruction_set': 'auto',
'smagorinsky': False,
'entropic': False,
'cumulant': False,
'stable': True,
}
categorical_columns = [] # ['optimization.vectorization.instruction_set']
for col, default in fill_default.items():
if col in df:
df[col] = df[col].fillna(default)
for col in categorical_columns:
if col in df:
df[col] = df[col].astype('category')
df, constants = remove_constant_columns(df)
remove_all_column_prefixes(df, inplace=True)
return df
def make_categorical(df):
"""Summarizes boolean columns into categorical columns, such that plotting is simpler afterwards.
- fixed_loop_sizes and fixed_relaxation_rates are summarized into single column with four values
- same for 'cse_global' and 'cse_pdfs' columns
"""
from pandas.api.types import CategoricalDtype
def bool_to_category(col):
df[col] = df.apply(lambda e: 'True' if e[col] else 'False', axis=1).astype('category')
if all(c in df for c in ['instruction_set', 'assume_aligned', 'nontemporal']):
def vec_column(row):
if row['instruction_set'] == 'auto':
return 'auto'
else:
result = str(row['instruction_set'])
if row['assume_aligned']:
result += "-align"
if row['nontemporal']:
result += "-nt"
return result
df['vec'] = df.apply(vec_column, axis=1)
del df['instruction_set']
del df['assume_aligned']
del df['nontemporal']
df['vec'] = df['vec'].astype('category')
if all(c in df for c in ['method']):
def method_category(row):
method = row['method']
if 'smagorinsky' in row and row['smagorinsky']:
method += '-smag'
if 'entropic' in row and row['entropic']:
method += '-entr'
if method.startswith('mrt3') and 'relaxation_rates' in row:
num_free_relaxation_rates = sum(1 for e in row['relaxation_rates'] if e == 'rr_free')
method += '-free{}'.format(num_free_relaxation_rates)
if 'cumulant' in row and row['cumulant']:
method += '-cumulant'
return method
df['method'] = df.apply(method_category, axis=1)
for col in ['smagorinsky', 'entropic', 'cumulant', 'relaxation_rates']:
if col in df:
del df[col]
if all(c in df for c in ['fixed_loop_sizes', 'fixed_relaxation_rates']):
def fixed_column(row):
mapping = {
(False, False): 'generic',
(True, False): 'loops only',
(False, True): "ω's only",
(True, True): "all fixed",
}
return mapping[(row['fixed_loop_sizes'], row['fixed_relaxation_rates'])]
cat_type = CategoricalDtype(categories=['generic', "ω's only", "loops only", "all fixed"], ordered=True)
df['fixed'] = df.apply(fixed_column, axis=1).astype(cat_type)
del df['fixed_loop_sizes']
del df['fixed_relaxation_rates']
if 'fixed_loop_sizes' in df:
bool_to_category('fixed_loop_sizes')
if 'fixed_relaxation_rates' in df:
bool_to_category('fixed_relaxation_rates')
if all(c in df for c in ['cse_global', 'cse_pdfs']):
def cse_column(row):
mapping = {
(False, False): 'none',
(True, False): 'global only',
(False, True): "pdfs only",
(True, True): "full cse",
}
return mapping[(row['cse_global'], row['cse_pdfs'])]
cat_type = CategoricalDtype(categories=["full cse", 'global only', 'none', "pdfs only", ], ordered=True)
df['cse'] = df.apply(cse_column, axis=1).astype(cat_type)
del df['cse_global']
del df['cse_pdfs']
if 'cse_global' in df:
bool_to_category('cse_global')
if 'cse_pdfs' in df:
bool_to_category('cse_pdfs')
if 'all_measurements' in df:
del df['all_measurements']
if 'split' in df:
df['split'] = df.apply(lambda e: 'split' if e['split'] else 'no-split', axis=1).astype('category')
if 'method' in df:
df['method'] = df['method'].astype('category')
return df
def speedup_table(df, column_name):
import pandas as pd
"""Computes the speed up that a boolean optimization (column) causes."""
result_columns = ['mlups_max', 'mlups_median', 'all_measurements']
param_columns = list(set(df.columns) - set(result_columns + [column_name]))
param_columns.insert(0, column_name)
df = df.set_index(param_columns)
result = pd.DataFrame(df.loc[True][['mlups_median']] / df.loc[False][['mlups_median']])
return result.rename(columns={'mlups_median': 'speedup'})
def category_columns_to_string_columns(df):
category_columns = []
for c in df.columns:
if df[c].dtype.name == 'category':
category_columns.append(c)
df[c] = df[c].astype(str)
return category_columns
def speedup_table_categorical(df, column_name, slow_values):
df = df.copy()
df['_tmp_'] = df.apply(lambda row: False if row[column_name] in slow_values else True, axis=1)
category_columns = category_columns_to_string_columns(df)
del df[column_name]
result = speedup_table(df, '_tmp_').reset_index()
for c in category_columns:
if c == column_name:
continue
result[c] = result[c].astype('category')
return result
def flatten_index(df, keep=[], remove=[]):
"""See reset_index - pass index names to keep or to remove"""
flatten_indices = []
if remove:
assert not keep
for i, cn in enumerate(df.index.names):
if cn in remove:
flatten_indices.append(i)
else:
for i, cn in enumerate(df.index.names):
if cn not in keep:
flatten_indices.append(i)
return df.reset_index(level=flatten_indices)
def bokeh_scatter_plot(df, category_column, dof_column, color_column=None, enable_hover=True, plot_size=(400, 300),
source=None, log=False):
"""Interactive bokeh scatter plot.
Args:
df: pandas data frame with data
category_column: column name for data that is plotted on y axis (has to be categorical column)
dof_column: column name plotted on the x axis (numeric)
color_column: categorical column used to color the data points
enable_hover: switch for tooltips on hover
plot_size: (width, height) of plot
source: use this parameter to link multiple bokeh plots together, pass the source here that was returned
by the first plot, make sure to use the same data frame for all plots
Returns:
(plot, source to pass to next plot to link them together)
"""
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool, WheelZoomTool
from bokeh.transform import jitter, factor_cmap
from bokeh.palettes import d3
if source is None:
source = ColumnDataSource(df)
if df[category_column].dtype.name == 'category':
figure_kwargs = {'y_range': [str(e) for e in df[category_column].unique()]}
else:
figure_kwargs = {}
if log:
figure_kwargs['x_axis_type'] = 'log'
p = figure(plot_width=plot_size[0], plot_height=plot_size[1],
tools="reset,pan,box_select,wheel_zoom", toolbar_location="right", **figure_kwargs)
p.toolbar.active_scroll = p.select_one(WheelZoomTool)
kwargs = {}
if color_column:
color_column_values = [str(e) for e in df[color_column].unique()]
palette = d3['Category10'][min(max(len(color_column_values), 3), 10)]
kwargs['color'] = factor_cmap(color_column, palette=palette, factors=color_column_values)
kwargs['legend'] = color_column
use_jitter = True
y = jitter(category_column, width=0.05, range=p.y_range, distribution='normal') if use_jitter else category_column
p.circle(source=source, x=dof_column, y=y, alpha=0.5, **kwargs)
p.legend.location = 'bottom_center'
p.legend.orientation = "horizontal"
p.legend.label_text_font_size = "6pt"
p.legend.padding = 0
p.legend.margin = 0
if enable_hover:
columns_to_hide = ['all_measurements', color_column, category_column, dof_column]
hover = HoverTool()
hover.tooltips = [(c, '@' + c) for c in df.columns if str(c) not in columns_to_hide]
# hover.tooltips.append(('index', '$index'))
p.add_tools(hover)
return p, source