#!/usr/bin/env python3 import matplotlib.pyplot as plt import numpy as np import pandas as pd from pandas.api.types import CategoricalDtype import seaborn as sns maincols = ['elapsed', 'ru_utime', 'ru_stime', 'ru_maxrss_mb'] allruns = pd.read_csv('results.csv') allruns['mode'] = allruns['mode'].astype(CategoricalDtype( categories=['system', 'unsorted', 'sortcmd', 'sortflag'], ordered=True, )) allruns['ru_maxrss_mb'] = allruns['ru_maxrss'] / 1024 allruns.sort_values(by=['caches', 'LC_ALL', 'tree', 'mode']) # Check for outliers accross runs. for key, group in allruns.groupby(['caches', 'LC_ALL', 'tree', 'mode'])[maincols]: seconds = group[['elapsed', 'ru_utime', 'ru_stime']] if (np.logical_and(seconds.max() - seconds.min() > .2, seconds.max() / seconds.min() > 1.1).any().any()): print('Warning: times vary by over 10% between runs', key) if np.logical_and(group.ru_maxrss_mb.max() - group.ru_maxrss_mb.min() > 2, group.ru_maxrss_mb.max() / group.ru_maxrss_mb.min() > 1.15).any(): print('Warning: memory usage varies by over 15% between runs', key) for key, group in allruns.groupby('tree'): if group.files.max() - group.files.min() > 20: print('Warning: file count varies by over 20 between runs', key) # Check that unsorted and system runs are similar. for key, group in (allruns [allruns['mode'].isin(['system', 'unsorted'])] .groupby(['caches', 'LC_ALL', 'tree', 'mode']) [maincols] .mean() .groupby(['caches', 'LC_ALL', 'tree'])): seconds = group[['elapsed', 'ru_utime', 'ru_stime']] if (np.logical_and(seconds.max() - seconds.min() > .2, seconds.max() / seconds.min() > 1.1).any().any()): print('Warning: times vary by over 10% in system vs unsorted', key) if np.logical_and(group.ru_maxrss_mb.max() - group.ru_maxrss_mb.min() > 2, group.ru_maxrss_mb.max() / group.ru_maxrss_mb.min() > 1.15).any(): print('Warning: memory usage varies by over 15% in system vs unsorted', key) print('File counts:') print(allruns.groupby('tree')[['files']].mean().astype(int)) print() nonsystem = allruns nonsystem['mode'] = nonsystem['mode'].cat.remove_categories('system') results = (nonsystem .groupby(['caches', 'LC_ALL', 'tree', 'mode']) [maincols] .mean()) print('Results:') print(results.round(2)) print() def normalize(group): return group / group.xs('unsorted', level='mode').values normalized = results.groupby(['caches', 'LC_ALL', 'tree']).apply(normalize) print('Normalized:') print(normalized.round(2)) print() sns.set_style('whitegrid') g = sns.FacetGrid(nonsystem, row='caches', col='LC_ALL', sharex=False, sharey=False) g.map(sns.barplot, 'tree', 'elapsed', 'mode', palette='Set1') g.add_legend() g.set_titles('{row_name} cache, {col_name} locale') g.set_ylabels('elapsed time (s)') for i, ax in enumerate(g.axes.flat): if i == 0: ax.set(ylim=(0, 5.5)) else: ax.set(ylim=(0, 33)) plt.savefig('results.png', dpi=150)