#!/usr/bin/env python3

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
import seaborn as sns

maincols = ['elapsed', 'ru_utime', 'ru_stime', 'ru_maxrss_mb']

allruns = pd.read_csv('results.csv')
allruns['mode'] = allruns['mode'].astype(CategoricalDtype(
    categories=['system', 'unsorted', 'sortcmd', 'sortflag'],
    ordered=True,
))
allruns['ru_maxrss_mb'] = allruns['ru_maxrss'] / 1024
allruns.sort_values(by=['caches', 'LC_ALL', 'tree', 'mode'])

# Check for outliers accross runs.
for key, group in allruns.groupby(['caches', 'LC_ALL', 'tree', 'mode'])[maincols]:
    seconds = group[['elapsed', 'ru_utime', 'ru_stime']]
    if (np.logical_and(seconds.max() - seconds.min() > .2,
                       seconds.max() / seconds.min() > 1.1).any().any()):
        print('Warning: times vary by over 10% between runs', key)
    if np.logical_and(group.ru_maxrss_mb.max() - group.ru_maxrss_mb.min() > 2,
                      group.ru_maxrss_mb.max() / group.ru_maxrss_mb.min() > 1.15).any():
        print('Warning: memory usage varies by over 15% between runs', key)

for key, group in allruns.groupby('tree'):
    if group.files.max() - group.files.min() > 20:
        print('Warning: file count varies by over 20 between runs', key)

# Check that unsorted and system runs are similar.
for key, group in (allruns
                   [allruns['mode'].isin(['system', 'unsorted'])]
                   .groupby(['caches', 'LC_ALL', 'tree', 'mode'])
                   [maincols]
                   .mean()
                   .groupby(['caches', 'LC_ALL', 'tree'])):
    seconds = group[['elapsed', 'ru_utime', 'ru_stime']]
    if (np.logical_and(seconds.max() - seconds.min() > .2,
                       seconds.max() / seconds.min() > 1.1).any().any()):
        print('Warning: times vary by over 10% in system vs unsorted', key)
    if np.logical_and(group.ru_maxrss_mb.max() - group.ru_maxrss_mb.min() > 2,
                      group.ru_maxrss_mb.max() / group.ru_maxrss_mb.min() > 1.15).any():
        print('Warning: memory usage varies by over 15% in system vs unsorted', key)

print('File counts:')
print(allruns.groupby('tree')[['files']].mean().astype(int))
print()

nonsystem = allruns
nonsystem['mode'] = nonsystem['mode'].cat.remove_categories('system')
results = (nonsystem
           .groupby(['caches', 'LC_ALL', 'tree', 'mode'])
           [maincols]
           .mean())

print('Results:')
print(results.round(2))
print()


def normalize(group):
    return group / group.xs('unsorted', level='mode').values


normalized = results.groupby(['caches', 'LC_ALL', 'tree']).apply(normalize)
print('Normalized:')
print(normalized.round(2))
print()

sns.set_style('whitegrid')
g = sns.FacetGrid(nonsystem,
                  row='caches', col='LC_ALL',
                  sharex=False, sharey=False)
g.map(sns.barplot, 'tree', 'elapsed', 'mode', palette='Set1')
g.add_legend()
g.set_titles('{row_name} cache, {col_name} locale')
g.set_ylabels('elapsed time (s)')
for i, ax in enumerate(g.axes.flat):
    if i == 0:
        ax.set(ylim=(0, 5.5))
    else:
        ax.set(ylim=(0, 33))
plt.savefig('results.png', dpi=150)