[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: Adding an aggregate for variance
From: |
John Darrington |
Subject: |
Re: Adding an aggregate for variance |
Date: |
Tue, 13 Oct 2020 20:38:31 +0200 |
User-agent: |
Mutt/1.10.1 (2018-07-13) |
Much as I think this change is of a generally good quality, I wonder
if it really belongs in recutils.
Recutils is a database, and not a statistical analysis tool. It could
be the start of a slippery slope ... what would come next ... ?
Sample Variance? Sample Std. Dev? Population Std. Dev? Skewness?
Kurtosis? Covariance? Correlation Coefficients? Chi-square ....
There are other GNU tools which do these calculations. Rather than
pretending that recutils is a statistical analysis tool, I think it
would be a better idea to implement easier ways to interface recutils
to the tools which are specifically designed for this job.
Just my $0.02
J'
On Tue, Oct 13, 2020 at 10:01:54AM -0700, Frank Pursel wrote:
Addressing some of the issues from the first try. I hope this
is better.
Regards,
Frank
diff --git a/recutils-1.8/ChangeLog b/recutils-1.8/ChangeLog
index d234bae..f3b8414 100644
--- a/recutils-1.8/ChangeLog
+++ b/recutils-1.8/ChangeLog
@@ -1,3 +1,10 @@
+2020-10-13 Frank Pursel <purself@yahoo.com>
+
+ * src/rec-aggregate.c: Added Var aggregate functions,
+ rec_aggregate_std_var and rec_aggregate_std_var_record.
+ * torture/utils/recsel.sh: Added test cases for the Var aggregate.
+ * doc/recutils.texi: Basic documentation for same.
+
2019-01-03 Jose E. Marchesi <jose.marchesi@oracle.com>
* configure.ac: Bump version to 1.8.
diff --git a/recutils-1.8/doc/recutils.texi
b/recutils-1.8/doc/recutils.texi
index 38877a3..3762fb8 100644
--- a/recutils-1.8/doc/recutils.texi
+++ b/recutils-1.8/doc/recutils.texi
@@ -3527,6 +3527,8 @@ The supported aggregate functions are the following:
Counts the number of occurrences of a field.
@item Avg(FIELD)
Calculates the average (mean) of the numerical values of a field.
+@item Var(FIELD)
+Calculates the population variance of the numerical values of a field.
@item Sum(FIELD)
Calculates the sum of the numerical values of a field.
@item Min(FIELD)
diff --git a/recutils-1.8/src/rec-aggregate.c
b/recutils-1.8/src/rec-aggregate.c
index e28f9d8..5194cc3 100644
--- a/recutils-1.8/src/rec-aggregate.c
+++ b/recutils-1.8/src/rec-aggregate.c
@@ -52,6 +52,12 @@ struct rec_aggregate_reg_s
size_t num_functions;
};
+struct rec_aggregate_reg_var_s
+{
+ int n;
+ double values[10000];
+};
+
/* Static functions defined in this file. */
static char *rec_aggregate_std_count (rec_rset_t rset,
@@ -64,6 +70,12 @@ static char *rec_aggregate_std_avg (rec_rset_t rset,
static double rec_aggregate_std_avg_record (rec_record_t record,
const char *field_name);
+static char *rec_aggregate_std_var (rec_rset_t rset,
+ rec_record_t record,
+ const char *field_name);
+static struct rec_aggregate_reg_var_s rec_aggregate_std_var_record
(rec_record_t record,
+ const char
*field_name);
+
static char *rec_aggregate_std_sum (rec_rset_t rset,
rec_record_t record,
const char *field_name);
@@ -96,11 +108,12 @@ struct rec_aggregate_descriptor_s
rec_aggregate_t func;
};
-#define NUM_STD_AGGREGATES 5
+#define NUM_STD_AGGREGATES 6
static struct rec_aggregate_descriptor_s std_aggregates[] =
{{"count", &rec_aggregate_std_count},
{"avg", &rec_aggregate_std_avg},
+ {"var", &rec_aggregate_std_var},
{"sum", &rec_aggregate_std_sum},
{"min", &rec_aggregate_std_min},
{"max", &rec_aggregate_std_max}};
@@ -305,6 +318,102 @@ rec_aggregate_std_avg_record (rec_record_t record,
return avg;
}
+static char *
+rec_aggregate_std_var (rec_rset_t rset,
+ rec_record_t record,
+ const char *field_name)
+{
+ char *result = NULL;
+ double var = 0;
+ struct rec_aggregate_reg_var_s vals;
+ struct rec_aggregate_reg_var_s mval;
+ mval.n = 0;
+
+ if (record)
+ {
+ vals = rec_aggregate_std_var_record (record, field_name);
+ if (vals.n < 2)
+ {
+ var = 0;
+ }
+ else
+ {
+ double avg = 0;
+ for (int i=0; i < vals.n; i++) { avg += vals.values[i]; }
+ avg = avg / vals.n;
+ for (int i=0; i < vals.n; i++)
+ {
+ var += ((vals.values[i] - avg) * (vals.values[i] - avg));
+ }
+ var = var / vals.n;
+ }
+ }
+ else if (rset)
+ {
+ int num_records = 0;
+ rec_record_t rec = NULL;
+ rec_mset_iterator_t iter = rec_mset_iterator (rec_rset_mset (rset));
+
+ while (rec_mset_iterator_next (&iter, MSET_RECORD, (void *) &rec,
NULL))
+ {
+
+ vals = rec_aggregate_std_var_record (rec, field_name);
+ for (int i=vals.n; i >= 0; i--)
+ {
+ mval.values[mval.n + i] = vals.values[i];
+ }
+ mval.n += vals.n;
+
+ }
+ rec_mset_iterator_free (&iter);
+
+ if (mval.n > 1) {
+ double avg = 0;
+ for (int i=0; i<mval.n; i++) { avg += mval.values[i]; }
+ avg = avg / mval.n;
+ for (int i=0; i<mval.n; i++) {
+ var += (mval.values[i] - avg) * (mval.values[i] - avg);
+ }
+ var = var / mval.n;
+ }
+ else
+ var = 0;
+ }
+ /* Return the average as a string. Note that if NULL is returned it
+ will be returned by this function below to signal the
+ end-of-memory condition. */
+ asprintf (&result, "%g", var);
+
+ return result;
+
+}
+
+static struct rec_aggregate_reg_var_s
+rec_aggregate_std_var_record (rec_record_t record,
+ const char *field_name)
+{
+ struct rec_aggregate_reg_var_s part_var;
+
+ rec_field_t field;
+ int num_fields = 0;
+ rec_mset_iterator_t iter = rec_mset_iterator (rec_record_mset (record));
+
+ while (rec_mset_iterator_next (&iter, MSET_FIELD, (void *) &field,
NULL))
+ {
+ double field_value_double = 0;
+ const char *field_value = rec_field_value (field);
+
+ if (rec_field_name_equal_p (rec_field_name (field), field_name)
+ && rec_atod (field_value, &field_value_double))
+ {
+ part_var.values[part_var.n++] = field_value_double;
+ }
+ }
+ rec_mset_iterator_free (&iter);
+
+ return part_var;
+}
+
#define REC_AGGREGATE_ACCUM_FUNC(NAME, OP, INIT_VAL) \
static char * \
rec_aggregate_std_##NAME (rec_rset_t rset, \
diff --git a/recutils-1.8/torture/utils/recsel.sh
b/recutils-1.8/torture/utils/recsel.sh
index 2bedc18..2a9c720 100755
--- a/recutils-1.8/torture/utils/recsel.sh
+++ b/recutils-1.8/torture/utils/recsel.sh
@@ -1579,6 +1579,13 @@ test_tool recsel-aggregate-avg-overall ok \
'39
'
+test_tool recsel-aggregate-var-overall ok \
+ recsel \
+ '-P "Var(Cost)"' \
+ sales \
+'1133.6
+'
+
test_tool recsel-aggregate-avg-grouped ok \
recsel \
'-p "Item,Avg(Cost)" -G Item' \
@@ -1596,6 +1603,23 @@ Item: D
Avg_Cost: 100
'
+test_tool recsel-aggregate-avg-grouped ok \
+ recsel \
+ '-p "Item,Var(Cost)" -G Item' \
+ sales \
+'Item: A
+Var_Cost: 42.25
+
+Item: B
+Var_Cost: 0
+
+Item: C
+Var_Cost: 0
+
+Item: D
+Var_Cost: 0
+'
+
test_tool recsel-aggregate-sum-overall ok \
recsel \
'-P "Sum(Cost)"' \