[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH] Add geomean stats operation for computing geometric mean
From: |
Shawn |
Subject: |
[PATCH] Add geomean stats operation for computing geometric mean |
Date: |
Wed, 04 Mar 2020 09:59:03 -0000 |
See subject. Pretty straightforward and hopefully useful addition to
the statistics functions.
---
Makefile.am | 1 +
bootstrap.conf | 1 +
doc/datamash.texi | 2 +
m4/.gitignore | 1 +
m4/logl.m4 | 200 ++++++++++++++++++++++++++++++++++++++++
src/datamash.c | 2 +-
src/field-ops.c | 8 ++
src/field-ops.h | 3 +-
src/op-defs.c | 1 +
src/op-defs.h | 1 +
src/utils.c | 12 +++
src/utils.h | 4 +
tests/datamash-stats.pl | 16 ++++
13 files changed, 250 insertions(+), 2 deletions(-)
create mode 100644 m4/logl.m4
diff --git a/Makefile.am b/Makefile.am
index e80d9cc..fa5f5e3 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -58,6 +58,7 @@ datamash_LDADD = \
$(LIBICONV) \
$(LIBINTL) \
$(LIB_CRYPTO) \
+ $(LOGL_LIBM) \
$(ROUND_LIBM) \
$(ROUNDL_LIBM) \
$(SQRT_LIBM) \
diff --git a/bootstrap.conf b/bootstrap.conf
index e0d892c..e416662 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -62,6 +62,7 @@ gnulib_modules="
linebuffer
locale
localeconv
+ logl
maintainer-makefile
mbsrtowcs
minmax
diff --git a/doc/datamash.texi b/doc/datamash.texi
index 4d87e18..9574b2d 100644
--- a/doc/datamash.texi
+++ b/doc/datamash.texi
@@ -448,6 +448,8 @@ number of unique/distinct values
@table @option
@item mean
mean of the values
+@item geomean
+geometric mean of the values (should be greater than 0)
@item trimmean
trimmed mean of the values
@item median
diff --git a/m4/.gitignore b/m4/.gitignore
index 02e4cef..eef4243 100644
--- a/m4/.gitignore
+++ b/m4/.gitignore
@@ -186,3 +186,4 @@
/iswxdigit.m4
/setlocale_null.m4
/zzgnulib.m4
+/log.m4
diff --git a/m4/logl.m4 b/m4/logl.m4
new file mode 100644
index 0000000..7401da7
--- /dev/null
+++ b/m4/logl.m4
@@ -0,0 +1,200 @@
+# logl.m4 serial 13
+dnl Copyright (C) 2010-2020 Free Software Foundation, Inc.
+dnl This file is free software; the Free Software Foundation
+dnl gives unlimited permission to copy and/or distribute it,
+dnl with or without modifications, as long as this notice is preserved.
+
+AC_DEFUN([gl_FUNC_LOGL],
+[
+ AC_REQUIRE([gl_MATH_H_DEFAULTS])
+ AC_REQUIRE([gl_LONG_DOUBLE_VS_DOUBLE])
+
+ dnl Persuade glibc <math.h> to declare logl().
+ AC_REQUIRE([gl_USE_SYSTEM_EXTENSIONS])
+
+ LOGL_LIBM=
+ AC_CACHE_CHECK([whether logl() can be used without linking with libm],
+ [gl_cv_func_logl_no_libm],
+ [
+ AC_LINK_IFELSE(
+ [AC_LANG_PROGRAM(
+ [[#ifndef __NO_MATH_INLINES
+ # define __NO_MATH_INLINES 1 /* for glibc */
+ #endif
+ #include <math.h>
+ long double (*funcptr) (long double) = logl;
+ long double x;]],
+ [[return funcptr (x) > 1
+ || logl (x) > 1;]])],
+ [gl_cv_func_logl_no_libm=yes],
+ [gl_cv_func_logl_no_libm=no])
+ ])
+ if test $gl_cv_func_logl_no_libm = no; then
+ AC_CACHE_CHECK([whether logl() can be used with libm],
+ [gl_cv_func_logl_in_libm],
+ [
+ save_LIBS="$LIBS"
+ LIBS="$LIBS -lm"
+ AC_LINK_IFELSE(
+ [AC_LANG_PROGRAM(
+ [[#ifndef __NO_MATH_INLINES
+ # define __NO_MATH_INLINES 1 /* for glibc */
+ #endif
+ #include <math.h>
+ long double (*funcptr) (long double) = logl;
+ long double x;]],
+ [[return funcptr (x) > 1
+ || logl (x) > 1;]])],
+ [gl_cv_func_logl_in_libm=yes],
+ [gl_cv_func_logl_in_libm=no])
+ LIBS="$save_LIBS"
+ ])
+ if test $gl_cv_func_logl_in_libm = yes; then
+ LOGL_LIBM=-lm
+ fi
+ fi
+ if test $gl_cv_func_logl_no_libm = yes \
+ || test $gl_cv_func_logl_in_libm = yes; then
+ dnl Also check whether it's declared.
+ dnl Mac OS X 10.3 has logl() in libc but doesn't declare it in <math.h>.
+ AC_CHECK_DECL([logl], , [HAVE_DECL_LOGL=0], [[#include <math.h>]])
+ save_LIBS="$LIBS"
+ LIBS="$LIBS $LOGL_LIBM"
+ gl_FUNC_LOGL_WORKS
+ LIBS="$save_LIBS"
+ case "$gl_cv_func_logl_works" in
+ *yes) ;;
+ *) REPLACE_LOGL=1 ;;
+ esac
+ else
+ HAVE_LOGL=0
+ HAVE_DECL_LOGL=0
+ fi
+ if test $HAVE_LOGL = 0 || test $REPLACE_LOGL = 1; then
+ dnl Find libraries needed to link lib/logl.c.
+ if test $HAVE_SAME_LONG_DOUBLE_AS_DOUBLE = 1; then
+ AC_REQUIRE([gl_FUNC_LOG])
+ LOGL_LIBM="$LOG_LIBM"
+ else
+ if test $HAVE_LOGL = 0; then
+ AC_REQUIRE([gl_FUNC_FREXPL])
+ AC_REQUIRE([gl_FUNC_ISNANL])
+ AC_REQUIRE([gl_FUNC_FLOORL])
+ dnl Append $FREXPL_LIBM to LOGL_LIBM, avoiding gratuitous duplicates.
+ case " $LOGL_LIBM " in
+ *" $FREXPL_LIBM "*) ;;
+ *) LOGL_LIBM="$LOGL_LIBM $FREXPL_LIBM" ;;
+ esac
+ dnl Append $ISNANL_LIBM to LOGL_LIBM, avoiding gratuitous duplicates.
+ case " $LOGL_LIBM " in
+ *" $ISNANL_LIBM "*) ;;
+ *) LOGL_LIBM="$LOGL_LIBM $ISNANL_LIBM" ;;
+ esac
+ dnl Append $FLOORL_LIBM to LOGL_LIBM, avoiding gratuitous duplicates.
+ case " $LOGL_LIBM " in
+ *" $FLOORL_LIBM "*) ;;
+ *) LOGL_LIBM="$LOGL_LIBM $FLOORL_LIBM" ;;
+ esac
+ fi
+ fi
+ fi
+ AC_SUBST([LOGL_LIBM])
+])
+
+dnl Test whether logl() works.
+dnl On OSF/1 5.1, logl(-0.0L) is NaN.
+dnl On NetBSD 8.0, the result is accurate to only 16 digits.
+AC_DEFUN([gl_FUNC_LOGL_WORKS],
+[
+ AC_REQUIRE([AC_PROG_CC])
+ AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles
+ AC_CACHE_CHECK([whether logl works], [gl_cv_func_logl_works],
+ [
+ AC_RUN_IFELSE(
+ [AC_LANG_SOURCE([[
+#ifndef __NO_MATH_INLINES
+# define __NO_MATH_INLINES 1 /* for glibc */
+#endif
+#include <float.h>
+#include <math.h>
+/* Override the values of <float.h>, like done in float.in.h. */
+#if defined __i386__ && (defined __BEOS__ || defined __OpenBSD__)
+# undef LDBL_MANT_DIG
+# define LDBL_MANT_DIG 64
+# undef LDBL_MIN_EXP
+# define LDBL_MIN_EXP (-16381)
+# undef LDBL_MAX_EXP
+# define LDBL_MAX_EXP 16384
+#endif
+#if defined __i386__ && (defined __FreeBSD__ || defined __DragonFly__)
+# undef LDBL_MANT_DIG
+# define LDBL_MANT_DIG 64
+# undef LDBL_MIN_EXP
+# define LDBL_MIN_EXP (-16381)
+# undef LDBL_MAX_EXP
+# define LDBL_MAX_EXP 16384
+#endif
+#if (defined _ARCH_PPC || defined _POWER) && defined _AIX && (LDBL_MANT_DIG ==
106) && defined __GNUC__
+# undef LDBL_MIN_EXP
+# define LDBL_MIN_EXP DBL_MIN_EXP
+#endif
+#if defined __sgi && (LDBL_MANT_DIG >= 106)
+# undef LDBL_MANT_DIG
+# define LDBL_MANT_DIG 106
+# if defined __GNUC__
+# undef LDBL_MIN_EXP
+# define LDBL_MIN_EXP DBL_MIN_EXP
+# endif
+#endif
+#undef logl
+extern
+#ifdef __cplusplus
+"C"
+#endif
+long double logl (long double);
+static long double dummy (long double x) { return 0; }
+volatile long double gx;
+long double gy;
+int main (int argc, char *argv[])
+{
+ long double (* volatile my_logl) (long double) = argc ? logl : dummy;
+ int result = 0;
+ /* This test fails on OSF/1 5.1. */
+ {
+ gx = -0.0L;
+ gy = logl (gx);
+ if (!(gy + gy == gy))
+ result |= 1;
+ }
+ /* This test fails on NetBSD 8.0. */
+ {
+ const long double TWO_LDBL_MANT_DIG = /* 2^LDBL_MANT_DIG */
+ (long double) (1U << ((LDBL_MANT_DIG - 1) / 5))
+ * (long double) (1U << ((LDBL_MANT_DIG - 1 + 1) / 5))
+ * (long double) (1U << ((LDBL_MANT_DIG - 1 + 2) / 5))
+ * (long double) (1U << ((LDBL_MANT_DIG - 1 + 3) / 5))
+ * (long double) (1U << ((LDBL_MANT_DIG - 1 + 4) / 5));
+ long double x = 16.981137113807045L;
+ long double err = (my_logl (x) + my_logl (1.0L / x)) * TWO_LDBL_MANT_DIG;
+ if (!(err >= -100.0L && err <= 100.0L))
+ result |= 2;
+ }
+
+ return result;
+}
+]])],
+ [gl_cv_func_logl_works=yes],
+ [gl_cv_func_logl_works=no],
+ [case "$host_os" in
+ # Guess yes on glibc systems.
+ *-gnu* | gnu*) gl_cv_func_logl_works="guessing yes" ;;
+ # Guess yes on musl systems.
+ *-musl*) gl_cv_func_logl_works="guessing yes" ;;
+ # Guess yes on native Windows.
+ mingw*) gl_cv_func_logl_works="guessing yes" ;;
+ # If we don't know, obey --enable-cross-guesses.
+ *) gl_cv_func_logl_works="$gl_cross_guess_normal" ;;
+ esac
+ ])
+ ])
+])
diff --git a/src/datamash.c b/src/datamash.c
index 57844a3..9a32802 100644
--- a/src/datamash.c
+++ b/src/datamash.c
@@ -208,7 +208,7 @@ which require a pair of fields (e.g. 'pcov 2:6').\n"),
stdout);
fputs (_("Statistical Grouping operations:\n"),stdout);
fputs ("\
- mean, trimmean, median, q1, q3, iqr, perc, mode, antimode, \n\
+ mean, geomean, trimmean, median, q1, q3, iqr, perc, mode, antimode, \n\
pstdev, sstdev, pvar, svar, mad, madraw,\n\
pskew, sskew, pkurt, skurt, dpo, jarque,\n\
scov, pcov, spearson, ppearson\n\
diff --git a/src/field-ops.c b/src/field-ops.c
index 582b9a6..dc9764d 100644
--- a/src/field-ops.c
+++ b/src/field-ops.c
@@ -76,6 +76,8 @@ struct operation_data operations[] =
{STRING_SCALAR, IGNORE_FIRST, STRING_RESULT},
/* OP_MEAN */
{NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
+ /* OP_GEOMEAN */
+ {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
/* OP_MEDIAN */
{NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT},
/* OP_QUARTILE_1 */
@@ -525,6 +527,7 @@ field_op_collect (struct fieldop *op,
case OP_S_COVARIANCE:
case OP_P_PEARSON_COR:
case OP_S_PEARSON_COR:
+ case OP_GEOMEAN:
case OP_TRIMMED_MEAN:
field_op_add_value (op, num_value);
break;
@@ -689,6 +692,7 @@ field_op_summarize_empty (struct fieldop *op)
switch (op->op) /* LCOV_EXCL_BR_LINE */
{
case OP_MEAN:
+ case OP_GEOMEAN:
case OP_S_SKEWNESS:
case OP_P_SKEWNESS:
case OP_S_EXCESS_KURTOSIS:
@@ -859,6 +863,10 @@ field_op_summarize (struct fieldop *op)
op->params.percentile / 100.0 );
break;
+ case OP_GEOMEAN:
+ numeric_result = geometric_mean_value(op->values, op->num_values);
+ break;
+
case OP_TRIMMED_MEAN:
field_op_sort_values (op);
numeric_result = trimmed_mean_value ( op->values, op->num_values,
diff --git a/src/field-ops.h b/src/field-ops.h
index 134aab4..6e3bb3f 100644
--- a/src/field-ops.h
+++ b/src/field-ops.h
@@ -98,7 +98,8 @@ struct fieldop
/* NUMERIC_SCALAR operations */
size_t count; /* number of items collected so far in a group */
long double value; /* for single-value operations (sum, min, max, absmin,
- absmax, mean) - this is the accumulated value */
+ absmax, mean, geomean) - this is the accumulated
+ value */
/* NUMERIC_VECTOR operations */
long double *values; /* array for multi-valued ops (median,mode,stdev) */
diff --git a/src/op-defs.c b/src/op-defs.c
index bddbcc5..13e0b7b 100644
--- a/src/op-defs.c
+++ b/src/op-defs.c
@@ -46,6 +46,7 @@ struct field_operation_definition field_operations[] =
{"last", OP_LAST, MODE_GROUPBY},
{"rand", OP_RAND, MODE_GROUPBY},
{"mean", OP_MEAN, MODE_GROUPBY},
+ {"geomean", OP_GEOMEAN, MODE_GROUPBY},
{"median", OP_MEDIAN, MODE_GROUPBY},
{"q1", OP_QUARTILE_1, MODE_GROUPBY},
{"q3", OP_QUARTILE_3, MODE_GROUPBY},
diff --git a/src/op-defs.h b/src/op-defs.h
index 898c441..21c86de 100644
--- a/src/op-defs.h
+++ b/src/op-defs.h
@@ -36,6 +36,7 @@ enum field_operation
OP_LAST,
OP_RAND,
OP_MEAN,
+ OP_GEOMEAN,
OP_MEDIAN,
OP_QUARTILE_1,
OP_QUARTILE_3,
diff --git a/src/utils.c b/src/utils.c
index 4b87bda..b643f9d 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -115,6 +115,18 @@ arithmetic_mean_value (const long double * const values,
const size_t n)
return mean;
}
+/* Compute the geometric mean of an array of numbers > 0.0 */
+long double _GL_ATTRIBUTE_PURE
+geometric_mean_value (const long double * const values, const size_t n)
+{
+ long double sum=0;
+ long double mean;
+ for (size_t i = 0; i < n; i++)
+ sum += logl(values[i]);
+ mean = sum / n ;
+ return expl(mean);
+}
+
long double _GL_ATTRIBUTE_PURE
variance_value (const long double * const values, size_t n, int df)
{
diff --git a/src/utils.h b/src/utils.h
index 1eee21e..c88269e 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -36,6 +36,10 @@ is_na (const char* value, const size_t len);
long double
arithmetic_mean_value (const long double * const values, const size_t n);
+/* Given an array of doubles > 0.0, return the geometric mean value */
+long double
+geometric_mean_value (const long double * const values, const size_t n);
+
/*
Given a sorted array of doubles, return the value of 'percentile'.
Example of valid 'percentile':
diff --git a/tests/datamash-stats.pl b/tests/datamash-stats.pl
index d888c76..88644ac 100755
--- a/tests/datamash-stats.pl
+++ b/tests/datamash-stats.pl
@@ -297,6 +297,22 @@ my @Tests =
['mean12','mean 1' , {IN_PIPE=>$seq22}, {OUT => "67.45\n"},],
['mean13','mean 1' , {IN_PIPE=>$seq23}, {OUT => "6.125\n"},],
+ # Test geometric mean
+ ['geomean1', 'geomean 1' , {IN_PIPE=>$seq1}, {OUT => "2.213\n"}],
+ ['geomean2', 'geomean 1' , {IN_PIPE=>$seq2}, {OUT => "1.817\n"}],
+ ['geomean3', 'geomean 1' , {IN_PIPE=>$seq3}, {OUT => "2\n"}],
+ # sorted/unsorted should not change the result.
+ ['geomean4', 'geomean 1' , {IN_PIPE=>$seq9}, {OUT => "8.464\n"},],
+ ['geomean5', 'geomean 1' , {IN_PIPE=>$seq10}, {OUT => "9.573\n"}],
+ ['geomean6', 'geomean 1' , {IN_PIPE=>$seq12}, {OUT => "11.817\n"},],
+ ['geomean7', 'geomean 1' , {IN_PIPE=>$seq11}, {OUT => "10.653\n"},],
+ ['geomean8', 'geomean 1' , {IN_PIPE=>$seq12_unsorted}, {OUT =>
"11.817\n"},],
+ ['geomean9', '--sort geomean 1' ,
+ {IN_PIPE=>$seq12_unsorted}, {OUT => "11.817\n"},],
+ ['geomean10','geomean 1' , {IN_PIPE=>$seq20}, {OUT => "99.596\n"},],
+ ['geomean11','geomean 1' , {IN_PIPE=>$seq21}, {OUT => "34.644\n"},],
+ ['geomean12','geomean 1' , {IN_PIPE=>$seq22}, {OUT => "67.386\n"},],
+
# Test median
['med1', 'median 1' , {IN_PIPE=>$seq1}, {OUT => "2.5\n"}],
['med2', 'median 1' , {IN_PIPE=>$seq2}, {OUT => "2\n"}],
--
2.17.1
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [PATCH] Add geomean stats operation for computing geometric mean,
Shawn <=