gnuastro-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[gnuastro-commits] master 1e98f7a: Table: two new options to limit rows:


From: Mohammad Akhlaghi
Subject: [gnuastro-commits] master 1e98f7a: Table: two new options to limit rows: --rowlimit and --rowrandom
Date: Fri, 5 Feb 2021 20:08:32 -0500 (EST)

branch: master
commit 1e98f7a61eb61cf4b0fe38016b4f6712dc13af6b
Author: Mohammad Akhlaghi <mohammad@akhlaghi.org>
Commit: Mohammad Akhlaghi <mohammad@akhlaghi.org>

    Table: two new options to limit rows: --rowlimit and --rowrandom
    
    Until now, the only position-based row-selection options available to a
    user where the '--head' and '--tail' options. But in some scenarios, we
    want a certain range of rows based on their position in the full table (for
    example rows 100 to 200). In other cases, to get a feeling of the dataset,
    or for statistical tests, we want a random set of the rows. Until now, it
    was not easy to extract these.
    
    With this commit, the new '--rowlimit' option enables selecting a
    contiguous range of rows anywhere within the table and the '--rowrandom'
    enables selecting a random sub-set of the rows (after all value-based
    selections have been applied).
---
 NEWS              |   8 +++
 bin/table/args.h  |  40 ++++++++++++
 bin/table/main.h  |   7 +++
 bin/table/table.c | 180 ++++++++++++++++++++++++++++++++++++++++++++----------
 bin/table/ui.c    |  79 ++++++++++++++++++++++--
 bin/table/ui.h    |   3 +
 doc/gnuastro.texi |  34 +++++++++++
 lib/checkset.c    |   2 +-
 8 files changed, 314 insertions(+), 39 deletions(-)

diff --git a/NEWS b/NEWS
index e194a72..980d9eb 100644
--- a/NEWS
+++ b/NEWS
@@ -13,6 +13,14 @@ See the end of the file for license conditions.
      columns of the final output table. This is handy when you want a
      "clean" (no NaN values in any column) table, but the table has many
      columns.
+   --rowlimit: new option to specify the positional interval of rows to
+     show. Until now the '--head' or '--tail' options would just allow
+     seeing the first or last few rows. You can use this to view a
+     contiguous set of rows in the middle of the table.
+   --rowrandom: Make a random selection of the rows. This option is useful
+     when you have a large dataset and just want to see a random sub-set of
+     the rows. It takes an integer, selects that many rows from the input
+     randomly.
 
 ** Removed features
 
diff --git a/bin/table/args.h b/bin/table/args.h
index b66842e..bf5b13a 100644
--- a/bin/table/args.h
+++ b/bin/table/args.h
@@ -316,6 +316,46 @@ struct argp_option program_options[] =
       GAL_OPTIONS_NOT_SET
     },
     {
+      "rowlimit",
+      UI_KEY_ROWLIMIT,
+      "INT,INT",
+      0,
+      "Only rows in this row-counter range.",
+      UI_GROUP_OUTROWS,
+      &p->rowlimit,
+      GAL_TYPE_STRING,
+      GAL_OPTIONS_RANGE_GE_0,
+      GAL_OPTIONS_NOT_MANDATORY,
+      GAL_OPTIONS_NOT_SET,
+      gal_options_parse_csv_float64
+    },
+    {
+      "randomrows",
+      UI_KEY_ROWRANDOM,
+      "INT",
+      0,
+      "Number of rows to select randomly.",
+      UI_GROUP_OUTROWS,
+      &p->rowrandom,
+      GAL_TYPE_SIZE_T,
+      GAL_OPTIONS_RANGE_GE_0,
+      GAL_OPTIONS_NOT_MANDATORY,
+      GAL_OPTIONS_NOT_SET,
+    },
+    {
+      "envseed",
+      UI_KEY_ENVSEED,
+      0,
+      0,
+      "Use GSL_RNG_SEED env. for '--randomrows'.",
+      UI_GROUP_OUTROWS,
+      &p->envseed,
+      GAL_OPTIONS_NO_ARG_TYPE,
+      GAL_OPTIONS_RANGE_0_OR_1,
+      GAL_OPTIONS_NOT_MANDATORY,
+      GAL_OPTIONS_NOT_SET
+    },
+    {
       "noblank",
       UI_KEY_NOBLANK,
       "STR[,STR]",
diff --git a/bin/table/main.h b/bin/table/main.h
index c924d7b..dee83e2 100644
--- a/bin/table/main.h
+++ b/bin/table/main.h
@@ -24,6 +24,7 @@ along with Gnuastro. If not, see 
<http://www.gnu.org/licenses/>.
 #define MAIN_H
 
 /* Include necessary headers */
+#include <gsl/gsl_rng.h>
 #include <gnuastro/data.h>
 
 #include <gnuastro-internal/options.h>
@@ -100,6 +101,9 @@ struct tableparams
   uint8_t          descending;  /* Sort columns in descending order.    */
   size_t                 head;  /* Output only the no. of top rows.     */
   size_t                 tail;  /* Output only the no. of bottom rows.  */
+  gal_data_t        *rowlimit;  /* Output rows in row-counter range.    */
+  size_t            rowrandom;  /* Number of rows to show randomly.     */
+  uint8_t             envseed;  /* Use the environment for random seed. */
   gal_data_t         *noblank;  /* Remove rows that have blank.         */
   gal_list_str_t *catcolumnfile; /* Filename to concat column wise.     */
   gal_list_str_t *catcolumnhdu;  /* HDU/extension for the catcolumn.    */
@@ -123,6 +127,9 @@ struct tableparams
   time_t              rawtime;  /* Starting time of the program.        */
   gal_data_t       **colarray;  /* Array of columns, with arithmetic.   */
   size_t          numcolarray;  /* Number of elements in 'colarray'.    */
+  gsl_rng                *rng;  /* Main random number generator.        */
+  const char        *rng_name;  /* Name of random number generator.     */
+  unsigned long int  rng_seed;  /* Random number generator seed.        */
 
   /* For arithmetic operators. */
   gal_list_str_t  *wcstoimg_p;  /* Pointer to the node.                 */
diff --git a/bin/table/table.c b/bin/table/table.c
index 60bbc2c..8cf4668 100644
--- a/bin/table/table.c
+++ b/bin/table/table.c
@@ -29,6 +29,7 @@ along with Gnuastro. If not, see 
<http://www.gnu.org/licenses/>.
 #include <stdlib.h>
 #include <unistd.h>
 
+#include <gsl/gsl_rng.h>
 #include <gsl/gsl_heapsort.h>
 
 #include <gnuastro/txt.h>
@@ -338,7 +339,7 @@ table_selection_equal_or_notequal(struct tableparams *p, 
gal_data_t *col,
 
 
 static void
-table_selection(struct tableparams *p)
+table_select_by_value(struct tableparams *p)
 {
   uint8_t *u;
   struct list_select *tmp;
@@ -559,47 +560,159 @@ table_sort(struct tableparams *p)
 
 
 
+/* Apply random row selection. If the returned value is 'EXIT_SUCCESS',
+   then, it was successful. Otherwise, it will return 'EXIT_FAILURE' and
+   the input won't be touched. */
+static int
+table_random_rows(gal_data_t *table, gsl_rng *rng, size_t numrandom,
+                  size_t minmapsize, int quietmmap)
+{
+  int bad;
+  uint8_t *marr, *u, *uf;
+  gal_data_t *mask, *perm;
+  size_t i, b, g, *s, *sf, ind;
+
+  /* Sanity check. */
+  if(numrandom>table->size)
+    return EXIT_FAILURE;
+
+  /* Allocate space for the permutation array and the mask
+     array. Initialize the mask array to 1 (so we later set the rows we
+     want to 0). */
+  mask=gal_data_alloc(NULL, GAL_TYPE_UINT8, 1, table->dsize, NULL, 0,
+                      minmapsize, quietmmap, NULL, NULL, NULL);
+  perm=gal_data_alloc(NULL, GAL_TYPE_SIZE_T, 1, table->dsize, NULL, 0,
+                      minmapsize, quietmmap, NULL, NULL, NULL);
+  uf=(u=mask->array)+mask->size; do *u++ = 1; while(u<uf);
+
+  /* Select the row numbers. */
+  marr=mask->array;
+  for(i=0;i<numrandom;++i)
+    {
+      /* Select a random index and make sure its new. */
+      bad=1;
+      while(bad)
+        {
+          ind = gsl_rng_uniform(rng) * table->size;
+          if(marr[ind]) bad=0;
+        }
+      marr[ind]=0;
+    }
+
+  /* Fill up the rest of the permutation array. */
+  g=0;          /* Good indexs (starting from 0). */
+  b=numrandom;  /* Bad indexs (starting from total number of good). */
+  u=mask->array;
+  sf=(s=perm->array)+perm->size;
+  do *s = *u++ ? b++ : g++; while(++s<sf);
+
+  /* Apply the final permutation to the whole table. */
+  table_apply_permutation(table, perm->array, numrandom, 1);
+
+  /* Clean up and return. */
+  gal_data_free(mask);
+  gal_data_free(perm);
+  return EXIT_SUCCESS;
+}
+
+
+
+
+
 static void
-table_head_tail(struct tableparams *p)
+table_select_by_position(struct tableparams *p)
 {
   char **strarr;
   gal_data_t *col;
   size_t i, start, end;
+  double *darr = p->rowlimit ? p->rowlimit->array : NULL;
+
+  /* Random row selection (by position, not value). This step is
+     independent of the other operations of this function, so as soon as
+     its finished return. */
+  if(p->rowrandom)
+    {
+      if( table_random_rows(p->table, p->rng, p->rowrandom,
+                            p->cp.minmapsize, p->cp.quietmmap)
+          == EXIT_FAILURE && p->cp.quiet==0 )
+        error(EXIT_SUCCESS, 0, "'--rowrandom' not activated because "
+              "the number of rows in the table at this stage (%zu) "
+              "is smaller than the number of requested random rows "
+              "(%zu). You can supress this message with '--quiet'",
+              p->table->size, p->rowrandom);
+      return;
+    }
+
+  /* Sanity check  */
+  if(p->rowlimit)
+    {
+      if(darr[0]>=p->table->size)
+        error(EXIT_FAILURE, 0, "the first value to '--rowlimit' (%g) "
+              "is larger than the number of rows (%zu)",
+              darr[0]+1, p->table->size);
+      else if( darr[1]>=p->table->size )
+        error(EXIT_FAILURE, 0, "the second value to '--rowlimit' (%g) "
+              "is larger than the number of rows (%zu)",
+              darr[1]+1, p->table->size);
+    }
 
   /* Go over all the columns and make the necessary corrections. */
   for(col=p->table;col!=NULL;col=col->next)
     {
-      /* If we are dealing with strings, we'll need to free the strings
-         that the columns that will not be used point to (outside the
-         allocated array directly 'gal_data_t'). We don't have to worry
-         about the space for the actual pointers (they will be free'd by
-         'free' in any case, since they are in the initially allocated
-         array).*/
+      /* FOR STRING: we'll need to free the individual strings that will
+         not be used (outside the allocated array directly
+         'gal_data_t'). We don't have to worry about the space for the
+         actual pointers (they will be free'd by 'free' in any case, since
+         they are in the initially allocated array).*/
       if(col->type==GAL_TYPE_STRING)
         {
-          /* Set the start and ending indexs. */
-          start = p->head!=GAL_BLANK_SIZE_T ? p->head        : 0;
-          end   = p->head!=GAL_BLANK_SIZE_T ? p->table->size : p->tail;
-
-          /* Free their allocated spaces. */
+          /* Parse the rows and free extra pointers. */
           strarr=col->array;
-          for(i=start; i<end; ++i) { free(strarr[i]); strarr[i]=NULL; }
+          if(p->rowlimit)
+            {
+              /* Note that the given values to '--rowlimit' started from 1,
+                 but in 'ui.c' we subtracted one from it (so at this stage,
+                 it starts from 0). */
+              start = darr[0];
+              end   = darr[1];
+              for(i=0;i<p->table->size;++i)
+                if(i<start || i>end) { free(strarr[i]); strarr[i]=NULL; }
+            }
+          else
+            {
+              /* Free their allocated spaces. */
+              start = p->head!=GAL_BLANK_SIZE_T ? p->head        : 0;
+              end   = p->head!=GAL_BLANK_SIZE_T ? p->table->size : p->tail;
+              for(i=start; i<end; ++i) { free(strarr[i]); strarr[i]=NULL; }
+            }
         }
 
-      /* For '--tail', we'll need to bring the last columns to the
-         start. Note that we are using 'memmove' because we want to be safe
-         with overlap. */
-      if(p->tail!=GAL_BLANK_SIZE_T)
-        memmove(col->array,
-                gal_pointer_increment(col->array, col->size - p->tail,
-                                      col->type),
-                p->tail*gal_type_sizeof(col->type));
-
-      /* In any case (head or tail), the new number of column elements is
-         the given value. */
-      col->size = col->dsize[0] = ( p->head!=GAL_BLANK_SIZE_T
-                                    ? p->head
-                                    : p->tail );
+      /* Make the final adjustment. */
+      if(p->rowlimit)
+        {
+          /* Move the values up to the top and correct the size. */
+          col->size=darr[1]-darr[0]+1;
+          memmove(col->array,
+                  gal_pointer_increment(col->array, darr[0], col->type),
+                  (darr[1]-darr[0]+1)*gal_type_sizeof(col->type));
+        }
+      else
+        {
+          /* For '--tail', we'll need to bring the last columns to the
+             start. Note that we are using 'memmove' because we want to be
+             safe with overlap. */
+          if(p->tail!=GAL_BLANK_SIZE_T)
+            memmove(col->array,
+                    gal_pointer_increment(col->array, col->size - p->tail,
+                                          col->type),
+                    p->tail*gal_type_sizeof(col->type));
+
+          /* In any case (head or tail), the new number of column elements
+             is the given value. */
+          col->size = col->dsize[0] = ( p->head!=GAL_BLANK_SIZE_T
+                                        ? p->head
+                                        : p->tail );
+        }
     }
 }
 
@@ -853,15 +966,18 @@ table_noblank(struct tableparams *p)
 void
 table(struct tableparams *p)
 {
-  /* Apply a certain range (if required) to the output sample. */
-  if(p->selection) table_selection(p);
+  /* Apply ranges based on row values (if required). */
+  if(p->selection) table_select_by_value(p);
 
   /* Sort it (if required). */
   if(p->sort) table_sort(p);
 
   /* If the output number of rows is limited, apply them. */
-  if(p->head!=GAL_BLANK_SIZE_T || p->tail!=GAL_BLANK_SIZE_T)
-    table_head_tail(p);
+  if( p->rowlimit
+      || p->rowrandom
+      || p->head!=GAL_BLANK_SIZE_T
+      || p->tail!=GAL_BLANK_SIZE_T )
+    table_select_by_position(p);
 
   /* If any operations are needed, do them. */
   if(p->outcols)
diff --git a/bin/table/ui.c b/bin/table/ui.c
index 378f0db..bf173c3 100644
--- a/bin/table/ui.c
+++ b/bin/table/ui.c
@@ -271,11 +271,15 @@ ui_read_check_only_options(struct tableparams *p)
               "v1x,v1y:v2x,v2y:v3x,v3y:...");
     }
 
-
-  /* Make sure '--head' and '--tail' aren't given together. */
-  if(p->head!=GAL_BLANK_SIZE_T && p->tail!=GAL_BLANK_SIZE_T)
-    error(EXIT_FAILURE, 0, "'--head' and '--tail' options cannot be "
-          "called together");
+  /* Make sure only one of the positional row selection operations is
+     called in this run.*/
+  if(p->rowlimit
+     && p->rowrandom
+     && p->head!=GAL_BLANK_SIZE_T
+     && p->tail!=GAL_BLANK_SIZE_T)
+    error(EXIT_FAILURE, 0, "only one of the following options can be "
+          "called in one run: '--head', '--tail', '--rowlimit' and "
+          "'--rowrandom'");
 
   /* If '--colmetadata' is given, make sure none of the given options have
      more than three values. */
@@ -1042,7 +1046,8 @@ ui_check_select_sort_after(struct tableparams *p, size_t 
nselect,
 static void
 ui_preparations(struct tableparams *p)
 {
-  size_t *colmatch;
+  double *darr;
+  size_t i, *colmatch;
   gal_list_str_t *lines;
   size_t nselect=0, origoutncols=0;
   size_t sortindout=GAL_BLANK_SIZE_T;
@@ -1132,6 +1137,51 @@ ui_preparations(struct tableparams *p)
               ? p->table->size
               : p->tail );
 
+  /* If rows are given, do some sanity checks and make sure that they are
+     within the table's limits. */
+  if(p->rowlimit)
+    {
+      /* There should only be two values. */
+      if(p->rowlimit->size!=2)
+        error(EXIT_FAILURE, 0, "only two should be given to "
+              "'--rowlimit' (the top and bottom row numbers specifying "
+              "your desired range)");
+
+      /* Do individual checks. */
+      darr=p->rowlimit->array;
+      for(i=0;i<p->rowlimit->size;++i)
+        {
+          /* Make sure it isn't 0 or negative. */
+          if( darr[i]<=0 )
+            error(EXIT_FAILURE, 0, "%g (value given to '--rowlimit') "
+                  "is smaller than, or equal to, zero! This option's "
+                  "values are row-counters (starting from 1), so they "
+                  "must be positive integers", darr[i]);
+
+          /* Make sure its an integer. */
+          if( darr[i] != (size_t)(darr[i]) )
+            error(EXIT_FAILURE, 0, "%g (value given to '--rowlimit') is "
+                  "not an integer! This option's values are row-counters "
+                  "so they must be integers.", darr[i]);
+
+          /* Subtract 1 from the value, so it counts from 0. */
+          --darr[i];
+        }
+
+      /* Make sure that the first value is smaller than the second. */
+      if( darr[0] > darr[1] )
+        error(EXIT_FAILURE, 0, "the first value to '--rowlimit' (%g) is "
+              "larger than the second (%g). This option's values defines "
+              "a row-counter interval, assuming the first value is the top "
+              "of the desired interval (smaller row counter) and the second "
+              "value is the bottom of the desired interval (larger row "
+              "counter)", darr[0], darr[1]);
+    }
+
+  /* If random rows are desired, we need to define a GSL random number
+     generator structure. */
+  if(p->rowrandom)
+    p->rng=gal_checkset_gsl_rng(p->envseed, &p->rng_name, &p->rng_seed);
 
   /* Clean up. */
   free(colmatch);
@@ -1211,6 +1261,19 @@ ui_read_check_inputs_setup(int argc, char *argv[], 
struct tableparams *p)
 
   /* Read/allocate all the necessary starting arrays. */
   ui_preparations(p);
+
+  /* Let the user know basic information if necessary (for example when a
+     random number generator has been used). */
+  if(p->rng && !p->cp.quiet)
+    {
+      /* Write the information. */
+      printf(PROGRAM_NAME" "PACKAGE_VERSION" started on %s",
+             ctime(&p->rawtime));
+      printf("Parameters used for '--randomrows':\n");
+      printf("  - Random number generator name: %s\n", p->rng_name);
+      printf("  - Random number generator seed: %lu\n", p->rng_seed);
+      printf("(use '--quiet' to supress this starting message)\n");
+    }
 }
 
 
@@ -1245,4 +1308,8 @@ ui_free_report(struct tableparams *p)
   gal_list_data_free(p->table);
   gal_list_data_free(p->colmetadata);
   if(p->colarray) free(p->colarray);
+
+  /* If a random number generator was allocated, free it. */
+  if(p->rng) gsl_rng_free(p->rng);
+
 }
diff --git a/bin/table/ui.h b/bin/table/ui.h
index 5b29de3..a898b19 100644
--- a/bin/table/ui.h
+++ b/bin/table/ui.h
@@ -68,6 +68,9 @@ enum option_keys_enum
   /* Only with long version (start with a value 1000, the rest will be set
      automatically). */
   UI_KEY_POLYGON         = 1000,
+  UI_KEY_ENVSEED,
+  UI_KEY_ROWLIMIT,
+  UI_KEY_ROWRANDOM,
   UI_KEY_INPOLYGON,
   UI_KEY_OUTPOLYGON,
   UI_KEY_CATCOLUMNRAWNAME,
diff --git a/doc/gnuastro.texi b/doc/gnuastro.texi
index 1680de4..757e742 100644
--- a/doc/gnuastro.texi
+++ b/doc/gnuastro.texi
@@ -10482,6 +10482,7 @@ When called with @option{--sort}, rows will be sorted 
in descending order.
 Only print the given number of rows from the @emph{top} of the final table.
 Note that this option only affects the @emph{output} table.
 For example if you use @option{--sort}, or @option{--range}, the printed rows 
are the first @emph{after} applying the sort sorting, or selecting a range of 
the full input.
+This option cannot be called with @option{--tail}, @option{--rowlimit} or 
@option{--rowrandom}.
 
 @cindex GNU Coreutils
 If the given value to @option{--head} is 0, the output columns won't have any 
rows and if its larger than the number of rows in the input table, all the rows 
are printed (this option is effectively ignored).
@@ -10491,6 +10492,39 @@ This behavior is taken from the @command{head} program 
in GNU Coreutils.
 @itemx --tail=INT
 Only print the given number of rows from the @emph{bottom} of the final table.
 See @option{--head} for more.
+This option cannot be called with @option{--head}, @option{--rowlimit} or 
@option{--rowrandom}.
+
+@item --rowlimit=INT,INT
+Only return the rows within the requested positional range (inclusive on both 
sides).
+Therefore, @code{--rowlimit=5,7} will return 3 of the input rows, row 5, 6 and 
7.
+This option will abort if any of the given values is larger than the total 
number of rows in the table.
+
+With the @option{--head} or @option{--tail} options you can only see the top 
or bottom few rows.
+However, with this option, you can limit the returned rows to a contiugous set 
of rows in the middle of the table.
+Therefore this option cannot be called with @option{--head}, @option{--tail}, 
or @option{--rowrandom}.
+
+@item --rowrandom=INT
+@cindex Random row selection
+@cindex Row selection, by random
+Select @code{INT} rows from the input table by random (assuming a uniform 
distribution).
+This option is applied @emph{after} the value-based selection options (like 
@option{--sort}, @option{--range}, @option{--polygon} and etc).
+On the other hand, only the row counters are randomly selected, this option 
doesn't change the order.
+Therefore, if @option{--rowrandom} is called together with @option{--sort}, 
the returned rows are still sorted.
+This option cannot be called with @option{--head}, @option{--tail}, or 
@option{--rowlimit}.
+
+This option will only have an effect if @code{INT} is larger than the number 
of rows when it is activated (after the value-based selection options have been 
applied).
+When there are fewer rows, a warning is printed, saying that this option has 
no effect.
+The warning can be disabled with the @option{--quiet} option.
+
+@cindex Reproducibility
+Due to its nature (to be random), the output of this option differs in each 
run.
+Therefore 5 calls to Table with @option{--rowrandom} on the same input table 
will generate 5 different outputs.
+If you want a reproducible random selection, set the @code{GSL_RNG_SEED} 
environment variable and also use the @option{--envseed} option, for more see 
@ref{Generating random numbers}.
+
+@item --envseed
+Read the random number generator seed from the @code{GSL_RNG_SEED} environment 
variable for @option{--rowrandom} (instead of generating a different seed 
internally on every run).
+This is useful if you want a reproducible random selection of the input rows.
+For more, see @ref{Generating random numbers}.
 
 @item -b STR[,STR[,STR]]
 @itemx --noblank=STR[,STR[,STR]]
diff --git a/lib/checkset.c b/lib/checkset.c
index 4247938..e4ddd04 100644
--- a/lib/checkset.c
+++ b/lib/checkset.c
@@ -49,7 +49,7 @@ along with Gnuastro. If not, see 
<http://www.gnu.org/licenses/>.
 /**************************************************************/
 /* The GSL random number generator (RNG) reads values from the
    environment. This function is designed to make the job easier for any
-   program using GSL's RNG. If the user doesn't want to set the */
+   Gnuastro program using GSL's RNG functions. */
 gsl_rng *
 gal_checkset_gsl_rng(uint8_t envseed_bool, const char **name,
                      unsigned long int *seed)



reply via email to

[Prev in Thread] Current Thread [Next in Thread]