[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[gnuastro-commits] master 4dda73a8 1/2: Match: now has multiple arrangem
From: |
Mohammad Akhlaghi |
Subject: |
[gnuastro-commits] master 4dda73a8 1/2: Match: now has multiple arrangements of output rows |
Date: |
Sat, 14 Dec 2024 22:10:23 -0500 (EST) |
branch: master
commit 4dda73a83c943083e05ddc962fa28a02b7c3ec52
Author: Mohammad Akhlaghi <mohammad@akhlaghi.org>
Commit: Mohammad Akhlaghi <mohammad@akhlaghi.org>
Match: now has multiple arrangements of output rows
Until now, Match only done one type of output arrangement: what is known in
SQL as the INNER JOIN. But other arrangements are also sometimes necessary.
With this commit, the other well-known arragements of an output of Match
have been implemented: full, outer and outer-within-aperture. The exapmles
in the newly added section of the book fully describe them.
---
NEWS | 27 +++
bin/match/args.h | 15 ++
bin/match/astmatch.conf | 3 +
bin/match/main.h | 2 +-
bin/match/match.c | 453 ++++++++++++++++++++++++++++++------------------
bin/match/ui.c | 366 ++++++++++++++++++++++----------------
bin/match/ui.h | 3 +-
doc/gnuastro.texi | 338 +++++++++++++++++++++++++++++-------
lib/gnuastro/match.h | 36 +++-
lib/match.c | 233 +++++++++++++++++++------
10 files changed, 1034 insertions(+), 442 deletions(-)
diff --git a/NEWS b/NEWS
index d26abb40..7fb40cb0 100644
--- a/NEWS
+++ b/NEWS
@@ -50,6 +50,27 @@ See the end of the file for license conditions.
--mean-error: Measure the error of the mean for each label (object or
clump).
+*** Match
+
+ --arrange: the arrangement of the match output. Until now, the Match
+ program only had the 'inner' arrangement (known in SQL as "INNER
+ JOIN"). From this version, other arrangements are also available and
+ fully described (with example tables and short tutorials) in the newly
+ added "Arranging match output" section of the book. A short summary of
+ the new arrangements is given below:
+
+ - full: known as "FULL OUTER JOIN" in SQL. Its output has both the
+ matched rows and the non-matched rows in one catalog.
+
+ - outer: known as "RIGHT OUTER JOIN" in SQL. Its output has the same
+ number of rows as the second (query) input catalog to Match, giving
+ the ability to find the (possibly repeated) matching rows from the
+ first (reference) catalog. See the demo for a real-world example.
+
+ - outer-within-aperture: similar to 'outer', but if the nearest
+ reference catalog entry is more distant than the given aperture, it
+ will be NaN/blank in the output.
+
*** astscript-fits-view
--topcat4k (or '-k') option is added for easy display on 4K monitors
@@ -75,6 +96,12 @@ See the end of the file for license conditions.
** Removed features
** Changed features
+
+*** Library
+
+ - gal_match_kdtree: a new 'arrange' argument has been added to define the
+ arrangement of the output.
+
** Bugs fixed
- bug #66216: NaN values in MakeProfile outputs for very small profiles
diff --git a/bin/match/args.h b/bin/match/args.h
index ce3b4a30..ae0dca09 100644
--- a/bin/match/args.h
+++ b/bin/match/args.h
@@ -49,6 +49,7 @@ struct argp_option program_options[] =
+
/* Outputs. */
{
"logasoutput",
@@ -90,6 +91,20 @@ struct argp_option program_options[] =
GAL_OPTIONS_NOT_SET,
gal_options_parse_csv_strings
},
+ {
+ "arrange",
+ UI_KEY_ARRANGE,
+ "STR",
+ 0,
+ "inner, outer, outer-within-aperture or full.",
+ GAL_OPTIONS_GROUP_OUTPUT,
+ &p->type,
+ GAL_TYPE_STRING,
+ GAL_OPTIONS_RANGE_ANY,
+ GAL_OPTIONS_MANDATORY,
+ GAL_OPTIONS_NOT_SET,
+ ui_parse_match_type,
+ },
diff --git a/bin/match/astmatch.conf b/bin/match/astmatch.conf
index 5d7edc38..e33059bc 100644
--- a/bin/match/astmatch.conf
+++ b/bin/match/astmatch.conf
@@ -23,5 +23,8 @@
hdu2 1
kdtreehdu 1
+# Output
+ arrange inner
+
# Catalog matching
kdtree internal
diff --git a/bin/match/main.h b/bin/match/main.h
index 2ea13324..c69013e3 100644
--- a/bin/match/main.h
+++ b/bin/match/main.h
@@ -63,6 +63,7 @@ struct matchparams
char *input1name; /* First input filename. */
char *input2name; /* Second input filename. */
char *hdu2; /* Second input's HDU. */
+ uint8_t type; /* Type of matching: inner, outer, full */
gal_data_t *ccol1; /* Array of first input column names. */
gal_data_t *ccol2; /* Array of second input column names. */
gal_data_t *coord; /* Array of manual coordinate values. */
@@ -74,7 +75,6 @@ struct matchparams
uint8_t notmatched; /* Output is rows that don't match. */
/* Internal */
- int mode; /* Mode of operation: image or catalog. */
gal_data_t *cols1; /* Column values of first input. */
gal_data_t *cols2; /* Column values of second input. */
gal_list_str_t *acols; /* Output columns from first input. */
diff --git a/bin/match/match.c b/bin/match/match.c
index 9bf4c6ff..f74606e0 100644
--- a/bin/match/match.c
+++ b/bin/match/match.c
@@ -225,27 +225,85 @@ match_catalog_permute_inplace(struct matchparams *p,
gal_data_t *in,
static void
match_arrange_in_new_col(struct matchparams *p, gal_data_t *in,
- size_t *permutation, size_t nummatched)
+ size_t *permut, size_t nummatched, uint8_t f1s2)
{
- size_t c=0, i, n;
- size_t istart=p->notmatched ? nummatched : 0;
- size_t iend=p->notmatched ? in->dsize[0] : nummatched;
- size_t outrows=p->notmatched ? in->dsize[0] - nummatched : nummatched;
+ void *out;
+ size_t istart, iend, outrows;
+ size_t i, j, n, c=0, offset=0;
+
+ /* Set the match types. */
+ switch(p->type)
+ {
+ case GAL_MATCH_ARRANGE_INNER:
+ istart=p->notmatched ? nummatched : 0;
+ iend=p->notmatched ? in->dsize[0] : nummatched;
+ outrows = p->notmatched ? in->dsize[0] - nummatched : nummatched;
+ break;
+
+ case GAL_MATCH_ARRANGE_FULL:
+ /* In a full match, the matched rows (first set of rows) will come
+ from both catalogs, all the non-matched rows will come after
+ that. So the final number of rows will be the addition of the two
+ sizes, minus the number of matched rows. */
+ istart=0;
+ iend=nummatched;
+ offset = f1s2==1 ? nummatched : p->cols1->size;
+ outrows = p->cols1->size + p->cols2->size - nummatched;
+ break;
+
+ case GAL_MATCH_ARRANGE_OUTER:
+ case GAL_MATCH_ARRANGE_OUTERWITHINAPERTURE:
+ istart=0;
+ iend = outrows = p->cols2->size;
+ break;
+
+ default:
+ error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at '%s' to "
+ "fix the problem. The code '%u' is not recognized for "
+ "'p->type'", __func__, PACKAGE_BUGREPORT, p->type);
+ }
/* Set the number of values in this column (for vectors). */
n = in->ndim==1 ? 1 : in->dsize[1];
- /* Allocate the array. */
- void *out=gal_pointer_allocate_ram_or_mmap(in->type, outrows*n, 0,
- p->cp.minmapsize,
- &in->mmapname, p->cp.quietmmap,
- __func__, "out");
+ /* Allocate the output array. */
+ out=gal_pointer_allocate_ram_or_mmap(in->type, outrows*n, 0,
+ p->cp.minmapsize,
+ &in->mmapname, p->cp.quietmmap,
+ __func__, "out");
+
+ /* Except for inner and outer, the other types of match can lead to empty
+ rows. So we'll initialize all the values to blank in the given
+ type. */
+ if( p->type==GAL_MATCH_ARRANGE_OUTERWITHINAPERTURE
+ || p->type==GAL_MATCH_ARRANGE_FULL )
+ for(i=0;i<outrows*n;++i)
+ gal_blank_write(gal_pointer_increment(out, i, in->type),
+ in->type);
/* Copy the matched rows into the output array. */
for(i=istart;i<iend;++i)
- memcpy(gal_pointer_increment(out, n*c++, in->type),
- gal_pointer_increment(in->array, n*permutation[i], in->type),
- gal_type_sizeof(in->type) * n);
+ {
+ /* Copy the matched columns. */
+ if(permut[i]!=GAL_BLANK_SIZE_T) /* For 'outer-within-aperture'. */
+ memcpy(gal_pointer_increment(out, n*c, in->type),
+ gal_pointer_increment(in->array, n*permut[i], in->type),
+ gal_type_sizeof(in->type) * n);
+
+ /* Increment the output row counter. */
+ ++c;
+ }
+
+ /* If we are doing a full match, we need to write the non-matching rows
+ in the last rows. */
+ if(p->type==GAL_MATCH_ARRANGE_FULL)
+ {
+ i=offset;
+ for(j=iend;j<in->size;++j)
+ memcpy(gal_pointer_increment(out, n*i++, in->type),
+ gal_pointer_increment(in->array, n*permut[j], in->type),
+ gal_type_sizeof(in->type) * n);
+ }
/**********************************/
/* Add a check so if the column is a string, we free the strings that
@@ -270,10 +328,11 @@ struct ma_params
gal_data_t *cat; /* Dataset (all rows) to arrange. */
size_t nummatched; /* Number of matched. */
size_t *permutation; /* The permutation. */
+ uint8_t f1s2; /* First catalog: 1, second: 2. */
};
static void *
-match_arrange(void *in_prm)
+match_arrange_worker(void *in_prm)
{
/* Low-level definitions to be done first. */
struct gal_threads_params *tprm=(struct gal_threads_params *)in_prm;
@@ -295,7 +354,7 @@ match_arrange(void *in_prm)
/* Rearrange this columns' elements. */
match_arrange_in_new_col(map->p, tmp, map->permutation,
- map->nummatched);
+ map->nummatched, map->f1s2);
}
/* Wait for all the other threads to finish, then return. */
@@ -307,6 +366,73 @@ match_arrange(void *in_prm)
+static void
+match_arrange_inner_full(struct matchparams *p, size_t *permutation,
+ int f1s2, gal_data_t *cat, size_t nummatched)
+{
+ gal_data_t *tmp;
+ struct ma_params map;
+
+ /* Arrange the output rows. */
+ if(permutation)
+ {
+ /* When we are in no-match AND outcols mode, we don't need to touch
+ the rows of the first input catalog (we want all of them) */
+ if( (p->notmatched && p->outcols && f1s2==1) == 0 )
+ {
+ map.p=p;
+ map.cat=cat;
+ map.f1s2=f1s2;
+ map.nummatched=nummatched;
+ map.permutation=permutation;
+ gal_threads_spin_off(match_arrange_worker, &map,
+ gal_list_data_number(cat),
+ p->cp.numthreads, p->cp.minmapsize,
+ p->cp.quietmmap);
+
+ }
+ }
+
+ /* If no match was found ('permutation==NULL'), and the matched columns
+ are requested, empty all the columns that are to be written (only
+ keeping the meta-data). */
+ else
+ if(p->notmatched==0)
+ {
+ for(tmp=cat; tmp!=NULL; tmp=tmp->next)
+ {
+ tmp->size=0;
+ free(tmp->dsize); tmp->dsize=NULL;
+ free(tmp->array); tmp->array=NULL;
+ }
+ }
+
+}
+
+
+
+
+
+static void
+match_arrange_outer(struct matchparams *p, size_t *permutation,
+ gal_data_t *cat)
+{
+ struct ma_params map;
+
+ /* Fill the multi-threaded arrangement values. */
+ map.p=p;
+ map.cat=cat;
+ map.permutation=permutation;
+ gal_threads_spin_off(match_arrange_worker, &map,
+ gal_list_data_number(cat),
+ p->cp.numthreads, p->cp.minmapsize,
+ p->cp.quietmmap);
+}
+
+
+
+
+
/* Read the catalog in the given file and use the given permutation to keep
the proper columns. */
static gal_data_t *
@@ -315,8 +441,7 @@ match_catalog_read_write_all(struct matchparams *p, size_t
*permutation,
size_t **numcolmatch)
{
int hasall=0;
- struct ma_params map;
- gal_data_t *tmp, *cat;
+ gal_data_t *cat;
gal_list_str_t *cols, *tcol;
char *hopt = (f1s2==1) ? "--hdu" : "--hdu2";
@@ -368,43 +493,28 @@ match_catalog_read_write_all(struct matchparams *p,
size_t *permutation,
else
cat=match_cat_from_coord(p, cols, *numcolmatch);
- /* Arrange the output rows. */
- if(permutation)
+ /* Arrange the rows (depends on the type of match). */
+ switch(p->type)
{
- /* When we are in no-match AND outcols mode, we don't need to touch
- the rows of the first input catalog (we want all of them) */
- if( (p->notmatched && p->outcols && f1s2==1) == 0 )
- {
- map.p=p;
- map.cat=cat;
- map.nummatched=nummatched;
- map.permutation=permutation;
- gal_threads_spin_off(match_arrange, &map,
- gal_list_data_number(cat),
- p->cp.numthreads, p->cp.minmapsize,
- p->cp.quietmmap);
-
- }
+ case GAL_MATCH_ARRANGE_FULL:
+ case GAL_MATCH_ARRANGE_INNER:
+ match_arrange_inner_full(p, permutation, f1s2, cat, nummatched);
+ break;
+ case GAL_MATCH_ARRANGE_OUTER:
+ case GAL_MATCH_ARRANGE_OUTERWITHINAPERTURE:
+ /* In an outer matching, the second input's columns do not need
+ arrangement, only the first input's columns need to be arranged
+ to match the second input. */
+ if(f1s2==1) match_arrange_outer(p, permutation, cat);
+ break;
+ default:
+ error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at '%s' to "
+ "fix the problem. The value '%u' of the 'type' variable is "
+ "not recognized", __func__, PACKAGE_BUGREPORT, p->type);
}
- /* If no match was found ('permutation==NULL'), and the matched columns
- are requested, empty all the columns that are to be written (only
- keeping the meta-data). */
- else
- if(p->notmatched==0)
- {
- for(tmp=cat; tmp!=NULL; tmp=tmp->next)
- {
- tmp->size=0;
- free(tmp->dsize); tmp->dsize=NULL;
- free(tmp->array); tmp->array=NULL;
- }
- }
-
-
/* Write the catalog to the output. */
- if(p->outcols)
- return cat;
+ if(p->outcols) return cat;
else if(cat)
{
/* Write the catalog to a file. */
@@ -415,6 +525,7 @@ match_catalog_read_write_all(struct matchparams *p, size_t
*permutation,
gal_list_data_free(cat);
}
+ /* If control reaches here, the output is already written. */
return NULL;
}
@@ -451,11 +562,12 @@ match_catalog_write_one_row(struct matchparams *p,
gal_data_t *a,
/* Make sure both have the same type. */
if(ta->type!=tb->type)
error(EXIT_FAILURE, 0, "when '--notmatched' and '--outcols' "
- "are used together, the each column given to '--outcols' "
- "must have the same datatype in both tables. However, "
- "the first input has a type of '%s' for one of the "
- "columns, while the second has a type of '%s'",
- gal_type_name(ta->type, 1), gal_type_name(tb->type, 1));
+ "are used together, the each column given to "
+ "'--outcols' must have the same datatype in both "
+ "tables. However, the first input has a type of '%s' "
+ "for one of the columns, while the second has a type "
+ "of '%s'", gal_type_name(ta->type, 1),
+ gal_type_name(tb->type, 1));
/* Allocate the necessary space. */
gal_list_data_add_alloc(&cat, NULL, ta->type, ta->ndim,
@@ -644,7 +756,8 @@ match_catalog_kdtree(struct matchparams *p, size_t
*nummatched)
printf(" - Match using the k-d tree ...\n");
}
out = gal_match_kdtree(p->cols1, p->cols2, p->kdtreedata,
- p->kdtreeroot, p->aperture->array,
+ p->kdtreeroot, p->type,
+ p->aperture ? p->aperture->array : NULL,
p->cp.numthreads, p->cp.minmapsize,
p->cp.quietmmap, nummatched);
if(!p->cp.quiet)
@@ -710,12 +823,123 @@ match_catalog_sort_based(struct matchparams *p, size_t
*nummatched)
static void
-match_catalog(struct matchparams *p)
+match_catalog_output(struct matchparams *p, gal_data_t *mcols,
+ size_t nummatched)
{
- uint32_t *u, *uf;
struct timeval t1;
- gal_data_t *tmp, *a=NULL, *b=NULL, *mcols=NULL;
- size_t nummatched, *acolmatch=NULL, *bcolmatch=NULL;
+ gal_data_t *a=NULL, *b=NULL;
+ size_t *acolmatch=NULL, *bcolmatch=NULL;
+
+ /* Let the user know what is happening. */
+ if(!p->cp.quiet)
+ {
+ gettimeofday(&t1, NULL);
+ printf(" - Arranging matched rows (skip this with "
+ "'--logasoutput')...\n");
+ }
+
+ /* Read (and possibly write) the outputs. Note that we only need to
+ read the table when it is necessary for the output (the user might
+ have asked for '--outcols', only with columns of one of the two
+ inputs). */
+ if(p->outcols==NULL || p->acols)
+ a=match_catalog_read_write_all(p, mcols?mcols->array:NULL,
+ nummatched, 1, &acolmatch);
+ if(p->outcols==NULL || p->bcols)
+ b=match_catalog_read_write_all(p, mcols?mcols->next->array:NULL,
+ nummatched, 2, &bcolmatch);
+
+ /* If one catalog (with specific columns from either of the two
+ inputs) was requested, then write it out. */
+ if(p->outcols)
+ {
+ /* Arrange the columns and write the output. */
+ if(p->notmatched)
+ match_catalog_write_one_row(p, a, b);
+ else
+ {
+ match_catalog_write_one_col(p, a, b, acolmatch, bcolmatch);
+ a=b=NULL; /*They are freed in function above. */
+ }
+
+ /* Clean up. */
+ if(acolmatch) free(acolmatch);
+ if(bcolmatch) free(bcolmatch);
+ }
+
+ /* Clean up. */
+ if(a) gal_list_data_free(a);
+ if(b) gal_list_data_free(b);
+
+ /* Let the user know. */
+ if( !p->cp.quiet )
+ gal_timing_report(&t1, "... done!", 1);
+}
+
+
+
+
+
+static void
+match_catalog_log(struct matchparams *p, gal_data_t *mcols,
+ size_t nummatched)
+{
+ gal_data_t *tmp;
+ uint32_t *u, *uf;
+
+ /* Note that unsigned 64-bit integers are not recognized in FITS
+ tables. So if the log file is a FITS table, covert the two
+ index columns to uint32. */
+ tmp=gal_data_copy_to_new_type(mcols, GAL_TYPE_UINT32);
+ tmp->size=tmp->dsize[0]=nummatched;
+ tmp->next=mcols->next;
+ gal_data_free(mcols);
+ mcols=tmp;
+
+ /* We also want everything to be incremented by one. In a C
+ program, counting starts with zero, so 'gal_match_sort_based'
+ will return indexs starting from zero. But outside a C
+ program, on the command-line people expect counting to start
+ from 1 (for example with AWK). */
+ uf = (u=mcols->array) + tmp->size; do (*u)++; while(++u<uf);
+
+ /* Same for the second set of indexs. */
+ tmp=gal_data_copy_to_new_type(mcols->next, GAL_TYPE_UINT32);
+ uf = (u=tmp->array) + tmp->size; do (*u)++; while(++u<uf);
+ tmp->size=tmp->dsize[0]=nummatched;
+ tmp->next=mcols->next->next;
+ gal_data_free(mcols->next);
+ mcols->next=tmp;
+
+ /* Correct the comments. */
+ free(mcols->comment);
+ mcols->comment="Row index in first catalog (counting from 1).";
+ free(mcols->next->comment);
+ mcols->next->comment="Row index in second catalog (counting "
+ "from 1).";
+
+ /* Write them into the table. */
+ gal_table_write(mcols, NULL, NULL, p->cp.tableformat, p->logname,
+ "LOG_INFO", 0, 0);
+
+ /* Set the comment pointer to NULL: they weren't allocated. */
+ mcols->comment=NULL;
+ mcols->next->comment=NULL;
+
+ /* Inform the user that a log-file has been created. */
+ if(!p->cp.quiet)
+ fprintf(stdout, " - Output (log): %s\n", p->logname);
+}
+
+
+
+
+
+static void
+match_catalog(struct matchparams *p)
+{
+ size_t nummatched;
+ gal_data_t *mcols=NULL;
/* If we want to use kd-tree for matching. */
if(p->kdtreemode!=MATCH_KDTREE_DISABLE)
@@ -732,105 +956,13 @@ match_catalog(struct matchparams *p)
/* If the output is to be taken from the input columns (it isn't just the
log), then do the job. */
- if(p->logasoutput==0)
- {
- /* Let the user know what is happening. */
- if(!p->cp.quiet)
- {
- gettimeofday(&t1, NULL);
- printf(" - Arranging matched rows (skip this with "
- "'--logasoutput')...\n");
- }
-
- /* Read (and possibly write) the outputs. Note that we only need to
- read the table when it is necessary for the output (the user might
- have asked for '--outcols', only with columns of one of the two
- inputs). */
- if(p->outcols==NULL || p->acols)
- a=match_catalog_read_write_all(p, mcols?mcols->array:NULL,
- nummatched, 1, &acolmatch);
- if(p->outcols==NULL || p->bcols)
- b=match_catalog_read_write_all(p, mcols?mcols->next->array:NULL,
- nummatched, 2, &bcolmatch);
-
- /* If one catalog (with specific columns from either of the two
- inputs) was requested, then write it out. */
- if(p->outcols)
- {
- /* Arrange the columns and write the output. */
- if(p->notmatched)
- match_catalog_write_one_row(p, a, b);
- else
- {
- match_catalog_write_one_col(p, a, b, acolmatch, bcolmatch);
- a=b=NULL; /*They are freed in function above. */
- }
-
- /* Clean up. */
- if(acolmatch) free(acolmatch);
- if(bcolmatch) free(bcolmatch);
- }
-
- /* Clean up. */
- if(a) gal_list_data_free(a);
- if(b) gal_list_data_free(b);
-
- /* Let the user know. */
- if( !p->cp.quiet )
- gal_timing_report(&t1, "... done!", 1);
- }
+ if(p->logasoutput==0) match_catalog_output(p, mcols, nummatched);
/* Write the raw information in a log file if necessary. */
- if(p->logname && mcols)
- {
- /* Note that unsigned 64-bit integers are not recognized in FITS
- tables. So if the log file is a FITS table, covert the two
- index columns to uint32. */
- tmp=gal_data_copy_to_new_type(mcols, GAL_TYPE_UINT32);
- tmp->size=tmp->dsize[0]=nummatched;
- tmp->next=mcols->next;
- gal_data_free(mcols);
- mcols=tmp;
-
- /* We also want everything to be incremented by one. In a C
- program, counting starts with zero, so 'gal_match_sort_based'
- will return indexs starting from zero. But outside a C
- program, on the command-line people expect counting to start
- from 1 (for example with AWK). */
- uf = (u=mcols->array) + tmp->size; do (*u)++; while(++u<uf);
-
- /* Same for the second set of indexs. */
- tmp=gal_data_copy_to_new_type(mcols->next, GAL_TYPE_UINT32);
- uf = (u=tmp->array) + tmp->size; do (*u)++; while(++u<uf);
- tmp->size=tmp->dsize[0]=nummatched;
- tmp->next=mcols->next->next;
- gal_data_free(mcols->next);
- mcols->next=tmp;
-
- /* Correct the comments. */
- free(mcols->comment);
- mcols->comment="Row index in first catalog (counting from 1).";
- free(mcols->next->comment);
- mcols->next->comment="Row index in second catalog (counting "
- "from 1).";
-
- /* Write them into the table. */
- gal_table_write(mcols, NULL, NULL, p->cp.tableformat, p->logname,
- "LOG_INFO", 0, 0);
-
- /* Set the comment pointer to NULL: they weren't allocated. */
- mcols->comment=NULL;
- mcols->next->comment=NULL;
-
- /* Inform the user that a log-file has been created. */
- if(!p->cp.quiet)
- fprintf(stdout, " - Output (log): %s\n", p->logname);
- }
+ if(p->logname && mcols) match_catalog_log(p, mcols, nummatched);
- /* Clean up. */
+ /* Clean up and print the number of matches if not in quiet mode. */
gal_list_data_free(mcols);
-
- /* Print the number of matches if not in quiet mode. */
if(!p->cp.quiet)
{
if(p->out2name && strcmp(p->out1name, p->out2name))
@@ -867,16 +999,7 @@ void
match(struct matchparams *p)
{
/* Do the correct type of matching. */
- switch(p->mode)
- {
- case MATCH_MODE_CATALOG: match_catalog(p); break;
- case MATCH_MODE_WCS:
- error(EXIT_FAILURE, 0, "matching by WCS is not yet supported");
- default:
- error(EXIT_FAILURE, 0, "%s: a bug! please contact us at %s to fix "
- "the problem: %d is not a recognized mode",
- __func__, PACKAGE_BUGREPORT, p->mode);
- }
+ match_catalog(p);
/* Write Match's configuration as keywords into the first extension of
the output. */
diff --git a/bin/match/ui.c b/bin/match/ui.c
index 954ee448..23fcb988 100644
--- a/bin/match/ui.c
+++ b/bin/match/ui.c
@@ -29,6 +29,7 @@ along with Gnuastro. If not, see
<http://www.gnu.org/licenses/>.
#include <string.h>
#include <gnuastro/fits.h>
+#include <gnuastro/match.h>
#include <gnuastro/threads.h>
#include <gnuastro-internal/timing.h>
@@ -195,6 +196,70 @@ parse_opt(int key, char *arg, struct argp_state *state)
+void *
+ui_parse_match_type(struct argp_option *option, char *arg,
+ char *filename, size_t lineno, void *junk)
+{
+ char *str=NULL;
+ uint8_t value=GAL_MATCH_ARRANGE_INVALID;
+
+ /* If we are printing values. */
+ if(lineno==-1)
+ {
+ /* The output must be an allocated string (will be 'free'd later). */
+ value=*(uint8_t *)(option->value);
+ switch(value)
+ {
+ case GAL_MATCH_ARRANGE_FULL:
+ gal_checkset_allocate_copy("full", &str);
+ break;
+ case GAL_MATCH_ARRANGE_INNER:
+ gal_checkset_allocate_copy("inner", &str);
+ break;
+ case GAL_MATCH_ARRANGE_OUTER:
+ gal_checkset_allocate_copy("outer", &str);
+ break;
+ case GAL_MATCH_ARRANGE_OUTERWITHINAPERTURE:
+ gal_checkset_allocate_copy("outer-within-aperture", &str);
+ break;
+ default:
+ error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at '%s' "
+ "to fix the problem. %u is not a recognized match type",
+ __func__, PACKAGE_BUGREPORT, value);
+ }
+ return str;
+ }
+
+ /* If we are reading/parsing values. */
+ else
+ {
+ /* If the option is already set, just return. */
+ if(option->set) return NULL;
+
+ /* Read the value. */
+ if( !strcmp(arg, "full") ) value = GAL_MATCH_ARRANGE_FULL;
+ else if( !strcmp(arg, "inner") ) value = GAL_MATCH_ARRANGE_INNER;
+ else if( !strcmp(arg, "outer") ) value = GAL_MATCH_ARRANGE_OUTER;
+ else if( !strcmp(arg, "outer-within-aperture") )
+ value = GAL_MATCH_ARRANGE_OUTERWITHINAPERTURE;
+ else
+ error_at_line(EXIT_FAILURE, 0, filename, lineno, "'%s' (value "
+ "to '%s' option) couldn't be recognized as a known "
+ "mathing type. Acceptable values are 'full', "
+ "'inner', 'outer' or 'outer-within-aperture'", arg,
+ option->name);
+ *(uint8_t *)(option->value)=value;
+
+ /* For no un-used variable warning. This function doesn't need the
+ pointer. */
+ return junk=NULL;
+ }
+}
+
+
+
+
+
@@ -218,42 +283,63 @@ parse_opt(int key, char *arg, struct argp_state *state)
static void
ui_check_only_options(struct matchparams *p)
{
- /* k-d tree specific sanity checks. */
- if(p->kdtree)
- {
- /* Set the k-d tree mode. */
- if( !strcmp(p->kdtree,"build") ) p->kdtreemode=MATCH_KDTREE_BUILD;
- else if( !strcmp(p->kdtree,"internal") )
p->kdtreemode=MATCH_KDTREE_INTERNAL;
- else if( !strcmp(p->kdtree,"disable") )
p->kdtreemode=MATCH_KDTREE_DISABLE;
- else if( gal_fits_name_is_fits(p->kdtree) )
p->kdtreemode=MATCH_KDTREE_FILE;
- else
- error(EXIT_FAILURE, 0, "'%s' is not valid for '--kdtree'. The "
- "following values are accepted: 'build' (to build the k-d tree in "
- "the file given to '--output'), 'internal' (to force internal "
- "usage of a k-d tree for the matching), 'disable' (to not use a "
- "k-d tree at all), a FITS file name (the file to read a created "
- "k-d tree from)", p->kdtree);
-
- /* Make sure that the k-d tree build mode is not called with
- '--outcols'. */
- if( p->kdtreemode==MATCH_KDTREE_BUILD && (p->outcols || p->coord) )
- error(EXIT_FAILURE, 0, "the '--kdtree=build' option is incompatible "
- "with the '--outcols' or '--coord' options (because in the k-d "
- "tree building mode doesn't involve actual matching. It will "
- "only build k-d tree and write it to a file so it can be used "
- "in future matches)");
-
- /* Make sure that a HDU is also specified for the k-d tree when its an
- external file. */
- if( p->kdtreemode==MATCH_KDTREE_FILE && p->kdtreehdu==NULL )
- error(EXIT_FAILURE, 0, "no '--kdtreehdu' specified! When a FITS "
- "file is given to '--kdtree', you should specify its "
- "extension/HDU within the file using '--kdtreehdu'. You "
- "can either use the HDU name, or its number (counting "
- "from 0). If you aren't sure what HDUs exist in the file, "
- "you can use the 'astfits %s' command to see the full list",
- p->kdtree);
- }
+ /* Set the k-d tree mode. */
+ if( !strcmp(p->kdtree,"build") )
+ p->kdtreemode=MATCH_KDTREE_BUILD;
+ else if( !strcmp(p->kdtree,"internal") )
+ p->kdtreemode=MATCH_KDTREE_INTERNAL;
+ else if( !strcmp(p->kdtree,"disable") )
+ p->kdtreemode=MATCH_KDTREE_DISABLE;
+ else if( gal_fits_name_is_fits(p->kdtree) )
+ p->kdtreemode=MATCH_KDTREE_FILE;
+ else
+ error(EXIT_FAILURE, 0, "'%s' is not valid for '--kdtree'. The "
+ "following values are accepted: 'build' (to build the k-d "
+ "tree in the file given to '--output'), 'internal' (to "
+ "force internal usage of a k-d tree for the matching), "
+ "'disable' (to not use a k-d tree at all), a FITS file "
+ "name (the file to read a created k-d tree from)", p->kdtree);
+
+ /* Make sure that the k-d tree build mode is not called with
+ '--outcols'. */
+ if( p->kdtreemode==MATCH_KDTREE_BUILD && (p->outcols || p->coord) )
+ error(EXIT_FAILURE, 0, "the '--kdtree=build' option is "
+ "incompatible with the '--outcols' or '--coord' options "
+ "(because in the k-d tree building mode doesn't involve "
+ "actual matching. It will only build k-d tree and write "
+ "it to a file so it can be used in future matches)");
+
+ /* Make sure that a HDU is also specified for the k-d tree when its an
+ external file. */
+ if( p->kdtreemode==MATCH_KDTREE_FILE && p->kdtreehdu==NULL )
+ error(EXIT_FAILURE, 0, "no '--kdtreehdu' specified! When a FITS "
+ "file is given to '--kdtree', you should specify its "
+ "extension/HDU within the file using '--kdtreehdu'. You "
+ "can either use the HDU name, or its number (counting "
+ "from 0). If you aren't sure what HDUs exist in the file, "
+ "you can use the 'astfits %s' command to see the full list",
+ p->kdtree);
+
+ /* For an outer match, we only operate in k-d tree mode. */
+ if( ( p->type==GAL_MATCH_ARRANGE_OUTER
+ || p->type==GAL_MATCH_ARRANGE_OUTERWITHINAPERTURE )
+ && p->kdtreemode==MATCH_KDTREE_DISABLE )
+ error(EXIT_FAILURE, 0, "the 'outer' arrangement only works with the "
+ "k-d tree algorithm (in other words, it conflicts with "
+ "'--kdtree=disable' option). Please set '--kdtree=internal' "
+ "(if you want to build the k-d tree very time), or to "
+ "'--kdtree=file.fits' (where 'file.fits' is an existing k-d "
+ "tree that was built once and can be used many times for "
+ "a faster result on a repeated reference catalog). See the "
+ "description of the '--kdtree' option of the Match program "
+ "in Gnuastro manual for more (by running 'info %s')",
+ PROGRAM_EXEC);
+
+ /* The '--notmatched' option is only valid for the inner type of
+ matching. */
+ if(p->notmatched && p->type!=GAL_MATCH_ARRANGE_INNER)
+ error(EXIT_FAILURE, 0, "the '--notmatched' option is only defined "
+ "with the '--arrange=inner' match type");
}
@@ -347,79 +433,6 @@ ui_check_options_and_arguments(struct matchparams *p)
/**************************************************************/
/*************** Preparations *******************/
/**************************************************************/
-static void
-ui_set_mode(struct matchparams *p)
-{
- int tin1, tin2;
- char *t1=NULL, *t2=NULL;
-
- /* We will base the mode on the first input, then check with the
- second. Note that when the first is from standard input (it is
- 'NULL'), then we go into catalog mode because currently we assume
- standard input is only for plain text and WCS matching is not defined
- on plain text. */
- if( p->input1name && gal_fits_file_recognized(p->input1name) )
- {
- tin1=gal_fits_hdu_format(p->input1name, p->cp.hdu, "--hdu");
- p->mode = (tin1 == IMAGE_HDU) ? MATCH_MODE_WCS : MATCH_MODE_CATALOG;
- }
- else
- {
- tin1=ASCII_TBL; /* For "uninitialized" warning, irrelevant here. */
- p->mode=MATCH_MODE_CATALOG;
- }
-
-
- /* Necessary sanity checks. */
- if(p->mode==MATCH_MODE_CATALOG && p->cp.searchin==0)
- error(EXIT_FAILURE, 0, "no '--searchin' option specified. Please run "
- "the following command for more information:\n\n"
- " $ info gnuastro \"selecting table columns\"\n");
-
-
- /* Now that the mode is set, do some sanity checks on the second
- catalog. Recall that when '--coord' is given, there is no second input
- file.*/
- if(p->input2name)
- {
- if(gal_fits_file_recognized(p->input2name))
- {
- tin2=gal_fits_hdu_format(p->input2name, p->hdu2, "--hdu2");
- if(tin1==IMAGE_HDU && tin2!=IMAGE_HDU)
- {
- t1 = (tin1==IMAGE_HDU) ? "image" : "catalog";
- t2 = (tin2==IMAGE_HDU) ? "image" : "catalog";
- }
- if(t1)
- error(EXIT_FAILURE, 0, "%s is a %s, while %s is an "
- "%s. Both inputs have to be images or catalogs",
- gal_checkset_dataset_name(p->input1name, p->cp.hdu), t1,
- gal_checkset_dataset_name(p->input2name, p->hdu2), t2 );
- }
- else
- {
- if(p->mode==MATCH_MODE_WCS)
- error(EXIT_FAILURE, 0, "%s is an image, while %s is a catalog! "
- "Both inputs have to be images or catalogs",
- gal_checkset_dataset_name(p->input1name, p->cp.hdu),
- gal_checkset_dataset_name(p->input2name, p->hdu2) );
- }
- }
- else
- {
- /* When there is no second-input file name ('--coord' is given), we
- cannot be in WCS mode (requiring a FITS file). */
- if(p->mode==MATCH_MODE_WCS)
- error(EXIT_FAILURE, 0, "%s is an image, while '--coord' is only "
- "meaningful for catalogs",
- gal_checkset_dataset_name(p->input1name, p->cp.hdu));
- }
-}
-
-
-
-
-
/* The final aperture must have the following values:
p->aperture[0]: Major axis length.
@@ -467,13 +480,15 @@ ui_read_columns_aperture_2d(struct matchparams *p)
case 3:
if(oaper[1]>1)
error(EXIT_FAILURE, 0, "second value to '--aperture' is larger "
- "than one. When three numbers are given to this option, the "
- "second is the axis ratio (which must always be less than 1).");
+ "than one. When three numbers are given to this option, "
+ "the second is the axis ratio (which must always be less "
+ "than 1).");
break;
default:
- error(EXIT_FAILURE, 0, "%zu values given to '--aperture'. In 2D, this "
- "option can only take 1, 2, or 3 values", p->aperture->size);
+ error(EXIT_FAILURE, 0, "%zu values given to '--aperture'. In 2D, "
+ "this option can only take 1, 2, or 3 values",
+ p->aperture->size);
}
/* If a new aperture was defined, then replace it with the exitsting
@@ -600,10 +615,11 @@ ui_read_columns_aperture_3d(struct matchparams *p)
break;
default:
- error(EXIT_FAILURE, 0, "%zu values given to '--aperture'. In 3D, this "
- "option can only take 1, 3, or 6 values. See the description of "
- "this option in the book for more with this command:\n\n"
- " $ info astmatch\n", p->aperture->size);
+ error(EXIT_FAILURE, 0, "%zu values given to '--aperture'. In 3D, "
+ "this option can only take 1, 3, or 6 values. See the "
+ "description of this option in the book for more with this "
+ "command:\n\n"
+ " $ info %s\n", p->aperture->size, PROGRAM_EXEC);
}
/* If a new aperture was defined, then replace it with the exitsting
@@ -670,17 +686,19 @@ ui_set_columns_sanity_check_read_aperture(struct
matchparams *p)
case 2: ui_read_columns_aperture_2d(p); break;
case 3: ui_read_columns_aperture_3d(p); break;
default:
- error(EXIT_FAILURE, 0, "%zu dimensional matches are not currently "
- "supported (maximum is 2 dimensions). The number of "
- "dimensions is deduced from the number of values given to "
- "'--ccol1' (or '--coord') and '--ccol2'", ccol1n);
+ error(EXIT_FAILURE, 0, "%zu dimensional matches are not "
+ "currently supported (maximum is 2 dimensions). The number "
+ "of dimensions is deduced from the number of values given "
+ "to '--ccol1' (or '--coord') and '--ccol2'", ccol1n);
}
else if ( p->kdtreemode != MATCH_KDTREE_BUILD )
- error(EXIT_FAILURE, 0, "no matching aperture specified. Please use "
- "the '--aperture' option to define the acceptable aperture for "
- "matching the coordinates (in the same units as each "
- "dimension). Please run the following command for more "
- "information.\n\n $ info %s\n", PROGRAM_EXEC);
+ error(EXIT_FAILURE, 0, "no matching aperture specified. Please "
+ "use the '--aperture' option to define the acceptable "
+ "aperture for matching the coordinates (in the same units "
+ "as each dimension). An aperture is not necessary when you "
+ "are building a k-d tree or when you want an outer match; "
+ "but this run is not any of those. Please run 'info %s' for "
+ "more on the format of '--aperture'", PROGRAM_EXEC);
/* Return the number of dimensions. */
return ccol1n;
@@ -788,8 +806,9 @@ ui_read_columns_to_double(struct matchparams *p, char
*filename,
#define UI_READ_KDTREE_FIXED_MSG "You can construct a k-d tree " \
"for your first input table using the command below (just set your " \
"desired output name, and give that file to '--kdtree', in a later " \
- "call with the same first input catalog):\n\n" \
- " astmatch %s -h%s --ccol1=%s,%s --kdtree=build --output=KDTREE.fits\n\n"
\
+ "call with the same first input catalog):\n\n" \
+ " astmatch %s -h%s --ccol1=%s,%s --kdtree=build " \
+ "--output=KDTREE.fits\n\n" \
"To learn more about the expected format of k-d trees in Gnuastro, " \
"please run this command: 'info gnuastro k-d'"
@@ -891,20 +910,23 @@ ui_read_columns(struct matchparams *p)
ui_read_kdtree(p);
/* If we are in k-d tree based matching and the second dataset is smaller
- than the first, print a warning to let the user know that the speed
+ than the first, and we are not in an outer-match (where the order of
+ the inputs matter) print a warning to let the user know that the speed
can be greatly improved if they swap the two. */
if( !p->cp.quiet
&& p->kdtreemode!=MATCH_KDTREE_BUILD
&& p->kdtreemode!=MATCH_KDTREE_DISABLE
- && p->cols1->size > (2*p->cols2->size) )
- error(EXIT_SUCCESS, 0, "TIP: the matching speed will GREATLY IMPROVE "
- "if you swap the two inputs. Currently the second input has "
- "fewer rows than the first. In the k-d tree based matching, "
- "multi-threading will occur on the second input's rows and "
- "the k-d will be constructed for the first input. Fewer "
- "first-input rows therefore means a smaller tree, and more "
- "second-input rows will be better distributed in your "
- "system's CPU threads");
+ && p->cols1->size > (2*p->cols2->size)
+ && p->type!=GAL_MATCH_ARRANGE_OUTER)
+ error(EXIT_SUCCESS, 0, "TIP: the matching speed will improve "
+ "if you swap the two inputs. Recall that for an inner or "
+ "full match, the order of inputs does not matter. "
+ "In the k-d tree based matching, a k-d tree will be "
+ "constructed for the first input and parallelized matching "
+ "will occur on the second input's rows. Fewer first-input "
+ "rows therefore mean a smaller k-d tree (which is easier "
+ "to build), and more second-input rows will be better "
+ "distributed in your system's processing threads");
/* Free the extra spaces. */
gal_list_str_free(cols1, 1);
@@ -915,6 +937,23 @@ ui_read_columns(struct matchparams *p)
+static void
+ui_preparations_out_cols_read(gal_list_str_t **list, char *string)
+{
+ /* Make sure the string is not empty. */
+ if(string[0]=='\0')
+ error(EXIT_FAILURE, 0, "the value(s) given to '--outcols' should "
+ "a column identifier (counter, name or '_all') after the 'a' "
+ "or 'b'");
+
+ /* Add the string to the list of columns for this column. */
+ gal_list_str_add(list, string, 0);
+}
+
+
+
+
+
static void
ui_preparations_out_cols(struct matchparams *p)
{
@@ -932,8 +971,8 @@ ui_preparations_out_cols(struct matchparams *p)
/* For easy reading. */
col=strarr[i];
- /* In no-match mode, then the same column will be used from both
- catalogs so things are easier. */
+ /* In no-match mode, the same column will be used from both catalogs
+ so things are easier. */
if(p->notmatched)
{
gal_list_str_add(&p->acols, col, 0);
@@ -945,7 +984,9 @@ ui_preparations_out_cols(struct matchparams *p)
else
switch(col[0])
{
- case 'a': gal_list_str_add(&p->acols, col+1, 0); break;
+ case 'a':
+ ui_preparations_out_cols_read(&p->acols, col+1);
+ break;
case 'b':
/* With '--coord', only numbers that are smaller than the
number of the dimensions are acceptable. */
@@ -976,7 +1017,7 @@ ui_preparations_out_cols(struct matchparams *p)
"the number of dimensions (%zu in this case) "
"are acceptable", col+1, ndim);
}
- gal_list_str_add(&p->bcols, col+1, 0);
+ ui_preparations_out_cols_read(&p->bcols, col+1);
break;
default:
error(EXIT_FAILURE, 0, "'%s' is not a valid value for "
@@ -1127,20 +1168,41 @@ ui_preparations_out_name(struct matchparams *p)
static void
ui_preparations(struct matchparams *p)
{
- /* Set the mode of the program. */
- ui_set_mode(p);
+ int intype;
- /* Currently Match only works on catalogs. */
- if(p->mode==MATCH_MODE_WCS)
- error(EXIT_FAILURE, 0, "currently Match only works on catalogs, "
- "we will implement the WCS matching routines later");
- else
+ /* Since Match only works on tables, 'searchin' is necessary. */
+ if(p->cp.searchin==0)
+ error(EXIT_FAILURE, 0, "no '--searchin' option specified. Please "
+ "run the following command for more information:\n\n"
+ " $ info gnuastro \"selecting table columns\"\n");
+
+ /* We will base the mode on the first input, then check with the
+ second. Note that when the first is from standard input (it is
+ 'NULL'), then we go into catalog mode because currently we assume
+ standard input is only for plain text and WCS matching is not defined
+ on plain text. */
+ if( p->input1name && gal_fits_file_recognized(p->input1name) )
{
- ui_read_columns(p);
- if(p->outcols)
- ui_preparations_out_cols(p);
+ intype=gal_fits_hdu_format(p->input1name, p->cp.hdu, "--hdu");
+ if(intype==IMAGE_HDU)
+ error(EXIT_FAILURE, 0, "%s: not a catalog",
+ gal_checkset_dataset_name(p->input1name, p->cp.hdu));
}
+ /* Same check for second input. */
+ if(p->input2name && gal_fits_file_recognized(p->input2name))
+ {
+ intype=gal_fits_hdu_format(p->input2name, p->hdu2, "--hdu2");
+ if(intype==IMAGE_HDU)
+ error(EXIT_FAILURE, 0, "%s: not a catalog",
+ gal_checkset_dataset_name(p->input2name, p->hdu2));
+ }
+
+ /* Read the input columns and prepare output columns. */
+ ui_read_columns(p);
+ if(p->outcols)
+ ui_preparations_out_cols(p);
+
/* Set the output filename. */
ui_preparations_out_name(p);
}
diff --git a/bin/match/ui.h b/bin/match/ui.h
index 8d45ee7c..2660a877 100644
--- a/bin/match/ui.h
+++ b/bin/match/ui.h
@@ -41,12 +41,13 @@ enum program_args_groups
/* Available letters for short options:
b e f g i j m n p r s t u v w x y z
- A B E G J L O Q R W X Y
+ B E G J L O Q R W X Y
*/
enum option_keys_enum
{
/* With short-option version. */
UI_KEY_HDU2 = 'H',
+ UI_KEY_ARRANGE = 'A',
UI_KEY_APERTURE = 'a',
UI_KEY_LOGASOUTPUT = 'l',
UI_KEY_CCOL1 = 'c',
diff --git a/doc/gnuastro.texi b/doc/gnuastro.texi
index d1484e09..052ef541 100644
--- a/doc/gnuastro.texi
+++ b/doc/gnuastro.texi
@@ -334,7 +334,7 @@ Detecting lines and extracting spectra in 3D data
* 3D measurements and spectra:: Measuring 3d properties including spectra.
* Extracting a single spectrum and plotting it:: Extracting a single vector
row.
* Cubes with logarithmic third dimension:: When the wavelength/frequency is
logarithmic.
-* Synthetic narrow-band images:: Collapsing the third dimension into a 2D
image.
+* Synthetic narrow-band images:: Collapsing the third dimension into a 2D
image.
Color images with full dynamic range
@@ -725,6 +725,7 @@ Invoking MakeCatalog
Match
+* Arranging match output:: Various meanings of matching
* Matching algorithms:: Different ways to find the match
* Invoking astmatch:: Inputs, outputs and options of Match
@@ -7948,7 +7949,7 @@ We will finish with creating synthetic narrow-band images
optimized for some of
* 3D measurements and spectra:: Measuring 3d properties including spectra.
* Extracting a single spectrum and plotting it:: Extracting a single vector
row.
* Cubes with logarithmic third dimension:: When the wavelength/frequency is
logarithmic.
-* Synthetic narrow-band images:: Collapsing the third dimension into a 2D
image.
+* Synthetic narrow-band images:: Collapsing the third dimension into a 2D
image.
@end menu
@node Viewing spectra and redshifted lines, Sky lines in optical IFUs,
Detecting lines and extracting spectra in 3D data, Detecting lines and
extracting spectra in 3D data
@@ -31491,35 +31492,237 @@ For random measurements on any area, please use the
upper-limit columns of MakeC
@node Match, , MakeCatalog, Data analysis
@section Match
-Data can come come from different telescopes, filters, software and even
different configurations for a single software.
-As a result, one of the primary things to do after generating catalogs from
each of these sources (for example, with @ref{MakeCatalog}), is to find which
sources in one catalog correspond to which in the other(s).
+High-level data (catalogs) of a single astronomical source can come come from
different telescopes, filters, software (detection, segmentation and cataloging
tools) and even different configurations for a single software.
+As a result, one of the first things we usually do after generating or
querying data (for example, with @ref{MakeCatalog} or @ref{Query}), is to find
which sources in one catalog correspond to which in the other(s).
In other words, to `match' the two catalogs with each other.
+Within Gnuastro, the Match program is in charge of such operations.
+The matching rows are defined within an aperture, which can be a circle or an
ellipse with any orientation.
-Gnuastro's Match program is in charge of such operations.
-The nearest objects in the two catalogs, within the given aperture, will be
found and given as output.
-The aperture can be a circle or an ellipse with any orientation.
+Before digging into the usage and command-line execution details of
@ref{Invoking astmatch}, we will first review the @ref{Matching algorithms}
that are currently available.
+We will then review @ref{Arranging match output} to fit the output arrangement
that will be necessary in different contexts.
+If this is the first time you are using Match, please take the time to read
these three sections.
+They will help a lot in optimizing your outputs for your needs; especially
when the catalogs are large, it does make a difference!
@menu
+* Arranging match output:: Various meanings of matching
* Matching algorithms:: Different ways to find the match
* Invoking astmatch:: Inputs, outputs and options of Match
@end menu
-@node Matching algorithms, Invoking astmatch, Match, Match
+@node Arranging match output, Matching algorithms, Match, Match
+@subsection Arranging match output
+
+When we say ``match'' we mean different things in different contexts.
+For example, sometimes you exclusively want the matched rows of the two input
catalogs.
+In other times you want to find the nearest row in a reference catalog for
every row of a query catalog.
+Yet other times, you want to merge two catalogs (having all the matched and
non-matched rows in one table).
+Within Match, these are defined as different ``arrangements'' of the output.
+
+The core matching algorithm (to find the nearest points between two catalogs)
is the same (described in @ref{Matching algorithms}).
+The thing that is different is how you want to use that information in
constructing/arranging the output.
+The various arrangements (that should be given to the @option{--arrange}
option) are formally defined below.
+
+@table @code
+@item inner
+The output of an inner match will contain exclusively-matched rows in both
catalogs within a given aperture.
+In other words, the number of rows in the output will be fewer (or equal) to
the input with fewest rows, @emph{and} no two rows of any input can be repeated.
+Another way to say this is that the order of the inputs does not matter for
the final number of matched rows in an inner match.
+An important aspect of an inner match is the necessity of an aperture to
define the maximum/largest acceptable distance to have a match.
+Without an aperture to define a reasonable match, there is always bound to be
a ``nearest'' item in both catalogs!
+
+A real-world example (in astrophysics) of inner matching is when you have two
catalogs of galaxies from different filters/telescopes and want to find the
same galaxy in both catalogs.
+The simplified example below shows inner matching in action.
+The values in each row are intentionally abstract/ideal to help understanding
the concept (for example, the first two columns can be the RA and Dec in a
real-world catalog).
+First, let's look at the contents of the two tables we want to match:
+
+@example
+$ cat a.txt
+# Column 1: X [pix, f32] X axis position
+# Column 2: Y [pix, f32] Y axis position
+# Column 3: MAGa [unit, f32] Electrong counts
+1.2 1.2 18.2
+3.2 3.2 20.8
+8.2 8.2 15.2
+
+$ cat b.txt
+# Column 1: X [pix, f32] X axis position
+# Column 2: Y [pix, f32] Y axis position
+# Column 3: MAGb [pix, f32] Radius of object in pixels
+1 1 18.5
+2 2 23.1
+3 3 16.2
+4 4 22.7
+5 5 23.4
+@end example
+
+In this hypothetical example, the first catalog has the magnitude of three
objects in filter a, and the second catalog has the magnitude of 5 objects in
filter b.
+But the coordinates are not identical (which is natural when the catalogs come
from different source) and not all of the galaxies in one catalog match the
other!
+
+With the first command below, we will find the matching rows with an inner
arrangement, and put the radius and magnitude of the matched rows in the same
row (using the @option{--outcols}, see @ref{Invoking astmatch}).
+The second command simply prints the contents of the table on the command-line
in easy-to-read format (simplified here by removing trailing zeros to help
visual comparison with above).
+
+@example
+$ astmatch a.txt --ccol1=X,Y b.txt --ccol2=X,Y \
+ --aperture=0.5 --outcols=a1,b1,a2,b2,a3,b3 \
+ --arrange=inner --output=inner.fits
+
+$ asttable inner.fits -Y
+1.2 1 1.2 1 18.2 18.5
+3.2 3 3.2 3 20.8 16.2
+@end example
+
+@cindex PSF
+As you see, while both inputs had more than two rows, the output of an inner
arrangement only contains the rows that matched in both catalogs (two in this
case).
+For inner matching, we rarely need both coordinate columns; instead, we can
simply only select the coordinates of the catalog that had a better precision
(in other words, a better point spread function, or PSF).
+For example, assuming that @file{a.fits} had better precision, we could
simplify the output above with @option{--outcols=a1,a2,a3,b3} (which has
removed @code{b1} and @code{b2} compared to the one in the example.
+Try this out for your self as an exercise to visually understand this
important feature of inner arrangements.
+
+@item full
+The output will contain matching and non-matching rows in both catalogs.
+In other words, when there is a match between the two catalogs, it is
exclusive (as in the ``Inner'' mode above).
+However, if a row from any of the two inputs does not match, it is still
present in the output catalog, but rows that belong to the other input are
given blank values for it.
+In @url{https://www.w3schools.com/sql/sql_join.asp, SQL jargon} this is known
as ``full-outer joining''.
+
+A ``full'' output arrangement is effectively an inner match with the appendage
of non-matching rows from both catalogs after it.
+Therefore, similar to an inner match, the order of inputs is irrelevant in a
full arrangement and an aperture is mandatory.
+
+Let's use the same two example demo inputs of the inner arrangement above, but
call the Match program with a @code{--arrange=full} this time:
+
+@example
+$ astmatch a.txt --ccol1=X,Y b.txt --ccol2=X,Y \
+ --aperture=0.5 --outcols=a1,b1,a2,b2,a3,b3 \
+ --arrange=full --output=full-raw.fits
+
+$ asttable full-raw.fits -Y
+1.2 1 1.2 1 18.2 18.5
+3.2 3 3.2 3 20.8 16.2
+8.2 nan 8.2 nan 15.2 nan
+nan 2 nan 2 nan 23.1
+nan 4 nan 4 nan 22.7
+nan 5 nan 5 nan 23.4
+@end example
+
+You see that first two rows are the same as the inner example above.
+But now we have all the non-matching rows of both catalogs also.
+
+In a real world example, the problem with these non-matching rows is that the
coordinates are NaN/blank!
+To fix that, you can use the @code{where} operator and @ref{Column arithmetic}
like the command below (where the coordinates of @file{b.fits} are put in the
first and third columns and the coordinate columns of @file{b.fits} are
removed).
+With the last two @option{--colmetadata} options, we are giving names, units
and comments to the newly synthesized columns.
+This is because after column arithmetic, the metadata is lost by default and
it is up to you to set it: always do so (data without metadata is not too
useful!):
+
+@example
+$ asttable full-raw.fits --output=full.fits \
+ -c'arith $1 set-i i i isblank $2 where' \
+ -c'arith $3 set-i i i isblank $4 where' \
+ -cMAGa,MAGb \
+ --colmetadata=1,X,pix,"X axis position (merged)" \
+ --colmetadata=2,Y,pix,"Y axis position (merged)"
+
+$ asttable full.fits -Y
+1.2 1.2 18.2 18.5
+3.2 3.2 20.8 16.2
+8.2 8.2 15.2 nan
+2.0 2.0 nan 23.1
+4.0 4.0 nan 22.7
+5.0 5.0 nan 23.4
+@end example
+
+The @file{full.fits} table above now has the matching rows; but it also has
entries for the non-matching objects of both catalogs and only one X and Y
position.
+To find objects that were only detected in one or the other filter, the user
can simply use the @option{--noblank} option like the first example below.
+To get the inner match output, the user of the catalog can simply use the same
option with @code{_all}, like the second example below.
+
+@example
+$ asttable full.fits --noblank=MAGa
+$ asttable full.fits --noblank=_all
+@end example
+
+@item outer
+An outer row arrangement will produce a table with the same number of rows as
the second input to the Match program: every row of the second input is matched
with the nearest row of the first.
+An outer match is useful when you want to find the nearest row of a
@emph{reference} catalog (first input) for each of a @emph{query} catalog's
entries (second input).
+In this scenario, if there are multiple entries of the reference catalog in
the output, there is no problem (in other words, the match is not exclusive).
+Also, since you simply want the nearest entry, no aperture is necessary for
the matching algorithm itself; but only necessary to optimize the k-d tree
construction.
+
+@cindex SQL
+@cindex Join (SQL)
+In @url{https://www.w3schools.com/sql/sql_join.asp, SQL jargon}, there is two
types of outer arrangements (or ``joining'' as SQL calls it): left-outer and
right-outer.
+But in Gnuastro's Match program, we current only have the right-outer join.
+This is because unlike SQL (that does many more things than matching/joining)
Gnuastro's Match program is not a programming language!
+So the user can always change the order of their inputs to achieve left-outer
matching!
+In other words, within the scope of Gnuastro's Match, there is no need for the
extra complexity.
+
+@cindex Galactic extinction
+@cindex Extinction (Galactic)
+For example, let's assume you want to find the Galactic extinction for the
galaxies in your catalog.
+In this scenario, the reference catalog is a table of Galactic extinction
values for a grid of positions in the sky and the check catalog contains one
row for each of your target galaxies.
+If two or more galaxies are near the same entry in the reference catalog, you
expect them to have will have the same extinction column value.
+
+To see it in practice, let's use the @file{b.fits} of the inner arrangement
example above as our query catalog.
+For the reference catalog, let's make a new @file{c.fits}; assuming that it
has galactic extinction for some points you have already measured:
+
+@example
+$ cat c.txt
+# Column 1: X [pix, f32] X axis position
+# Column 2: Y [pix, f32] Y axis position
+# Column 3: EXT [mag, f32] Galactic extinction at (X,Y)
+1 1 0.04
+4 4 0.05
+
+$ astmatch c.txt --ccol1=X,Y b.txt --ccol2=X,Y \
+ --aperture=0.5 --outcols=bX,bY,bMAGb,aEXT \
+ --arrange=outer --output=outer.fits
+
+$ asttable outer.fits -Y
+1 1 18.5 0.04
+2 2 23.1 0.04
+3 3 16.2 0.05
+4 4 22.7 0.05
+5 5 23.4 0.05
+@end example
+
+The output has the same number of rows as the query catalog, and rows from the
reference catalog are repeated when they are the nearest to each query item.
+
+@item outer-within-aperture
+Similar to the outer match described above, with the difference that if the
nearest point is farther than @option{--aperture}, all reference table (first
input) columns will be NaN in the output.
+For more on @option{--aperture}, see @ref{Invoking astmatch}.
+This is useful in scenarios where the distance matters and you do not want
reference points that are too distant from the query catalog.
+
+As an example, let's repeat the command from the example for the outer
arrangement, but use this arrangement instead:
+
+@example
+$ astmatch c.txt --ccol1=X,Y b.txt --ccol2=X,Y \
+ --aperture=0.5 --outcols=bX,bY,bMAGb,aEXT \
+ --arrange=outer-within-aperture \
+ --output=outer-wa.fits
+
+$ asttable outer-wa.fits -Y
+1 1 18.5 0.040
+2 2 23.1 nan
+3 3 16.2 nan
+4 4 22.7 0.050
+5 5 23.4 nan
+@end example
+
+You can confirm that only the reference rows that were within the given
aperture, are not NaN/blank.
+
+@end table
+
+
+@node Matching algorithms, Invoking astmatch, Arranging match output, Match
@subsection Matching algorithms
Matching involves two catalogs, let's call them catalog A (with N rows) and
catalog B (with M rows).
The most basic matching algorithm that immediately comes to mind is this:
-for each row in A (let's call it @mymath{A_i}), go over all the rows in B
(@mymath{B_j}, where @mymath{0<j<M}) and calculate the distance
@mymath{|B_j-A_i|}.
-If this distance is less than a certain acceptable distance threshold (or
radius, or aperture), consider @mymath{A_i} and @mymath{B_j} as a match.
+for each row in A (let's call it @mymath{A_i}), go over all the rows in B
(@mymath{B_j}, where @mymath{0<j<M}) and calculate the distance
@mymath{d(A_i,B_j)}.
+Find the @mymath{B_j} that has the smallest distance to @mymath{A_i}, and if
that distance is smaller than the acceptable distance threshold (or radius, or
aperture), consider @mymath{A_i} and @mymath{B_j} as a match.
-This basic parsing algorithm is very computationally expensive:
-@mymath{N\times M} distances have to measured, and calculating the distance
requires a square root and power of 2: in 2 dimensions it would be
@mymath{\sqrt{(B_{ix}-A_{ix})^2+(B_{iy}-A_{iy})^2}}.
-If an elliptical aperture is necessary, it can even get more complicated, see
@ref{Defining an ellipse and ellipsoid}.
+However, this basic parsing algorithm is very computationally expensive:
+@mymath{N\times M} distances have to measured, and calculating the distance
requires a square root and power of 2: in 2 dimensions it would be
@mymath{d(A_i,B_j)=\sqrt{(B_{ix}-A_{ix})^2+(B_{iy}-A_{iy})^2}}.
+If an elliptical aperture is necessary, it can even get more complicated (see
@ref{Defining an ellipse and ellipsoid}).
Such operations are not simple, and will consume many cycles of your CPU!
As a result, this basic algorithm will become terribly slow as your datasets
grow in size.
For example, when N or M exceed hundreds of thousands (which is common in the
current days with datasets like the European Space Agency's Gaia mission).
Therefore that basic parsing algorithm will take too much time and more
efficient ways to @emph{find the nearest neighbor} need to be found.
-Gnuastro's Match currently has algorithms for finding the nearest neighbor:
+Gnuastro's Match currently has the following algorithms for finding the
nearest neighbor:
@table @asis
@item Sort-based
@@ -31534,31 +31737,41 @@ Use the radial distance threshold to define the width
of a moving interval over
Therefore, with a single parsing of both simultaneously, for each A-point, we
can find all the elements in B that are sufficiently near to it (within the
requested aperture).
@end enumerate
-This method has some caveats:
-1) It requires sorting, which can again be slow on large numbers.
-2) It can only be done on a single CPU thread! So it cannot benefit from the
modern CPUs with many threads.
-3) There is no way to preserve intermediate information for future matches,
for example, this can greatly help when one of the matched datasets is always
the same.
-To use this sorting method in Match, use @option{--kdtree=disable}.
+You can use this method by disabling the default algorithm that is described
next with @option{--kdtree=disable}.
+The reason the sort-based algorithm is not the default is that it has some
caveats:
+@itemize
+@item
+It requires sorting, which can again be slow on large numbers.
+@item
+It can only be done on a single thread! So it cannot benefit from the modern
CPUs with many threads, or GPUs that have hundreds/thousands of computing units.
+@item
+There is no way to preserve intermediate information for future matches (and
not have to repeat them).
+@end itemize
-@item k-d tree based
+@item k-d tree
The k-d tree concept is much more abstract, but powerful (addressing all the
caveats of the sort-based method described above.).
In short a k-d tree is a partitioning of a k-dimensional space (``k'' is just
a place-holder, so together with ``d'' for dimension, ``k-d'' means ``any
number of dimensions''!).
+
The k-d tree of table A is another table with the same number of rows, but
only two integer columns: the integers contain the row indexs (counting from
zero) of the left and right ``branch'' (in the ``tree'') of that row.
-With a k-d tree we can find the nearest point with much fewer (statistically)
checks, compared to always parsing from the top-down.
-For more on the k-d tree concept and Gnuastro's implementation, please see
@ref{K-d tree}.
+With a k-d tree we can find the nearest point with much fewer checks
(statistically: compared to always parsing everything from the top-down).
+We won't go deeper into the concept of k-d trees here and will focus on the
high-level usage in of k-d trees in Match.
+In case you are interested to learn more on the k-d tree concept and
Gnuastro's implementation, please see its
@url{https://en.wikipedia.org/wiki/K-d_tree, Wikipedia page} and @ref{K-d tree}.
-When given two catalogs (like the command below), Gnuastro's Match will
internally construct a k-d tree for catalog A (the first catalog given to it)
and use the k-d tree of A, for finding the nearest B-point(s) to each A-point
(this is done in parallel on all available CPU threads, unless you specify a
certain number of threads to use with @option{--numthreads}, see
@ref{Multi-threaded operations})
+When given two catalogs (like the command below), Gnuastro's Match will
internally construct a k-d tree for catalog A (the first catalog given to it)
and use the k-d tree of A, for finding the nearest row in B to each row in A.
+This is done in parallel on all available threads (unless you specify a
certain number of threads to use with @option{--numthreads}, see
@ref{Multi-threaded operations})
@example
$ astmatch A.fits --ccol1=ra,dec B.fits --ccol2=RA,DEC \
--aperture=1/3600
@end example
-However, optionally, you can also build the k-d tree of A and save it into a
file, with a separate call to Match, like below
+In scenarios where your reference (A) catalog is the same (and it is large!),
you can save time by building the k-d tree of A and saving it into a file once,
and simply use that k-d tree in all future matches.
+The command below shows how to do the first step (to build the k-d tree and
keep it in a file).
@example
$ astmatch A.fits --ccol1=ra,dec --kdtree=build \
--output=A-kdtree.fits
@end example
-This external k-d tree (@file{A-kdtree.fits}) can be fed to Match later (to
avoid having to reconstruct it every time you want to match a new catalog with
A) like below for matching both @file{B.fits} and @file{C.fits} with
@file{A.fits}.
-Note that the same @option{--kdtree} option above, is now given the file name
of the k-d tree, instead of @code{build}.
+This external k-d tree (@file{A-kdtree.fits}) can be fed to Match later (to
avoid having to reconstruct it every time you want to match a new catalog with
A).
+The commands below show how to do this by matching both @file{B.fits} and
@file{C.fits} with @file{A.fits} using its pre-built k-d tree.
+Note that the same @option{--kdtree} option above (which has a value of
@code{build}), is now given the file name of the already-built k-d tree.
@example
$ astmatch A.fits --ccol1=ra,dec --kdtree=A-kdtree.fits \
B.fits --ccol2=RA,DEC --aperture=1/3600 \
@@ -31568,54 +31781,35 @@ $ astmatch A.fits --ccol1=ra,dec
--kdtree=A-kdtree.fits \
--output=A-C.fits
@end example
-Irrespective of how the k-d tree is made ready (by importing or by
constructing internally), it will be used to find the nearest A-point to each
B-point.
-The k-d tree is parsed independently (on different CPU threads) for each row
of B.
-
There is just one technical issue however: when there is no neighbor within
the acceptable distance of the k-d tree, it is forced to parse all elements to
confirm that there is no match!
Therefore if one catalog only covers a small portion (in the coordinate space)
of the other catalog, the k-d tree algorithm will be forced to parse the full
k-d tree for the majority of points!
This will dramatically decrease the running speed of Match.
-Therefore, Match first divides the range of the first input in all its
dimensions into bins that have a width of the requested aperture (similar to a
histogram), and will only do the k-d tree based search when the point in
catalog B actually falls within a bin that has at least one element in A.
-@end table
-Above, we described different ways of finding the @mymath{A_i} that is nearest
to each @mymath{B_j}.
-But this is not the whole matching process!
-Let's go ahead with a ``basic'' description of what happens next...
-You may be tempted to remove @mymath{A_i} from the search of matches for
@mymath{B_k} (where @mymath{k>j}).
-Therefore, as you go down B (and more matches are found), you have to
calculate less distances (there are fewer elements in A that remain to be
checked).
-However, this will introduce an important bias: @mymath{A_i} may actually be
closer to @mymath{B_k} than to @mymath{B_j}!
-But because @mymath{B_j} happened to be before @mymath{B_k} in your table,
@mymath{A_i} was removed from the potential search domain of @mymath{B_k}.
-The good match (@mymath{B_k} with @mymath{A_i} will therefore be lost, and
replaced by a false match between @mymath{B_j} and @mymath{A_i}!
-
-In a single-dimensional match, this bias depends on the sorting of your two
datasets (leading to different matches if you shuffle your datasets).
-But it will get more complex as you add dimensionality.
-For example, catalogs derived from 2D images or 3D cubes, where you have 2 and
3 different coordinates for each point.
-
-To address this problem, in Gnuastro (the Match program, or the matching
functions of the library) similar to above, we first parse over the elements of
B.
-But we will not associate the first nearest-neighbor with a match!
-Instead, we will use an array (with the number of rows in A, let's call it
``B-in-A'') to keep the list of all nearest element(s) in B that match each
A-point.
-Once all the points in B are parsed, each A-point in B-in-A will (possibly)
have a sorted list of B-points (there may be multiple B-points that fall within
the acceptable aperture of each A-point).
-In the previous example, the @mymath{i} element (corresponding to
@mymath{A_i}) of B-in-A will contain the following list of B-points:
@mymath{B_j} and @mymath{B_k}.
-
-A new array (with the number of points in B, let's call it A-in-B) is then
used to find the final match.
-We parse over B-in-A (that was completed above), and from it extract the
nearest B-point to each A-point (@mymath{B_k} for @mymath{A_i} in the example
above).
-If this is the first A-point that is found for this B-point, then we put this
A-point into A-in-B (in the example above, element @mymath{k} is filled with
@mymath{A_k}).
-If another A-point was previously found for this B-point, then the distance of
the two A-points to that B-point are compared, and the A-point with the smaller
distance is kept in A-in-B.
-This will give us the best match between the two catalogs, independent of any
sorting issues.
-Both the B-in-A and A-in-B will also keep the distances, so distances are only
measured once.
+To mitigate this, Match first divides the range of the first input in all its
dimensions into bins that have a width of the requested aperture (similar to a
histogram), and will only do the k-d tree based search when the point in
catalog B actually falls within a bin that has at least one element in A.
-@noindent
In summary, here are the points to consider when selecting an algorithm, or
the order of your inputs (for optimal speed, the match will be the same):
@itemize
@item
For larger datasets, the k-d tree based method (when running on all threads
possible) is much more faster than the classical sort-based method.
@item
-The k-d tree is constructed for the first input table and the multi-threading
is done on the rows of the second input table.
-The construction of a larger dataset's k-d tree will take longer, but
multi-threading will work better when you have more rows.
-As a result, the optimal way to place your inputs is to give the smaller input
table (with fewer rows) as the first argument (so its k-d tree is constructed),
and the larger table as the second argument (so its rows are checked in
parallel).
-@item
If you always need to match against one catalog (that is large!), the k-d tree
construction itself can take a significant fraction of the running time.
-Therefore you can save its k-d tree into a file and simply give it to later
calls, like the example given in the description of the k-d algorithm mentioned
above.
+In such cases, save the k-d tree into a file and simply give it to later calls
(as shown above)
+@item
+For the @emph{inner} or @emph{full} arrangement of the output (described in
@ref{Arranging match output}), the order of inputs does not matter.
+But if you put the table with @emph{fewer rows} as the first input, you will
gain a lot in processing time (depending on the size of the other table and the
number of threads).
+This is because of the following facts:
+@itemize
+@item
+The k-d tree is constructed for the first input table and the construction of
a larger dataset's k-d tree will take longer (as a non-linear function of the
number of rows).
+@item
+Multi-threading is done on the rows of the second input table and it scales in
a linear way with more rows.
+@end itemize
@end itemize
+@end table
+
+
+
+
@node Invoking astmatch, , Matching algorithms, Match
@subsection Invoking Match
@@ -43141,6 +43335,15 @@ Usually, you also have other columns (such as the
magnitude and morphology) and
Once you have the permutations, they can be applied to those other columns
(see @ref{Permutations}) and the higher-level processing can continue.
So if you do not need the coordinate columns for the rest of your analysis, it
is better to set @code{inplace=1}.
+@deffn Macro GAL_MATCH_ARRANGE_FULL
+@deffnx Macro GAL_MATCH_ARRANGE_INNER
+@deffnx Macro GAL_MATCH_ARRANGE_OUTER
+@deffnx Macro GAL_MATCH_ARRANGE_INVALID
+@deffnx Macro GAL_MATCH_ARRANGE_OUTERWITHINAPERTURE
+The arrangement of the match output; for a description of the various match
arrangements, see @ref{Arranging match output}.
+The invalid keyword is useful when you need to initialize values (and later
make sure that your users have actually given a value.
+@end deffn
+
@deftypefun {gal_data_t *} gal_match_sort_based (gal_data_t @code{*coord1},
gal_data_t @code{*coord2}, double @code{*aperture}, int @code{sorted_by_first},
int @code{inplace}, size_t @code{minmapsize}, int @code{quietmmap}, size_t
@code{*nummatched})
Use a basic sort-based match to find the matching points of two input
coordinates.
@@ -43154,18 +43357,23 @@ Therefore, when @code{inplace==0}, inputs will remain
untouched, but this functi
If internal allocation is necessary and the space is larger than
@code{minmapsize}, the space will be not allocated in the RAM, but in a file,
see description of @option{--minmapsize} and @code{--quietmmap} in
@ref{Processing options}.
@end deftypefun
-@deftypefun {gal_data_t *} gal_match_kdtree (gal_data_t @code{*coord1},
gal_data_t @code{*coord2}, gal_data_t @code{*coord1_kdtree}, size_t
@code{kdtree_root}, double @code{*aperture}, size_t @code{numthreads}, size_t
@code{minmapsize}, int @code{quietmmap}, size_t @code{*nummatched})
+@deftypefun {gal_data_t *} gal_match_kdtree (gal_data_t @code{*coord1},
gal_data_t @code{*coord2}, gal_data_t @code{*coord1_kdtree}, size_t
@code{kdtree_root}, uint8_t @code{arrange}, double @code{*aperture}, size_t
@code{numthreads}, size_t @code{minmapsize}, int @code{quietmmap}, size_t
@code{*nummatched})
@cindex Matching by k-d tree
@cindex k-d tree matching
-Use the k-d tree concept for finding matches between two catalogs, optionally
in parallel (on @code{numthreads} threads).
+Use the k-d tree algorithm for finding matches between two catalogs.
+The @code{arrange}-ment of the match output should be set with one of the
@code{GAL_MATCH_ARRANGE_*} macros described above.
The k-d tree of the first input (@code{coord1_kdtree}), and its root index
(@code{kdtree_root}), should be constructed and found before calling this
function, to do this, you can use the @code{gal_kdtree_create} of @ref{K-d
tree}.
The desired @code{aperture} array is the same as @code{gal_match_sort_based}
and described at the top of this section.
-If @code{coord1_kdtree==NULL}, this function will return a @code{NULL}
pointer and write a value of @code{0} in the space that @code{nummatched}
points to.
+If @code{coord1_kdtree==NULL}, this function will return a @code{NULL} pointer
and write a value of @code{0} in the space that @code{nummatched} points to.
+If @code{numthreads} threads is larger than one, the matching will be done in
parallel.
The final number of matches is returned in @code{nummatched} and the format of
the returned dataset (three columns) is described above.
If internal allocation is necessary and the space is larger than
@code{minmapsize}, the space will be not allocated in the RAM, but in a file,
see description of @option{--minmapsize} and @code{--quietmmap} in
@ref{Processing options}.
+When @code{arrange} is outer or outer-within-aperture, the output is only a
two column table (list of 1D @code{gal_data_t}s).
+The first one is the permutation that that should be applied to the first
input and the second one is the distance between the match.
+This is because in these types of arrangements, the second output is not
changed or permuted).
@end deftypefun
@node Statistical operations, Fitting functions, Matching, Gnuastro library
diff --git a/lib/gnuastro/match.h b/lib/gnuastro/match.h
index 2d0e5431..6daccf01 100644
--- a/lib/gnuastro/match.h
+++ b/lib/gnuastro/match.h
@@ -45,6 +45,38 @@ along with Gnuastro. If not, see
<http://www.gnu.org/licenses/>.
+/* The names are inspired by SQL's joining features:
+ https://www.w3schools.com/sql/sql_join.asp . We do not need two OUTER
+ modes because the order of inputs is important here (always using the
+ first one as reference):
+
+ - Inner: The output will contain matched rows in both catalogs. For
+ example when you have two catalogs of galaxies from different
+ telescopes and want to find the same galaxies in both.
+
+ - Outer: The output will have the same number of rows as the second
+ input, and the nearest row of the first input (within the given
+ aperture) will be allocated for it. This is generally useful in
+ classification scenarios: the first catalog will contain the
+ "coordinate" columns and a certain value for that "coordinate". A
+ second catalog's rows then need to be given the value of the nearest
+ "coordinate" in the first.
+
+ - Full (outer): Matching and non-matching rows of both. For example, you
+ want to build a combined catalog from two separate input catalogs
+ where some items match and some don't. */
+enum gal_match_arrange{
+ GAL_MATCH_ARRANGE_INVALID, /* ==0 by default. */
+ GAL_MATCH_ARRANGE_FULL,
+ GAL_MATCH_ARRANGE_INNER,
+ GAL_MATCH_ARRANGE_OUTER,
+ GAL_MATCH_ARRANGE_OUTERWITHINAPERTURE,
+};
+
+
+
+
+
gal_data_t *
gal_match_sort_based(gal_data_t *coord1, gal_data_t *coord2,
double *aperture, int sorted_by_first,
@@ -54,8 +86,8 @@ gal_match_sort_based(gal_data_t *coord1, gal_data_t *coord2,
gal_data_t *
gal_match_kdtree(gal_data_t *coord1, gal_data_t *coord2,
gal_data_t *coord1_kdtree, size_t kdtree_root,
- double *aperture, size_t numthreads, size_t minmapsize,
- int quietmmap, size_t *nummatched);
+ uint8_t arrange, double *aperture, size_t numthreads,
+ size_t minmapsize, int quietmmap, size_t *nummatched);
diff --git a/lib/match.c b/lib/match.c
index 04b908ee..fd139aa0 100644
--- a/lib/match.c
+++ b/lib/match.c
@@ -1,5 +1,5 @@
/*********************************************************************
-match -- Functions to match catalogs and WCS.
+Match -- Functions to match catalogs.
This is part of GNU Astronomy Utilities (Gnuastro) package.
Original author:
@@ -34,6 +34,7 @@ along with Gnuastro. If not, see
<http://www.gnu.org/licenses/>.
#include <gnuastro/box.h>
#include <gnuastro/list.h>
#include <gnuastro/blank.h>
+#include <gnuastro/match.h>
#include <gnuastro/binary.h>
#include <gnuastro/kdtree.h>
#include <gnuastro/pointer.h>
@@ -150,7 +151,8 @@ match_aperture_prepare(gal_data_t *A, gal_data_t *B,
{
/* Using the box that encloses the aperture, calculate the
distance along each axis. */
- gal_box_bound_ellipse_extent(aperture[0], aperture[0]*aperture[1],
+ gal_box_bound_ellipse_extent(aperture[0],
+ aperture[0]*aperture[1],
aperture[2], dist);
/* Calculate the sin and cos of the given ellipse if necessary
@@ -277,9 +279,9 @@ match_distance(double *delta, int iscircle, size_t ndim,
-/* In the 'match_XXXX_second_in_first' functions, we made an array of
- lists, here we want to reverse that list to fix the second two issues
- that were discussed there. */
+/* In the 'match_*_second_in_first' functions, we made an array of lists,
+ here we want to reverse that list to fix the second two issues that were
+ discussed there. */
void
match_rearrange(gal_data_t *A, gal_data_t *B, struct match_sfll **bina)
{
@@ -387,8 +389,9 @@ match_rearrange(gal_data_t *A, gal_data_t *B, struct
match_sfll **bina)
/* The matching has been done, write the output. */
static gal_data_t *
-match_output(gal_data_t *A, gal_data_t *B, size_t *A_perm, size_t *B_perm,
- struct match_sfll **bina, size_t minmapsize, int quietmmap)
+match_output_inner(gal_data_t *A, gal_data_t *B, size_t *A_perm,
+ size_t *B_perm, struct match_sfll **bina,
+ size_t minmapsize, int quietmmap)
{
float r;
double *rval;
@@ -487,6 +490,57 @@ match_output(gal_data_t *A, gal_data_t *B, size_t *A_perm,
size_t *B_perm,
+/* The matching has been done, write the output. */
+static gal_data_t *
+match_output_outer(uint8_t arrange, gal_data_t *A, gal_data_t *B,
+ size_t *aoinb, double *aoinbd, size_t minmapsize,
+ int quietmmap, size_t *nummatched)
+{
+ gal_data_t *out;
+ size_t *s, *ss, nm=0;
+
+ /* If there aren't any matches to write, return NULL. */
+ if(B->size==0) return NULL;
+
+ /* For a check.
+ size_t b;
+ for(b=0;b<B->size;++b)
+ printf("%zu: %zu, %f\n", b, aoinb[b], aoinbd[b]);
+ */
+
+ /* Allocate the output list. */
+ out=gal_data_alloc(aoinb, GAL_TYPE_SIZE_T, 1, &B->size, NULL, 0,
+ minmapsize, quietmmap, "CAT1_ROW", "counter",
+ "Row index in first catalog (counting from 0).");
+ out->next=gal_data_alloc(aoinbd, GAL_TYPE_FLOAT64, 1, &B->size,
+ NULL, 0, minmapsize, quietmmap,
+ "MATCH_DIST", NULL,
+ "Distance between the match.");
+
+ /* Report the number of matches: */
+ switch(arrange)
+ {
+ case GAL_MATCH_ARRANGE_OUTER:
+ *nummatched=out->size;
+ break;
+ case GAL_MATCH_ARRANGE_OUTERWITHINAPERTURE:
+ ss=(s=out->array)+out->size;
+ do if(*s!=GAL_BLANK_SIZE_T) ++nm; while(++s<ss);
+ *nummatched=nm;
+ break;
+ default:
+ error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at '%s' "
+ "to fix the problem. The code '%u' is not recognized for "
+ "'arrange'", __func__, PACKAGE_BUGREPORT, arrange);
+ }
+
+ /* Return the output. */
+ return out;
+}
+
+
+
+
@@ -587,7 +641,8 @@ match_sort_based_sanity_check(gal_data_t *coord1,
gal_data_t *coord2,
break;
case 3:
- if(aperture[1]<=0 || aperture[1]>1 || aperture[2]<=0 || aperture[2]>1)
+ if( aperture[1]<=0 || aperture[1]>1
+ || aperture[2]<=0 || aperture[2]>1 )
error(EXIT_FAILURE, 0, "%s: at least one of the second or "
"third values in the aperture (%g and %g respectively) "
"is smaller than zero or larger than one. In a 3D match, "
@@ -734,8 +789,8 @@ match_sort_based_second_in_first(gal_data_t *A, gal_data_t
*B,
struct match_sfll **bina)
{
/* To keep things easy to read, all variables related to catalog 1 start
- with an 'a' and things related to catalog 2 are marked with a 'b'. The
- redundant variables (those that equal a previous value) are only
+ with an 'a' and things related to catalog 2 are start with a 'b'. The
+ redundant variables (those that equal a previous variable) are only
defined to make it easy to read the code.*/
int iscircle=0;
size_t i, ar=A->size, br=B->size;
@@ -892,8 +947,8 @@ gal_match_sort_based(gal_data_t *coord1, gal_data_t *coord2,
{
int allf64=1;
gal_data_t *A, *B, *out;
- size_t *A_perm=NULL, *B_perm=NULL;
struct match_sfll **bina;
+ size_t *A_perm=NULL, *B_perm=NULL;
/* Do a small sanity check and make the preparations. After this point,
we'll call the two arrays 'a' and 'b'.*/
@@ -903,7 +958,6 @@ gal_match_sort_based(gal_data_t *coord1, gal_data_t *coord2,
allf64, &A, &B, &A_perm, &B_perm,
minmapsize);
-
/* Allocate the 'bina' array (an array of lists). Let's call the first
catalog 'a' and the second 'b'. This array has 'a->size' elements
(pointers) and for each, it keeps a list of 'b' elements that are
@@ -914,18 +968,15 @@ gal_match_sort_based(gal_data_t *coord1, gal_data_t
*coord2,
error(EXIT_FAILURE, errno, "%s: %zu bytes for 'bina'", __func__,
A->size*sizeof *bina);
-
/* All records in 'b' that match each 'a' (possibly duplicate). */
match_sort_based_second_in_first(A, B, aperture, bina);
-
/* Two re-arrangings will fix the issue. */
match_rearrange(A, B, bina);
-
/* The match is done, write the output. */
- out=match_output(A, B, A_perm, B_perm, bina, minmapsize, quietmmap);
-
+ out=match_output_inner(A, B, A_perm, B_perm, bina, minmapsize,
+ quietmmap);
/* Clean up. */
free(bina);
@@ -937,7 +988,6 @@ gal_match_sort_based(gal_data_t *coord1, gal_data_t *coord2,
if(A_perm) free(A_perm);
if(B_perm) free(B_perm);
-
/* Set 'nummatched' and return output. */
*nummatched = out ? out->next->next->size : 0;
return out;
@@ -968,6 +1018,7 @@ gal_match_sort_based(gal_data_t *coord1, gal_data_t
*coord2,
struct match_kdtree_params
{
/* Input arguments. */
+ uint8_t arrange; /* Arrangement: outer, inner, full... */
gal_data_t *A; /* 1st coordinate list of 'gal_data_t's */
gal_data_t *B; /* 2nd coordinate list of 'gal_data_t's */
size_t ndim; /* The number of dimensions. */
@@ -987,6 +1038,8 @@ struct match_kdtree_params
double *a[3]; /* Direct pointers to column arrays. */
double *b[3]; /* Direct pointers to column arrays. */
struct match_sfll **bina; /* Second cat. items in first. */
+ size_t *aoinb; /* For outer: 1st cat element in 2nd. */
+ double *aoinbd; /* Distance of the aoinb match. */
gal_data_t *Aexist; /* If any element of A exists in bins. */
double *Abinwidth; /* Width of bins along each dimension. */
double *Amin; /* Minimum value of A along each dim. */
@@ -1040,10 +1093,6 @@ match_kdtree_A_coverage(struct match_kdtree_params *p)
numbins=MATCH_KDTREE_COVERAGE_MAXBINS;
if(numbins==0) numbins=1;
- /*************************/
- //numbins=1;
- /*************************/
-
/* Generate the 'Aexist' list for this dimension. Note that if we
have a single bin in this dimension, we can just set everything
automatically. */
@@ -1103,7 +1152,8 @@ match_kdtree_A_coverage(struct match_kdtree_params *p)
{
d=bins->array;
for(i=0;i<bins->size;++i)
- printf("%zu: %-15.8f%-15.8f%u\n", i, d[i]-p->Abinwidth[dim]/2,
+ printf("%zu: %-15.8f%-15.8f%u\n", i,
+ d[i]-p->Abinwidth[dim]/2,
d[i]+p->Abinwidth[dim]/2, u[i]);
}
else
@@ -1135,6 +1185,8 @@ match_kdtree_A_coverage(struct match_kdtree_params *p)
static void
match_kdtree_sanity_check(struct match_kdtree_params *p)
{
+ double *d, *dd;
+ size_t *s, *ss;
gal_data_t *tmp;
/* Make sure all coordinates and the k-d tree have the same number of
@@ -1177,11 +1229,40 @@ match_kdtree_sanity_check(struct match_kdtree_params *p)
call the first catalog 'a' and the second 'b'. This array has
'a->size' elements (pointers) and for each, it keeps a list of 'b'
elements that are nearest to it. */
- errno=0;
- p->bina=calloc(p->A->size, sizeof *p->bina);
- if(p->bina==NULL)
- error(EXIT_FAILURE, errno, "%s: %zu bytes for 'bina'",
- __func__, p->A->size*sizeof *p->bina);
+ switch(p->arrange)
+ {
+
+ /* For inner and full, we need the bina array. */
+ case GAL_MATCH_ARRANGE_FULL:
+ case GAL_MATCH_ARRANGE_INNER:
+ errno=0;
+ p->bina=calloc(p->A->size, sizeof *p->bina);
+ if(p->bina==NULL)
+ error(EXIT_FAILURE, errno, "%s: %zu bytes for 'bina'",
+ __func__, p->A->size*sizeof *p->bina);
+ break;
+
+ /* For the outer matches, we need the aoinb array, which is initialized
+ to blank for the outer-within-aperture type. */
+ case GAL_MATCH_ARRANGE_OUTER:
+ case GAL_MATCH_ARRANGE_OUTERWITHINAPERTURE:
+ p->aoinb=gal_pointer_allocate(GAL_TYPE_SIZE_T, p->B->size, 0,
+ __func__, "p->aoinb");
+ p->aoinbd=gal_pointer_allocate(GAL_TYPE_FLOAT64, p->B->size, 0,
+ __func__, "p->aoinb");
+ if(p->arrange==GAL_MATCH_ARRANGE_OUTERWITHINAPERTURE)
+ {
+ ss=(s=p->aoinb)+p->B->size;
+ do *s=GAL_BLANK_SIZE_T; while(++s<ss);
+ dd=(d=p->aoinbd)+p->B->size; do *d=NAN; while(++d<dd);
+ }
+ break;
+
+ /* Un-recognized match arrangement. */
+ default:
+ error(EXIT_FAILURE, 0, "%s: arrange code %u is not recognized",
+ __func__, p->arrange);
+ }
/* Pointers to the input column arrays for easy parsing later. */
p->a[0]=p->A->array;
@@ -1252,15 +1333,22 @@ match_kdtree_worker(void *in_prm)
po = point[j] = ((double *)(ccol->array))[ bi ];
/* Make sure it covers the range of A (following the same set
- of tests as in 'gal_statistics_histogram'). */
- if( po >= p->Amin[j] && po <= p->Amax[j] )
+ of tests as in 'gal_statistics_histogram'). Note that this
+ applies to all types of matching, except for the "outer"
+ (where we simply want the nearest and no aperture is
+ defined). */
+ if( p->arrange!=GAL_MATCH_ARRANGE_OUTER )
{
- h_i=(po-p->Amin[j])/p->Abinwidth[j];
- if( existA[ h_i - (h_i==p->Aexist->size ? 1 : 0) ] == 0 )
+ if ( po >= p->Amin[j] && po <= p->Amax[j] )
+ {
+ h_i=(po-p->Amin[j])/p->Abinwidth[j];
+ if( existA[ h_i - (h_i==p->Aexist->size ? 1 : 0) ]
+ == 0 )
+ iscovered=0;
+ }
+ else
iscovered=0;
}
- else
- iscovered=0;
}
/* Increment the dimensionality counter. */
@@ -1277,8 +1365,8 @@ match_kdtree_worker(void *in_prm)
p->kdtree_root, point,
&least_dist);
- /* If nothing was found within the least distance, then the 'ai'
- will be 'GAL_BLANK_SIZE_T'. */
+ /* If nothing was found, then the 'ai' will be
+ 'GAL_BLANK_SIZE_T'. */
if(ai!=GAL_BLANK_SIZE_T)
{
/* Make sure the matched point is within the given aperture
@@ -1290,17 +1378,34 @@ match_kdtree_worker(void *in_prm)
/* If the radial distance is smaller than the radial measure,
then add this item to a match with 'ai'. */
- if(r<p->aperture[0])
- match_add_to_sfll(&p->bina[ai], bi, r);
+ if(p->arrange==GAL_MATCH_ARRANGE_OUTER || r<p->aperture[0])
+ {
+ switch(p->arrange)
+ {
+ case GAL_MATCH_ARRANGE_FULL:
+ case GAL_MATCH_ARRANGE_INNER:
+ match_add_to_sfll(&p->bina[ai], bi, r);
+ break;
+
+ case GAL_MATCH_ARRANGE_OUTER:
+ case GAL_MATCH_ARRANGE_OUTERWITHINAPERTURE:
+ p->aoinb[bi]=ai;
+ p->aoinbd[bi]=r;
+ break;
+ }
+ }
}
-
- /* For a check:
- if(ai==GAL_BLANK_SIZE_T)
- printf("second[%zu] matched with first[%zu].\n", bi);
- else
- printf("second[%zu] DIDN'T match with first.\n", bi, bi);
- */
}
+ else ai=GAL_BLANK_SIZE_T;
+
+ /* For a check:
+ if(ai==GAL_BLANK_SIZE_T)
+ printf("%s: second[%zu] DIDN'T match with first.\n",
+ __func__, bi);
+ else
+ printf("%s: second[%zu] matched with first[%zu].\n",
+ __func__, bi, ai);
+ */
}
/* Clean up. */
@@ -1339,11 +1444,11 @@ match_kdtree_second_in_first(struct match_kdtree_params
*p,
gal_data_t *
gal_match_kdtree(gal_data_t *coord1, gal_data_t *coord2,
gal_data_t *coord1_kdtree, size_t kdtree_root,
- double *aperture, size_t numthreads, size_t minmapsize,
- int quietmmap, size_t *nummatched)
+ uint8_t arrange, double *aperture, size_t numthreads,
+ size_t minmapsize, int quietmmap, size_t *nummatched)
{
gal_data_t *out=NULL;
- struct match_kdtree_params p;
+ struct match_kdtree_params p={0};
/* In case the 'k-d' tree is empty, just return a NULL pointer and the
number of matches to zero. */
@@ -1352,6 +1457,7 @@ gal_match_kdtree(gal_data_t *coord1, gal_data_t *coord2,
/* Write the parameters into the structure. */
p.A=coord1;
p.B=coord2;
+ p.arrange=arrange;
p.aperture=aperture;
p.A_kdtree=coord1_kdtree;
p.kdtree_root=kdtree_root;
@@ -1363,20 +1469,35 @@ gal_match_kdtree(gal_data_t *coord1, gal_data_t *coord2,
radius of the first. */
match_kdtree_second_in_first(&p, numthreads, minmapsize, quietmmap);
- /* Find the best match for each item (from possibly multiple matches). */
- match_rearrange(p.A, p.B, p.bina);
-
/* The match is done, write the output. */
- out=match_output(p.A, p.B, NULL, NULL, p.bina, minmapsize, quietmmap);
+ switch(arrange)
+ {
+ case GAL_MATCH_ARRANGE_FULL:
+ case GAL_MATCH_ARRANGE_INNER:
+ match_rearrange(p.A, p.B, p.bina);
+ out=match_output_inner(p.A, p.B, NULL, NULL, p.bina, minmapsize,
+ quietmmap);
+ *nummatched = out ? out->next->next->size : 0;
+ break;
- /* Set 'nummatched' and return output. */
- *nummatched = out ? out->next->next->size : 0;
+ case GAL_MATCH_ARRANGE_OUTER:
+ case GAL_MATCH_ARRANGE_OUTERWITHINAPERTURE:
+ out=match_output_outer(arrange, p.A, p.B, p.aoinb, p.aoinbd,
+ minmapsize, quietmmap, nummatched);
+ break;
- /* Clean up and return. */
- free(p.bina);
+ default:
+ error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at '%s' to "
+ "fix the problem. The 'arrange' code '%u' is not recognized",
+ __func__, PACKAGE_BUGREPORT, arrange);
+ }
+
+ /* Clean up and return (the 'aoinb' and 'aoinbd' are not freed because
+ when used, they are directly written to the output). */
free(p.Amin);
free(p.Amax);
free(p.Abinwidth);
+ if(p.bina) free(p.bina);
gal_list_data_free(p.Aexist);
return out;
}