[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[bug-recutils] [PATCH 04/13] src,torture: imple ment index builder.
From: |
Michał Masłowski |
Subject: |
[bug-recutils] [PATCH 04/13] src,torture: imple ment index builder. |
Date: |
Mon, 20 Aug 2012 18:21:25 +0200 |
---
ChangeLog | 34 ++
src/rec-idx-tree.c | 833 +++++++++++++++++++++++++++++
src/rec.h | 49 ++
torture/Makefile.am | 1 +
torture/rec-idx-tree/rec-idx-tree-build.c | 168 ++++++
torture/rec-idx-tree/tsuite-rec-idx-tree.c | 2 +
6 files changed, 1087 insertions(+)
create mode 100644 torture/rec-idx-tree/rec-idx-tree-build.c
diff --git a/ChangeLog b/ChangeLog
index 93012af..a7a45b5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,37 @@
+2012-08-15 Michał Masłowski <address@hidden>
+
+ src,torture: implement index builder.
+
+ * src/rec-idx-tree.c (REC_IDX_TREE_MAX_FIELDS): New constant.
+ (REC_IDX_TREE_MAX_NODE_SIZE): Likewise.
+ (REC_IDX_TREE_MAX_DEPTH): Likewise.
+ (struct rec_idx_tree_node_s): New structure.
+ (struct rec_idx_tree_entry_s): Likewise.
+ (rec_idx_tree_build): New function.
+ (rec_idx_tree_entry_dispose): Likewise.
+ (rec_idx_tree_entry_compare): Likewise.
+ (rec_idx_tree_add_entries): Likewise.
+ (rec_idx_tree_get_entry_size): Likewise.
+ (rec_idx_tree_split_entries_into_nodes): Likewise.
+ (rec_idx_tree_split_nodes_into_nodes): Likewise.
+ (rec_idx_tree_node_too_big): Likewise.
+ (rec_idx_tree_node_new): Likewise.
+ (rec_idx_tree_node_dispose): Likewise.
+ (rec_idx_tree_add_node_entry): Likewise.
+ (rec_idx_tree_add_node_node): Likewise.
+ (rec_idx_tree_header_and_type_size): Likewise.
+ (rec_idx_tree_write_header): Likewise.
+ (rec_idx_tree_write_layer): Likewise.
+ (rec_idx_tree_write_key): Likewise.
+ * src/rec.h: Add prototype for rec_idx_tree_build.
+
+ * torture/Makefile.am (REC_IDX_TREE_TSUITE): Add
+ rec-idx-tree-build.c.
+ * torture/rec-idx-tree/rec-idx-tree-build.c: New file.
+ * torture/rec-idx-tree/tsuite-rec-idx-tree.c: Add
+ test_rec_idx_tree_build.
+
+
2012-08-12 Michał Masłowski <address@hidden>
src,torture: implement index tree scans.
diff --git a/src/rec-idx-tree.c b/src/rec-idx-tree.c
index f106082..da45b95 100644
--- a/src/rec-idx-tree.c
+++ b/src/rec-idx-tree.c
@@ -25,9 +25,12 @@
#include <config.h>
+#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
+#include <gl_array_list.h>
+#include <gl_list.h>
#include <rec.h>
@@ -115,6 +118,58 @@ struct rec_idx_tree_header_s
#define REC_IDX_TREE_NONFIRST_MASK INT64_C(0x8000000000000000)
+/* Maximum number of fields in a key for building. For reading it's
+ always UINT8_MAX due to the index format, no higher value is
+ supported for building. */
+
+#define REC_IDX_TREE_MAX_FIELDS 4
+
+/* Maximum size of an index node having more than two entries, in
+ octets. */
+
+#define REC_IDX_TREE_MAX_NODE_SIZE 4096
+
+/* Maximum depth of the index tree for build, zero means a single leaf
+ node. A tree of depth n must have at least 2**n entries, so the
+ limit of UINT8_MAX from the index format will never be
+ exceeded. */
+
+#define REC_IDX_TREE_MAX_DEPTH 64
+
+/* A node for building. The type of the elements in entries depend on
+ the level. Unlike in the index data format, these objects contain
+ inner node entry data in the node it would point to. */
+
+struct rec_idx_tree_node_s
+{
+ size_t n_first; /* number of first nodes in the subtree */
+ void **key; /* smallest key in the node */
+ uint64_t *offset; /* location of the offset field in the parent node */
+ gl_list_t entries; /* list of struct rec_idx_tree_node_s for inner
+ nodes or struct rec_idx_tree_entry_s for leaf
+ nodes */
+ size_t size; /* number of octets taken by the node in index data */
+ size_t upper_size; /* number of ctets taken by the node key in an
+ upper layer node */
+};
+
+/* Object representing a leaf entry for tree building. */
+
+struct rec_idx_tree_entry_s
+{
+ /* general data for the whole index */
+ size_t n_fields; /* number of fields in the key */
+ const char **names; /* names of the fields, this is not a copy */
+ const rec_idx_tree_key_t *types; /* types of the fields */
+ /* entry data */
+ void **key; /* key from which the entry is mapped in the format of
+ a rec_idx_tree_scan argument */
+ uint64_t *offset; /* location of the offset field in the parent node */
+ uint64_t line; /* record line as in a node */
+ uint64_t character; /* record character as in a node */
+ size_t size; /* number of octets taken by the node in index data */
+};
+
/* Static functions defined in this file. */
static rec_iter_t rec_idx_tree_scan_common (rec_idx_tree_t idx_tree, const
void **left, const void **right, size_t left_ind, size_t right_ind, bool
first_only);
@@ -123,6 +178,27 @@ static int rec_idx_tree_skip_and_compare_key
(rec_idx_tree_t tree, const uint8_t
static bool rec_idx_tree_iter_next (void *data, rec_record_t *record);
static void rec_idx_tree_iter_dispose (void *data);
+static void rec_idx_tree_entry_dispose (const void *data);
+static int rec_idx_tree_entry_compare (const void *elt1, const void *elt2);
+
+static bool rec_idx_tree_add_entries (gl_list_t entries, struct
rec_idx_tree_entry_s *template, rec_record_t record, void **prefix, size_t
prefix_length, bool first);
+static size_t rec_idx_tree_get_entry_size (struct rec_idx_tree_entry_s *entry);
+
+static gl_list_t rec_idx_tree_split_entries_into_nodes (gl_list_t entries,
size_t *index_size);
+static gl_list_t rec_idx_tree_split_nodes_into_nodes (gl_list_t entries,
size_t *index_size);
+
+static bool rec_idx_tree_node_too_big (struct rec_idx_tree_node_s *node,
size_t to_add);
+static struct rec_idx_tree_node_s *rec_idx_tree_node_new (void);
+static void rec_idx_tree_node_dispose (const void *data);
+static bool rec_idx_tree_add_node_entry (struct rec_idx_tree_node_s *node,
struct rec_idx_tree_entry_s *entry);
+static bool rec_idx_tree_add_node_node (struct rec_idx_tree_node_s *node,
struct rec_idx_tree_node_s *entry);
+
+static size_t rec_idx_tree_header_and_type_size (size_t n_fields, const char
**field_names);
+
+static uint8_t *rec_idx_tree_write_header (uint8_t *bytes, size_t rset_number,
size_t depth, size_t n_fields, const char **field_names, const
rec_idx_tree_key_t *field_types);
+static uint8_t *rec_idx_tree_write_layer (uint8_t *buffer, uint8_t*
current_byte, gl_list_t nodes, bool leaf, size_t n_fields, const
rec_idx_tree_key_t *types);
+static uint8_t *rec_idx_tree_write_key (uint8_t *buffer, uint8_t
*current_byte, size_t n_fields, const rec_idx_tree_key_t *types, const void
**key);
+
/*
* Public functions.
*/
@@ -276,6 +352,138 @@ rec_idx_tree_scan_interval (rec_idx_tree_t idx_tree,
size_t left, size_t right)
return rec_idx_tree_scan_common (idx_tree, NULL, NULL, left, right, true);
}
+bool
+rec_idx_tree_build (rec_rset_t rset,
+ size_t rset_number,
+ size_t n_fields,
+ const char **field_names,
+ const rec_idx_tree_key_t *field_types,
+ uint8_t **byte_array,
+ size_t *size,
+ int64_t *type)
+{
+ gl_list_t entries; /* A list of struct rec_idx_tree_entry_s objects. */
+ rec_mset_iterator_t iter;
+ rec_record_t record;
+ /* Common data for entries. */
+ struct rec_idx_tree_entry_s template;
+ /* A prefix of keys for backtracking. */
+ void *key_prefix[REC_IDX_TREE_MAX_FIELDS];
+ /* A list of nodes on a tree layer. */
+ gl_list_t layers[REC_IDX_TREE_MAX_DEPTH + 1];
+ size_t index_size, header_size;
+ uint8_t *bytes, *current_byte;
+ int current_layer;
+
+ /* Check arguments for index format-specific limits. */
+ if ((rset_number > UINT8_MAX)
+ || (n_fields == 0) || (n_fields > REC_IDX_TREE_MAX_FIELDS)
+ || (rec_rset_num_records (rset) == 0))
+ {
+ return false;
+ }
+
+ /* Allocate the list of entries. */
+ entries = gl_list_nx_create_empty (GL_ARRAY_LIST, NULL,
+ NULL, rec_idx_tree_entry_dispose, true);
+ if (!entries)
+ {
+ return false;
+ }
+
+ /* Initialize the entry template. */
+ template.n_fields = n_fields;
+ template.names = field_names;
+ template.types = field_types;
+
+ /* Measure index file header and key type size. */
+ header_size = rec_idx_tree_header_and_type_size (n_fields, field_names);
+ index_size = header_size;
+
+ /* Make entries for records and add them to the list. */
+ iter = rec_mset_iterator (rec_rset_mset (rset));
+ while (rec_mset_iterator_next (&iter, MSET_RECORD, (const void **) &record,
+ NULL))
+ {
+ if (!rec_idx_tree_add_entries (entries, &template,
+ record, key_prefix, 0, true))
+ {
+ /* Not enough memory. */
+ gl_list_free (entries);
+ return false;
+ }
+ }
+
+ /* Split the list into leaf nodes. */
+ layers[0] = rec_idx_tree_split_entries_into_nodes (entries, &index_size);
+ if (!layers[0])
+ {
+ gl_list_free (entries);
+ return false;
+ }
+
+ /* Make layers of inner nodes if needed. */
+ current_layer = 0;
+ while (gl_list_size (layers[current_layer]) > 1)
+ {
+ layers[current_layer + 1] =
+ rec_idx_tree_split_nodes_into_nodes (layers[current_layer],
+ &index_size);
+ current_layer++;
+ if (layers[current_layer] == NULL)
+ {
+ /* Not enough memory. */
+ gl_list_free (entries);
+ while (current_layer > 0)
+ {
+ gl_list_free (layers[--current_layer]);
+ }
+ return false;
+ }
+ }
+
+ /* Allocate memory for the tree. */
+ bytes = malloc (index_size);
+ if (!bytes)
+ {
+ gl_list_free (entries);
+ do
+ {
+ gl_list_free (layers[current_layer--]);
+ }
+ while (current_layer > 0);
+ return NULL;
+ }
+
+ /* Write the tree header and key type. */
+ current_byte = rec_idx_tree_write_header (bytes,
+ rset_number,
+ current_layer,
+ n_fields,
+ field_names,
+ field_types);
+ assert ((size_t) (current_byte - bytes) == header_size);
+
+ /* Write the nodes. Start at the root, process each layer and
+ descend in the BFS order. */
+ while (current_layer >= 0)
+ {
+ current_byte = rec_idx_tree_write_layer (bytes,
+ current_byte,
+ layers[current_layer],
+ current_layer == 0,
+ n_fields,
+ field_types);
+ current_layer--;
+ }
+
+ /* Set output arguments. */
+ *byte_array = bytes;
+ *size = index_size;
+ *type = REC_IDX_TREE_TYPE;
+ return true;
+}
+
/*
* Private functions.
*/
@@ -665,4 +873,629 @@ rec_idx_tree_iter_dispose (void *data)
free (iter);
}
+/* Dispose a struct rec_idx_tree_entry_s object. */
+
+static void
+rec_idx_tree_entry_dispose (const void *data)
+{
+ size_t n;
+ struct rec_idx_tree_entry_s *entry = (struct rec_idx_tree_entry_s *) data;
+
+ for (n = 0; n < entry->n_fields; n++)
+ {
+ free (entry->key[n]);
+ }
+
+ free (entry);
+}
+
+/* Compare struct rec_idx_tree_entry_s objects. */
+
+static int
+rec_idx_tree_entry_compare (const void *elt1, const void *elt2)
+{
+ size_t n;
+ int comparison;
+ struct rec_idx_tree_entry_s *entry1 = (struct rec_idx_tree_entry_s *) elt1,
+ *entry2 = (struct rec_idx_tree_entry_s *) elt2;
+
+ /* Compare by key. */
+ for (n = 0; n < entry1->n_fields; n++)
+ {
+ /* NULLs are before other values. */
+ if (entry1->key[n] == NULL)
+ {
+ if (entry2->key[n] == NULL)
+ {
+ continue;
+ }
+ else
+ {
+ return -1;
+ }
+ }
+ else if (entry2->key[n] == NULL)
+ {
+ return 1;
+ }
+
+ /* Compare if both entries have keys. */
+ switch (entry1->types[n])
+ {
+ case REC_IDX_KEY_NONE:
+ break;
+ case REC_IDX_KEY_STRING:
+ comparison = strcmp ((const char *) entry1->key[n],
+ (const char *) entry2->key[n]);
+ if (comparison != 0)
+ {
+ return comparison;
+ }
+ break;
+ }
+ }
+
+ /* If equal, compare by record position. */
+ if (entry1->character != entry2->character)
+ {
+ if (entry1->character < entry2->character)
+ {
+ return -1;
+ }
+ else
+ {
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* Add all entries for given record to the list. The template
+ argument has non-entry-specific fields that will be used for all
+ entries. The prefix array is used for key generation and stores
+ its prefix of length prefix_length on a recursive call. The
+ argument first is 'true' if this prefix is used by the first entry
+ that would be used for sorting. Returns 'false' on error, maybe
+ having added some entries to the list. */
+
+static bool
+rec_idx_tree_add_entries (gl_list_t entries,
+ struct rec_idx_tree_entry_s *template,
+ rec_record_t record,
+ void **prefix,
+ size_t prefix_length,
+ bool first)
+{
+ struct rec_idx_tree_entry_s *entry;
+ rec_field_t field;
+ size_t n, num_fields_here, char_location;
+
+ if (prefix_length == template->n_fields)
+ {
+ /* We have a complete key in prefix, let's make an entry. */
+ entry = malloc (sizeof (*entry));
+ if (!entry)
+ {
+ return false;
+ }
+
+ memcpy (entry, template, sizeof (*entry));
+
+ entry->key = malloc (sizeof (void *) * template->n_fields);
+ if (!(entry->key))
+ {
+ free (entry);
+ return false;
+ }
+
+ memcpy (entry->key, prefix, template->n_fields * sizeof (entry->key[0]));
+ entry->line = (uint64_t) rec_record_location (record);
+ char_location = (uint64_t) rec_record_char_location (record);
+ entry->character = (char_location > 0) ? (char_location - 1) : 0;
+ entry->size = rec_idx_tree_get_entry_size (entry);
+ if (!first)
+ {
+ entry->line |= REC_IDX_TREE_NONFIRST_MASK;
+ }
+
+ /* Add the entry. */
+ if (!gl_sortedlist_nx_add (entries, rec_idx_tree_entry_compare, entry))
+ {
+ return false;
+ }
+ }
+ else
+ {
+ /* Recurse with all possible values for the next field. */
+ num_fields_here
+ = rec_record_get_num_fields_by_name (record,
+ template->names[prefix_length]);
+ if (num_fields_here == 0)
+ {
+ /* No values, leave NULL. When NULL is smaller than any
+ other key value, this emulates the behaviour of rset
+ sorting. */
+ prefix[prefix_length] = NULL;
+ return rec_idx_tree_add_entries (entries, template, record,
+ prefix, prefix_length + 1, first);
+ }
+ else
+ {
+ for (n = 0; n < num_fields_here; n++)
+ {
+ field =
+ rec_record_get_field_by_name (record,
+ template->names[prefix_length],
+ n);
+ prefix[prefix_length] = rec_field_value (field);
+ if (!rec_idx_tree_add_entries (entries, template, record,
+ prefix, prefix_length + 1,
+ first && (n == 0)))
+ {
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+}
+
+/* Return the size of an entry in a leaf node binary format. */
+
+static size_t
+rec_idx_tree_get_entry_size (struct rec_idx_tree_entry_s *entry)
+{
+ size_t res = sizeof (uint64_t) * 2;
+ size_t n;
+
+ for (n = 0; n < entry->n_fields; n++)
+ {
+ switch (entry->types[n])
+ {
+ case REC_IDX_KEY_NONE:
+ break;
+ case REC_IDX_KEY_STRING:
+ if (entry->key[n] == NULL)
+ {
+ res += 1; /* NULL strings are represented as empty strings */
+ }
+ else
+ {
+ res += strlen (entry->key[n]) + 1;
+ }
+ }
+ }
+
+ if (res % 8 != 0)
+ {
+ res += 8 - (res % 8);
+ }
+
+ return res;
+}
+
+/* Make a list of nodes from a list of leaf entries. The *index_size
+ variable will be increased by the number of octets taken by the
+ nodes of the returned list. On error NULL is returned and
+ *index_size is undefined. */
+
+static gl_list_t
+rec_idx_tree_split_entries_into_nodes (gl_list_t entries,
+ size_t *index_size)
+{
+ gl_list_t res;
+ struct rec_idx_tree_entry_s *entry;
+ struct rec_idx_tree_node_s *last_node = NULL;
+ gl_list_iterator_t iter;
+
+ /* We use a simple greedy algorithm: add entries to a node until
+ there is no more entries or the node already has two elements and
+ would get larger than REC_IDX_TREE_MAX_NODE_SIZE. This should
+ make reading a node require at most one disk access and a query
+ for a single element require reading not more than O(log n) nodes
+ if there are n leaf entries in the tree. */
+
+ res = gl_list_nx_create_empty (GL_ARRAY_LIST, NULL, NULL,
+ rec_idx_tree_node_dispose, true);
+ if (!res)
+ {
+ return NULL;
+ }
+
+ for (iter = gl_list_iterator (entries);
+ gl_list_iterator_next (&iter, (const void **) &entry, NULL);)
+ {
+ /* Go to the next node if there is no space left in the current
+ one. */
+ if ((last_node == NULL)
+ || rec_idx_tree_node_too_big (last_node, entry->size))
+ {
+ if (last_node)
+ {
+ *index_size += last_node->size;
+ }
+
+ last_node = rec_idx_tree_node_new ();
+ if (!last_node)
+ {
+ gl_list_free (res);
+ return NULL;
+ }
+
+ last_node->size += sizeof (uint64_t); /* next pointer */
+
+ if (!gl_list_nx_add_last (res, last_node))
+ {
+ rec_idx_tree_node_dispose (last_node);
+ gl_list_free (res);
+ return NULL;
+ }
+ }
+
+ /* Add the entry. */
+ if (!rec_idx_tree_add_node_entry (last_node, entry))
+ {
+ gl_list_free (res);
+ return NULL;
+ }
+ }
+
+ *index_size += last_node->size;
+ return res;
+}
+
+/* Just like rec_idx_tree_split_entries_into_nodes for making inner
+ nodes. Nodes are used as entries. */
+static gl_list_t
+rec_idx_tree_split_nodes_into_nodes (gl_list_t entries,
+ size_t *index_size)
+{
+ gl_list_t res;
+ struct rec_idx_tree_node_s *entry, *last_node = NULL;
+ gl_list_iterator_t iter;
+
+ res = gl_list_nx_create_empty (GL_ARRAY_LIST, NULL, NULL,
+ rec_idx_tree_node_dispose, true);
+ if (!res)
+ {
+ return NULL;
+ }
+
+ for (iter = gl_list_iterator (entries);
+ gl_list_iterator_next (&iter, (const void **) &entry, NULL);)
+ {
+ /* Go to the next node if there is no space left in the current
+ one. */
+ if ((last_node == NULL)
+ || rec_idx_tree_node_too_big (last_node, entry->upper_size))
+ {
+ if (last_node)
+ {
+ *index_size += last_node->size;
+ }
+
+ last_node = rec_idx_tree_node_new ();
+ if (!last_node)
+ {
+ gl_list_free (res);
+ return NULL;
+ }
+
+ if (!gl_list_nx_add_last (res, last_node))
+ {
+ rec_idx_tree_node_dispose (last_node);
+ gl_list_free (res);
+ return NULL;
+ }
+ }
+
+ /* Add the entry. */
+ if (!rec_idx_tree_add_node_node (last_node, entry))
+ {
+ gl_list_free (res);
+ return NULL;
+ }
+ }
+
+ *index_size += last_node->size;
+ return res;
+}
+
+/* Return 'true' if the node should not have an entry of size to_add
+ added. */
+
+static bool
+rec_idx_tree_node_too_big (struct rec_idx_tree_node_s *node, size_t to_add)
+{
+ if (gl_list_size (node->entries) < 2)
+ {
+ return false;
+ }
+
+ return node->size + to_add > REC_IDX_TREE_MAX_NODE_SIZE;
+}
+
+static struct rec_idx_tree_node_s *
+rec_idx_tree_node_new (void)
+{
+ struct rec_idx_tree_node_s *res;
+
+ res = malloc (sizeof (*res));
+ if (!res)
+ {
+ return NULL;
+ }
+
+ memset (res, 0, sizeof (*res)); /* zero most fields */
+ res->entries = gl_list_nx_create_empty (GL_ARRAY_LIST, NULL, NULL,
+ rec_idx_tree_node_dispose, true);
+ if (!res->entries)
+ {
+ return NULL;
+ }
+
+ res->size = sizeof (uint64_t); /* for the number of entries */
+
+ return res;
+}
+
+static void
+rec_idx_tree_node_dispose (const void *data)
+{
+ struct rec_idx_tree_node_s *node = (struct rec_idx_tree_node_s *) data;
+
+ gl_list_free (node->entries);
+ free (node);
+}
+
+/* Add an entry to a node. */
+static bool
+rec_idx_tree_add_node_entry (struct rec_idx_tree_node_s *node,
+ struct rec_idx_tree_entry_s *entry)
+{
+ if (!gl_list_nx_add_last (node->entries, entry))
+ {
+ return false;
+ }
+
+ if (!(entry->line & REC_IDX_TREE_NONFIRST_MASK))
+ {
+ node->n_first++;
+ }
+
+ node->size += entry->size;
+
+ if (gl_list_size (node->entries) == 1)
+ {
+ node->key = entry->key;
+ node->upper_size = entry->size;
+ }
+
+ return node;
+}
+
+/* Add a subtree node to an inner node. */
+static bool
+rec_idx_tree_add_node_node (struct rec_idx_tree_node_s *node,
+ struct rec_idx_tree_node_s *entry)
+{
+ if (!gl_list_nx_add_last (node->entries, entry))
+ {
+ return false;
+ }
+
+ node->n_first += entry->n_first;
+
+ if (gl_list_size (node->entries) == 1)
+ {
+ node->key = entry->key;
+ node->upper_size = entry->upper_size;
+ node->size += 2 * sizeof (uint64_t); /* no key stored */
+ }
+ else
+ {
+ node->size += entry->upper_size;
+ }
+ return node;
+}
+
+/* Return the size of index data without nodes. */
+
+static size_t
+rec_idx_tree_header_and_type_size (size_t n_fields,
+ const char **field_names)
+{
+ size_t size, n;
+
+ /* The header. */
+ size = sizeof (struct rec_idx_tree_header_s);
+
+ /* Key type elements. */
+ for (n = 0; n < n_fields; n++)
+ {
+ /* type, name and '\0' */
+ size += 2 + strlen (field_names[n]);
+ }
+
+ /* Padding. */
+ if (size % 8 != 0)
+ {
+ size += 8 - (size % 8);
+ }
+
+ return size;
+}
+
+/* Write the header and key type at bytes, return the pointer to the
+ next byte after padding. */
+
+static uint8_t *
+rec_idx_tree_write_header (uint8_t *bytes,
+ size_t rset_number,
+ size_t depth,
+ size_t n_fields,
+ const char **field_names,
+ const rec_idx_tree_key_t *field_types)
+{
+ size_t size, n, padding;
+ struct rec_idx_tree_header_s *header;
+ uint8_t *next_byte;
+
+ /* The header. */
+ header = (struct rec_idx_tree_header_s *) bytes;
+ header->n_rsets = rset_number;
+ header->depth = depth;
+ header->n_fields = n_fields;
+ next_byte = bytes + sizeof (*header);
+
+ /* Key type elements. */
+ for (n = 0; n < n_fields; n++)
+ {
+ *(next_byte++) = (uint8_t) field_types[n];
+ size = strlen (field_names[n]) + 1;
+ next_byte = memcpy (next_byte, field_names[n], size) + size;
+ }
+
+ /* Padding. */
+ if ((next_byte - bytes) % 8 != 0)
+ {
+ padding = 8 - ((next_byte - bytes) % 8);
+ next_byte = memset (next_byte, 0, padding) + padding;
+ }
+
+ return next_byte;
+}
+
+/* Write a layer of nodes (leaf or non-leaf depending on the fourth
+ parameter). The first parameter is the start of index data, the
+ second one is the start of the current layer there. Returns a
+ pointer to the next byte after the nodes. */
+static uint8_t *
+rec_idx_tree_write_layer (uint8_t* buffer,
+ uint8_t* current_byte,
+ gl_list_t nodes,
+ bool leaf,
+ size_t n_fields,
+ const rec_idx_tree_key_t *types)
+{
+ gl_list_iterator_t layer_iter, node_iter;
+ struct rec_idx_tree_node_s *node, *node_entry;
+ struct rec_idx_tree_entry_s *entry;
+ void *element;
+ uint64_t *next_word;
+ bool first, first_in_node;
+ uint8_t *start_of_node;
+
+ next_word = (uint64_t *) current_byte;
+ first = true;
+ for (layer_iter = gl_list_iterator (nodes);
+ gl_list_iterator_next (&layer_iter, (const void **) &node, NULL);)
+ {
+ first_in_node = true;
+ start_of_node = (uint8_t *) next_word;
+ if (node->offset != NULL)
+ {
+ /* Set the offset of this node in the parent. */
+ *(node->offset) = (uint64_t) (current_byte - buffer);
+ }
+
+ if (leaf && !first)
+ {
+ /* Offset of this leaf. */
+ *(next_word) = (uint8_t *) (next_word + 1) - buffer;
+ next_word++;
+ }
+
+ /* Write the node length. */
+ *(next_word++) = (uint64_t) gl_list_size (node->entries);
+
+ /* Write node entries. */
+ for (node_iter = gl_list_iterator (node->entries);
+ gl_list_iterator_next (&node_iter, (const void **) &element, NULL);)
+ {
+ if (leaf)
+ {
+ entry = (struct rec_idx_tree_entry_s *) element;
+ *(next_word++) = entry->line;
+ *(next_word++) = entry->character;
+ next_word = (uint64_t *)
+ rec_idx_tree_write_key (buffer, (uint8_t *) next_word,
+ n_fields, types,
+ (const void **) entry->key);
+ }
+ else
+ {
+ node_entry = (struct rec_idx_tree_node_s *) element;
+ if (!first_in_node)
+ {
+ next_word = (uint64_t *)
+ rec_idx_tree_write_key (buffer, (uint8_t *) next_word,
+ n_fields, types,
+ (const void **) node_entry->key);
+ }
+ else
+ {
+ first_in_node = false;
+ }
+
+ node_entry->offset = next_word++; /* will be set in
+ next layer */
+ *(next_word++) = node_entry->n_first;
+ }
+ }
+
+ assert ((size_t) (((uint8_t *) next_word) - start_of_node)
+ == node->size - ((first && leaf) ? 8 : 0));
+ first = false;
+ }
+
+ if (leaf)
+ {
+ /* No next node. */
+ *(next_word++) = 0;
+ }
+
+ return (uint8_t *) next_word;
+}
+
+/* Write a key and return the next byte position after padding. */
+static uint8_t *
+rec_idx_tree_write_key (uint8_t *buffer,
+ uint8_t *current_byte,
+ size_t n_fields,
+ const rec_idx_tree_key_t *types,
+ const void **key)
+{
+ size_t n, length, padding;
+
+ for (n = 0; n < n_fields; n++)
+ {
+ switch (types[n])
+ {
+ case REC_IDX_KEY_NONE:
+ break;
+ case REC_IDX_KEY_STRING:
+ if (key[n] != NULL)
+ {
+ length = strlen ((const char *) key[n]) + 1;
+ current_byte = memcpy (current_byte, key[n], length) + length;
+ }
+ else
+ {
+ *(current_byte++) = '\0';
+ }
+ break;
+ }
+ }
+
+ if ((current_byte - buffer) % 8 != 0)
+ {
+ padding = 8 - (current_byte - buffer) % 8;
+ current_byte = memset (current_byte, 0, padding) + padding;
+ }
+
+ return current_byte;
+}
+
/* End of rec-idx-tree.c */
diff --git a/src/rec.h b/src/rec.h
index 548e439..0aaad13 100644
--- a/src/rec.h
+++ b/src/rec.h
@@ -2532,6 +2532,55 @@ rec_iter_t rec_idx_tree_scan (rec_idx_tree_t idx_tree,
const void **left, const
rec_iter_t rec_idx_tree_scan_interval (rec_idx_tree_t idx_tree, size_t left,
size_t right);
+/**************** Building index trees *******************************/
+
+/* Build an index tree for a record set.
+
+ The following arguments are used for input:
+
+ RSET
+
+ The record set from which records will be taken and indexed.
+ Must have at least one record.
+
+ RSET_NUMBER
+
+ The number of preceding record sets in the recfile, as would be
+ returned by rec_idx_tree_get_rset_number for the output index
+ data.
+
+ N_FIELDS
+
+ The number of fields in the index key type.
+
+ FIELD_NAMES
+
+ The names of fields in the index key type.
+
+ FIELD_TYPES
+
+ The types of fields in the index key type.
+
+ The return value is 'true' on success, 'false' on error. If no
+ error occurs these arguments will be set to the following values:
+
+ BYTE_ARRAY
+
+ Start of the allocated index data, the caller should call free
+ on it after use.
+
+ SIZE
+
+ The number of octets in BYTE_ARRAY.
+
+ TYPE
+
+ The type in an index file for the index built.
+
+*/
+
+bool rec_idx_tree_build (rec_rset_t rset, size_t rset_number, size_t n_fields,
const char **field_names, const rec_idx_tree_key_t *field_types, uint8_t
**byte_array, size_t *size, int64_t *type);
+
#endif /* !GNU_REC_H */
/* End of rec.h */
diff --git a/torture/Makefile.am b/torture/Makefile.am
index 38d9e6d..dc01501 100644
--- a/torture/Makefile.am
+++ b/torture/Makefile.am
@@ -149,6 +149,7 @@ REC_ITER_TSUITE = rec-iter/rec-iter-mset.c \
REC_IDX_TREE_TSUITE = rec-idx-tree/rec-idx-tree-new.c \
rec-idx-tree/rec-idx-tree-scan.c \
+ rec-idx-tree/rec-idx-tree-build.c \
rec-idx-tree/tsuite-rec-idx-tree.c
runtests_SOURCES = runtests.c \
diff --git a/torture/rec-idx-tree/rec-idx-tree-build.c
b/torture/rec-idx-tree/rec-idx-tree-build.c
new file mode 100644
index 0000000..fb34e01
--- /dev/null
+++ b/torture/rec-idx-tree/rec-idx-tree-build.c
@@ -0,0 +1,168 @@
+/* -*- mode: C -*-
+ *
+ * File: rec-idx-tree-build.c
+ * Date: Wed Aug 15 15:16:05 2012
+ *
+ * GNU recutils - rec_idx_tree_build unit tests.
+ *
+ */
+
+/* Copyright (C) 2012 Michał Masłowski */
+
+/* This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <check.h>
+
+#include <rec.h>
+
+/* Data for indexes. */
+static const char recfile[] = "a: 4\n\na: 3\n\na: 1\n\na: 2\nb: 1\n\na:
5\nb:2";
+static const char *names[] = {"b", "a"};
+static const rec_idx_tree_key_t types[] = {REC_IDX_KEY_STRING,
+ REC_IDX_KEY_STRING};
+
+/*-
+ * Test: rec_idx_tree_build_verify_header
+ * Unit: rec_idx_tree_build
+ * Description:
+ * + Build an index and check the header.
+ */
+START_TEST(rec_idx_tree_build_verify_header)
+{
+ rec_idx_tree_t tree;
+ rec_rset_t rset;
+ rec_parser_t parser;
+ rec_iter_t iter;
+ rec_record_t record;
+ uint8_t *buffer = NULL;
+ size_t size = 0;
+ uint64_t type = 0;
+
+ /* Prepare the rset. */
+ parser = rec_parser_new_str (recfile, "dummy");
+ fail_if (!parser);
+ fail_if (!rec_parse_rset (parser, &rset));
+
+ /* Build the index and check obvious result problems. */
+ fail_if (!rec_idx_tree_build (rset, 1, 2, names, types,
+ &buffer, &size, &type));
+ fail_if (buffer == NULL);
+ fail_if (size < 16);
+ fail_if (type == 0);
+
+ /* Close the rset, reset parser. */
+ rec_rset_destroy (rset);
+ rec_parser_destroy (parser);
+
+ /* Open the index. */
+ tree = rec_idx_tree_new (buffer, type, size);
+
+ /* Verify the header. */
+ fail_if (rec_idx_tree_get_rset_number (tree) != 1);
+ fail_if (rec_idx_tree_get_num_fields (tree) != 2);
+ fail_if (strcmp (rec_idx_tree_get_field_name (tree, 0), "b"));
+ fail_if (strcmp (rec_idx_tree_get_field_name (tree, 1), "a"));
+ fail_if (rec_idx_tree_get_field_type (tree, 0) != REC_IDX_KEY_STRING);
+ fail_if (rec_idx_tree_get_field_type (tree, 1) != REC_IDX_KEY_STRING);
+
+ /* Close. */
+ rec_idx_tree_destroy (tree);
+}
+END_TEST
+
+/*-
+ * Test: rec_idx_tree_build_verify_elements
+ * Unit: rec_idx_tree_build
+ * Description:
+ * + Build an index and verify elements.
+ */
+START_TEST(rec_idx_tree_build_verify_elements)
+{
+ rec_idx_tree_t tree;
+ rec_rset_t rset;
+ rec_parser_t parser;
+ rec_iter_t iter;
+ rec_record_t record;
+ uint8_t *buffer = NULL;
+ size_t size = 0;
+ uint64_t type = 0;
+ static const char *left[] = {NULL, "1"};
+ static const char *right[] = {"2", "5"};
+
+ /* Prepare the rset. */
+ parser = rec_parser_new_str (recfile, "dummy");
+ fail_if (!parser);
+ fail_if (!rec_parse_rset (parser, &rset));
+
+ /* Build the index and check obvious result problems. */
+ fail_if (!rec_idx_tree_build (rset, 1, 2, names, types,
+ &buffer, &size, &type));
+ fail_if (buffer == NULL);
+ fail_if (size < 16);
+ fail_if (type == 0);
+
+ /* Close the rset, reset parser. */
+ rec_rset_destroy (rset);
+ rec_parser_reset (parser);
+
+ /* Open the rset and index. */
+ fail_if (!rec_parse_rset_lazy (parser, &rset));
+ tree = rec_idx_tree_new (buffer, type, size);
+ rec_idx_tree_set_rset (tree, parser, rset);
+
+ /* Do a scan. */
+ iter = rec_idx_tree_scan (tree, (const void **) left, (const void **) right);
+ fail_if (!iter);
+
+ /* Read the records. */
+ fail_if (!rec_iter_next (iter, &record));
+ fail_if (!rec_record_contains_field (record, "a", "1"));
+ fail_if (!rec_iter_next (iter, &record));
+ fail_if (!rec_record_contains_field (record, "a", "3"));
+ fail_if (!rec_iter_next (iter, &record));
+ fail_if (!rec_record_contains_field (record, "a", "4"));
+ fail_if (!rec_iter_next (iter, &record));
+ fail_if (!rec_record_contains_field (record, "a", "2"));
+ fail_if (!rec_iter_next (iter, &record));
+ fail_if (!rec_record_contains_field (record, "a", "5"));
+ fail_if (rec_iter_next (iter, &record));
+
+ /* Close. */
+ rec_iter_destroy (iter);
+ rec_idx_tree_destroy (tree);
+ rec_rset_destroy (rset);
+ rec_parser_destroy (parser);
+}
+END_TEST
+
+/*
+ * Test creation function
+ */
+TCase *
+test_rec_idx_tree_build (void)
+{
+ TCase *tc = tcase_create ("rec_idx_tree_build");
+ tcase_add_test (tc, rec_idx_tree_build_verify_header);
+ tcase_add_test (tc, rec_idx_tree_build_verify_elements);
+
+ return tc;
+}
+
+/* End of rec-idx-tree-build.c */
diff --git a/torture/rec-idx-tree/tsuite-rec-idx-tree.c
b/torture/rec-idx-tree/tsuite-rec-idx-tree.c
index 0ea6d45..6328836 100644
--- a/torture/rec-idx-tree/tsuite-rec-idx-tree.c
+++ b/torture/rec-idx-tree/tsuite-rec-idx-tree.c
@@ -28,6 +28,7 @@
extern TCase *test_rec_idx_tree_new (void);
extern TCase *test_rec_idx_tree_scan (void);
+extern TCase *test_rec_idx_tree_build (void);
Suite *
tsuite_rec_idx_tree ()
@@ -37,6 +38,7 @@ tsuite_rec_idx_tree ()
s = suite_create ("rec-idx-tree");
suite_add_tcase (s, test_rec_idx_tree_new ());
suite_add_tcase (s, test_rec_idx_tree_scan ());
+ suite_add_tcase (s, test_rec_idx_tree_build ());
return s;
}
--
1.7.11.4
- [bug-recutils] Index trees, Michał Masłowski, 2012/08/20
- [bug-recutils] [PATCH 01/13] src,torture: imple ment abstract record iterators., Michał Masłowski, 2012/08/20
- [bug-recutils] [PATCH 02/13] src,torture: imple ment initial index tree support without nodes., Michał Masłowski, 2012/08/20
- [bug-recutils] [PATCH 06/13] utils,doc: make re cfix --build-index add index trees., Michał Masłowski, 2012/08/20
- [bug-recutils] [PATCH 05/13] src: fix index file builder., Michał Masłowski, 2012/08/20
- [bug-recutils] [PATCH 04/13] src,torture: imple ment index builder.,
Michał Masłowski <=
- [bug-recutils] [PATCH 12/13] src: implement a tri vial query planner., Michał Masłowski, 2012/08/20
- [bug-recutils] [PATCH 09/13] src,torture: suppo rt duplicating index tree objects., Michał Masłowski, 2012/08/20
- [bug-recutils] [PATCH 10/13] src: keep indexes in rsets., Michał Masłowski, 2012/08/20
- [bug-recutils] [PATCH 11/13] src: support index t rees that point to a leaf too left of the key searched f or., Michał Masłowski, 2012/08/20
- [bug-recutils] [PATCH 08/13] utils: add recfix -- add-index command., Michał Masłowski, 2012/08/20
- [bug-recutils] [PATCH 13/13] torture: use index t rees for performance tests., Michał Masłowski, 2012/08/20
- [bug-recutils] [PATCH 07/13] src: fix index range checks., Michał Masłowski, 2012/08/20
- [bug-recutils] [PATCH 03/13] src,torture: imple ment index tree scans., Michał Masłowski, 2012/08/20