bug-recutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[bug-recutils] [PATCH 04/13] src,torture: imple ment index builder.


From: Michał Masłowski
Subject: [bug-recutils] [PATCH 04/13] src,torture: imple ment index builder.
Date: Mon, 20 Aug 2012 18:21:25 +0200

---
 ChangeLog                                  |  34 ++
 src/rec-idx-tree.c                         | 833 +++++++++++++++++++++++++++++
 src/rec.h                                  |  49 ++
 torture/Makefile.am                        |   1 +
 torture/rec-idx-tree/rec-idx-tree-build.c  | 168 ++++++
 torture/rec-idx-tree/tsuite-rec-idx-tree.c |   2 +
 6 files changed, 1087 insertions(+)
 create mode 100644 torture/rec-idx-tree/rec-idx-tree-build.c

diff --git a/ChangeLog b/ChangeLog
index 93012af..a7a45b5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,37 @@
+2012-08-15  Michał Masłowski  <address@hidden>
+
+       src,torture: implement index builder.
+
+       * src/rec-idx-tree.c (REC_IDX_TREE_MAX_FIELDS): New constant.
+       (REC_IDX_TREE_MAX_NODE_SIZE): Likewise.
+       (REC_IDX_TREE_MAX_DEPTH): Likewise.
+       (struct rec_idx_tree_node_s): New structure.
+       (struct rec_idx_tree_entry_s): Likewise.
+       (rec_idx_tree_build): New function.
+       (rec_idx_tree_entry_dispose): Likewise.
+       (rec_idx_tree_entry_compare): Likewise.
+       (rec_idx_tree_add_entries): Likewise.
+       (rec_idx_tree_get_entry_size): Likewise.
+       (rec_idx_tree_split_entries_into_nodes): Likewise.
+       (rec_idx_tree_split_nodes_into_nodes): Likewise.
+       (rec_idx_tree_node_too_big): Likewise.
+       (rec_idx_tree_node_new): Likewise.
+       (rec_idx_tree_node_dispose): Likewise.
+       (rec_idx_tree_add_node_entry): Likewise.
+       (rec_idx_tree_add_node_node): Likewise.
+       (rec_idx_tree_header_and_type_size): Likewise.
+       (rec_idx_tree_write_header): Likewise.
+       (rec_idx_tree_write_layer): Likewise.
+       (rec_idx_tree_write_key): Likewise.
+       * src/rec.h: Add prototype for rec_idx_tree_build.
+
+       * torture/Makefile.am (REC_IDX_TREE_TSUITE): Add
+       rec-idx-tree-build.c.
+       * torture/rec-idx-tree/rec-idx-tree-build.c: New file.
+       * torture/rec-idx-tree/tsuite-rec-idx-tree.c: Add
+       test_rec_idx_tree_build.
+
+
 2012-08-12  Michał Masłowski  <address@hidden>
 
        src,torture: implement index tree scans.
diff --git a/src/rec-idx-tree.c b/src/rec-idx-tree.c
index f106082..da45b95 100644
--- a/src/rec-idx-tree.c
+++ b/src/rec-idx-tree.c
@@ -25,9 +25,12 @@
 
 #include <config.h>
 
+#include <assert.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
+#include <gl_array_list.h>
+#include <gl_list.h>
 
 #include <rec.h>
 
@@ -115,6 +118,58 @@ struct rec_idx_tree_header_s
 
 #define REC_IDX_TREE_NONFIRST_MASK INT64_C(0x8000000000000000)
 
+/* Maximum number of fields in a key for building.  For reading it's
+   always UINT8_MAX due to the index format, no higher value is
+   supported for building.  */
+
+#define REC_IDX_TREE_MAX_FIELDS 4
+
+/* Maximum size of an index node having more than two entries, in
+   octets.  */
+
+#define REC_IDX_TREE_MAX_NODE_SIZE 4096
+
+/* Maximum depth of the index tree for build, zero means a single leaf
+   node.  A tree of depth n must have at least 2**n entries, so the
+   limit of UINT8_MAX from the index format will never be
+   exceeded.  */
+
+#define REC_IDX_TREE_MAX_DEPTH 64
+
+/* A node for building.  The type of the elements in entries depend on
+   the level.  Unlike in the index data format, these objects contain
+   inner node entry data in the node it would point to.  */
+
+struct rec_idx_tree_node_s
+{
+  size_t n_first;  /* number of first nodes in the subtree */
+  void **key;  /* smallest key in the node */
+  uint64_t *offset;  /* location of the offset field in the parent node */
+  gl_list_t entries;  /* list of struct rec_idx_tree_node_s for inner
+                         nodes or struct rec_idx_tree_entry_s for leaf
+                         nodes */
+  size_t size;  /* number of octets taken by the node in index data */
+  size_t upper_size;  /* number of ctets taken by the node key in an
+                         upper layer node */
+};
+
+/* Object representing a leaf entry for tree building.  */
+
+struct rec_idx_tree_entry_s
+{
+  /* general data for the whole index */
+  size_t n_fields;  /* number of fields in the key */
+  const char **names;  /* names of the fields, this is not a copy */
+  const rec_idx_tree_key_t *types;  /* types of the fields */
+  /* entry data */
+  void **key;  /* key from which the entry is mapped in the format of
+                  a rec_idx_tree_scan argument */
+  uint64_t *offset;  /* location of the offset field in the parent node */
+  uint64_t line;  /* record line as in a node */
+  uint64_t character;  /* record character as in a node */
+  size_t size;  /* number of octets taken by the node in index data */
+};
+
 /* Static functions defined in this file.  */
 
 static rec_iter_t rec_idx_tree_scan_common (rec_idx_tree_t idx_tree, const 
void **left, const void **right, size_t left_ind, size_t right_ind, bool 
first_only);
@@ -123,6 +178,27 @@ static int rec_idx_tree_skip_and_compare_key 
(rec_idx_tree_t tree, const uint8_t
 static bool rec_idx_tree_iter_next (void *data, rec_record_t *record);
 static void rec_idx_tree_iter_dispose (void *data);
 
+static void rec_idx_tree_entry_dispose (const void *data);
+static int rec_idx_tree_entry_compare (const void *elt1, const void *elt2);
+
+static bool rec_idx_tree_add_entries (gl_list_t entries, struct 
rec_idx_tree_entry_s *template, rec_record_t record, void **prefix, size_t 
prefix_length, bool first);
+static size_t rec_idx_tree_get_entry_size (struct rec_idx_tree_entry_s *entry);
+
+static gl_list_t rec_idx_tree_split_entries_into_nodes (gl_list_t entries, 
size_t *index_size);
+static gl_list_t rec_idx_tree_split_nodes_into_nodes (gl_list_t entries, 
size_t *index_size);
+
+static bool rec_idx_tree_node_too_big (struct rec_idx_tree_node_s *node, 
size_t to_add);
+static struct rec_idx_tree_node_s *rec_idx_tree_node_new (void);
+static void rec_idx_tree_node_dispose (const void *data);
+static bool rec_idx_tree_add_node_entry (struct rec_idx_tree_node_s *node, 
struct rec_idx_tree_entry_s *entry);
+static bool rec_idx_tree_add_node_node (struct rec_idx_tree_node_s *node, 
struct rec_idx_tree_node_s *entry);
+
+static size_t rec_idx_tree_header_and_type_size (size_t n_fields, const char 
**field_names);
+
+static uint8_t *rec_idx_tree_write_header (uint8_t *bytes, size_t rset_number, 
size_t depth, size_t n_fields, const char **field_names, const 
rec_idx_tree_key_t *field_types);
+static uint8_t *rec_idx_tree_write_layer (uint8_t *buffer, uint8_t* 
current_byte, gl_list_t nodes, bool leaf, size_t n_fields, const 
rec_idx_tree_key_t *types);
+static uint8_t *rec_idx_tree_write_key (uint8_t *buffer, uint8_t 
*current_byte, size_t n_fields, const rec_idx_tree_key_t *types, const void 
**key);
+
 /*
  * Public functions.
  */
@@ -276,6 +352,138 @@ rec_idx_tree_scan_interval (rec_idx_tree_t idx_tree, 
size_t left, size_t right)
   return rec_idx_tree_scan_common (idx_tree, NULL, NULL, left, right, true);
 }
 
+bool
+rec_idx_tree_build (rec_rset_t rset,
+                    size_t rset_number,
+                    size_t n_fields,
+                    const char **field_names,
+                    const rec_idx_tree_key_t *field_types,
+                    uint8_t **byte_array,
+                    size_t *size,
+                    int64_t *type)
+{
+  gl_list_t entries;  /* A list of struct rec_idx_tree_entry_s objects. */
+  rec_mset_iterator_t iter;
+  rec_record_t record;
+  /* Common data for entries.  */
+  struct rec_idx_tree_entry_s template;
+  /* A prefix of keys for backtracking.  */
+  void *key_prefix[REC_IDX_TREE_MAX_FIELDS];
+  /* A list of nodes on a tree layer. */
+  gl_list_t layers[REC_IDX_TREE_MAX_DEPTH + 1];
+  size_t index_size, header_size;
+  uint8_t *bytes, *current_byte;
+  int current_layer;
+
+  /* Check arguments for index format-specific limits. */
+  if ((rset_number > UINT8_MAX)
+      || (n_fields == 0) || (n_fields > REC_IDX_TREE_MAX_FIELDS)
+      || (rec_rset_num_records (rset) == 0))
+    {
+      return false;
+    }
+
+  /* Allocate the list of entries. */
+  entries = gl_list_nx_create_empty (GL_ARRAY_LIST, NULL,
+                                     NULL, rec_idx_tree_entry_dispose, true);
+  if (!entries)
+    {
+      return false;
+    }
+
+  /* Initialize the entry template. */
+  template.n_fields = n_fields;
+  template.names = field_names;
+  template.types = field_types;
+
+  /* Measure index file header and key type size.  */
+  header_size = rec_idx_tree_header_and_type_size (n_fields, field_names);
+  index_size = header_size;
+
+  /* Make entries for records and add them to the list. */
+  iter = rec_mset_iterator (rec_rset_mset (rset));
+  while (rec_mset_iterator_next (&iter, MSET_RECORD, (const void **) &record,
+                                 NULL))
+    {
+      if (!rec_idx_tree_add_entries (entries, &template,
+                                     record, key_prefix, 0, true))
+        {
+          /* Not enough memory. */
+          gl_list_free (entries);
+          return false;
+        }
+    }
+
+  /* Split the list into leaf nodes.  */
+  layers[0] = rec_idx_tree_split_entries_into_nodes (entries, &index_size);
+  if (!layers[0])
+    {
+      gl_list_free (entries);
+      return false;
+    }
+
+  /* Make layers of inner nodes if needed.  */
+  current_layer = 0;
+  while (gl_list_size (layers[current_layer]) > 1)
+    {
+      layers[current_layer + 1] =
+        rec_idx_tree_split_nodes_into_nodes (layers[current_layer],
+                                             &index_size);
+      current_layer++;
+      if (layers[current_layer] == NULL)
+        {
+          /* Not enough memory. */
+          gl_list_free (entries);
+          while (current_layer > 0)
+            {
+              gl_list_free (layers[--current_layer]);
+            }
+          return false;
+        }
+    }
+
+  /* Allocate memory for the tree. */
+  bytes = malloc (index_size);
+  if (!bytes)
+    {
+      gl_list_free (entries);
+      do
+        {
+          gl_list_free (layers[current_layer--]);
+        }
+      while (current_layer > 0);
+      return NULL;
+    }
+
+  /* Write the tree header and key type. */
+  current_byte = rec_idx_tree_write_header (bytes,
+                                            rset_number,
+                                            current_layer,
+                                            n_fields,
+                                            field_names,
+                                            field_types);
+  assert ((size_t) (current_byte - bytes) == header_size);
+
+  /* Write the nodes.  Start at the root, process each layer and
+     descend in the BFS order.  */
+  while (current_layer >= 0)
+    {
+      current_byte = rec_idx_tree_write_layer (bytes,
+                                               current_byte,
+                                               layers[current_layer],
+                                               current_layer == 0,
+                                               n_fields,
+                                               field_types);
+      current_layer--;
+    }
+
+  /* Set output arguments. */
+  *byte_array = bytes;
+  *size = index_size;
+  *type = REC_IDX_TREE_TYPE;
+  return true;
+}
+
 /*
  * Private functions.
  */
@@ -665,4 +873,629 @@ rec_idx_tree_iter_dispose (void *data)
   free (iter);
 }
 
+/* Dispose a struct rec_idx_tree_entry_s object. */
+
+static void
+rec_idx_tree_entry_dispose (const void *data)
+{
+  size_t n;
+  struct rec_idx_tree_entry_s *entry = (struct rec_idx_tree_entry_s *) data;
+
+  for (n = 0; n < entry->n_fields; n++)
+    {
+      free (entry->key[n]);
+    }
+
+  free (entry);
+}
+
+/* Compare struct rec_idx_tree_entry_s objects.  */
+
+static int
+rec_idx_tree_entry_compare (const void *elt1, const void *elt2)
+{
+  size_t n;
+  int comparison;
+  struct rec_idx_tree_entry_s *entry1 = (struct rec_idx_tree_entry_s *) elt1,
+    *entry2 = (struct rec_idx_tree_entry_s *) elt2;
+
+  /* Compare by key. */
+  for (n = 0; n < entry1->n_fields; n++)
+    {
+      /* NULLs are before other values. */
+      if (entry1->key[n] == NULL)
+        {
+          if (entry2->key[n] == NULL)
+            {
+              continue;
+            }
+          else
+            {
+              return -1;
+            }
+        }
+      else if (entry2->key[n] == NULL)
+        {
+          return 1;
+        }
+
+      /* Compare if both entries have keys. */
+      switch (entry1->types[n])
+        {
+        case REC_IDX_KEY_NONE:
+          break;
+        case REC_IDX_KEY_STRING:
+          comparison = strcmp ((const char *) entry1->key[n],
+                               (const char *) entry2->key[n]);
+          if (comparison != 0)
+            {
+              return comparison;
+            }
+          break;
+        }
+    }
+
+  /* If equal, compare by record position. */
+  if (entry1->character != entry2->character)
+    {
+      if (entry1->character < entry2->character)
+        {
+          return -1;
+        }
+      else
+        {
+          return 1;
+        }
+    }
+
+  return 0;
+}
+
+/* Add all entries for given record to the list.  The template
+   argument has non-entry-specific fields that will be used for all
+   entries.  The prefix array is used for key generation and stores
+   its prefix of length prefix_length on a recursive call.  The
+   argument first is 'true' if this prefix is used by the first entry
+   that would be used for sorting.  Returns 'false' on error, maybe
+   having added some entries to the list.  */
+
+static bool
+rec_idx_tree_add_entries (gl_list_t entries,
+                          struct rec_idx_tree_entry_s *template,
+                          rec_record_t record,
+                          void **prefix,
+                          size_t prefix_length,
+                          bool first)
+{
+  struct rec_idx_tree_entry_s *entry;
+  rec_field_t field;
+  size_t n, num_fields_here, char_location;
+
+  if (prefix_length == template->n_fields)
+    {
+      /* We have a complete key in prefix, let's make an entry. */
+      entry = malloc (sizeof (*entry));
+      if (!entry)
+        {
+          return false;
+        }
+
+      memcpy (entry, template, sizeof (*entry));
+
+      entry->key = malloc (sizeof (void *) * template->n_fields);
+      if (!(entry->key))
+        {
+          free (entry);
+          return false;
+        }
+
+      memcpy (entry->key, prefix, template->n_fields * sizeof (entry->key[0]));
+      entry->line = (uint64_t) rec_record_location (record);
+      char_location = (uint64_t) rec_record_char_location (record);
+      entry->character = (char_location > 0) ? (char_location - 1) : 0;
+      entry->size = rec_idx_tree_get_entry_size (entry);
+      if (!first)
+        {
+          entry->line |= REC_IDX_TREE_NONFIRST_MASK;
+        }
+
+      /* Add the entry. */
+      if (!gl_sortedlist_nx_add (entries, rec_idx_tree_entry_compare, entry))
+        {
+          return false;
+        }
+    }
+  else
+    {
+      /* Recurse with all possible values for the next field. */
+      num_fields_here
+        = rec_record_get_num_fields_by_name (record,
+                                             template->names[prefix_length]);
+      if (num_fields_here == 0)
+        {
+          /* No values, leave NULL.  When NULL is smaller than any
+             other key value, this emulates the behaviour of rset
+             sorting.  */
+          prefix[prefix_length] = NULL;
+          return rec_idx_tree_add_entries (entries, template, record,
+                                           prefix, prefix_length + 1, first);
+        }
+      else
+        {
+          for (n = 0; n < num_fields_here; n++)
+            {
+              field =
+                rec_record_get_field_by_name (record,
+                                              template->names[prefix_length],
+                                              n);
+              prefix[prefix_length] = rec_field_value (field);
+              if (!rec_idx_tree_add_entries (entries, template, record,
+                                             prefix, prefix_length + 1,
+                                             first && (n == 0)))
+                {
+                  return false;
+                }
+            }
+        }
+    }
+  return true;
+}
+
+/* Return the size of an entry in a leaf node binary format.  */
+
+static size_t
+rec_idx_tree_get_entry_size (struct rec_idx_tree_entry_s *entry)
+{
+  size_t res = sizeof (uint64_t) * 2;
+  size_t n;
+
+  for (n = 0; n < entry->n_fields; n++)
+    {
+      switch (entry->types[n])
+        {
+        case REC_IDX_KEY_NONE:
+          break;
+        case REC_IDX_KEY_STRING:
+          if (entry->key[n] == NULL)
+            {
+              res += 1;  /* NULL strings are represented as empty strings */
+            }
+          else
+            {
+              res += strlen (entry->key[n]) + 1;
+            }
+        }
+    }
+
+  if (res % 8 != 0)
+    {
+      res += 8 - (res % 8);
+    }
+
+  return res;
+}
+
+/* Make a list of nodes from a list of leaf entries.  The *index_size
+   variable will be increased by the number of octets taken by the
+   nodes of the returned list.  On error NULL is returned and
+   *index_size is undefined.  */
+
+static gl_list_t
+rec_idx_tree_split_entries_into_nodes (gl_list_t entries,
+                                       size_t *index_size)
+{
+  gl_list_t res;
+  struct rec_idx_tree_entry_s *entry;
+  struct rec_idx_tree_node_s *last_node = NULL;
+  gl_list_iterator_t iter;
+
+  /* We use a simple greedy algorithm: add entries to a node until
+     there is no more entries or the node already has two elements and
+     would get larger than REC_IDX_TREE_MAX_NODE_SIZE.  This should
+     make reading a node require at most one disk access and a query
+     for a single element require reading not more than O(log n) nodes
+     if there are n leaf entries in the tree.  */
+
+  res = gl_list_nx_create_empty (GL_ARRAY_LIST, NULL, NULL,
+                                 rec_idx_tree_node_dispose, true);
+  if (!res)
+    {
+      return NULL;
+    }
+
+  for (iter = gl_list_iterator (entries);
+       gl_list_iterator_next (&iter, (const void **) &entry, NULL);)
+    {
+      /* Go to the next node if there is no space left in the current
+         one. */
+      if ((last_node == NULL)
+          || rec_idx_tree_node_too_big (last_node, entry->size))
+        {
+          if (last_node)
+            {
+              *index_size += last_node->size;
+            }
+
+          last_node = rec_idx_tree_node_new ();
+          if (!last_node)
+            {
+              gl_list_free (res);
+              return NULL;
+            }
+
+          last_node->size += sizeof (uint64_t);  /* next pointer */
+
+          if (!gl_list_nx_add_last (res, last_node))
+            {
+              rec_idx_tree_node_dispose (last_node);
+              gl_list_free (res);
+              return NULL;
+            }
+        }
+
+      /* Add the entry. */
+      if (!rec_idx_tree_add_node_entry (last_node, entry))
+        {
+          gl_list_free (res);
+          return NULL;
+        }
+    }
+
+  *index_size += last_node->size;
+  return res;
+}
+
+/* Just like rec_idx_tree_split_entries_into_nodes for making inner
+   nodes.  Nodes are used as entries.  */
+static gl_list_t
+rec_idx_tree_split_nodes_into_nodes (gl_list_t entries,
+                                     size_t *index_size)
+{
+  gl_list_t res;
+  struct rec_idx_tree_node_s *entry, *last_node = NULL;
+  gl_list_iterator_t iter;
+
+  res = gl_list_nx_create_empty (GL_ARRAY_LIST, NULL, NULL,
+                                 rec_idx_tree_node_dispose, true);
+  if (!res)
+    {
+      return NULL;
+    }
+
+  for (iter = gl_list_iterator (entries);
+       gl_list_iterator_next (&iter, (const void **) &entry, NULL);)
+    {
+      /* Go to the next node if there is no space left in the current
+         one. */
+      if ((last_node == NULL)
+          || rec_idx_tree_node_too_big (last_node, entry->upper_size))
+        {
+          if (last_node)
+            {
+              *index_size += last_node->size;
+            }
+
+          last_node = rec_idx_tree_node_new ();
+          if (!last_node)
+            {
+              gl_list_free (res);
+              return NULL;
+            }
+
+          if (!gl_list_nx_add_last (res, last_node))
+            {
+              rec_idx_tree_node_dispose (last_node);
+              gl_list_free (res);
+              return NULL;
+            }
+        }
+
+      /* Add the entry. */
+      if (!rec_idx_tree_add_node_node (last_node, entry))
+        {
+          gl_list_free (res);
+          return NULL;
+        }
+    }
+
+  *index_size += last_node->size;
+  return res;
+}
+
+/* Return 'true' if the node should not have an entry of size to_add
+   added.  */
+
+static bool
+rec_idx_tree_node_too_big (struct rec_idx_tree_node_s *node, size_t to_add)
+{
+  if (gl_list_size (node->entries) < 2)
+    {
+      return false;
+    }
+
+  return node->size + to_add > REC_IDX_TREE_MAX_NODE_SIZE;
+}
+
+static struct rec_idx_tree_node_s *
+rec_idx_tree_node_new (void)
+{
+  struct rec_idx_tree_node_s *res;
+
+  res = malloc (sizeof (*res));
+  if (!res)
+    {
+      return NULL;
+    }
+
+  memset (res, 0, sizeof (*res));  /* zero most fields */
+  res->entries = gl_list_nx_create_empty (GL_ARRAY_LIST, NULL, NULL,
+                                          rec_idx_tree_node_dispose, true);
+  if (!res->entries)
+    {
+      return NULL;
+    }
+
+  res->size = sizeof (uint64_t);  /* for the number of entries */
+
+  return res;
+}
+
+static void
+rec_idx_tree_node_dispose (const void *data)
+{
+  struct rec_idx_tree_node_s *node = (struct rec_idx_tree_node_s *) data;
+
+  gl_list_free (node->entries);
+  free (node);
+}
+
+/* Add an entry to a node. */
+static bool
+rec_idx_tree_add_node_entry (struct rec_idx_tree_node_s *node,
+                             struct rec_idx_tree_entry_s *entry)
+{
+  if (!gl_list_nx_add_last (node->entries, entry))
+    {
+      return false;
+    }
+
+  if (!(entry->line & REC_IDX_TREE_NONFIRST_MASK))
+    {
+      node->n_first++;
+    }
+
+  node->size += entry->size;
+
+  if (gl_list_size (node->entries) == 1)
+    {
+      node->key = entry->key;
+      node->upper_size = entry->size;
+    }
+
+  return node;
+}
+
+/* Add a subtree node to an inner node. */
+static bool
+rec_idx_tree_add_node_node (struct rec_idx_tree_node_s *node,
+                            struct rec_idx_tree_node_s *entry)
+{
+  if (!gl_list_nx_add_last (node->entries, entry))
+    {
+      return false;
+    }
+
+  node->n_first += entry->n_first;
+
+  if (gl_list_size (node->entries) == 1)
+    {
+      node->key = entry->key;
+      node->upper_size = entry->upper_size;
+      node->size += 2 * sizeof (uint64_t);  /* no key stored */
+    }
+  else
+    {
+      node->size += entry->upper_size;
+    }
+  return node;
+}
+
+/* Return the size of index data without nodes. */
+
+static size_t
+rec_idx_tree_header_and_type_size (size_t n_fields,
+                                   const char **field_names)
+{
+  size_t size, n;
+
+  /* The header. */
+  size = sizeof (struct rec_idx_tree_header_s);
+
+  /* Key type elements. */
+  for (n = 0; n < n_fields; n++)
+    {
+      /* type, name and '\0' */
+      size += 2 + strlen (field_names[n]);
+    }
+
+  /* Padding. */
+  if (size % 8 != 0)
+    {
+      size += 8 - (size % 8);
+    }
+
+  return size;
+}
+
+/* Write the header and key type at bytes, return the pointer to the
+   next byte after padding. */
+
+static uint8_t *
+rec_idx_tree_write_header (uint8_t *bytes,
+                           size_t rset_number,
+                           size_t depth,
+                           size_t n_fields,
+                           const char **field_names,
+                           const rec_idx_tree_key_t *field_types)
+{
+  size_t size, n, padding;
+  struct rec_idx_tree_header_s *header;
+  uint8_t *next_byte;
+
+  /* The header. */
+  header = (struct rec_idx_tree_header_s *) bytes;
+  header->n_rsets = rset_number;
+  header->depth = depth;
+  header->n_fields = n_fields;
+  next_byte = bytes + sizeof (*header);
+
+  /* Key type elements. */
+  for (n = 0; n < n_fields; n++)
+    {
+      *(next_byte++) = (uint8_t) field_types[n];
+      size = strlen (field_names[n]) + 1;
+      next_byte = memcpy (next_byte, field_names[n], size) + size;
+    }
+
+  /* Padding. */
+  if ((next_byte - bytes) % 8 != 0)
+    {
+      padding = 8 - ((next_byte - bytes) % 8);
+      next_byte = memset (next_byte, 0, padding) + padding;
+    }
+
+  return next_byte;
+}
+
+/* Write a layer of nodes (leaf or non-leaf depending on the fourth
+   parameter).  The first parameter is the start of index data, the
+   second one is the start of the current layer there.  Returns a
+   pointer to the next byte after the nodes.  */
+static uint8_t *
+rec_idx_tree_write_layer (uint8_t* buffer,
+                          uint8_t* current_byte,
+                          gl_list_t nodes,
+                          bool leaf,
+                          size_t n_fields,
+                          const rec_idx_tree_key_t *types)
+{
+  gl_list_iterator_t layer_iter, node_iter;
+  struct rec_idx_tree_node_s *node, *node_entry;
+  struct rec_idx_tree_entry_s *entry;
+  void *element;
+  uint64_t *next_word;
+  bool first, first_in_node;
+  uint8_t *start_of_node;
+
+  next_word = (uint64_t *) current_byte;
+  first = true;
+  for (layer_iter = gl_list_iterator (nodes);
+       gl_list_iterator_next (&layer_iter, (const void **) &node, NULL);)
+    {
+      first_in_node = true;
+      start_of_node = (uint8_t *) next_word;
+      if (node->offset != NULL)
+        {
+          /* Set the offset of this node in the parent. */
+          *(node->offset) = (uint64_t) (current_byte - buffer);
+        }
+
+      if (leaf && !first)
+        {
+          /* Offset of this leaf.  */
+          *(next_word) = (uint8_t *) (next_word + 1) - buffer;
+          next_word++;
+        }
+
+      /* Write the node length. */
+      *(next_word++) = (uint64_t) gl_list_size (node->entries);
+
+      /* Write node entries. */
+      for (node_iter = gl_list_iterator (node->entries);
+           gl_list_iterator_next (&node_iter, (const void **) &element, NULL);)
+        {
+          if (leaf)
+            {
+              entry = (struct rec_idx_tree_entry_s *) element;
+              *(next_word++) = entry->line;
+              *(next_word++) = entry->character;
+              next_word = (uint64_t *)
+                rec_idx_tree_write_key (buffer, (uint8_t *) next_word,
+                                        n_fields, types,
+                                        (const void **) entry->key);
+            }
+          else
+            {
+              node_entry = (struct rec_idx_tree_node_s *) element;
+              if (!first_in_node)
+                {
+                  next_word = (uint64_t *)
+                    rec_idx_tree_write_key (buffer, (uint8_t *) next_word,
+                                            n_fields, types,
+                                            (const void **) node_entry->key);
+                }
+              else
+                {
+                  first_in_node = false;
+                }
+
+              node_entry->offset = next_word++;  /* will be set in
+                                                    next layer */
+              *(next_word++) = node_entry->n_first;
+            }
+        }
+
+      assert ((size_t) (((uint8_t *) next_word) - start_of_node)
+              == node->size - ((first && leaf) ? 8 : 0));
+      first = false;
+    }
+
+  if (leaf)
+    {
+      /* No next node. */
+      *(next_word++) = 0;
+    }
+
+  return (uint8_t *) next_word;
+}
+
+/* Write a key and return the next byte position after padding.  */
+static uint8_t *
+rec_idx_tree_write_key (uint8_t *buffer,
+                        uint8_t *current_byte,
+                        size_t n_fields,
+                        const rec_idx_tree_key_t *types,
+                        const void **key)
+{
+  size_t n, length, padding;
+
+  for (n = 0; n < n_fields; n++)
+    {
+      switch (types[n])
+        {
+        case REC_IDX_KEY_NONE:
+          break;
+        case REC_IDX_KEY_STRING:
+          if (key[n] != NULL)
+            {
+              length = strlen ((const char *) key[n]) + 1;
+              current_byte = memcpy (current_byte, key[n], length) + length;
+            }
+          else
+            {
+              *(current_byte++) = '\0';
+            }
+          break;
+        }
+    }
+
+  if ((current_byte - buffer) % 8 != 0)
+    {
+      padding = 8 - (current_byte - buffer) % 8;
+      current_byte = memset (current_byte, 0, padding) + padding;
+    }
+
+  return current_byte;
+}
+
 /* End of rec-idx-tree.c */
diff --git a/src/rec.h b/src/rec.h
index 548e439..0aaad13 100644
--- a/src/rec.h
+++ b/src/rec.h
@@ -2532,6 +2532,55 @@ rec_iter_t rec_idx_tree_scan (rec_idx_tree_t idx_tree, 
const void **left, const
 
 rec_iter_t rec_idx_tree_scan_interval (rec_idx_tree_t idx_tree, size_t left, 
size_t right);
 
+/**************** Building index trees *******************************/
+
+/* Build an index tree for a record set.
+
+   The following arguments are used for input:
+
+   RSET
+
+      The record set from which records will be taken and indexed.
+      Must have at least one record.
+
+   RSET_NUMBER
+
+      The number of preceding record sets in the recfile, as would be
+      returned by rec_idx_tree_get_rset_number for the output index
+      data.
+
+   N_FIELDS
+
+      The number of fields in the index key type.
+
+   FIELD_NAMES
+
+      The names of fields in the index key type.
+
+   FIELD_TYPES
+
+      The types of fields in the index key type.
+
+   The return value is 'true' on success, 'false' on error.  If no
+   error occurs these arguments will be set to the following values:
+
+   BYTE_ARRAY
+
+      Start of the allocated index data, the caller should call free
+      on it after use.
+
+   SIZE
+
+      The number of octets in BYTE_ARRAY.
+
+   TYPE
+
+      The type in an index file for the index built.
+
+*/
+
+bool rec_idx_tree_build (rec_rset_t rset, size_t rset_number, size_t n_fields, 
const char **field_names, const rec_idx_tree_key_t *field_types, uint8_t 
**byte_array, size_t *size, int64_t *type);
+
 #endif /* !GNU_REC_H */
 
 /* End of rec.h */
diff --git a/torture/Makefile.am b/torture/Makefile.am
index 38d9e6d..dc01501 100644
--- a/torture/Makefile.am
+++ b/torture/Makefile.am
@@ -149,6 +149,7 @@ REC_ITER_TSUITE = rec-iter/rec-iter-mset.c \
 
 REC_IDX_TREE_TSUITE = rec-idx-tree/rec-idx-tree-new.c \
                       rec-idx-tree/rec-idx-tree-scan.c \
+                      rec-idx-tree/rec-idx-tree-build.c \
                       rec-idx-tree/tsuite-rec-idx-tree.c
 
 runtests_SOURCES = runtests.c \
diff --git a/torture/rec-idx-tree/rec-idx-tree-build.c 
b/torture/rec-idx-tree/rec-idx-tree-build.c
new file mode 100644
index 0000000..fb34e01
--- /dev/null
+++ b/torture/rec-idx-tree/rec-idx-tree-build.c
@@ -0,0 +1,168 @@
+/* -*- mode: C -*-
+ *
+ *       File:         rec-idx-tree-build.c
+ *       Date:         Wed Aug 15 15:16:05 2012
+ *
+ *       GNU recutils - rec_idx_tree_build unit tests.
+ *
+ */
+
+/* Copyright (C) 2012 Michał Masłowski */
+
+/* This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <check.h>
+
+#include <rec.h>
+
+/* Data for indexes. */
+static const char recfile[] = "a: 4\n\na: 3\n\na: 1\n\na: 2\nb: 1\n\na: 
5\nb:2";
+static const char *names[] = {"b", "a"};
+static const rec_idx_tree_key_t types[] = {REC_IDX_KEY_STRING,
+                                           REC_IDX_KEY_STRING};
+
+/*-
+ * Test: rec_idx_tree_build_verify_header
+ * Unit: rec_idx_tree_build
+ * Description:
+ * + Build an index and check the header.
+ */
+START_TEST(rec_idx_tree_build_verify_header)
+{
+  rec_idx_tree_t tree;
+  rec_rset_t rset;
+  rec_parser_t parser;
+  rec_iter_t iter;
+  rec_record_t record;
+  uint8_t *buffer = NULL;
+  size_t size = 0;
+  uint64_t type = 0;
+
+  /* Prepare the rset. */
+  parser = rec_parser_new_str (recfile, "dummy");
+  fail_if (!parser);
+  fail_if (!rec_parse_rset (parser, &rset));
+
+  /* Build the index and check obvious result problems. */
+  fail_if (!rec_idx_tree_build (rset, 1, 2, names, types,
+                                &buffer, &size, &type));
+  fail_if (buffer == NULL);
+  fail_if (size < 16);
+  fail_if (type == 0);
+
+  /* Close the rset, reset parser. */
+  rec_rset_destroy (rset);
+  rec_parser_destroy (parser);
+
+  /* Open the index. */
+  tree = rec_idx_tree_new (buffer, type, size);
+
+  /* Verify the header. */
+  fail_if (rec_idx_tree_get_rset_number (tree) != 1);
+  fail_if (rec_idx_tree_get_num_fields (tree) != 2);
+  fail_if (strcmp (rec_idx_tree_get_field_name (tree, 0), "b"));
+  fail_if (strcmp (rec_idx_tree_get_field_name (tree, 1), "a"));
+  fail_if (rec_idx_tree_get_field_type (tree, 0) != REC_IDX_KEY_STRING);
+  fail_if (rec_idx_tree_get_field_type (tree, 1) != REC_IDX_KEY_STRING);
+
+  /* Close. */
+  rec_idx_tree_destroy (tree);
+}
+END_TEST
+
+/*-
+ * Test: rec_idx_tree_build_verify_elements
+ * Unit: rec_idx_tree_build
+ * Description:
+ * + Build an index and verify elements.
+ */
+START_TEST(rec_idx_tree_build_verify_elements)
+{
+  rec_idx_tree_t tree;
+  rec_rset_t rset;
+  rec_parser_t parser;
+  rec_iter_t iter;
+  rec_record_t record;
+  uint8_t *buffer = NULL;
+  size_t size = 0;
+  uint64_t type = 0;
+  static const char *left[] = {NULL, "1"};
+  static const char *right[] = {"2", "5"};
+
+  /* Prepare the rset. */
+  parser = rec_parser_new_str (recfile, "dummy");
+  fail_if (!parser);
+  fail_if (!rec_parse_rset (parser, &rset));
+
+  /* Build the index and check obvious result problems. */
+  fail_if (!rec_idx_tree_build (rset, 1, 2, names, types,
+                                &buffer, &size, &type));
+  fail_if (buffer == NULL);
+  fail_if (size < 16);
+  fail_if (type == 0);
+
+  /* Close the rset, reset parser. */
+  rec_rset_destroy (rset);
+  rec_parser_reset (parser);
+
+  /* Open the rset and index. */
+  fail_if (!rec_parse_rset_lazy (parser, &rset));
+  tree = rec_idx_tree_new (buffer, type, size);
+  rec_idx_tree_set_rset (tree, parser, rset);
+
+  /* Do a scan. */
+  iter = rec_idx_tree_scan (tree, (const void **) left, (const void **) right);
+  fail_if (!iter);
+
+  /* Read the records. */
+  fail_if (!rec_iter_next (iter, &record));
+  fail_if (!rec_record_contains_field (record, "a", "1"));
+  fail_if (!rec_iter_next (iter, &record));
+  fail_if (!rec_record_contains_field (record, "a", "3"));
+  fail_if (!rec_iter_next (iter, &record));
+  fail_if (!rec_record_contains_field (record, "a", "4"));
+  fail_if (!rec_iter_next (iter, &record));
+  fail_if (!rec_record_contains_field (record, "a", "2"));
+  fail_if (!rec_iter_next (iter, &record));
+  fail_if (!rec_record_contains_field (record, "a", "5"));
+  fail_if (rec_iter_next (iter, &record));
+
+  /* Close. */
+  rec_iter_destroy (iter);
+  rec_idx_tree_destroy (tree);
+  rec_rset_destroy (rset);
+  rec_parser_destroy (parser);
+}
+END_TEST
+
+/*
+ * Test creation function
+ */
+TCase *
+test_rec_idx_tree_build (void)
+{
+  TCase *tc = tcase_create ("rec_idx_tree_build");
+  tcase_add_test (tc, rec_idx_tree_build_verify_header);
+  tcase_add_test (tc, rec_idx_tree_build_verify_elements);
+
+  return tc;
+}
+
+/* End of rec-idx-tree-build.c */
diff --git a/torture/rec-idx-tree/tsuite-rec-idx-tree.c 
b/torture/rec-idx-tree/tsuite-rec-idx-tree.c
index 0ea6d45..6328836 100644
--- a/torture/rec-idx-tree/tsuite-rec-idx-tree.c
+++ b/torture/rec-idx-tree/tsuite-rec-idx-tree.c
@@ -28,6 +28,7 @@
 
 extern TCase *test_rec_idx_tree_new (void);
 extern TCase *test_rec_idx_tree_scan (void);
+extern TCase *test_rec_idx_tree_build (void);
 
 Suite *
 tsuite_rec_idx_tree ()
@@ -37,6 +38,7 @@ tsuite_rec_idx_tree ()
   s = suite_create ("rec-idx-tree");
   suite_add_tcase (s, test_rec_idx_tree_new ());
   suite_add_tcase (s, test_rec_idx_tree_scan ());
+  suite_add_tcase (s, test_rec_idx_tree_build ());
 
   return s;
 }
-- 
1.7.11.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]