You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@subversion.apache.org by st...@apache.org on 2013/06/29 23:39:03 UTC

svn commit: r1498029 - in /subversion/branches/fsfs-format7/subversion: include/private/svn_string_private.h libsvn_subr/prefix_string.c

Author: stefan2
Date: Sat Jun 29 21:39:02 2013
New Revision: 1498029

URL: http://svn.apache.org/r1498029
Log:
On the fsfs-format7 branch: Introduce prefix-tree-based strings as a new
private data type, including new API functions.

* include/private/svn_string_private.h
  (svn__prefix_string_t,
   svn__prefix_tree_t): declare new private API data types
  (svn__prefix_tree_create,
   svn__prefix_string_create,
   svn__prefix_string_expand,
   svn__prefix_string_compare): declare new private API functions

* libsvn_subr/prefix_string.c
  (): new file
  (svn__prefix_string_t,
   node_t,
   svn__prefix_tree_t): define new data types
  (is_leaf,
   auto_realloc_sub_nodes,
   search_lower_bound): new utility functions
  (svn__prefix_tree_create,
   svn__prefix_string_create,
   svn__prefix_string_expand,
   svn__prefix_string_compare): implement the new API functions

Added:
    subversion/branches/fsfs-format7/subversion/libsvn_subr/prefix_string.c
Modified:
    subversion/branches/fsfs-format7/subversion/include/private/svn_string_private.h

Modified: subversion/branches/fsfs-format7/subversion/include/private/svn_string_private.h
URL: http://svn.apache.org/viewvc/subversion/branches/fsfs-format7/subversion/include/private/svn_string_private.h?rev=1498029&r1=1498028&r2=1498029&view=diff
==============================================================================
--- subversion/branches/fsfs-format7/subversion/include/private/svn_string_private.h (original)
+++ subversion/branches/fsfs-format7/subversion/include/private/svn_string_private.h Sat Jun 29 21:39:02 2013
@@ -264,6 +264,58 @@ svn_cstring__reverse_match_length(const 
 
 /** @} */
 
+/** Prefix trees.
+ *
+ * Prefix trees allow for a space-efficient representation of a set of path-
+ * like strings, i.e. those that share common prefixes.  Any given string
+ * value will be stored only once, i.e. two strings stored in the same tree
+ * are equal if and only if the point to the same #svn__prefix_string_t.
+ *
+ * @defgroup svn_prefix_string Strings in prefix trees.
+* @{
+ */
+
+/**
+ * Opaque data type for prefix-tree-based strings.
+ */
+typedef struct svn__prefix_string_t svn__prefix_string_t;
+
+/**
+ * Opaque data type representing a prefix tree
+ */
+typedef struct svn__prefix_tree_t svn__prefix_tree_t;
+
+/**
+ * Return a new prefix tree allocated in @a pool.
+ */
+svn__prefix_tree_t *
+svn__prefix_tree_create(apr_pool_t *pool);
+
+/**
+ * Return a string with the value @a s stored in @a tree.  If no such string
+ * exists yet, add it automatically.
+ */
+svn__prefix_string_t *
+svn__prefix_string_create(svn__prefix_tree_t *tree,
+                          const char *s);
+
+/**
+ * Return the contents of @a s as a new string object allocated in @a pool.
+ */
+svn_string_t *
+svn__prefix_string_expand(const svn__prefix_string_t *s,
+                          apr_pool_t *pool);
+
+/**
+ * Compare the two strings @a lhs and @a rhs that must be part of the same
+ * tree.
+ */
+int
+svn__prefix_string_compare(const svn__prefix_string_t *lhs,
+                           const svn__prefix_string_t *rhs);
+
+/** @} */
+
 /** @} */
 
 

Added: subversion/branches/fsfs-format7/subversion/libsvn_subr/prefix_string.c
URL: http://svn.apache.org/viewvc/subversion/branches/fsfs-format7/subversion/libsvn_subr/prefix_string.c?rev=1498029&view=auto
==============================================================================
--- subversion/branches/fsfs-format7/subversion/libsvn_subr/prefix_string.c (added)
+++ subversion/branches/fsfs-format7/subversion/libsvn_subr/prefix_string.c Sat Jun 29 21:39:02 2013
@@ -0,0 +1,315 @@
+/* prefix_string.c --- implement strings based on a prefix tree
+ *
+ * ====================================================================
+ *    Licensed to the Apache Software Foundation (ASF) under one
+ *    or more contributor license agreements.  See the NOTICE file
+ *    distributed with this work for additional information
+ *    regarding copyright ownership.  The ASF licenses this file
+ *    to you under the Apache License, Version 2.0 (the
+ *    "License"); you may not use this file except in compliance
+ *    with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing,
+ *    software distributed under the License is distributed on an
+ *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *    KIND, either express or implied.  See the License for the
+ *    specific language governing permissions and limitations
+ *    under the License.
+ * ====================================================================
+ */
+
+#include <assert.h>
+#include "private/svn_string_private.h"
+
+/* A node in the tree represents a common prefix.  The root node is the
+ * empty prefix.  Nodes may have up to 256 sub-nodes, each starting with
+ * a different character (possibly '\0').
+ *
+ * The nodes in the tree store only up to 8 chars of the respective common
+ * prefix, i.e. longer common prefixes must be drawn out over multiple
+ * hierarchy levels.  This is a space <-> efficiency trade-off.
+ *
+ * Strings are the leaf nodes in the tree and use a specialized, smaller
+ * data structure.  They may add 0 to 7 extra chars to the prefix.  Both
+ * data types can be discerned by the last char in the data buffer.  This
+ * must be 0 for strings (leaves) and non-0 otherwise.  Please note that
+ * ordinary nodes have a length information so that no terminating 0 is
+ * required for them.
+ */
+
+/* forward declaration */
+typedef struct node_t node_t;
+
+/* String type and tree leaf.
+ */
+struct svn__prefix_string_t
+{
+  /* mandatory prefix */
+  node_t *prefix;
+
+  /* 0 ..7 chars to add the the prefix. NUL-terminated. */
+  char data[8];
+};
+
+/* A node inside the tree, i.e. not a string and not a leaf (unless this is
+ * the root node).
+ *
+ * Note: keep the ordering to minimize size / alignment overhead on 64 bit
+ * machines.
+ */
+struct node_t
+{
+  /* pointer to the parent prefix plus the 1 .. 8 extra chars.
+   * Only the root will provide 0 extra chars. */
+  svn__prefix_string_t key;
+
+  /* Length of the prefix from the root down to and including this one.
+   * 0 for the root node.  Only then will key.prefix be NULL. */
+  apr_uint32_t length;
+
+  /* Number of entries used in SUB_NODES. */ 
+  apr_uint32_t sub_node_count;
+
+  /* The sub-nodes, ordered by first char.  node_t and svn__prefix_string_t
+   * may be mixed here.  May be NULL.
+   * The number of allocated entries is always a power-of-two and only
+   * given implicitly by SUB_NODE_COUNT. */
+  struct node_t **sub_nodes;
+};
+
+/* The actual tree structure. */
+struct svn__prefix_tree_t
+{
+  /* the common tree root (represents the empty prefix). */
+  node_t *root;
+
+  /* all sub-nodes & strings will be allocated from this pool */
+  apr_pool_t *pool;
+};
+
+/* Return TRUE, iff NODE is a leaf node.
+ */
+static svn_boolean_t
+is_leaf(node_t *node)
+{
+  return node->key.data[7] == 0;
+}
+
+/* Ensure that the sub-nodes array of NODE within TREE has at least one
+ * unused entry.  Re-allocate as necessary.
+ */
+static void
+auto_realloc_sub_nodes(svn__prefix_tree_t *tree,
+                       node_t *node)
+{
+  if (node->sub_node_count & (node->sub_node_count - 1))
+    return;
+
+  if (node->sub_node_count == 0)
+    {
+      node->sub_nodes = apr_pcalloc(tree->pool, sizeof(*node->sub_nodes));
+    }
+  else
+    {
+      node_t **sub_nodes
+        = apr_pcalloc(tree->pool,
+                      2 * node->sub_node_count * sizeof(*sub_nodes));
+      memcpy(sub_nodes, node->sub_nodes,
+             node->sub_node_count * sizeof(*sub_nodes));
+      node->sub_nodes = sub_nodes;
+    }
+}
+
+/* Given the COUNT pointers in the SUB_NODES array, return the location at
+ * which KEY is either located or would be inserted.
+ */
+static int
+search_lower_bound(node_t **sub_nodes,
+                   unsigned char key,
+                   int count)
+{
+  int lower = 0;
+  int upper = count - 1;
+
+  /* Binary search for the lowest position at which to insert KEY. */
+  while (lower <= upper)
+    {
+      int current = lower + (upper - lower) / 2; 
+
+      if ((unsigned char)sub_nodes[current]->key.data[0] < key)
+        lower = current + 1;
+      else
+        upper = current - 1;
+    }
+
+  return lower;
+}
+
+svn__prefix_tree_t *
+svn__prefix_tree_create(apr_pool_t *pool)
+{
+  svn__prefix_tree_t *tree = apr_pcalloc(pool, sizeof(*tree));
+  tree->pool = pool;
+
+  tree->root = apr_pcalloc(pool, sizeof(*tree->root));
+  tree->root->key.data[7] = '\xff';
+
+  return tree;
+}
+
+svn__prefix_string_t *
+svn__prefix_string_create(svn__prefix_tree_t *tree,
+                          const char *s)
+{
+  svn__prefix_string_t *new_string;
+  apr_size_t len = strlen(s);
+  node_t *node = tree->root;
+  node_t *new_node;
+  int idx;
+
+  /* walk the existing tree until we either find S or the node at which S
+   * has to be inserted */
+  while (TRUE)
+    {
+      node_t *sub_node;
+      int match = 1;
+
+      /* index of the matching sub-node */
+      idx = node->sub_node_count
+          ? search_lower_bound(node->sub_nodes,
+                               (unsigned char)s[node->length],
+                               node->sub_node_count)
+          : 0;
+
+      /* any (partially) matching sub-nodes? */
+      if (idx == node->sub_node_count
+          || node->sub_nodes[idx]->key.data[0] != s[node->length])
+        break;
+
+      sub_node = node->sub_nodes[idx];
+
+      /* fully matching sub-node? */
+      if (is_leaf(sub_node))
+        {
+          if (strcmp(sub_node->key.data, s + node->length) == 0)
+            return &sub_node->key;
+        }
+      else
+        {
+          apr_size_t sub_node_len = sub_node->length - node->length;
+          if (strncmp(sub_node->key.data, s + node->length,
+                      sub_node_len) == 0)
+            {
+              node = sub_node;
+              continue;
+            }
+        }
+
+      /* partial match -> split */
+      while (sub_node->key.data[match] == s[node->length + match])
+        ++match;
+
+      new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
+      new_node->key = sub_node->key;
+      new_node->length = node->length + match;
+      new_node->key.data[7] = '\xff';
+      new_node->sub_node_count = 1;
+      new_node->sub_nodes = apr_palloc(tree->pool, sizeof(node_t *));
+      new_node->sub_nodes[0] = sub_node;
+
+      memcpy(sub_node->key.data, sub_node->key.data + match, 8 - match);
+
+      /* replace old sub-node with new one and continue lookup */
+      sub_node->key.prefix = new_node;
+      node->sub_nodes[idx] = new_node;
+      node = new_node;
+    }
+
+  /* add sub-node(s) and final string */
+  while (node->length + 7 < len)
+    {
+      new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
+      new_node->key.prefix = node;
+      new_node->length = node->length + 8;
+      memcpy(new_node->key.data, s + node->length, 8);
+
+      auto_realloc_sub_nodes(tree, node);
+      memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
+              (node->sub_node_count - idx) * sizeof(node_t *));
+
+      /* replace old sub-node with new one and continue lookup */
+      node->sub_nodes[idx] = new_node;
+      node->sub_node_count++;
+      node = new_node;
+      idx = 0;
+    }
+
+  new_string = apr_pcalloc(tree->pool, sizeof(*new_string));
+  new_string->prefix = node;
+  memcpy(new_string->data, s + node->length, len - node->length);
+
+  auto_realloc_sub_nodes(tree, node);
+  memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
+          (node->sub_node_count - idx) * sizeof(node_t *));
+
+  node->sub_nodes[idx] = (node_t *)new_string;
+  node->sub_node_count++;
+  return new_string;
+}
+
+svn_string_t *
+svn__prefix_string_expand(const svn__prefix_string_t *s,
+                          apr_pool_t *pool)
+{
+  apr_size_t s_len = strlen(s->data);
+  apr_size_t len = s->prefix->length + s_len;
+  char *buffer = apr_palloc(pool, len + 1);
+
+  svn_string_t *result = apr_pcalloc(pool, sizeof(result));
+  result->data = buffer;
+  result->len = len;
+  buffer[len] = '\0';
+
+  while (s->prefix)
+    {
+      memcpy(result + s->prefix->length, s->data, len - s->prefix->length);
+      len = s->prefix->length;
+      s = &s->prefix->key;
+    }
+
+  return result;
+}
+
+int
+svn__prefix_string_compare(const svn__prefix_string_t *lhs,
+                           const svn__prefix_string_t *rhs)
+{
+  const node_t *lhs_parent = lhs->prefix;
+  const node_t *rhs_parent = rhs->prefix;
+
+  if (lhs == rhs)
+    return 0;
+
+  /* find the common root */
+  while (lhs_parent != rhs_parent)
+    {
+      if (lhs_parent->length <= rhs_parent->length)
+        {
+          rhs = &rhs_parent->key;
+          rhs_parent = rhs_parent->key.prefix;
+        }
+      else if (rhs_parent->length <= lhs_parent->length)
+        {
+          lhs = &lhs_parent->key;
+          lhs_parent = lhs_parent->key.prefix;
+        }
+
+      /* same tree? */
+      assert(lhs_parent && rhs_parent);
+    }
+
+  /* at the common root, strings will differ in the first follow-up char */
+  return (int)(unsigned char)lhs->data[0] - (int)(unsigned char)rhs->data[0];
+}