You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@subversion.apache.org by st...@apache.org on 2012/02/12 21:03:28 UTC

svn commit: r1243312 - in /subversion/trunk/subversion: libsvn_fs_fs/fs_fs.c tests/cmdline/svnadmin_tests.py

Author: stefan2
Date: Sun Feb 12 20:03:28 2012
New Revision: 1243312

URL: http://svn.apache.org/viewvc?rev=1243312&view=rev
Log:
Extend rep sharing to property reps.  Since many projects set
props such as svn:eol-style on virtually every file, about half
the rep reads in a c/o pertain to props - with many of them being
identical.  We use the same infrastructure as for data rep sharing.

Moreover, most props get set once and for many nodes in batch mode.
That means we need to identify common prop reps within the protorev
currently being written.  To that end, we simply index those reps
in a local hash that we fill & use during the commit process.

The same could be done for duplicate data reps within the same rev
but that is not within the cope of this commit.

* subversion/libsvn_fs_fs/fs_fs.c
  (get_shared_rep): new utility function factored out from
   rep_write_contents_close()
  (rep_write_contents_close): call new utility

  (write_hash_baton): add sha1 alongside md5
  (write_hash_handler): update both checksums

  (write_hash_rep): bring signature closer to write_hash_delta_rep;
   add support for shared reps within & between revs
  (write_hash_delta_rep): add support for shared reps within & between revs
  (write_final_rev): add hash for shared reps within rev;
   write rep hashes and on disk rep index;
   ensure minimal on-disk representation - i.e. no SHA1
  (commit_baton): add hash for in-rev rep sharing
  (commit_body): adapt caller
  (svn_fs_fs__commit): adapt baton initialization

* subversion/tests/cmdline/svnadmin_tests.py
  (verify_non_utf8_paths): adapt the FSFS pattern; add a new one
   for deltified props

Modified:
    subversion/trunk/subversion/libsvn_fs_fs/fs_fs.c
    subversion/trunk/subversion/tests/cmdline/svnadmin_tests.py

Modified: subversion/trunk/subversion/libsvn_fs_fs/fs_fs.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_fs_fs/fs_fs.c?rev=1243312&r1=1243311&r2=1243312&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_fs_fs/fs_fs.c (original)
+++ subversion/trunk/subversion/libsvn_fs_fs/fs_fs.c Sun Feb 12 20:03:28 2012
@@ -5377,6 +5377,80 @@ rep_write_get_baton(struct rep_write_bat
   return SVN_NO_ERROR;
 }
 
+/* For the hash REP->SHA1, try to find an already existing representation
+   in FS and return it in *OUT_REP.  If no such representation exists or
+   if rep sharing has been disabled for FS, NULL will be returned.  Since
+   there may be new duplicate representations within the same uncommitted
+   revision, those can be passed in REPS_HASH (maps a sha1 digest onto
+   representation_t*), otherwise pass in NULL for REPS_HASH.
+   POOL will be used for allocations. The lifetime of the returned rep is
+   limited by both, POOL and REP lifetime.
+ */
+static svn_error_t *
+get_shared_rep(representation_t **old_rep, 
+               svn_fs_t *fs, 
+               representation_t *rep,
+               apr_hash_t *reps_hash,
+               apr_pool_t *pool)
+{
+  svn_error_t *err;
+  fs_fs_data_t *ffd = fs->fsap_data;
+
+  /* Return NULL, if rep sharing has been disabled. */
+  *old_rep = NULL;
+  if (!ffd->rep_sharing_allowed)
+    return SVN_NO_ERROR;
+
+  /* Check and see if we already have a representation somewhere that's
+     identical to the one we just wrote out.  Start with the hash lookup
+     because it is cheepest. */
+  if (reps_hash)
+    *old_rep = apr_hash_get(reps_hash,
+                            rep->sha1_checksum->digest,
+                            APR_SHA1_DIGESTSIZE);
+   
+  /* If we haven't found anything yet, try harder and consult our DB. */
+  if (*old_rep == NULL)
+    {
+      err = svn_fs_fs__get_rep_reference(old_rep, fs, rep->sha1_checksum,
+                                         pool);
+      /* ### Other error codes that we shouldn't mask out? */
+      if (err == SVN_NO_ERROR
+          || err->apr_err == SVN_ERR_FS_CORRUPT
+          || SVN_ERROR_IN_CATEGORY(err->apr_err,
+                                   SVN_ERR_MALFUNC_CATEGORY_START))
+        {
+          /* Fatal error; don't mask it.
+
+             In particular, this block is triggered when the rep-cache refers
+             to revisions in the future.  We signal that as a corruption situation
+             since, once those revisions are less than youngest (because of more
+             commits), the rep-cache would be invalid.
+           */
+          SVN_ERR(err);
+        }
+      else
+        {
+          /* Something's wrong with the rep-sharing index.  We can continue
+             without rep-sharing, but warn.
+           */
+          (fs->warning)(fs->warning_baton, err);
+          svn_error_clear(err);
+          *old_rep = NULL;
+        }
+    }
+
+  /* Add information that is missing in the cached data. */
+  if (*old_rep)
+    {
+      /* Use the old rep for this content. */
+      (*old_rep)->md5_checksum = rep->md5_checksum;
+      (*old_rep)->uniquifier = rep->uniquifier;
+    }
+    
+  return SVN_NO_ERROR;
+}
+
 /* Close handler for the representation write stream.  BATON is a
    rep_write_baton.  Writes out a new node-rev that correctly
    references the representation we just finished writing. */
@@ -5384,7 +5458,6 @@ static svn_error_t *
 rep_write_contents_close(void *baton)
 {
   struct rep_write_baton *b = baton;
-  fs_fs_data_t *ffd = b->fs->fsap_data;
   const char *unique_suffix;
   representation_t *rep;
   representation_t *old_rep;
@@ -5418,38 +5491,7 @@ rep_write_contents_close(void *baton)
 
   /* Check and see if we already have a representation somewhere that's
      identical to the one we just wrote out. */
-  if (ffd->rep_sharing_allowed)
-    {
-      svn_error_t *err;
-      err = svn_fs_fs__get_rep_reference(&old_rep, b->fs, rep->sha1_checksum,
-                                         b->parent_pool);
-      /* ### Other error codes that we shouldn't mask out? */
-      if (err == SVN_NO_ERROR
-          || err->apr_err == SVN_ERR_FS_CORRUPT
-          || SVN_ERROR_IN_CATEGORY(err->apr_err,
-                                   SVN_ERR_MALFUNC_CATEGORY_START))
-        {
-          /* Fatal error; don't mask it.
-
-             In particular, this block is triggered when the rep-cache refers
-             to revisions in the future.  We signal that as a corruption situation
-             since, once those revisions are less than youngest (because of more
-             commits), the rep-cache would be invalid.
-           */
-          SVN_ERR(err);
-        }
-      else
-        {
-          /* Something's wrong with the rep-sharing index.  We can continue
-             without rep-sharing, but warn.
-           */
-          (b->fs->warning)(b->fs->warning_baton, err);
-          svn_error_clear(err);
-          old_rep = NULL;
-        }
-    }
-  else
-    old_rep = NULL;
+  SVN_ERR(get_shared_rep(&old_rep, b->fs, rep, NULL, b->parent_pool));
 
   if (old_rep)
     {
@@ -5457,8 +5499,6 @@ rep_write_contents_close(void *baton)
       SVN_ERR(svn_io_file_trunc(b->file, b->rep_offset, b->pool));
 
       /* Use the old rep for this content. */
-      old_rep->md5_checksum = rep->md5_checksum;
-      old_rep->uniquifier = rep->uniquifier;
       b->noderev->data_rep = old_rep;
     }
   else
@@ -5625,7 +5665,8 @@ struct write_hash_baton
 
   apr_size_t size;
 
-  svn_checksum_ctx_t *checksum_ctx;
+  svn_checksum_ctx_t *md5_ctx;
+  svn_checksum_ctx_t *sha1_ctx;
 };
 
 /* The handler for the write_hash_rep stream.  BATON is a
@@ -5638,7 +5679,8 @@ write_hash_handler(void *baton,
 {
   struct write_hash_baton *whb = baton;
 
-  SVN_ERR(svn_checksum_update(whb->checksum_ctx, data, *len));
+  SVN_ERR(svn_checksum_update(whb->md5_ctx, data, *len));
+  SVN_ERR(svn_checksum_update(whb->sha1_ctx, data, *len));
 
   SVN_ERR(svn_stream_write(whb->stream, data, len));
   whb->size += *len;
@@ -5647,23 +5689,32 @@ write_hash_handler(void *baton,
 }
 
 /* Write out the hash HASH as a text representation to file FILE.  In
-   the process, record the total size of the dump in *SIZE, and the
-   md5 digest in CHECKSUM.  Perform temporary allocations in POOL. */
+   the process, record position, the total size of the dump and MD5 as
+   well as SHA1 in REP.   If rep sharing has been enabled and REPS_HASH
+   is not NULL, it will be used in addition to the on-disk cache to find
+   earlier reps with the same content.  When such existing reps can be
+   found,  we will truncate the one just written from the file and return
+   the existing rep.  Perform temporary allocations in POOL. */
 static svn_error_t *
-write_hash_rep(svn_filesize_t *size,
-               svn_checksum_t **checksum,
+write_hash_rep(representation_t *rep,
                apr_file_t *file,
                apr_hash_t *hash,
+               svn_fs_t *fs,
+               apr_hash_t *reps_hash,
                apr_pool_t *pool)
 {
   svn_stream_t *stream;
   struct write_hash_baton *whb;
+  representation_t *old_rep;
+
+  SVN_ERR(get_file_offset(&rep->offset, file, pool));
 
   whb = apr_pcalloc(pool, sizeof(*whb));
 
   whb->stream = svn_stream_from_aprfile2(file, TRUE, pool);
   whb->size = 0;
-  whb->checksum_ctx = svn_checksum_ctx_create(svn_checksum_md5, pool);
+  whb->md5_ctx = svn_checksum_ctx_create(svn_checksum_md5, pool);
+  whb->sha1_ctx = svn_checksum_ctx_create(svn_checksum_sha1, pool);
 
   stream = svn_stream_create(whb, pool);
   svn_stream_set_write(stream, write_hash_handler);
@@ -5673,15 +5724,41 @@ write_hash_rep(svn_filesize_t *size,
   SVN_ERR(svn_hash_write2(hash, stream, SVN_HASH_TERMINATOR, pool));
 
   /* Store the results. */
-  SVN_ERR(svn_checksum_final(checksum, whb->checksum_ctx, pool));
-  *size = whb->size;
+  SVN_ERR(svn_checksum_final(&rep->md5_checksum, whb->md5_ctx, pool));
+  SVN_ERR(svn_checksum_final(&rep->sha1_checksum, whb->sha1_ctx, pool));
+
+  /* Check and see if we already have a representation somewhere that's
+     identical to the one we just wrote out. */
+  SVN_ERR(get_shared_rep(&old_rep, fs, rep, reps_hash, pool));
+
+  if (old_rep)
+    {
+      /* We need to erase from the protorev the data we just wrote. */
+      SVN_ERR(svn_io_file_trunc(file, rep->offset, pool));
+
+      /* Use the old rep for this content. */
+      memcpy(rep, old_rep, sizeof (*rep));
+    }
+  else
+    {
+      /* Write out our cosmetic end marker. */
+      SVN_ERR(svn_stream_printf(whb->stream, pool, "ENDREP\n"));
+
+      /* update the representation */
+      rep->size = whb->size;
+      rep->expanded_size = 0;
+    }
 
-  return svn_stream_printf(whb->stream, pool, "ENDREP\n");
+  return SVN_NO_ERROR;
 }
 
 /* Write out the hash HASH pertaining to the NODEREV in FS as a deltified
    text representation to file FILE.  In the process, record the total size
-   and the md5 digest in REP.  Perform temporary allocations in POOL. */
+   and the md5 digest in REP.  If rep sharing has been enabled and REPS_HASH
+   is not NULL, it will be used in addition to the on-disk cache to find
+   earlier reps with the same content.  When such existing reps can be found,
+   we will truncate the one just written from the file and return the existing
+   rep.  Perform temporary allocations in POOL. */
 #ifdef SVN_FS_FS_DELTIFY_DIRECTORIES
 static svn_error_t *
 write_hash_delta_rep(representation_t *rep,
@@ -5689,6 +5766,7 @@ write_hash_delta_rep(representation_t *r
                      apr_hash_t *hash,
                      svn_fs_t *fs,
                      node_revision_t *noderev,
+                     apr_hash_t *reps_hash,
                      apr_pool_t *pool)
 {
   svn_txdelta_window_handler_t diff_wh;
@@ -5697,6 +5775,7 @@ write_hash_delta_rep(representation_t *r
   svn_stream_t *file_stream;
   svn_stream_t *stream;
   representation_t *base_rep;
+  representation_t *old_rep;
   svn_stream_t *source;
   const char *header;
 
@@ -5742,7 +5821,8 @@ write_hash_delta_rep(representation_t *r
   whb = apr_pcalloc(pool, sizeof(*whb));
   whb->stream = svn_txdelta_target_push(diff_wh, diff_whb, source, pool);
   whb->size = 0;
-  whb->checksum_ctx = svn_checksum_ctx_create(svn_checksum_md5, pool);
+  whb->md5_ctx = svn_checksum_ctx_create(svn_checksum_md5, pool);
+  whb->sha1_ctx = svn_checksum_ctx_create(svn_checksum_sha1, pool);
 
   /* serialize the hash */
   stream = svn_stream_create(whb, pool);
@@ -5752,15 +5832,31 @@ write_hash_delta_rep(representation_t *r
   SVN_ERR(svn_stream_close(whb->stream));
 
   /* Store the results. */
-  SVN_ERR(svn_checksum_final(&rep->md5_checksum, whb->checksum_ctx, pool));
+  SVN_ERR(svn_checksum_final(&rep->md5_checksum, whb->md5_ctx, pool));
+  SVN_ERR(svn_checksum_final(&rep->sha1_checksum, whb->sha1_ctx, pool));
+
+  /* Check and see if we already have a representation somewhere that's
+     identical to the one we just wrote out. */
+  SVN_ERR(get_shared_rep(&old_rep, fs, rep, reps_hash, pool));
 
-  /* Write out our cosmetic end marker. */
-  SVN_ERR(get_file_offset(&rep_end, file, pool));
-  SVN_ERR(svn_stream_printf(file_stream, pool, "ENDREP\n"));
-
-  /* update the representation */
-  rep->expanded_size = whb->size;
-  rep->size = rep_end - delta_start;
+  if (old_rep)
+    {
+      /* We need to erase from the protorev the data we just wrote. */
+      SVN_ERR(svn_io_file_trunc(file, rep->offset, pool));
+
+      /* Use the old rep for this content. */
+      memcpy(rep, old_rep, sizeof (*rep));
+    }
+  else
+    {
+      /* Write out our cosmetic end marker. */
+      SVN_ERR(get_file_offset(&rep_end, file, pool));
+      SVN_ERR(svn_stream_printf(file_stream, pool, "ENDREP\n"));
+
+      /* update the representation */
+      rep->expanded_size = whb->size;
+      rep->size = rep_end - delta_start;
+    }
 
   return SVN_NO_ERROR;
 }
@@ -5839,6 +5935,10 @@ validate_root_noderev(svn_fs_t *fs,
    If REPS_TO_CACHE is not NULL, append to it a copy (allocated in
    REPS_POOL) of each data rep that is new in this revision.
 
+   If REPS_HASH is not NULL, append copies (allocated in REPS_POOL)
+   of the representations of each property rep that is new in this
+   revision.
+
    AT_ROOT is true if the node revision being written is the root
    node-revision.  It is only controls additional sanity checking
    logic.
@@ -5854,6 +5954,7 @@ write_final_rev(const svn_fs_id_t **new_
                 const char *start_copy_id,
                 apr_off_t initial_offset,
                 apr_array_header_t *reps_to_cache,
+                apr_hash_t *reps_hash,
                 apr_pool_t *reps_pool,
                 svn_boolean_t at_root,
                 apr_pool_t *pool)
@@ -5892,7 +5993,7 @@ write_final_rev(const svn_fs_id_t **new_
           svn_pool_clear(subpool);
           SVN_ERR(write_final_rev(&new_id, file, rev, fs, dirent->id,
                                   start_node_id, start_copy_id, initial_offset,
-                                  reps_to_cache, reps_pool, FALSE,
+                                  reps_to_cache, reps_hash, reps_pool, FALSE,
                                   subpool));
           if (new_id && (svn_fs_fs__id_rev(new_id) == rev))
             dirent->id = svn_fs_fs__id_copy(new_id, pool);
@@ -5909,13 +6010,11 @@ write_final_rev(const svn_fs_id_t **new_
 
 #ifdef SVN_FS_FS_DELTIFY_DIRECTORIES
           SVN_ERR(write_hash_delta_rep(noderev->data_rep, file,
-                                       str_entries, fs, noderev, pool));
+                                       str_entries, fs, noderev, NULL,
+                                       pool));
 #else
-          SVN_ERR(get_file_offset(&noderev->data_rep->offset, file, pool));
-          SVN_ERR(write_hash_rep(&noderev->data_rep->size,
-                                 &noderev->data_rep->md5_checksum, file,
-                                 str_entries, pool));
-          noderev->data_rep->expanded_size = noderev->data_rep->size;
+          SVN_ERR(write_hash_rep(noderev->data_rep, file, str_entries,
+                                 fs, NULL, pool));
 #endif
         }
     }
@@ -5946,17 +6045,17 @@ write_final_rev(const svn_fs_id_t **new_
       apr_hash_t *proplist;
       SVN_ERR(svn_fs_fs__get_proplist(&proplist, fs, noderev, pool));
 
+      noderev->prop_rep->txn_id = NULL;
+      noderev->prop_rep->revision = rev;
+
 #ifdef SVN_FS_FS_DELTIFY_PROPS
       SVN_ERR(write_hash_delta_rep(noderev->prop_rep, file,
-                                   proplist, fs, noderev, pool));
+                                   proplist, fs, noderev, reps_hash,
+                                   pool));
 #else
-      SVN_ERR(get_file_offset(&noderev->prop_rep->offset, file, pool));
-      SVN_ERR(write_hash_rep(&noderev->prop_rep->size,
-                             &noderev->prop_rep->md5_checksum, file,
-                             proplist, pool));
+      SVN_ERR(write_hash_rep(noderev->prop_rep, file, proplist, 
+                             fs, reps_hash, pool));
 #endif
-      noderev->prop_rep->txn_id = NULL;
-      noderev->prop_rep->revision = rev;
     }
 
 
@@ -5999,6 +6098,41 @@ write_final_rev(const svn_fs_id_t **new_
 
   noderev->id = new_id;
 
+  if (ffd->rep_sharing_allowed)
+    {
+      /* Save the data representation's hash in the rep cache. */
+      if (   noderev->data_rep && noderev->kind == svn_node_file
+          && noderev->data_rep->revision == rev)
+        {
+          SVN_ERR_ASSERT(reps_to_cache && reps_pool);
+          APR_ARRAY_PUSH(reps_to_cache, representation_t *)
+            = svn_fs_fs__rep_copy(noderev->data_rep, reps_pool);
+        }
+
+      if (noderev->prop_rep && noderev->prop_rep->revision == rev)
+        {
+          /* Add new property reps to hash and on-disk cache. */
+          representation_t *copy 
+            = svn_fs_fs__rep_copy(noderev->prop_rep, reps_pool);
+
+          SVN_ERR_ASSERT(reps_to_cache && reps_pool);
+          APR_ARRAY_PUSH(reps_to_cache, representation_t *) = copy;
+
+          apr_hash_set(reps_hash, 
+                        copy->sha1_checksum->digest, 
+                        APR_SHA1_DIGESTSIZE, 
+                        copy);
+        }
+    }
+
+  /* don't serialize SHA1 for dirs to disk (waste of space) */
+  if (noderev->data_rep && noderev->kind == svn_node_dir)
+    noderev->data_rep->sha1_checksum = NULL;
+
+  /* don't serialize SHA1 for props to disk (waste of space) */
+  if (noderev->prop_rep)
+    noderev->prop_rep->sha1_checksum = NULL;
+
   /* Write out our new node-revision. */
   if (at_root)
     SVN_ERR(validate_root_noderev(fs, noderev, rev, pool));
@@ -6007,16 +6141,6 @@ write_final_rev(const svn_fs_id_t **new_
                                    svn_fs_fs__fs_supports_mergeinfo(fs),
                                    pool));
 
-  /* Save the data representation's hash in the rep cache. */
-  if (ffd->rep_sharing_allowed
-        && noderev->data_rep && noderev->kind == svn_node_file
-        && noderev->data_rep->revision == rev)
-    {
-      SVN_ERR_ASSERT(reps_to_cache && reps_pool);
-      APR_ARRAY_PUSH(reps_to_cache, representation_t *)
-        = svn_fs_fs__rep_copy(noderev->data_rep, reps_pool);
-    }
-
   /* Return our ID that references the revision file. */
   *new_id_p = noderev->id;
 
@@ -6224,6 +6348,7 @@ struct commit_baton {
   svn_fs_t *fs;
   svn_fs_txn_t *txn;
   apr_array_header_t *reps_to_cache;
+  apr_hash_t *reps_hash;
   apr_pool_t *reps_pool;
 };
 
@@ -6281,8 +6406,8 @@ commit_body(void *baton, apr_pool_t *poo
   root_id = svn_fs_fs__id_txn_create("0", "0", cb->txn->id, pool);
   SVN_ERR(write_final_rev(&new_root_id, proto_file, new_rev, cb->fs, root_id,
                           start_node_id, start_copy_id, initial_offset,
-                          cb->reps_to_cache, cb->reps_pool, TRUE,
-                          pool));
+                          cb->reps_to_cache, cb->reps_hash, cb->reps_pool,
+                          TRUE, pool));
 
   /* Write the changed-path information. */
   SVN_ERR(write_final_changed_path_info(&changed_path_offset, proto_file,
@@ -6456,11 +6581,13 @@ svn_fs_fs__commit(svn_revnum_t *new_rev_
   if (ffd->rep_sharing_allowed)
     {
       cb.reps_to_cache = apr_array_make(pool, 5, sizeof(representation_t *));
+      cb.reps_hash = apr_hash_make(pool);
       cb.reps_pool = pool;
     }
   else
     {
       cb.reps_to_cache = NULL;
+      cb.reps_hash = NULL;
       cb.reps_pool = NULL;
     }
 

Modified: subversion/trunk/subversion/tests/cmdline/svnadmin_tests.py
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/tests/cmdline/svnadmin_tests.py?rev=1243312&r1=1243311&r2=1243312&view=diff
==============================================================================
--- subversion/trunk/subversion/tests/cmdline/svnadmin_tests.py (original)
+++ subversion/trunk/subversion/tests/cmdline/svnadmin_tests.py Sun Feb 12 20:03:28 2012
@@ -1422,9 +1422,12 @@ def verify_non_utf8_paths(sbox):
     if line == "A\n":
       # replace 'A' with a latin1 character -- the new path is not valid UTF-8
       fp_new.write("\xE6\n")
-    elif line == "text: 1 279 32 32 d63ecce65d8c428b86f4f8b0920921fe\n":
+    elif line == "text: 1 279 32 0 d63ecce65d8c428b86f4f8b0920921fe\n":
       # fix up the representation checksum
-      fp_new.write("text: 1 279 32 32 b50b1d5ed64075b5f632f3b8c30cd6b2\n")
+      fp_new.write("text: 1 279 32 0 b50b1d5ed64075b5f632f3b8c30cd6b2\n")
+    elif line == "text: 1 292 44 32 a6be7b4cf075fd39e6a99eb69a31232b\n":
+      # fix up the representation checksum
+      fp_new.write("text: 1 292 44 32 f2e93e73272cac0f18fccf16f224eb93\n")
     elif line == "cpath: /A\n":
       # also fix up the 'created path' field
       fp_new.write("cpath: /\xE6\n")