You are viewing a plain text version of this content. The canonical link for it is here.

Posted to dev@subversion.apache.org by Josh Pieper <jj...@pobox.com> on 2004/05/24 01:03:48 UTC

More libsvn_wc efficiency improvements.

Philip, could you look over this patch and see if anything is
obviously wrong with my entries caching modifications?  Essentially
lock.c:prune_deleted was getting called for every single modification
made to the entries file.  To generate a pruned list, it needs to
iterate over all the currently existing entries.  It just wasn't very
noticable before because of the O(n^2) disk problems.  While the patch
does cause a little extra work to be done in svn_wc__entry_modify, the
reduction in work in prune_deleted seems to negate the loss for small
directories, for big directories it's an obvious performance win.

With this patch and the current (r9870) trunk FSFS, I see no
non-linearities in checkouts when going up to at least 25,000 files in
a single directory.  At that point my hard drive or filesystem started
to groan, so I stopped.

-Josh

=======================================

Improve the efficiency of libsvn_wc during updates and checkouts by
decreasing the number of times that an entries hash has its hidden
entries removed.  Cache the entries list with and without hidden
entries separately and make svn_wc__entry_modify operates on both
cached lists at the same time.  Prior to this, svn_wc__entry_modify
would cause a new pruned entries file to be regenerated after every
single modification.  Since generating the pruned list required
iterating over the entirety of the entries list, this resulted in n^2
performance as the number of files in a directory increased.

* subversion/libsvn_wc/lock.c
  (prune_deleted): Do not attempt to take a shortcut and assign the
    full entries list to the pruned one.  This was only a very small
    performance win before and now that we need to update the two
    caches separately it would be incorrect.

* subversion/libsvn_wc/entries.c
  (svn_wc__entry_modify): Get both the full and pruned entries lists,
    perform the requested modifications on both of them.

Index: subversion/libsvn_wc/entries.c
===================================================================
--- subversion/libsvn_wc/entries.c	(revision 9870)
+++ subversion/libsvn_wc/entries.c	(working copy)
@@ -1549,7 +1549,7 @@
                       svn_boolean_t do_sync,
                       apr_pool_t *pool)
 {
-  apr_hash_t *entries;
+  apr_hash_t *entries, *entries_nohidden;
   svn_boolean_t entry_was_deleted_p = FALSE;
 
   /* ENTRY is rather necessary, and ENTRY->kind is required to be valid! */
@@ -1557,7 +1557,11 @@
 
   /* Load PATH's whole entries file. */
   SVN_ERR (svn_wc_entries_read (&entries, adm_access, TRUE, pool));
+  SVN_ERR (svn_wc_entries_read (&entries_nohidden, adm_access, FALSE, pool));
 
+  /* These two hashes cannot be the exact same ones. */
+  assert (entries != entries_nohidden);
+
   /* Ensure that NAME is valid. */
   if (name == NULL)
     name = SVN_WC_ENTRY_THIS_DIR;
@@ -1565,6 +1569,8 @@
   if (modify_flags & SVN_WC__ENTRY_MODIFY_SCHEDULE)
     {
       svn_wc_entry_t *entry_before, *entry_after;
+      apr_uint32_t orig_modify_flags = modify_flags;
+      svn_wc_schedule_t orig_schedule = entry->schedule;
 
       /* Keep a copy of the unmodified entry on hand. */
       entry_before = apr_hash_get (entries, name, APR_HASH_KEY_STRING);
@@ -1574,6 +1580,9 @@
       SVN_ERR (fold_scheduling (entries, name, &modify_flags, 
                                 &entry->schedule, pool));
 
+      SVN_ERR (fold_scheduling (entries_nohidden, name, &orig_modify_flags,
+                                &orig_schedule, pool));
+
       /* Special case:  fold_state_changes() may have actually REMOVED
          the entry in question!  If so, don't try to fold_entry, as
          this will just recreate the entry again. */
@@ -1588,14 +1597,16 @@
   /* If the entry wasn't just removed from the entries hash, fold the
      changes into the entry. */
   if (! entry_was_deleted_p)
-    fold_entry (entries, name, modify_flags, entry,
-                svn_wc_adm_access_pool (adm_access));
+    {
+      fold_entry (entries, name, modify_flags, entry,
+                  svn_wc_adm_access_pool (adm_access));
+      fold_entry (entries_nohidden, name, modify_flags, entry,
+                  svn_wc_adm_access_pool (adm_access));
+    }
 
   /* Sync changes to disk. */
   if (do_sync)
     SVN_ERR (svn_wc__entries_write (entries, adm_access, pool));
-  else
-    svn_wc__adm_access_set_entries (adm_access, FALSE, NULL);
 
   return SVN_NO_ERROR;
 }
Index: subversion/libsvn_wc/lock.c
===================================================================
--- subversion/libsvn_wc/lock.c	(revision 9870)
+++ subversion/libsvn_wc/lock.c	(working copy)
@@ -915,30 +915,6 @@
     {
       apr_hash_index_t *hi;
 
-      /* I think it will be common for there to be no deleted entries, so
-         it is worth checking for that case as we can optimise it. */
-      for (hi = apr_hash_first (pool, adm_access->entries_hidden);
-           hi;
-           hi = apr_hash_next (hi))
-        {
-          void *val;
-          const svn_wc_entry_t *entry;
-          apr_hash_this (hi, NULL, NULL, &val);
-          entry = val;
-          if ((entry->deleted
-               && (entry->schedule != svn_wc_schedule_add)
-               && (entry->schedule != svn_wc_schedule_replace))
-              || entry->absent)
-            break;
-        }
-
-      if (! hi)
-        {
-          /* There are no deleted entries, so we can use the full hash */
-          adm_access->entries = adm_access->entries_hidden;
-          return;
-        }
-
       /* Construct pruned hash without deleted entries */
       adm_access->entries = apr_hash_make (adm_access->pool);
       for (hi = apr_hash_first (pool, adm_access->entries_hidden);

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org

Re: More libsvn_wc efficiency improvements.

Posted by Philip Martin <ph...@codematters.co.uk>.

Josh Pieper <jj...@pobox.com> writes:

> Yes, but it was not an obvious call.  Currently, svn_wc__entry_modify
> clears the cache for the pruned list after every modification.  This
> means that the next time the pruned list is requested it must be
> regenerated.  This happened for every entries list modification.  By
> changing svn_wc__entry_modify to modify both the full and pruned list
> simulataneously, prune_deleted only needs to be called the first time
> a new entries list is loaded.  The change to prune_deleted was just a
> minor optimization after the fact.

OK.  I suppose it would be possible to leave in the bit that shares
the hash by making svn_wc__entry_modify a bit more intelligent--it
could determine whether the hash was shared or not and modify one or
two hashes as required, I guess it might also need to be responsible
for unsharing if the modification created a deleted entry.  I don't
know whether the memory saving is sufficiently important, but entry
cache memory does have a long lifetime.

-- 
Philip Martin

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org

Re: More libsvn_wc efficiency improvements.

Posted by Josh Pieper <jj...@pobox.com>.

Philip Martin wrote:
> Josh Pieper <jj...@pobox.com> writes:
> 
> > Philip, could you look over this patch and see if anything is
> > obviously wrong with my entries caching modifications?  Essentially
> > lock.c:prune_deleted was getting called for every single modification
> > made to the entries file.  To generate a pruned list, it needs to
> > iterate over all the currently existing entries.  It just wasn't very
> > noticable before because of the O(n^2) disk problems.  While the patch
> > does cause a little extra work to be done in svn_wc__entry_modify, the
> > reduction in work in prune_deleted seems to negate the loss for small
> > directories, for big directories it's an obvious performance win.
> >
> > With this patch and the current (r9870) trunk FSFS, I see no
> > non-linearities in checkouts when going up to at least 25,000 files in
> > a single directory.  At that point my hard drive or filesystem started
> > to groan, so I stopped.
> 
> It's hard to argue with measurements, but I don't understand why it's
> an obvious performance win.  The current prune_deleted function has
> two loops over the entries, now your patch removes one loop but that
> still leaves the other so any O(N^2) behaviour is still there.  Have
> you removed a call to prune_deleted from the critical path?

Yes, but it was not an obvious call.  Currently, svn_wc__entry_modify
clears the cache for the pruned list after every modification.  This
means that the next time the pruned list is requested it must be
regenerated.  This happened for every entries list modification.  By
changing svn_wc__entry_modify to modify both the full and pruned list
simulataneously, prune_deleted only needs to be called the first time
a new entries list is loaded.  The change to prune_deleted was just a
minor optimization after the fact.

> > * subversion/libsvn_wc/lock.c
> >   (prune_deleted): Do not attempt to take a shortcut and assign the
> >     full entries list to the pruned one.  This was only a very small
> >     performance win before and now that we need to update the two
> >     caches separately it would be incorrect.
> 
> I never measured it but the optimisation was reduced memory use rather
> than any runtime CPU effect.  The memory used by the entries cache is
> allocated for the lifetime of the access batons, so I tried to
> minimise it where possible.  I suppose the extra memory used by having
> two hashes instead of one is small compared to the memory used by the
> entries themselves, but it does seem a waste since in most cases there
> are no deleted entries.

Well, with this patch prune_deleted is only called once for any
entries list, so the pruned list isn't allocated over and over again,
thus the memory penalty is very small.  Besides, as I mentioned in the
comment, the two lists must be separate for the bigger optimization to
happen.

> > * subversion/libsvn_wc/entries.c
> >   (svn_wc__entry_modify): Get both the full and pruned entries lists,
> >     perform the requested modifications on both of them.
> >
> > Index: subversion/libsvn_wc/entries.c
> > ===================================================================
> > --- subversion/libsvn_wc/entries.c	(revision 9870)
> > +++ subversion/libsvn_wc/entries.c	(working copy)
> > @@ -1549,7 +1549,7 @@
> >                        svn_boolean_t do_sync,
> >                        apr_pool_t *pool)
> >  {
> > -  apr_hash_t *entries;
> > +  apr_hash_t *entries, *entries_nohidden;
> >    svn_boolean_t entry_was_deleted_p = FALSE;
> >  
> >    /* ENTRY is rather necessary, and ENTRY->kind is required to be valid! */
> > @@ -1557,7 +1557,11 @@
> >  
> >    /* Load PATH's whole entries file. */
> >    SVN_ERR (svn_wc_entries_read (&entries, adm_access, TRUE, pool));
> > +  SVN_ERR (svn_wc_entries_read (&entries_nohidden, adm_access, FALSE, pool));
> >  
> > +  /* These two hashes cannot be the exact same ones. */
> > +  assert (entries != entries_nohidden);
> > +
> >    /* Ensure that NAME is valid. */
> >    if (name == NULL)
> >      name = SVN_WC_ENTRY_THIS_DIR;
> > @@ -1565,6 +1569,8 @@
> >    if (modify_flags & SVN_WC__ENTRY_MODIFY_SCHEDULE)
> >      {
> >        svn_wc_entry_t *entry_before, *entry_after;
> > +      apr_uint32_t orig_modify_flags = modify_flags;
> > +      svn_wc_schedule_t orig_schedule = entry->schedule;
> >  
> >        /* Keep a copy of the unmodified entry on hand. */
> >        entry_before = apr_hash_get (entries, name, APR_HASH_KEY_STRING);
> > @@ -1574,6 +1580,9 @@
> >        SVN_ERR (fold_scheduling (entries, name, &modify_flags, 
> >                                  &entry->schedule, pool));
> >  
> > +      SVN_ERR (fold_scheduling (entries_nohidden, name, &orig_modify_flags,
> > +                                &orig_schedule, pool));
> 
> How about a sanity check?
> 
>          assert(orig_modify_flags == modify_flags);
>          assert(orig_schedule == entry->schedule);

Will do.

Do you see the optimization now?

-Josh

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org

Re: More libsvn_wc efficiency improvements.

Posted by Philip Martin <ph...@codematters.co.uk>.

Josh Pieper <jj...@pobox.com> writes:

> Philip, could you look over this patch and see if anything is
> obviously wrong with my entries caching modifications?  Essentially
> lock.c:prune_deleted was getting called for every single modification
> made to the entries file.  To generate a pruned list, it needs to
> iterate over all the currently existing entries.  It just wasn't very
> noticable before because of the O(n^2) disk problems.  While the patch
> does cause a little extra work to be done in svn_wc__entry_modify, the
> reduction in work in prune_deleted seems to negate the loss for small
> directories, for big directories it's an obvious performance win.
>
> With this patch and the current (r9870) trunk FSFS, I see no
> non-linearities in checkouts when going up to at least 25,000 files in
> a single directory.  At that point my hard drive or filesystem started
> to groan, so I stopped.

It's hard to argue with measurements, but I don't understand why it's
an obvious performance win.  The current prune_deleted function has
two loops over the entries, now your patch removes one loop but that
still leaves the other so any O(N^2) behaviour is still there.  Have
you removed a call to prune_deleted from the critical path?

> * subversion/libsvn_wc/lock.c
>   (prune_deleted): Do not attempt to take a shortcut and assign the
>     full entries list to the pruned one.  This was only a very small
>     performance win before and now that we need to update the two
>     caches separately it would be incorrect.

I never measured it but the optimisation was reduced memory use rather
than any runtime CPU effect.  The memory used by the entries cache is
allocated for the lifetime of the access batons, so I tried to
minimise it where possible.  I suppose the extra memory used by having
two hashes instead of one is small compared to the memory used by the
entries themselves, but it does seem a waste since in most cases there
are no deleted entries.

>
> * subversion/libsvn_wc/entries.c
>   (svn_wc__entry_modify): Get both the full and pruned entries lists,
>     perform the requested modifications on both of them.
>
> Index: subversion/libsvn_wc/entries.c
> ===================================================================
> --- subversion/libsvn_wc/entries.c	(revision 9870)
> +++ subversion/libsvn_wc/entries.c	(working copy)
> @@ -1549,7 +1549,7 @@
>                        svn_boolean_t do_sync,
>                        apr_pool_t *pool)
>  {
> -  apr_hash_t *entries;
> +  apr_hash_t *entries, *entries_nohidden;
>    svn_boolean_t entry_was_deleted_p = FALSE;
>  
>    /* ENTRY is rather necessary, and ENTRY->kind is required to be valid! */
> @@ -1557,7 +1557,11 @@
>  
>    /* Load PATH's whole entries file. */
>    SVN_ERR (svn_wc_entries_read (&entries, adm_access, TRUE, pool));
> +  SVN_ERR (svn_wc_entries_read (&entries_nohidden, adm_access, FALSE, pool));
>  
> +  /* These two hashes cannot be the exact same ones. */
> +  assert (entries != entries_nohidden);
> +
>    /* Ensure that NAME is valid. */
>    if (name == NULL)
>      name = SVN_WC_ENTRY_THIS_DIR;
> @@ -1565,6 +1569,8 @@
>    if (modify_flags & SVN_WC__ENTRY_MODIFY_SCHEDULE)
>      {
>        svn_wc_entry_t *entry_before, *entry_after;
> +      apr_uint32_t orig_modify_flags = modify_flags;
> +      svn_wc_schedule_t orig_schedule = entry->schedule;
>  
>        /* Keep a copy of the unmodified entry on hand. */
>        entry_before = apr_hash_get (entries, name, APR_HASH_KEY_STRING);
> @@ -1574,6 +1580,9 @@
>        SVN_ERR (fold_scheduling (entries, name, &modify_flags, 
>                                  &entry->schedule, pool));
>  
> +      SVN_ERR (fold_scheduling (entries_nohidden, name, &orig_modify_flags,
> +                                &orig_schedule, pool));

How about a sanity check?

         assert(orig_modify_flags == modify_flags);
         assert(orig_schedule == entry->schedule);

-- 
Philip Martin

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org