You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@subversion.apache.org by ju...@apache.org on 2010/06/21 15:56:30 UTC

svn commit: r956593 - /subversion/trunk/subversion/libsvn_delta/text_delta.c

Author: julianfoad
Date: Mon Jun 21 13:56:30 2010
New Revision: 956593

URL: http://svn.apache.org/viewvc?rev=956593&view=rev
Log:
Optimize the copies performed by svn_txdelta_apply_instructions().

svn_txdelta_apply_instructions() is relatively slow for long instruction
sequences copying small pieces of data.  This is particularly visible in
"format 2" FSFS repositories.

Two kinds of copy are involved.  For simple copies, the system's memcpy()
is used, which is fast for long lengths; this patch bypasses it for very
short lengths.  For intentionally overlapping copies, a custom loop is used,
which was already fast for very short lengths; this patch adds a code path
that is fast for longer lengths.

* subversion/libsvn_delta/text_delta.c
  (fast_memcpy, patterning_copy): New functions, optimized for our specific
    workload.
  (svn_txdelta_apply_instructions): Use fast_memcpy() and patterning_copy().

Patch by: Stefan Fuhrmann <stefanfuhrmann{_AT_}alice-dsl.de>
(I tweaked the comments and split Stefan's patch into two parts, of which
this is the first.)

Modified:
    subversion/trunk/subversion/libsvn_delta/text_delta.c

Modified: subversion/trunk/subversion/libsvn_delta/text_delta.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_delta/text_delta.c?rev=956593&r1=956592&r2=956593&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_delta/text_delta.c (original)
+++ subversion/trunk/subversion/libsvn_delta/text_delta.c Mon Jun 21 13:56:30 2010
@@ -564,6 +564,73 @@ size_buffer(char **buf, apr_size_t *buf_
   return SVN_NO_ERROR;
 }
 
+/* Copy LEN bytes from SOURCE to TARGET, optimizing for the case where LEN
+ * is often very small.  Return a pointer to the first byte after the copied
+ * target range, unlike standard memcpy(), as a potential further
+ * optimization for the caller.
+ *
+ * memcpy() is hard to tune for a wide range of buffer lengths.  Therefore,
+ * it is often tuned for high throughput on large buffers and relatively
+ * low latency for mid-sized buffers (tens of bytes).  However, the overhead
+ * for very small buffers (<10 bytes) is still high.  Even passing the
+ * parameters, for instance, may take as long as copying 3 bytes.
+ *
+ * Because short copy sequences seem to be a common case, at least in
+ * "format 2" FSFS repositories, we copy them directly.  Larger buffer sizes
+ * aren't hurt measurably by the exta 'if' clause.  */
+static APR_INLINE char *
+fast_memcpy(char *target, const char *source, apr_size_t len)
+{
+  if (len > 7)
+    {
+      memcpy(target, source, len);
+      target += len;
+    }
+  else
+    {
+      /* memcpy is not exactly fast for small block sizes.
+       * Since they are common, let's run optimized code for them. */
+      const char *end = source + len;
+      for (; source != end; source++)
+        *(target++) = *source;
+    }
+
+  return target;
+}
+
+/* Copy LEN bytes from SOURCE to TARGET.  Unlike memmove() or memcpy(),
+ * create repeating patterns if the source and target ranges overlap.
+ * Return a pointer to the first byte after the copied target range.  */
+static APR_INLINE char *
+patterning_copy(char *target, const char *source, apr_size_t len)
+{
+  const char *end = source + len;
+
+  /* On the majority of machines (x86 / x64), unaligned access is much
+   * cheaper than repeated aligned access.  Therefore, use chunky copies on
+   * these machines when feasible.
+   * For those machines, GCC, ICC and MSC will define one of the following: */
+#if defined(_M_IX86) || defined(_M_X64) || defined(i386) || defined(__x86_64)
+
+  if (end + sizeof(apr_uint32_t) <= target)
+    {
+      /* Source and target are at least 4 bytes apart, so we can copy in
+       * 4-byte chunks.  */
+      for (; source + sizeof(apr_uint32_t) <= end;
+           source += sizeof(apr_uint32_t),
+           target += sizeof(apr_uint32_t))
+      *(apr_uint32_t *)(target) = *(apr_uint32_t *)(source);
+    }
+
+#endif
+
+  /* fall through to byte-wise copy (either for the below-chunk-size tail
+   * or the whole copy) */
+  for (; source != end; source++)
+    *(target++) = *source;
+
+  return target;
+}
 
 void
 svn_txdelta_apply_instructions(svn_txdelta_window_t *window,
@@ -571,7 +638,7 @@ svn_txdelta_apply_instructions(svn_txdel
                                apr_size_t *tlen)
 {
   const svn_txdelta_op_t *op;
-  apr_size_t i, j, tpos = 0;
+  apr_size_t tpos = 0;
 
   for (op = window->ops; op < window->ops + window->num_ops; op++)
     {
@@ -586,25 +653,26 @@ svn_txdelta_apply_instructions(svn_txdel
         case svn_txdelta_source:
           /* Copy from source area.  */
           assert(op->offset + op->length <= window->sview_len);
-          memcpy(tbuf + tpos, sbuf + op->offset, buf_len);
+          fast_memcpy(tbuf + tpos, sbuf + op->offset, buf_len);
           break;
 
         case svn_txdelta_target:
-          /* Copy from target area.  Don't use memcpy() since its
-             semantics aren't guaranteed for overlapping memory areas,
-             and target copies are allowed to overlap to generate
-             repeated data.  */
+          /* Copy from target area.  We can't use memcpy() or the like
+           * since we need a specific semantics for overlapping copies:
+           * they must result in repeating patterns.
+           * Note that most copies won't have overlapping source and
+           * target ranges (they are just a result of self-compressed
+           * data) but a small percentage will.  */
           assert(op->offset < tpos);
-          for (i = op->offset, j = tpos; i < op->offset + buf_len; i++)
-            tbuf[j++] = tbuf[i];
+          patterning_copy(tbuf + tpos, tbuf + op->offset, buf_len);
           break;
 
         case svn_txdelta_new:
           /* Copy from window new area.  */
           assert(op->offset + op->length <= window->new_data->len);
-          memcpy(tbuf + tpos,
-                 window->new_data->data + op->offset,
-                 buf_len);
+          fast_memcpy(tbuf + tpos,
+                      window->new_data->data + op->offset,
+                      buf_len);
           break;
 
         default:



Re: svn commit: r956593 - /subversion/trunk/subversion/libsvn_delta/text_delta.c

Posted by Blair Zajac <bl...@orcaware.com>.
On Jun 21, 2010, at 9:56 AM, julianfoad@apache.org wrote:

> Author: julianfoad
> Date: Mon Jun 21 13:56:30 2010
> New Revision: 956593
>
> URL: http://svn.apache.org/viewvc?rev=956593&view=rev
> Log:
> Optimize the copies performed by svn_txdelta_apply_instructions().
>
> svn_txdelta_apply_instructions() is relatively slow for long  
> instruction
> sequences copying small pieces of data.  This is particularly  
> visible in
> "format 2" FSFS repositories.
>
> Two kinds of copy are involved.  For simple copies, the system's  
> memcpy()
> is used, which is fast for long lengths; this patch bypasses it for  
> very
> short lengths.  For intentionally overlapping copies, a custom loop  
> is used,
> which was already fast for very short lengths; this patch adds a  
> code path
> that is fast for longer lengths.

Hi Julian,

Does this turn out to have a measurable performance improvement? I  
would guess svn is limited by IO speeds with fsync(), not memory copies.

Not that I don't appreciate performance improvements, running a  
backend that gets over 2 commits per second :), I'm just curious.

Regards,
Blair