You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@subversion.apache.org by st...@apache.org on 2014/04/13 06:40:41 UTC

svn commit: r1586922 - in /subversion/trunk/subversion: include/private/svn_io_private.h include/svn_types.h libsvn_subr/io.c libsvn_subr/stream.c

Author: stefan2
Date: Sun Apr 13 04:40:40 2014
New Revision: 1586922

URL: http://svn.apache.org/r1586922
Log:
Speed up file / stream comparison, i.e. minimize the processing overhead
for finding the first mismatch.

The approach is two-sided.  Instead of fetching SVN__STREAM_CHUNK_SIZE
from all sources before comparing data, we start with a much lower 4kB
and increase the chunk size until we reach SVN__STREAM_CHUNK_SIZE while
making sure that all reads are naturally aligned.  So, we quickly find
mismatches near the beginning of the file.

On the other end side, we bump the SVN__STREAM_CHUNK_SIZE to 64kB which
gives better throughput for longer distances to the first mismatch -
without causing ill effects in APR's memory management.

* subversion/include/svn_types.h
  (SVN__STREAM_CHUNK_SIZE): Bump to 64k and add some documentation on
                            the general restrictions for future changes.

* subversion/include/private/svn_io_private.h
  (svn_io__next_chunk_size): New utility function generating the new
                             read block size sequence.

* subversion/libsvn_subr/io.c
  (svn_io__next_chunk_size): Implement.
  (contents_identical_p,
   contents_three_identical_p): Let the new utility determine the read
                                block size.

* subversion/libsvn_subr/stream.c
  (svn_stream_contents_same2): Ditto.

Modified:
    subversion/trunk/subversion/include/private/svn_io_private.h
    subversion/trunk/subversion/include/svn_types.h
    subversion/trunk/subversion/libsvn_subr/io.c
    subversion/trunk/subversion/libsvn_subr/stream.c

Modified: subversion/trunk/subversion/include/private/svn_io_private.h
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/include/private/svn_io_private.h?rev=1586922&r1=1586921&r2=1586922&view=diff
==============================================================================
--- subversion/trunk/subversion/include/private/svn_io_private.h (original)
+++ subversion/trunk/subversion/include/private/svn_io_private.h Sun Apr 13 04:40:40 2014
@@ -72,6 +72,14 @@ svn_io__is_finfo_read_only(svn_boolean_t
                            apr_finfo_t *file_info,
                            apr_pool_t *pool);
 
+/** Given that @a total_read bytes have already been read from a file or
+ * stream, return a suggestion for the size of the next block to process.
+ * This value will be <= #SVN__STREAM_CHUNK_SIZE.
+ *
+ * @since New in 1.9.
+ */
+apr_size_t
+svn_io__next_chunk_size(apr_off_t total_read);
 
 /** Buffer test handler function for a generic stream. @see svn_stream_t
  * and svn_stream__is_buffered().

Modified: subversion/trunk/subversion/include/svn_types.h
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/include/svn_types.h?rev=1586922&r1=1586921&r2=1586922&view=diff
==============================================================================
--- subversion/trunk/subversion/include/svn_types.h (original)
+++ subversion/trunk/subversion/include/svn_types.h Sun Apr 13 04:40:40 2014
@@ -1142,8 +1142,12 @@ typedef svn_error_t *(*svn_commit_callba
  *
  * NOTE: This is an internal macro, put here for convenience.
  * No public API may depend on the particular value of this macro.
+ *
+ * NOTE: The implementation assumes that this is a power of two >= 4k.
+ *       Moreover, it should be less than 80kB to prevent memory
+ *       fragmentation in the APR memory allocator.
  */
-#define SVN__STREAM_CHUNK_SIZE 16384
+#define SVN__STREAM_CHUNK_SIZE 0x10000
 #endif
 
 /** The maximum amount we can ever hold in memory. */

Modified: subversion/trunk/subversion/libsvn_subr/io.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/io.c?rev=1586922&r1=1586921&r2=1586922&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/io.c (original)
+++ subversion/trunk/subversion/libsvn_subr/io.c Sun Apr 13 04:40:40 2014
@@ -63,6 +63,7 @@
 #include "svn_config.h"
 #include "svn_private_config.h"
 #include "svn_ctype.h"
+#include "svn_sorts.h"
 
 #include "private/svn_atomic.h"
 #include "private/svn_io_private.h"
@@ -4501,6 +4502,17 @@ svn_io_read_version_file(int *version,
 }
 
 
+apr_size_t
+svn_io__next_chunk_size(apr_off_t total_read)
+{
+  /* Started with total_read===, this will generate a sequence ensuring
+     aligned access with increasing block size up to SVN__STREAM_CHUNK_SIZE:
+     4k@ offset 0, 4k@ offset 4k, 8k@ offset 8k, 16k@ offset 16k etc.
+     */
+  return total_read ? (apr_size_t)MIN(total_read, SVN__STREAM_CHUNK_SIZE)
+                    : (apr_size_t)4096;
+}
+
 
 /* Do a byte-for-byte comparison of FILE1 and FILE2. */
 static svn_error_t *
@@ -4517,6 +4529,7 @@ contents_identical_p(svn_boolean_t *iden
   apr_file_t *file2_h;
   svn_boolean_t eof1 = FALSE;
   svn_boolean_t eof2 = FALSE;
+  apr_off_t total_read = 0;
 
   SVN_ERR(svn_io_file_open(&file1_h, file1, APR_READ, APR_OS_DEFAULT,
                            pool));
@@ -4532,14 +4545,17 @@ contents_identical_p(svn_boolean_t *iden
   *identical_p = TRUE;  /* assume TRUE, until disproved below */
   while (!err && !eof1 && !eof2)
     {
+      apr_size_t to_read = svn_io__next_chunk_size(total_read);
+      total_read += to_read;
+
       err = svn_io_file_read_full2(file1_h, buf1,
-                                   SVN__STREAM_CHUNK_SIZE, &bytes_read1,
+                                   to_read, &bytes_read1,
                                    &eof1, pool);
       if (err)
           break;
 
       err = svn_io_file_read_full2(file2_h, buf2,
-                                   SVN__STREAM_CHUNK_SIZE, &bytes_read2,
+                                   to_read, &bytes_read2,
                                    &eof2, pool);
       if (err)
           break;
@@ -4585,6 +4601,7 @@ contents_three_identical_p(svn_boolean_t
   svn_boolean_t eof1 = FALSE;
   svn_boolean_t eof2 = FALSE;
   svn_boolean_t eof3 = FALSE;
+  apr_off_t total_read = 0;
 
   SVN_ERR(svn_io_file_open(&file1_h, file1, APR_READ, APR_OS_DEFAULT,
                            scratch_pool));
@@ -4621,6 +4638,9 @@ contents_three_identical_p(svn_boolean_t
       apr_size_t bytes_read1, bytes_read2, bytes_read3;
       svn_boolean_t read_1, read_2, read_3;
 
+      apr_size_t to_read = svn_io__next_chunk_size(total_read);
+      total_read += to_read;
+
       read_1 = read_2 = read_3 = FALSE;
 
       /* As long as a file is not at the end yet, and it is still
@@ -4628,8 +4648,8 @@ contents_three_identical_p(svn_boolean_t
       if (!eof1 && (*identical_p12 || *identical_p13))
         {
           err = svn_io_file_read_full2(file1_h, buf1,
-                                   SVN__STREAM_CHUNK_SIZE, &bytes_read1,
-                                   &eof1, scratch_pool);
+                                       to_read, &bytes_read1,
+                                       &eof1, scratch_pool);
           if (err)
               break;
           read_1 = TRUE;
@@ -4638,8 +4658,8 @@ contents_three_identical_p(svn_boolean_t
       if (!eof2 && (*identical_p12 || *identical_p23))
         {
           err = svn_io_file_read_full2(file2_h, buf2,
-                                   SVN__STREAM_CHUNK_SIZE, &bytes_read2,
-                                   &eof2, scratch_pool);
+                                       to_read, &bytes_read2,
+                                       &eof2, scratch_pool);
           if (err)
               break;
           read_2 = TRUE;
@@ -4648,8 +4668,8 @@ contents_three_identical_p(svn_boolean_t
       if (!eof3 && (*identical_p13 || *identical_p23))
         {
           err = svn_io_file_read_full2(file3_h, buf3,
-                                   SVN__STREAM_CHUNK_SIZE, &bytes_read3,
-                                   &eof3, scratch_pool);
+                                       to_read, &bytes_read3,
+                                       &eof3, scratch_pool);
           if (err)
               break;
           read_3 = TRUE;

Modified: subversion/trunk/subversion/libsvn_subr/stream.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/stream.c?rev=1586922&r1=1586921&r2=1586922&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/stream.c (original)
+++ subversion/trunk/subversion/libsvn_subr/stream.c Sun Apr 13 04:40:40 2014
@@ -585,14 +585,20 @@ svn_stream_contents_same2(svn_boolean_t 
 {
   char *buf1 = apr_palloc(pool, SVN__STREAM_CHUNK_SIZE);
   char *buf2 = apr_palloc(pool, SVN__STREAM_CHUNK_SIZE);
-  apr_size_t bytes_read1 = SVN__STREAM_CHUNK_SIZE;
-  apr_size_t bytes_read2 = SVN__STREAM_CHUNK_SIZE;
+  apr_size_t to_read = 0;
+  apr_size_t bytes_read1 = 0;
+  apr_size_t bytes_read2 = 0;
+  apr_off_t total_read = 0;
   svn_error_t *err = NULL;
 
   *same = TRUE;  /* assume TRUE, until disproved below */
-  while (bytes_read1 == SVN__STREAM_CHUNK_SIZE
-         && bytes_read2 == SVN__STREAM_CHUNK_SIZE)
+  while (bytes_read1 == to_read && bytes_read2 == to_read)
     {
+      to_read = svn_io__next_chunk_size(total_read);
+      bytes_read1 = to_read;
+      bytes_read2 = to_read;
+      total_read += to_read;
+
       err = svn_stream_read_full(stream1, buf1, &bytes_read1);
       if (err)
         break;

Re: svn commit: r1586922 - in /subversion/trunk/subversion: include/private/svn_io_private.h include/svn_types.h libsvn_subr/io.c libsvn_subr/stream.c

Posted by Branko Čibej <br...@wandisco.com>.

On 18.04.2014 18:21, Ivan Zhakov wrote:
> On 18 April 2014 15:46, Stefan Fuhrmann <st...@wandisco.com> wrote:
>> On Fri, Apr 18, 2014 at 10:06 AM, Ivan Zhakov <iv...@visualsvn.com> wrote:
>>
>>> Sorry, may be I was not clear enough, but I was vetoing this change.
>>> Given no reply, I'm going to revert it.
>>
>> [Been AFK for the last 3 days]
>>
>> Don't revert; your veto is based on faulty measurements & assumptions.
>> Here the shortlist:
>>
> My veto is still my veto, you cannot judge it.

Bullshit. If it's based on wrong data, it can't be valid.

(Please note that I'm not evaluating the reasons for the veto, just your
statement that you can arbitrarily veto anything for any reason at all.)

-- Brane


-- 
Branko Čibej | Director of Subversion
WANdisco // Non-Stop Data
e. brane@wandisco.com

Re: svn commit: r1586922 - in /subversion/trunk/subversion: include/private/svn_io_private.h include/svn_types.h libsvn_subr/io.c libsvn_subr/stream.c

Posted by Stefan Fuhrmann <st...@wandisco.com>.

On Fri, Apr 18, 2014 at 6:21 PM, Ivan Zhakov <iv...@visualsvn.com> wrote:

> On 18 April 2014 15:46, Stefan Fuhrmann <st...@wandisco.com>
> wrote:
> > On Fri, Apr 18, 2014 at 10:06 AM, Ivan Zhakov <iv...@visualsvn.com>
> wrote:
> >>
> >> On 14 April 2014 20:11, Ivan Zhakov <iv...@visualsvn.com> wrote:
> >> > On 13 April 2014 08:40,  <st...@apache.org> wrote:
> >> >> Author: stefan2
> >> >> Date: Sun Apr 13 04:40:40 2014
> >> >> New Revision: 1586922
> >> >>
> >> >> URL: http://svn.apache.org/r1586922
> >> >> Log:
> >> >> Speed up file / stream comparison, i.e. minimize the processing
> >> >> overhead
> >> >> for finding the first mismatch.
> >> >>
> >> >> The approach is two-sided.  Instead of fetching
> SVN__STREAM_CHUNK_SIZE
> >> >> from all sources before comparing data, we start with a much lower
> 4kB
> >> >> and increase the chunk size until we reach SVN__STREAM_CHUNK_SIZE
> while
> >> >> making sure that all reads are naturally aligned.  So, we quickly
> find
> >> >> mismatches near the beginning of the file.
> >> >>
> >> >> On the other end side, we bump the SVN__STREAM_CHUNK_SIZE to 64kB
> which
> >> >> gives better throughput for longer distances to the first mismatch -
> >> >> without causing ill effects in APR's memory management.
> >> >>
> >> >> * subversion/include/svn_types.h
> >> >>   (SVN__STREAM_CHUNK_SIZE): Bump to 64k and add some documentation on
> >> >>                             the general restrictions for future
> >> >> changes.
> >> >>
> >> > Hi Stefan,
> >> >
> >> > You effectively reverted my recent fix (r1581296) for high memory
> >> > usage with many repositories open at the same time (about 500k per
> >> > repository. Also please consider r857671 and discussion before that
> >> > commit:
> >> > http://svn.haxx.se/dev/archive-2004-11/0123.shtml
> >> >
> >> > So please revert.
> >> >
> >> Sorry, may be I was not clear enough, but I was vetoing this change.
> >> Given no reply, I'm going to revert it.
> >
> >
> > [Been AFK for the last 3 days]
> >
> > Don't revert; your veto is based on faulty measurements & assumptions.
> > Here the shortlist:
> >
> My veto is still my veto, you cannot judge it.
>

But of course, I can judge it and do many other things to it as
I please.

You are right that I could not veto your veto - and I'm not trying
to do that. I'll try, however, to demonstrate that it does not have
a technical justification. If I succeed, it has never been a valid
veto in the first place.

You don't get the point: you changed *global* constant used

Well, one point of global constants is their central changeability
without jeopardizing consistency, i.e. they may assume any value
without breaking the code (insert a gazillion exceptions here).
They should not be changed wantonly, though.

> in
> multiple situation for just FSFS optimization. The value for this
> constant was discussed long time ago and was reduced for serious
> reasons. You changed it just to fix specific FSFS cases. I believe
> it's wrong way for coding.
>

My change was in way related to FSFS and I doubt that it would
have any impact at all: Config files don't care as they are too small
and repo I/O is using an entirely independent block size (or just
the APR 4k default).

You, however, made the initial change to config_file.c all about
repositories - and since you are not using BDB - about FSFS:

On Fri, Apr 18, 2014 at 10:06 AM, Ivan Zhakov <iv...@visualsvn.com> wrote:
>
> You effectively reverted my recent fix (r1581296) for high memory
> usage with many repositories open at the same time (about 500k per
> repository. Also please consider r857671 and discussion before that
> commit:
> http://svn.haxx.se/dev/archive-2004-11/0123.shtml
>

The 500k claim is demonstrably false. It is about 150k in 1.7,
200k in 1.8 and 46k on /trunk before recent optimizations that
brought it down to 10k. The 1.7 and 1.8 usage was due to every
membuffer cache frontend using a private scratch pool.

So, your reasoning for the initial patch as been faulty and cannot
serve as a basis to veto further change. The patch itself is fine,
though, since it now uses our standard constant for buffer sizes.

The post you are citing is the same that I already mentioned in
my initial reply. I clearly says that 64k is just as well as 16k.
So, it does not counter my reasoning. However, 10 years have
passed and seen the raise of SSDs that are only now unveiling
their full potential.

For cold disc caches, I begin see improvements in 'svn st' on
large 'touched' files when the disc throughput is 1GB/s or better.
For hot disc caches (commit after status), the gain is ~15%.
That was the motivation for my commit.

On Fri, Apr 18, 2014 at 6:21 PM, Ivan Zhakov <iv...@visualsvn.com> wrote:

> You also made libsvn_wc a bit slower, because while your 'slow start'
> compare algorithm may be works fine for FSFS it doesn't work well for
> comparing on disk files in WC.
>

Do you have any numbers to back up your claim? Here are mine
for the test suite execution on my macbook (Linux, ramdisk):

r1587968 1m54.371s (ra_svn) 1m40.437s (ra_local)
r1586922 1m54.608s (ra_svn) 1m41.495s (ra_local)
r1586921 1m54.208s (ra_svn) 1m41.936s (ra_local)

So, there is a mild indication that the 'svn log' improvements
might have helped but on the whole the results are as expected:
file sizes are very small and histories are short; tuning for
throughput should not help much.

> > * SVN__STREAM_CHUNK_SIZE is only to be used with files and
> >   streams (duh!) - not with data structures that persist in RAM.
> >   If your use of that constant were to actually effect the size of a
> >   repo instance, that use would be faulty.
> >
> > * parse_context_t is only instantiated in one place,
> > svn_config__parse_stream
> >   and will be destroyed at the end of that function. So, there is no
> >   persistent memory use from this one.
> This depends on allocator and heap behavior.
>

Only in the sense that freed memory can be reused when
needed. I guess that is a safe assumption - unless you
want to claim that using pools is pointless after all.

> > * According to *my* measurements, a FSFS instance takes about
> >   46kB just after fs_open() and under 64 bit Unix. Of those 46k,
> >   64k were used - according to you - for the stream buffer alone ...
> >
> Yes, on trunk FSFS instances doesn't use so much memory, because
> specific subpools added and APR buffered files avoided. But this
> issues can reappear again in future, like it was in Subversion 1.8.x.
>

You probably had a systematic error in your measurements.
To measure marginal memory usage, you need to open
thousands of repo instances (may always be the same on disk).

My macbook now happily opens 1M repos without running OOM.
What is your test scenario?

So, I'm still veto this change. Please revert it. Then I suggest you
> make this stream comparision algorithm private for FSFS, becuase it's
> makes sense only for 'expensive' streams.
>

I'm not aware that FSFS would need any stream comparison
because it is all hashsum based. Again, FSFS is *not* the
reason for the change.

Is there any reason left unaddressed, why you veto the change?

> I'm ready to discuss increasing SVN__STREAM_CHUNK_SIZE to 64k or any
> other value, if there are serous reasons for doing this, but only
> after reverting this change from trunk.
>

-- Stefan^2.

Re: svn commit: r1586922 - in /subversion/trunk/subversion: include/private/svn_io_private.h include/svn_types.h libsvn_subr/io.c libsvn_subr/stream.c

Posted by Ivan Zhakov <iv...@visualsvn.com>.

On 18 April 2014 15:46, Stefan Fuhrmann <st...@wandisco.com> wrote:
> On Fri, Apr 18, 2014 at 10:06 AM, Ivan Zhakov <iv...@visualsvn.com> wrote:
>>
>> On 14 April 2014 20:11, Ivan Zhakov <iv...@visualsvn.com> wrote:
>> > On 13 April 2014 08:40,  <st...@apache.org> wrote:
>> >> Author: stefan2
>> >> Date: Sun Apr 13 04:40:40 2014
>> >> New Revision: 1586922
>> >>
>> >> URL: http://svn.apache.org/r1586922
>> >> Log:
>> >> Speed up file / stream comparison, i.e. minimize the processing
>> >> overhead
>> >> for finding the first mismatch.
>> >>
>> >> The approach is two-sided.  Instead of fetching SVN__STREAM_CHUNK_SIZE
>> >> from all sources before comparing data, we start with a much lower 4kB
>> >> and increase the chunk size until we reach SVN__STREAM_CHUNK_SIZE while
>> >> making sure that all reads are naturally aligned.  So, we quickly find
>> >> mismatches near the beginning of the file.
>> >>
>> >> On the other end side, we bump the SVN__STREAM_CHUNK_SIZE to 64kB which
>> >> gives better throughput for longer distances to the first mismatch -
>> >> without causing ill effects in APR's memory management.
>> >>
>> >> * subversion/include/svn_types.h
>> >>   (SVN__STREAM_CHUNK_SIZE): Bump to 64k and add some documentation on
>> >>                             the general restrictions for future
>> >> changes.
>> >>
>> > Hi Stefan,
>> >
>> > You effectively reverted my recent fix (r1581296) for high memory
>> > usage with many repositories open at the same time (about 500k per
>> > repository. Also please consider r857671 and discussion before that
>> > commit:
>> > http://svn.haxx.se/dev/archive-2004-11/0123.shtml
>> >
>> > So please revert.
>> >
>> Sorry, may be I was not clear enough, but I was vetoing this change.
>> Given no reply, I'm going to revert it.
>
>
> [Been AFK for the last 3 days]
>
> Don't revert; your veto is based on faulty measurements & assumptions.
> Here the shortlist:
>
My veto is still my veto, you cannot judge it.

You don't get the point: you changed *global* constant used in
multiple situation for just FSFS optimization. The value for this
constant was discussed long time ago and was reduced for serious
reasons. You changed it just to fix specific FSFS cases. I believe
it's wrong way for coding.

You also made libsvn_wc a bit slower, because while your 'slow start'
compare algorithm may be works fine for FSFS it doesn't work well for
comparing on disk files in WC.

> * SVN__STREAM_CHUNK_SIZE is only to be used with files and
>   streams (duh!) - not with data structures that persist in RAM.
>   If your use of that constant were to actually effect the size of a
>   repo instance, that use would be faulty.
>
> * parse_context_t is only instantiated in one place,
> svn_config__parse_stream
>   and will be destroyed at the end of that function. So, there is no
>   persistent memory use from this one.
This depends on allocator and heap behavior.

>
> * According to *my* measurements, a FSFS instance takes about
>   46kB just after fs_open() and under 64 bit Unix. Of those 46k,
>   64k were used - according to you - for the stream buffer alone ...
>
Yes, on trunk FSFS instances doesn't use so much memory, because
specific subpools added and APR buffered files avoided. But this
issues can reappear again in future, like it was in Subversion 1.8.x.

So, I'm still veto this change. Please revert it. Then I suggest you
make this stream comparision algorithm private for FSFS, becuase it's
makes sense only for 'expensive' streams.

I'm ready to discuss increasing SVN__STREAM_CHUNK_SIZE to 64k or any
other value, if there are serous reasons for doing this, but only
after reverting this change from trunk.

-- 
Ivan Zhakov

Re: svn commit: r1586922 - in /subversion/trunk/subversion: include/private/svn_io_private.h include/svn_types.h libsvn_subr/io.c libsvn_subr/stream.c

Posted by Stefan Fuhrmann <st...@wandisco.com>.

On Fri, Apr 18, 2014 at 10:06 AM, Ivan Zhakov <iv...@visualsvn.com> wrote:

> On 14 April 2014 20:11, Ivan Zhakov <iv...@visualsvn.com> wrote:
> > On 13 April 2014 08:40,  <st...@apache.org> wrote:
> >> Author: stefan2
> >> Date: Sun Apr 13 04:40:40 2014
> >> New Revision: 1586922
> >>
> >> URL: http://svn.apache.org/r1586922
> >> Log:
> >> Speed up file / stream comparison, i.e. minimize the processing overhead
> >> for finding the first mismatch.
> >>
> >> The approach is two-sided.  Instead of fetching SVN__STREAM_CHUNK_SIZE
> >> from all sources before comparing data, we start with a much lower 4kB
> >> and increase the chunk size until we reach SVN__STREAM_CHUNK_SIZE while
> >> making sure that all reads are naturally aligned.  So, we quickly find
> >> mismatches near the beginning of the file.
> >>
> >> On the other end side, we bump the SVN__STREAM_CHUNK_SIZE to 64kB which
> >> gives better throughput for longer distances to the first mismatch -
> >> without causing ill effects in APR's memory management.
> >>
> >> * subversion/include/svn_types.h
> >>   (SVN__STREAM_CHUNK_SIZE): Bump to 64k and add some documentation on
> >>                             the general restrictions for future changes.
> >>
> > Hi Stefan,
> >
> > You effectively reverted my recent fix (r1581296) for high memory
> > usage with many repositories open at the same time (about 500k per
> > repository. Also please consider r857671 and discussion before that
> > commit:
> > http://svn.haxx.se/dev/archive-2004-11/0123.shtml
> >
> > So please revert.
> >
> Sorry, may be I was not clear enough, but I was vetoing this change.
> Given no reply, I'm going to revert it.
>

[Been AFK for the last 3 days]

Don't revert; your veto is based on faulty measurements & assumptions.
Here the shortlist:

* SVN__STREAM_CHUNK_SIZE is only to be used with files and
  streams (duh!) - not with data structures that persist in RAM.
  If your use of that constant were to actually effect the size of a
  repo instance, that use would be faulty.

* parse_context_t is only instantiated in one place,
svn_config__parse_stream
  and will be destroyed at the end of that function. So, there is no
  persistent memory use from this one.

* According to *my* measurements, a FSFS instance takes about
  46kB just after fs_open() and under 64 bit Unix. Of those 46k,
  64k were used - according to you - for the stream buffer alone ...

So, I doubt you actually verified by how much the marginal (as in
costs per new instance in the long run) memory consumption went
down due to your initial patch.

In the meanwhile, I'm looking into ways to reduce FSFS instance
size to something like 10k. There are several places where we
could be tighter with our pool usage.

-- Stefan^2.

Re: svn commit: r1586922 - in /subversion/trunk/subversion: include/private/svn_io_private.h include/svn_types.h libsvn_subr/io.c libsvn_subr/stream.c

Posted by Ivan Zhakov <iv...@visualsvn.com>.

On 14 April 2014 20:11, Ivan Zhakov <iv...@visualsvn.com> wrote:
> On 13 April 2014 08:40,  <st...@apache.org> wrote:
>> Author: stefan2
>> Date: Sun Apr 13 04:40:40 2014
>> New Revision: 1586922
>>
>> URL: http://svn.apache.org/r1586922
>> Log:
>> Speed up file / stream comparison, i.e. minimize the processing overhead
>> for finding the first mismatch.
>>
>> The approach is two-sided.  Instead of fetching SVN__STREAM_CHUNK_SIZE
>> from all sources before comparing data, we start with a much lower 4kB
>> and increase the chunk size until we reach SVN__STREAM_CHUNK_SIZE while
>> making sure that all reads are naturally aligned.  So, we quickly find
>> mismatches near the beginning of the file.
>>
>> On the other end side, we bump the SVN__STREAM_CHUNK_SIZE to 64kB which
>> gives better throughput for longer distances to the first mismatch -
>> without causing ill effects in APR's memory management.
>>
>> * subversion/include/svn_types.h
>>   (SVN__STREAM_CHUNK_SIZE): Bump to 64k and add some documentation on
>>                             the general restrictions for future changes.
>>
> Hi Stefan,
>
> You effectively reverted my recent fix (r1581296) for high memory
> usage with many repositories open at the same time (about 500k per
> repository. Also please consider r857671 and discussion before that
> commit:
> http://svn.haxx.se/dev/archive-2004-11/0123.shtml
>
> So please revert.
>
Sorry, may be I was not clear enough, but I was vetoing this change.
Given no reply, I'm going to revert it.

-- 
Ivan Zhakov
CTO | VisualSVN | http://www.visualsvn.com

Re: svn commit: r1586922 - in /subversion/trunk/subversion: include/private/svn_io_private.h include/svn_types.h libsvn_subr/io.c libsvn_subr/stream.c

Posted by Ivan Zhakov <iv...@visualsvn.com>.

On 13 April 2014 08:40,  <st...@apache.org> wrote:
> Author: stefan2
> Date: Sun Apr 13 04:40:40 2014
> New Revision: 1586922
>
> URL: http://svn.apache.org/r1586922
> Log:
> Speed up file / stream comparison, i.e. minimize the processing overhead
> for finding the first mismatch.
>
> The approach is two-sided.  Instead of fetching SVN__STREAM_CHUNK_SIZE
> from all sources before comparing data, we start with a much lower 4kB
> and increase the chunk size until we reach SVN__STREAM_CHUNK_SIZE while
> making sure that all reads are naturally aligned.  So, we quickly find
> mismatches near the beginning of the file.
>
> On the other end side, we bump the SVN__STREAM_CHUNK_SIZE to 64kB which
> gives better throughput for longer distances to the first mismatch -
> without causing ill effects in APR's memory management.
>
> * subversion/include/svn_types.h
>   (SVN__STREAM_CHUNK_SIZE): Bump to 64k and add some documentation on
>                             the general restrictions for future changes.
>
Hi Stefan,

You effectively reverted my recent fix (r1581296) for high memory
usage with many repositories open at the same time (about 500k per
repository. Also please consider r857671 and discussion before that
commit:
http://svn.haxx.se/dev/archive-2004-11/0123.shtml

So please revert.

> * subversion/include/private/svn_io_private.h
>   (svn_io__next_chunk_size): New utility function generating the new
>                              read block size sequence.
>
> * subversion/libsvn_subr/io.c
>   (svn_io__next_chunk_size): Implement.
>   (contents_identical_p,
>    contents_three_identical_p): Let the new utility determine the read
>                                 block size.
>
> * subversion/libsvn_subr/stream.c
>   (svn_stream_contents_same2): Ditto.
>
This optimization makes sense only for FSFS case where fetching
text/property content is expensive, but it doesn't makes sense for
comparing on disk files. So could you please make this optimization
FSFS specific, without affecting generic API and behavior. Probably it
better to make just smaller  chunk size for FSFS case btw. Thanks in
advance.

-- 
Ivan Zhakov
CTO | VisualSVN | http://www.visualsvn.com

Re: svn commit: r1586922 - in /subversion/trunk/subversion: include/private/svn_io_private.h include/svn_types.h libsvn_subr/io.c libsvn_subr/stream.c

Posted by Stefan Fuhrmann <st...@wandisco.com>.

On Sun, Apr 13, 2014 at 5:28 PM, Bert Huijben <be...@qqmail.nl> wrote:

>
>
> > -----Original Message-----
> > From: stefan2@apache.org [mailto:stefan2@apache.org]
> > Sent: zondag 13 april 2014 06:41
> > To: commits@subversion.apache.org
> > Subject: svn commit: r1586922 - in /subversion/trunk/subversion:
> > include/private/svn_io_private.h include/svn_types.h libsvn_subr/io.c
> > libsvn_subr/stream.c
> >
> > Author: stefan2
> > Date: Sun Apr 13 04:40:40 2014
> > New Revision: 1586922
> >
> > URL: http://svn.apache.org/r1586922
> > Log:
> > Speed up file / stream comparison, i.e. minimize the processing overhead
> > for finding the first mismatch.
> >
> > The approach is two-sided.  Instead of fetching SVN__STREAM_CHUNK_SIZE
> > from all sources before comparing data, we start with a much lower 4kB
> > and increase the chunk size until we reach SVN__STREAM_CHUNK_SIZE
> > while
> > making sure that all reads are naturally aligned.  So, we quickly find
> > mismatches near the beginning of the file.
> >
> > On the other end side, we bump the SVN__STREAM_CHUNK_SIZE to 64kB
> > which
> > gives better throughput for longer distances to the first mismatch -
> > without causing ill effects in APR's memory management.
> >
> > * subversion/include/svn_types.h
> >   (SVN__STREAM_CHUNK_SIZE): Bump to 64k and add some documentation
> > on
> >                             the general restrictions for future changes.
>
> This reverts recent changes to reduce server side load... (see config file
> behavior changes, recently discussed on dev@s.a.o")

> Please revert this and use a different variable...
>

I disagree. The documentation clearly states the application
of SVN__STREAM_CHUNK_SIZE. See also
http://svn.apache.org/viewvc?view=revision&revision=r857671
http://svn.haxx.se/dev/archive-2004-11/0123.shtml

If you accidentally used for a different purpose, it would
be you who had to fix that.

However, I assume you are talking about r1581296. The critical
bit there was to go from 100k (non-paged = fragmented allocation
in APR pool allocator) to < 80k (paged = non-fragmented) for
the whole parse_context_t. This is not violated by bumping
the buffer size to 64k.

> And I really wonder how you determined that the rest of the patch is going
> to help *for all users* both client and server side.
>

I don't have an obligation to verify that *all* users are going
to benefit but I determined the following (elaborating on what
is already stated in the commit message and the code itself):

* Increasing the allocation size for buffers from 16k to 64k
  will not affect allocation performance. It will also no affect
  overall memory consumption if those buffers are temporary
  (this condition is met by file and stream buffers).

* Functions that allow for an early exit (in this patch: file /
  stream comparison) can now exit after reading 8/12k
  instead of 32/48k. So, they are now slightly faster than
  before and won't get hurt by the larger ultimate chunk size.

There may be more functions that have a regular (non-exceptional)
early exit and would benefit from svn_io__next_chunk_size.
A quick scan over all users of SVN__STREAM_CHUNK_SIZE
didn't bring up something obvious, though.

The driver of this patch has been handling working copies
with large binaries in it that got "touched" but not modified.
Using a larger chunk size gave a 15% higher throughput
in 'svn st' on my Macbook.

Working copies on NFS volumes with "normal" file changes
should benefit from the reduced initial chunk size. I haven't
tested that, though. Local drives don't show any over 16k
but something like 2..5% over using a fixed 64k block size.

Do you have a scenario in mind where this change is actually
counter-productive? According to conventional wisdom, larger
chunks should always tend to be faster than smaller ones.

-- Stefan^2.

RE: svn commit: r1586922 - in /subversion/trunk/subversion: include/private/svn_io_private.h include/svn_types.h libsvn_subr/io.c libsvn_subr/stream.c

Posted by Bert Huijben <be...@qqmail.nl>.


> -----Original Message-----
> From: stefan2@apache.org [mailto:stefan2@apache.org]
> Sent: zondag 13 april 2014 06:41
> To: commits@subversion.apache.org
> Subject: svn commit: r1586922 - in /subversion/trunk/subversion:
> include/private/svn_io_private.h include/svn_types.h libsvn_subr/io.c
> libsvn_subr/stream.c
> 
> Author: stefan2
> Date: Sun Apr 13 04:40:40 2014
> New Revision: 1586922
> 
> URL: http://svn.apache.org/r1586922
> Log:
> Speed up file / stream comparison, i.e. minimize the processing overhead
> for finding the first mismatch.
> 
> The approach is two-sided.  Instead of fetching SVN__STREAM_CHUNK_SIZE
> from all sources before comparing data, we start with a much lower 4kB
> and increase the chunk size until we reach SVN__STREAM_CHUNK_SIZE
> while
> making sure that all reads are naturally aligned.  So, we quickly find
> mismatches near the beginning of the file.
> 
> On the other end side, we bump the SVN__STREAM_CHUNK_SIZE to 64kB
> which
> gives better throughput for longer distances to the first mismatch -
> without causing ill effects in APR's memory management.
> 
> * subversion/include/svn_types.h
>   (SVN__STREAM_CHUNK_SIZE): Bump to 64k and add some documentation
> on
>                             the general restrictions for future changes.

This reverts recent changes to reduce server side load... (see config file behavior changes, recently discussed on dev@s.a.o")

Please revert this and use a different variable...



And I really wonder how you determined that the rest of the patch is going to help *for all users* both client and server side.

	Bert


> 
> * subversion/include/private/svn_io_private.h
>   (svn_io__next_chunk_size): New utility function generating the new
>                              read block size sequence.
> 
> * subversion/libsvn_subr/io.c
>   (svn_io__next_chunk_size): Implement.
>   (contents_identical_p,
>    contents_three_identical_p): Let the new utility determine the read
>                                 block size.
> 
> * subversion/libsvn_subr/stream.c
>   (svn_stream_contents_same2): Ditto.
> 
> Modified:
>     subversion/trunk/subversion/include/private/svn_io_private.h
>     subversion/trunk/subversion/include/svn_types.h
>     subversion/trunk/subversion/libsvn_subr/io.c
>     subversion/trunk/subversion/libsvn_subr/stream.c
> 
> Modified: subversion/trunk/subversion/include/private/svn_io_private.h
> URL:
> http://svn.apache.org/viewvc/subversion/trunk/subversion/include/private
> /svn_io_private.h?rev=1586922&r1=1586921&r2=1586922&view=diff
> ==========================================================
> ====================
> --- subversion/trunk/subversion/include/private/svn_io_private.h (original)
> +++ subversion/trunk/subversion/include/private/svn_io_private.h Sun Apr
> 13 04:40:40 2014
> @@ -72,6 +72,14 @@ svn_io__is_finfo_read_only(svn_boolean_t
>                             apr_finfo_t *file_info,
>                             apr_pool_t *pool);
> 
> +/** Given that @a total_read bytes have already been read from a file or
> + * stream, return a suggestion for the size of the next block to process.
> + * This value will be <= #SVN__STREAM_CHUNK_SIZE.
> + *
> + * @since New in 1.9.
> + */
> +apr_size_t
> +svn_io__next_chunk_size(apr_off_t total_read);
> 
>  /** Buffer test handler function for a generic stream. @see svn_stream_t
>   * and svn_stream__is_buffered().
> 
> Modified: subversion/trunk/subversion/include/svn_types.h
> URL:
> http://svn.apache.org/viewvc/subversion/trunk/subversion/include/svn_ty
> pes.h?rev=1586922&r1=1586921&r2=1586922&view=diff
> ==========================================================
> ====================
> --- subversion/trunk/subversion/include/svn_types.h (original)
> +++ subversion/trunk/subversion/include/svn_types.h Sun Apr 13 04:40:40
> 2014
> @@ -1142,8 +1142,12 @@ typedef svn_error_t *(*svn_commit_callba
>   *
>   * NOTE: This is an internal macro, put here for convenience.
>   * No public API may depend on the particular value of this macro.
> + *
> + * NOTE: The implementation assumes that this is a power of two >= 4k.
> + *       Moreover, it should be less than 80kB to prevent memory
> + *       fragmentation in the APR memory allocator.
>   */
> -#define SVN__STREAM_CHUNK_SIZE 16384
> +#define SVN__STREAM_CHUNK_SIZE 0x10000
>  #endif
> 
>  /** The maximum amount we can ever hold in memory. */
> 
> Modified: subversion/trunk/subversion/libsvn_subr/io.c
> URL:
> http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/io.
> c?rev=1586922&r1=1586921&r2=1586922&view=diff
> ==========================================================
> ====================
> --- subversion/trunk/subversion/libsvn_subr/io.c (original)
> +++ subversion/trunk/subversion/libsvn_subr/io.c Sun Apr 13 04:40:40 2014
> @@ -63,6 +63,7 @@
>  #include "svn_config.h"
>  #include "svn_private_config.h"
>  #include "svn_ctype.h"
> +#include "svn_sorts.h"
> 
>  #include "private/svn_atomic.h"
>  #include "private/svn_io_private.h"
> @@ -4501,6 +4502,17 @@ svn_io_read_version_file(int *version,
>  }
> 
> 
> +apr_size_t
> +svn_io__next_chunk_size(apr_off_t total_read)
> +{
> +  /* Started with total_read===, this will generate a sequence ensuring
> +     aligned access with increasing block size up to
> SVN__STREAM_CHUNK_SIZE:
> +     4k@ offset 0, 4k@ offset 4k, 8k@ offset 8k, 16k@ offset 16k etc.
> +     */
> +  return total_read ? (apr_size_t)MIN(total_read,
> SVN__STREAM_CHUNK_SIZE)
> +                    : (apr_size_t)4096;
> +}
> +
> 
>  /* Do a byte-for-byte comparison of FILE1 and FILE2. */
>  static svn_error_t *
> @@ -4517,6 +4529,7 @@ contents_identical_p(svn_boolean_t *iden
>    apr_file_t *file2_h;
>    svn_boolean_t eof1 = FALSE;
>    svn_boolean_t eof2 = FALSE;
> +  apr_off_t total_read = 0;
> 
>    SVN_ERR(svn_io_file_open(&file1_h, file1, APR_READ, APR_OS_DEFAULT,
>                             pool));
> @@ -4532,14 +4545,17 @@ contents_identical_p(svn_boolean_t *iden
>    *identical_p = TRUE;  /* assume TRUE, until disproved below */
>    while (!err && !eof1 && !eof2)
>      {
> +      apr_size_t to_read = svn_io__next_chunk_size(total_read);
> +      total_read += to_read;
> +
>        err = svn_io_file_read_full2(file1_h, buf1,
> -                                   SVN__STREAM_CHUNK_SIZE, &bytes_read1,
> +                                   to_read, &bytes_read1,
>                                     &eof1, pool);
>        if (err)
>            break;
> 
>        err = svn_io_file_read_full2(file2_h, buf2,
> -                                   SVN__STREAM_CHUNK_SIZE, &bytes_read2,
> +                                   to_read, &bytes_read2,
>                                     &eof2, pool);
>        if (err)
>            break;
> @@ -4585,6 +4601,7 @@ contents_three_identical_p(svn_boolean_t
>    svn_boolean_t eof1 = FALSE;
>    svn_boolean_t eof2 = FALSE;
>    svn_boolean_t eof3 = FALSE;
> +  apr_off_t total_read = 0;
> 
>    SVN_ERR(svn_io_file_open(&file1_h, file1, APR_READ, APR_OS_DEFAULT,
>                             scratch_pool));
> @@ -4621,6 +4638,9 @@ contents_three_identical_p(svn_boolean_t
>        apr_size_t bytes_read1, bytes_read2, bytes_read3;
>        svn_boolean_t read_1, read_2, read_3;
> 
> +      apr_size_t to_read = svn_io__next_chunk_size(total_read);
> +      total_read += to_read;
> +
>        read_1 = read_2 = read_3 = FALSE;
> 
>        /* As long as a file is not at the end yet, and it is still
> @@ -4628,8 +4648,8 @@ contents_three_identical_p(svn_boolean_t
>        if (!eof1 && (*identical_p12 || *identical_p13))
>          {
>            err = svn_io_file_read_full2(file1_h, buf1,
> -                                   SVN__STREAM_CHUNK_SIZE, &bytes_read1,
> -                                   &eof1, scratch_pool);
> +                                       to_read, &bytes_read1,
> +                                       &eof1, scratch_pool);
>            if (err)
>                break;
>            read_1 = TRUE;
> @@ -4638,8 +4658,8 @@ contents_three_identical_p(svn_boolean_t
>        if (!eof2 && (*identical_p12 || *identical_p23))
>          {
>            err = svn_io_file_read_full2(file2_h, buf2,
> -                                   SVN__STREAM_CHUNK_SIZE, &bytes_read2,
> -                                   &eof2, scratch_pool);
> +                                       to_read, &bytes_read2,
> +                                       &eof2, scratch_pool);
>            if (err)
>                break;
>            read_2 = TRUE;
> @@ -4648,8 +4668,8 @@ contents_three_identical_p(svn_boolean_t
>        if (!eof3 && (*identical_p13 || *identical_p23))
>          {
>            err = svn_io_file_read_full2(file3_h, buf3,
> -                                   SVN__STREAM_CHUNK_SIZE, &bytes_read3,
> -                                   &eof3, scratch_pool);
> +                                       to_read, &bytes_read3,
> +                                       &eof3, scratch_pool);
>            if (err)
>                break;
>            read_3 = TRUE;
> 
> Modified: subversion/trunk/subversion/libsvn_subr/stream.c
> URL:
> http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/str
> eam.c?rev=1586922&r1=1586921&r2=1586922&view=diff
> ==========================================================
> ====================
> --- subversion/trunk/subversion/libsvn_subr/stream.c (original)
> +++ subversion/trunk/subversion/libsvn_subr/stream.c Sun Apr 13 04:40:40
> 2014
> @@ -585,14 +585,20 @@ svn_stream_contents_same2(svn_boolean_t
>  {
>    char *buf1 = apr_palloc(pool, SVN__STREAM_CHUNK_SIZE);
>    char *buf2 = apr_palloc(pool, SVN__STREAM_CHUNK_SIZE);
> -  apr_size_t bytes_read1 = SVN__STREAM_CHUNK_SIZE;
> -  apr_size_t bytes_read2 = SVN__STREAM_CHUNK_SIZE;
> +  apr_size_t to_read = 0;
> +  apr_size_t bytes_read1 = 0;
> +  apr_size_t bytes_read2 = 0;
> +  apr_off_t total_read = 0;
>    svn_error_t *err = NULL;
> 
>    *same = TRUE;  /* assume TRUE, until disproved below */
> -  while (bytes_read1 == SVN__STREAM_CHUNK_SIZE
> -         && bytes_read2 == SVN__STREAM_CHUNK_SIZE)
> +  while (bytes_read1 == to_read && bytes_read2 == to_read)
>      {
> +      to_read = svn_io__next_chunk_size(total_read);
> +      bytes_read1 = to_read;
> +      bytes_read2 = to_read;
> +      total_read += to_read;
> +
>        err = svn_stream_read_full(stream1, buf1, &bytes_read1);
>        if (err)
>          break;
>