You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@subversion.apache.org by jc...@apache.org on 2011/01/13 23:02:38 UTC

svn commit: r1058759 - /subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c

Author: jcorvel
Date: Thu Jan 13 22:02:38 2011
New Revision: 1058759

URL: http://svn.apache.org/viewvc?rev=1058759&view=rev
Log:
On the diff-optimizations-bytes branch:

Adapt the diff4 algorithm, as far as I understand it, to make use of the
prefix/suffix optimization.

Note:
- diff4 isn't used in svn core, only in tools/diff/diff4.
- This change is untested, because there is currently no test in the test
  suite exercising this algorithm, and I don't understand it well enough to
  write tests for it. Review and/or adding tests for this code is thus very
  welcome.

* subversion/libsvn_diff/diff4.c
  (svn_diff_diff4): Open the 4 datasources together with datasources_open, to
  let it eliminate identical prefix and suffix.

Modified:
    subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c

Modified: subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c
URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c?rev=1058759&r1=1058758&r2=1058759&view=diff
==============================================================================
--- subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c (original)
+++ subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c Thu Jan 13 22:02:38 2011
@@ -173,6 +173,10 @@ svn_diff_diff4(svn_diff_t **diff,
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[4];
+  svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
+                                        svn_diff_datasource_modified,
+                                        svn_diff_datasource_latest,
+                                        svn_diff_datasource_ancestor};
   svn_diff__lcs_t *lcs_ol;
   svn_diff__lcs_t *lcs_adjust;
   svn_diff_t *diff_ol;
@@ -181,6 +185,7 @@ svn_diff_diff4(svn_diff_t **diff,
   apr_pool_t *subpool;
   apr_pool_t *subpool2;
   apr_pool_t *subpool3;
+  apr_off_t prefix_lines = 0;
 
   *diff = NULL;
 
@@ -190,36 +195,38 @@ svn_diff_diff4(svn_diff_t **diff,
 
   svn_diff__tree_create(&tree, subpool3);
 
+  SVN_ERR(vtable->datasources_open(diff_baton, &prefix_lines, datasource, 4));
+
   SVN_ERR(svn_diff__get_tokens(&position_list[0],
                                tree,
                                diff_baton, vtable,
                                svn_diff_datasource_original,
-                               FALSE,
-                               0,
+                               TRUE,
+                               prefix_lines,
                                subpool2));
 
   SVN_ERR(svn_diff__get_tokens(&position_list[1],
                                tree,
                                diff_baton, vtable,
                                svn_diff_datasource_modified,
-                               FALSE,
-                               0,
+                               TRUE,
+                               prefix_lines,
                                subpool));
 
   SVN_ERR(svn_diff__get_tokens(&position_list[2],
                                tree,
                                diff_baton, vtable,
                                svn_diff_datasource_latest,
-                               FALSE,
-                               0,
+                               TRUE,
+                               prefix_lines,
                                subpool));
 
   SVN_ERR(svn_diff__get_tokens(&position_list[3],
                                tree,
                                diff_baton, vtable,
                                svn_diff_datasource_ancestor,
-                               FALSE,
-                               0,
+                               TRUE,
+                               prefix_lines,
                                subpool2));
 
   /* Get rid of the tokens, we don't need them to calc the diff */
@@ -230,7 +237,8 @@ svn_diff_diff4(svn_diff_t **diff,
   svn_pool_clear(subpool3);
 
   /* Get the lcs for original - latest */
-  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], 0, subpool3);
+  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
+                         subpool3);
   diff_ol = svn_diff__diff(lcs_ol, 1, 1, TRUE, pool);
 
   svn_pool_clear(subpool3);
@@ -251,7 +259,8 @@ svn_diff_diff4(svn_diff_t **diff,
   /* Get the lcs for common ancestor - original
    * Do reverse adjustements
    */
-  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], 0, subpool3);
+  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], prefix_lines,
+                             subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
 
@@ -260,7 +269,8 @@ svn_diff_diff4(svn_diff_t **diff,
   /* Get the lcs for modified - common ancestor
    * Do forward adjustments
    */
-  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], 0, subpool3);
+  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], prefix_lines,
+                             subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
 



Re: diff4-optimization-bytes

Posted by Johan Corveleyn <jc...@gmail.com>.
On Wed, Feb 9, 2011 at 10:20 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> In other news, I looked into the cause --- tried to make
> datasource_get_next_token() do one more loop in the place where
> currently it does 'return if at_start_of_suffix()' --- but that didn't
> fix the truncation...

Indeed, that won't fix it.

I think the best (cleanest, most robust, least special-cased) approach
is to leave the suffix stripping as it is (and let
datasource_get_next_token() stop when it encounters the suffix), but
to count the number of suffix lines while scanning them (like we do
for prefix). Then that suffix_lines result can be passed around, like
the prefix_lines, so it ends up in the call to lcs.c#svn_diff__lcs,
where a piece of "suffix_lcs" can be added to the lcs chain (just like
the prefix_lcs is prepended to it). If we get there, the rest will
work automagically :-).

The only drawback of this is a minor loss of performance of the
optimization, since we now have to count newlines while scanning the
suffix (i.e. need to compare every byte scanned). But I can live with
that :-). I'll see how much impact this has once I get it working.

I've done part of this locally (adapting the function parameters and
passing the value around), but still have to do the hard parts:
- actually counting the newlines in find_identical_suffix
- create and append the suffix_lcs in lcs.c#svn_diff__lcs.

It might take me a couple more days to finish that.

> In the meantime, I tweaked a test to make it XFail (r1068798).  From
> a quick glance it seemed to be relevant, but in second thought I'll
> admit I didn't study the test thoroughly before making the patch.

Great, thanks. The fact that it's random is a bit annoying, but at
least it's something. I was thinking of writing a specific merge test
(or another one in diff-diff3-test.c) with say 100 lines of identical
suffix. But for now I don't have the time to do that. Maybe later ...

-- 
Johan

Re: diff4-optimization-bytes

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.
In other news, I looked into the cause --- tried to make
datasource_get_next_token() do one more loop in the place where
currently it does 'return if at_start_of_suffix()' --- but that didn't
fix the truncation...

In the meantime, I tweaked a test to make it XFail (r1068798).  From
a quick glance it seemed to be relevant, but in second thought I'll
admit I didn't study the test thoroughly before making the patch.

Daniel


Daniel Shahaf wrote on Wed, Feb 09, 2011 at 11:10:43 +0200:
> Johan Corveleyn wrote on Wed, Feb 09, 2011 at 08:42:20 +0100:
> > On Wed, Feb 9, 2011 at 4:54 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> > > The experimental code is svn_diff_diff4_2(); AFAIK svn_diff_diff4() is
> > > as stable as ever.
> > 
> > I have no objections on adding such an annotation. However, from where
> > I'm sitting, svn_diff_diff4() is/was just as under-exercised as
> > svn_diff_diff4_2(), i.e. no known callers, only one unit test (thanks
> > for adding that test, BTW). Of course it's into the codebase a lot
> > longer than *_2, but it has never had any core code calling it, and no
> > unit tests (so could have been broken by any number of commits after
> > its inception till now).
> > 
> 
> In other words, svn_diff_diff4() is as experimental as svn_diff_diff4_2().
> Agreed.
> 
> IMO the issue boils down to representation: if we haven't tested a given
> piece of code (it has no callers and a surfacial unit test), then we
> shouldn't represent to API consumers otherwise.
> 
> Daniel
> (and yes, I respect all the work you've been doing; my opinion of
> diff4_2() is orthogonal to that)
> 
> > -- 
> > Johan

Re: diff4-optimization-bytes

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.
Johan Corveleyn wrote on Wed, Feb 09, 2011 at 08:42:20 +0100:
> On Wed, Feb 9, 2011 at 4:54 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> > The experimental code is svn_diff_diff4_2(); AFAIK svn_diff_diff4() is
> > as stable as ever.
> 
> I have no objections on adding such an annotation. However, from where
> I'm sitting, svn_diff_diff4() is/was just as under-exercised as
> svn_diff_diff4_2(), i.e. no known callers, only one unit test (thanks
> for adding that test, BTW). Of course it's into the codebase a lot
> longer than *_2, but it has never had any core code calling it, and no
> unit tests (so could have been broken by any number of commits after
> its inception till now).
> 

In other words, svn_diff_diff4() is as experimental as svn_diff_diff4_2().
Agreed.

IMO the issue boils down to representation: if we haven't tested a given
piece of code (it has no callers and a surfacial unit test), then we
shouldn't represent to API consumers otherwise.

Daniel
(and yes, I respect all the work you've been doing; my opinion of
diff4_2() is orthogonal to that)

> -- 
> Johan

Re: diff4-optimization-bytes

Posted by Johan Corveleyn <jc...@gmail.com>.
On Wed, Feb 9, 2011 at 4:54 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> Hyrum K Wright wrote on Tue, Feb 08, 2011 at 21:47:12 -0600:
>> On Tue, Feb 8, 2011 at 9:26 PM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
>> > Johan Corveleyn wrote on Fri, Feb 04, 2011 at 13:20:29 +0100:
>> >> On Fri, Feb 4, 2011 at 7:56 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
>> >> > Could you have a look? (attached)
>> >>
>> >> Nice. It looks good to me (haven't tested it, just looked at the code;
>> >> I assume it passes with trunk?)
>> >>
>> >
>> > Thanks, yes, r1068749.
>> >
>> > While I'm here, in light of the truncation bug in diff3 earlier today,
>> > how about adding a warning to svn_diff_diff4_2() family of API's to the
>> > effect of "@warning This code is experimental"?
>>
>> How much more experimental is it than other recent code we've added to
>> trunk?
>
> s/experimental/under-exercised/
> (no known callers, only one unit test)
>
>> (And I hope that such an appellation would be only temporary;
>> experimental code close to a release is a Bad Thing.  Johan is working
>> hard to fix whatever Badness there may be.)
>>
>
> The experimental code is svn_diff_diff4_2(); AFAIK svn_diff_diff4() is
> as stable as ever.

I have no objections on adding such an annotation. However, from where
I'm sitting, svn_diff_diff4() is/was just as under-exercised as
svn_diff_diff4_2(), i.e. no known callers, only one unit test (thanks
for adding that test, BTW). Of course it's into the codebase a lot
longer than *_2, but it has never had any core code calling it, and no
unit tests (so could have been broken by any number of commits after
its inception till now).

-- 
Johan

Re: diff4-optimization-bytes

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.
Hyrum K Wright wrote on Tue, Feb 08, 2011 at 21:47:12 -0600:
> On Tue, Feb 8, 2011 at 9:26 PM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> > Johan Corveleyn wrote on Fri, Feb 04, 2011 at 13:20:29 +0100:
> >> On Fri, Feb 4, 2011 at 7:56 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> >> > Could you have a look? (attached)
> >>
> >> Nice. It looks good to me (haven't tested it, just looked at the code;
> >> I assume it passes with trunk?)
> >>
> >
> > Thanks, yes, r1068749.
> >
> > While I'm here, in light of the truncation bug in diff3 earlier today,
> > how about adding a warning to svn_diff_diff4_2() family of API's to the
> > effect of "@warning This code is experimental"?
> 
> How much more experimental is it than other recent code we've added to
> trunk?

s/experimental/under-exercised/
(no known callers, only one unit test)

> (And I hope that such an appellation would be only temporary;
> experimental code close to a release is a Bad Thing.  Johan is working
> hard to fix whatever Badness there may be.)
> 

The experimental code is svn_diff_diff4_2(); AFAIK svn_diff_diff4() is
as stable as ever.

> -Hyrum

Re: diff4-optimization-bytes

Posted by Hyrum K Wright <hy...@hyrumwright.org>.
On Tue, Feb 8, 2011 at 9:26 PM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> Johan Corveleyn wrote on Fri, Feb 04, 2011 at 13:20:29 +0100:
>> On Fri, Feb 4, 2011 at 7:56 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
>> > Could you have a look? (attached)
>>
>> Nice. It looks good to me (haven't tested it, just looked at the code;
>> I assume it passes with trunk?)
>>
>
> Thanks, yes, r1068749.
>
> While I'm here, in light of the truncation bug in diff3 earlier today,
> how about adding a warning to svn_diff_diff4_2() family of API's to the
> effect of "@warning This code is experimental"?

How much more experimental is it than other recent code we've added to
trunk?  (And I hope that such an appellation would be only temporary;
experimental code close to a release is a Bad Thing.  Johan is working
hard to fix whatever Badness there may be.)

-Hyrum

Re: diff4-optimization-bytes

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.
Johan Corveleyn wrote on Fri, Feb 04, 2011 at 13:20:29 +0100:
> On Fri, Feb 4, 2011 at 7:56 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> > Could you have a look? (attached)
> 
> Nice. It looks good to me (haven't tested it, just looked at the code;
> I assume it passes with trunk?)
> 

Thanks, yes, r1068749.

While I'm here, in light of the truncation bug in diff3 earlier today,
how about adding a warning to svn_diff_diff4_2() family of API's to the
effect of "@warning This code is experimental"?

> The order of the parameters is correct: tools/diff/diff4.c does
> exactly the same thing (switch the first and second argument), before
> it passes them to svn_diff_file_diff4.

Re: diff4-optimization-bytes

Posted by Johan Corveleyn <jc...@gmail.com>.
On Fri, Feb 4, 2011 at 7:56 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> Johan Corveleyn wrote on Mon, Jan 31, 2011 at 02:47:50 +0100:
>> On Fri, Jan 28, 2011 at 7:58 PM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
>> > Johan Corveleyn wrote on Fri, Jan 28, 2011 at 14:04:07 +0100:
>> >> On Fri, Jan 28, 2011 at 9:29 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
>> >> > May I suggest that, if this code is to be released, then you validate
>> >> > its correctnss?  For example, a minimal regression test that is written
>> >> > to record trunk's pre-branch behaviour might be sufficient.
>> >>
>> >> ... I'll accept your suggestion. I'll try to take some time for that
>> >> this weekend. If anyone beats me to it, that would be fine as well of
>> >> course :).
>> >>
>>
>> Sorry, didn't get around to it yet. Maybe sometime during next week.
>>
>> > Thanks :-).  I've looked into it, but it seems the output functions in
>> > svn_diff.h are oriented to diff2/diff3 only (they don't have an
>> > 'ancestor' enum or parameters); and in a quick test, I couldn't get
>> > tools/diff/diff4 to output anything sensible:
>> >
>> > % for i in 1 2 3 4; do echo $i>$i; done
>> > % ./tools/diff/diff4 1 2 3 4
>> > 3
>> > %
>>
>> I think it's more or less sensible, although I'm not completely sure.
>> If you look at the "usage" of diff4, you see:
>>
>>     Usage: /path/to/diff4 <mine> <older> <yours> <ancestor>
>>
>> I think what you did is: take the difference between 2 and 3 (older
>> and yours), and apply that to 1 (mine), using 4 as the common
>> ancestor. Apparently that yields "3" :-).
>>
>> Compare that with the use of diff3, with the files 1, 2 and 3:
>>
>>     Usage: /path/to/diff3 <mine> <older> <yours>
>>
>> $ diff3 1 2 3
>> <<<<<<< 1
>> 1
>> =======
>> 3
>> >>>>>>> 3
>>
>> Hmmm, that looks like a merge conflict, which seems logical (trying to
>> apply the diff between 2 and 3, to the file 1 which doesn't contain a
>> line '2').
>>
>
> Agreed.
>
>> So now I also don't really understand why diff4 successfully merges
>> it, given the ancestor 4. Maybe the reasoning is as follows, given
>> that 4 is the common ancestor:
>>
>> - 1 has evolved out of 4 (so '1' is the mine's replacement for '4')
>>
>> - 2 has evolved out of 4 (so in the merge source, '2' is the
>> replacement for '4')
>>
>> - So with "variance-adjusted patching", the diff between 2 and 3
>> translates into: "replace whatever '4' was transformed into (in this
>> case '1') into '3'". That would yield '3'.
>>
>> Not sure though.
>>
>> > How can we test diff4 then?  Do we have to add public diff4 APIs in
>> > order to be able to test svn_diff_diff4_2()?
>>
>> Maybe we should come up with a better example. In the meantime, I'm
>> trying to understand diff4 (reading kfogel's article at [1], as
>> referenced in diff4.c).
>>
>
> That article was very helpful --- thanks for the pointer!
>
>> > Daniel
>> > (on the one hand I'd much prefer to test the API changes before
>> > releasing them; on the other hand, adding API's related to an API
>> > that no one (to our knowledge) uses seems odd)
>>
>> Yes. I think we should give it some effort to come up with some
>> minimal testing. But if it takes too long, we should probably not let
>> this be blocking....
>>
>
> I've went ahead and made a patch out of the second example in that
> article.  However, I admit that I got it to work by fiddling with the
> order of the four parameters until the test passed :-)
>
> Could you have a look? (attached)

Nice. It looks good to me (haven't tested it, just looked at the code;
I assume it passes with trunk?)

The order of the parameters is correct: tools/diff/diff4.c does
exactly the same thing (switch the first and second argument), before
it passes them to svn_diff_file_diff4.

tools/diff/diff4.c, lines 75-77:
[[[
    svn_err = do_diff4(ostream,
                       argv[2], argv[1], argv[3], argv[4],
                       pool);
]]]

And do_diff4 look thusly, mirroring exactly what you're doing in the test:
[[[
static svn_error_t *
do_diff4(svn_stream_t *ostream,
         const char *original,
         const char *modified,
         const char *latest,
         const char *ancestor,
         apr_pool_t *pool)
{
  svn_diff_t *diff;

  SVN_ERR(svn_diff_file_diff4(&diff,
                              original, modified, latest, ancestor,
                              pool));
  SVN_ERR(svn_diff_file_output_merge(ostream, diff,
                                     original, modified, latest,
                                     NULL, NULL, NULL, NULL,
                                     FALSE,
                                     FALSE,
                                     pool));

  return NULL;
}
]]]

So apparently, if we compare the arguments from diff4's usage message
with the names of the parameters, we have:
    mine == modified
    older == original
    yours == latest
    ancestor == ancestor

Cheers,
Johan

> Thanks,
>
> Daniel
>
>> Cheers,
>> --
>> Johan
>>
>> [1] http://svn.apache.org/repos/asf/subversion/trunk/notes/variance-adjusted-patching.html
>

Re: diff4-optimization-bytes

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.
Johan Corveleyn wrote on Mon, Jan 31, 2011 at 02:47:50 +0100:
> On Fri, Jan 28, 2011 at 7:58 PM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> > Johan Corveleyn wrote on Fri, Jan 28, 2011 at 14:04:07 +0100:
> >> On Fri, Jan 28, 2011 at 9:29 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> >> > May I suggest that, if this code is to be released, then you validate
> >> > its correctnss? �For example, a minimal regression test that is written
> >> > to record trunk's pre-branch behaviour might be sufficient.
> >>
> >> ... I'll accept your suggestion. I'll try to take some time for that
> >> this weekend. If anyone beats me to it, that would be fine as well of
> >> course :).
> >>
> 
> Sorry, didn't get around to it yet. Maybe sometime during next week.
> 
> > Thanks :-). �I've looked into it, but it seems the output functions in
> > svn_diff.h are oriented to diff2/diff3 only (they don't have an
> > 'ancestor' enum or parameters); and in a quick test, I couldn't get
> > tools/diff/diff4 to output anything sensible:
> >
> > % for i in 1 2 3 4; do echo $i>$i; done
> > % ./tools/diff/diff4 1 2 3 4
> > 3
> > %
> 
> I think it's more or less sensible, although I'm not completely sure.
> If you look at the "usage" of diff4, you see:
> 
>     Usage: /path/to/diff4 <mine> <older> <yours> <ancestor>
> 
> I think what you did is: take the difference between 2 and 3 (older
> and yours), and apply that to 1 (mine), using 4 as the common
> ancestor. Apparently that yields "3" :-).
> 
> Compare that with the use of diff3, with the files 1, 2 and 3:
> 
>     Usage: /path/to/diff3 <mine> <older> <yours>
> 
> $ diff3 1 2 3
> <<<<<<< 1
> 1
> =======
> 3
> >>>>>>> 3
> 
> Hmmm, that looks like a merge conflict, which seems logical (trying to
> apply the diff between 2 and 3, to the file 1 which doesn't contain a
> line '2').
> 

Agreed.

> So now I also don't really understand why diff4 successfully merges
> it, given the ancestor 4. Maybe the reasoning is as follows, given
> that 4 is the common ancestor:
> 
> - 1 has evolved out of 4 (so '1' is the mine's replacement for '4')
> 
> - 2 has evolved out of 4 (so in the merge source, '2' is the
> replacement for '4')
> 
> - So with "variance-adjusted patching", the diff between 2 and 3
> translates into: "replace whatever '4' was transformed into (in this
> case '1') into '3'". That would yield '3'.
> 
> Not sure though.
> 
> > How can we test diff4 then? �Do we have to add public diff4 APIs in
> > order to be able to test svn_diff_diff4_2()?
> 
> Maybe we should come up with a better example. In the meantime, I'm
> trying to understand diff4 (reading kfogel's article at [1], as
> referenced in diff4.c).
> 

That article was very helpful --- thanks for the pointer!

> > Daniel
> > (on the one hand I'd much prefer to test the API changes before
> > releasing them; on the other hand, adding API's related to an API
> > that no one (to our knowledge) uses seems odd)
> 
> Yes. I think we should give it some effort to come up with some
> minimal testing. But if it takes too long, we should probably not let
> this be blocking....
> 

I've went ahead and made a patch out of the second example in that
article.  However, I admit that I got it to work by fiddling with the
order of the four parameters until the test passed :-)

Could you have a look? (attached)

Thanks,

Daniel

> Cheers,
> -- 
> Johan
> 
> [1] http://svn.apache.org/repos/asf/subversion/trunk/notes/variance-adjusted-patching.html

Re: diff4-optimization-bytes

Posted by Johan Corveleyn <jc...@gmail.com>.
On Fri, Jan 28, 2011 at 7:58 PM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> Johan Corveleyn wrote on Fri, Jan 28, 2011 at 14:04:07 +0100:
>> On Fri, Jan 28, 2011 at 9:29 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
>> > May I suggest that, if this code is to be released, then you validate
>> > its correctnss?  For example, a minimal regression test that is written
>> > to record trunk's pre-branch behaviour might be sufficient.
>>
>> ... I'll accept your suggestion. I'll try to take some time for that
>> this weekend. If anyone beats me to it, that would be fine as well of
>> course :).
>>

Sorry, didn't get around to it yet. Maybe sometime during next week.

> Thanks :-).  I've looked into it, but it seems the output functions in
> svn_diff.h are oriented to diff2/diff3 only (they don't have an
> 'ancestor' enum or parameters); and in a quick test, I couldn't get
> tools/diff/diff4 to output anything sensible:
>
> % for i in 1 2 3 4; do echo $i>$i; done
> % ./tools/diff/diff4 1 2 3 4
> 3
> %

I think it's more or less sensible, although I'm not completely sure.
If you look at the "usage" of diff4, you see:

    Usage: /path/to/diff4 <mine> <older> <yours> <ancestor>

I think what you did is: take the difference between 2 and 3 (older
and yours), and apply that to 1 (mine), using 4 as the common
ancestor. Apparently that yields "3" :-).

Compare that with the use of diff3, with the files 1, 2 and 3:

    Usage: /path/to/diff3 <mine> <older> <yours>

$ diff3 1 2 3
<<<<<<< 1
1
=======
3
>>>>>>> 3

Hmmm, that looks like a merge conflict, which seems logical (trying to
apply the diff between 2 and 3, to the file 1 which doesn't contain a
line '2').

So now I also don't really understand why diff4 successfully merges
it, given the ancestor 4. Maybe the reasoning is as follows, given
that 4 is the common ancestor:

- 1 has evolved out of 4 (so '1' is the mine's replacement for '4')

- 2 has evolved out of 4 (so in the merge source, '2' is the
replacement for '4')

- So with "variance-adjusted patching", the diff between 2 and 3
translates into: "replace whatever '4' was transformed into (in this
case '1') into '3'". That would yield '3'.

Not sure though.

> How can we test diff4 then?  Do we have to add public diff4 APIs in
> order to be able to test svn_diff_diff4_2()?

Maybe we should come up with a better example. In the meantime, I'm
trying to understand diff4 (reading kfogel's article at [1], as
referenced in diff4.c).

> Daniel
> (on the one hand I'd much prefer to test the API changes before
> releasing them; on the other hand, adding API's related to an API
> that no one (to our knowledge) uses seems odd)

Yes. I think we should give it some effort to come up with some
minimal testing. But if it takes too long, we should probably not let
this be blocking....

Cheers,
-- 
Johan

[1] http://svn.apache.org/repos/asf/subversion/trunk/notes/variance-adjusted-patching.html

Re: diff4-optimization-bytes

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.
Johan Corveleyn wrote on Fri, Jan 28, 2011 at 14:04:07 +0100:
> On Fri, Jan 28, 2011 at 9:29 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> > May I suggest that, if this code is to be released, then you validate
> > its correctnss?  For example, a minimal regression test that is written
> > to record trunk's pre-branch behaviour might be sufficient.
> 
> ... I'll accept your suggestion. I'll try to take some time for that
> this weekend. If anyone beats me to it, that would be fine as well of
> course :).
> 

Thanks :-).  I've looked into it, but it seems the output functions in
svn_diff.h are oriented to diff2/diff3 only (they don't have an
'ancestor' enum or parameters); and in a quick test, I couldn't get
tools/diff/diff4 to output anything sensible:

% for i in 1 2 3 4; do echo $i>$i; done
% ./tools/diff/diff4 1 2 3 4
3
% 

How can we test diff4 then?  Do we have to add public diff4 APIs in
order to be able to test svn_diff_diff4_2()?

Daniel
(on the one hand I'd much prefer to test the API changes before
releasing them; on the other hand, adding API's related to an API
that no one (to our knowledge) uses seems odd)

> Cheers,
> Johan

Re: diff4-optimization-bytes

Posted by Johan Corveleyn <jc...@gmail.com>.
On Fri, Jan 28, 2011 at 9:29 AM, Daniel Shahaf <d....@daniel.shahaf.name> wrote:
> Johan,
>
> I'm concerned about this change: on the one hand, it's untested and
> no one claims to be understanding the code; on the other hand, it
> doesn't exactly parallel the diff3 change:
>
> specifically, the last hunk of the diff3 patch (which is also included
> below) has no equivalent in the diff4 patch.  A wild guess is that the
> parameters to svn_diff__diff() might need to be adjusted.
>
> All in all, given your warnings in the log message, I have zero reason
> to believe the change is correct, and I just cited one reason I have to
> believe it's wrong :-).

I admit it's kind of a vulnerable spot in the work I did, because of
the absence of tests. I made those changes simply based on some
generic understanding about the code structure, and with analogy to
the other code.

However, I'm pretty sure that it's correct. That last hunk in the
diff3 patch is not there in diff4, because the analogous piece of code
isn't there in diff4 (and it also isn't in diff2). That's because
diff3 does something quite special: it tries to manually "synchronize"
between the lcs information from "original-modified" and
"original-latest". That isn't done in diff2 nor in diff4. So AFAIU the
changes to diff4 more or less parallel the changes to diff2, and it's
diff3 that is 'special'.

However, I also don't feel quite at easy with this situation, so ...

> May I suggest that, if this code is to be released, then you validate
> its correctnss?  For example, a minimal regression test that is written
> to record trunk's pre-branch behaviour might be sufficient.

... I'll accept your suggestion. I'll try to take some time for that
this weekend. If anyone beats me to it, that would be fine as well of
course :).

Cheers,
Johan

>
> Best,
>
> Daniel
>
>
> jcorvel@apache.org wrote on Thu, Jan 13, 2011 at 22:02:38 -0000:
>> Author: jcorvel
>> Date: Thu Jan 13 22:02:38 2011
>> New Revision: 1058759
>>
>> URL: http://svn.apache.org/viewvc?rev=1058759&view=rev
>> Log:
>> On the diff-optimizations-bytes branch:
>>
>> Adapt the diff4 algorithm, as far as I understand it, to make use of the
>> prefix/suffix optimization.
>>
>> Note:
>> - diff4 isn't used in svn core, only in tools/diff/diff4.
>> - This change is untested, because there is currently no test in the test
>>   suite exercising this algorithm, and I don't understand it well enough to
>>   write tests for it. Review and/or adding tests for this code is thus very
>>   welcome.
>>
>> * subversion/libsvn_diff/diff4.c
>>   (svn_diff_diff4): Open the 4 datasources together with datasources_open, to
>>   let it eliminate identical prefix and suffix.
>>
>> Modified:
>>     subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c
>>
>> Modified: subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c
>> URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c?rev=1058759&r1=1058758&r2=1058759&view=diff
>> ==============================================================================
>> --- subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c (original)
>> +++ subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c Thu Jan 13 22:02:38 2011
>> @@ -173,6 +173,10 @@ svn_diff_diff4(svn_diff_t **diff,
>>  {
>>    svn_diff__tree_t *tree;
>>    svn_diff__position_t *position_list[4];
>> +  svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
>> +                                        svn_diff_datasource_modified,
>> +                                        svn_diff_datasource_latest,
>> +                                        svn_diff_datasource_ancestor};
>>    svn_diff__lcs_t *lcs_ol;
>>    svn_diff__lcs_t *lcs_adjust;
>>    svn_diff_t *diff_ol;
>> @@ -181,6 +185,7 @@ svn_diff_diff4(svn_diff_t **diff,
>>    apr_pool_t *subpool;
>>    apr_pool_t *subpool2;
>>    apr_pool_t *subpool3;
>> +  apr_off_t prefix_lines = 0;
>>
>>    *diff = NULL;
>>
>> @@ -190,36 +195,38 @@ svn_diff_diff4(svn_diff_t **diff,
>>
>>    svn_diff__tree_create(&tree, subpool3);
>>
>> +  SVN_ERR(vtable->datasources_open(diff_baton, &prefix_lines, datasource, 4));
>> +
>>    SVN_ERR(svn_diff__get_tokens(&position_list[0],
>>                                 tree,
>>                                 diff_baton, vtable,
>>                                 svn_diff_datasource_original,
>> -                               FALSE,
>> -                               0,
>> +                               TRUE,
>> +                               prefix_lines,
>>                                 subpool2));
>>
>>    SVN_ERR(svn_diff__get_tokens(&position_list[1],
>>                                 tree,
>>                                 diff_baton, vtable,
>>                                 svn_diff_datasource_modified,
>> -                               FALSE,
>> -                               0,
>> +                               TRUE,
>> +                               prefix_lines,
>>                                 subpool));
>>
>>    SVN_ERR(svn_diff__get_tokens(&position_list[2],
>>                                 tree,
>>                                 diff_baton, vtable,
>>                                 svn_diff_datasource_latest,
>> -                               FALSE,
>> -                               0,
>> +                               TRUE,
>> +                               prefix_lines,
>>                                 subpool));
>>
>>    SVN_ERR(svn_diff__get_tokens(&position_list[3],
>>                                 tree,
>>                                 diff_baton, vtable,
>>                                 svn_diff_datasource_ancestor,
>> -                               FALSE,
>> -                               0,
>> +                               TRUE,
>> +                               prefix_lines,
>>                                 subpool2));
>>
>>    /* Get rid of the tokens, we don't need them to calc the diff */
>> @@ -230,7 +237,8 @@ svn_diff_diff4(svn_diff_t **diff,
>>    svn_pool_clear(subpool3);
>>
>>    /* Get the lcs for original - latest */
>> -  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], 0, subpool3);
>> +  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
>> +                         subpool3);
>>    diff_ol = svn_diff__diff(lcs_ol, 1, 1, TRUE, pool);
>>
>>    svn_pool_clear(subpool3);
>> @@ -251,7 +259,8 @@ svn_diff_diff4(svn_diff_t **diff,
>>    /* Get the lcs for common ancestor - original
>>     * Do reverse adjustements
>>     */
>> -  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], 0, subpool3);
>> +  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], prefix_lines,
>> +                             subpool3);
>>    diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
>>    adjust_diff(diff_ol, diff_adjust);
>>
>> @@ -260,7 +269,8 @@ svn_diff_diff4(svn_diff_t **diff,
>>    /* Get the lcs for modified - common ancestor
>>     * Do forward adjustments
>>     */
>> -  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], 0, subpool3);
>> +  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], prefix_lines,
>> +                             subpool3);
>>    diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
>>    adjust_diff(diff_ol, diff_adjust);
>>
>>
>>
>
> jcorvel@apache.org wrote on Thu, Jan 13, 2011 at 21:38:16 -0000:
>> Author: jcorvel
>> Date: Thu Jan 13 21:38:16 2011
>> New Revision: 1058753
>>
>> URL: http://svn.apache.org/viewvc?rev=1058753&view=rev
>> Log:
>> On the diff-optimizations-bytes branch:
>>
>> Adapt the diff3 algorithm to make use of the prefix/suffix optimization.
>>
>> * subversion/libsvn_diff/diff3.c
>>   (svn_diff_diff3): Open the 3 datasources together with datasources_open, to
>>    let it eliminate identical prefix and suffix. Don't forget to adjust the
>>    sentinel_positions with the prefix_lines, in the case of an empty
>>    position_list.
>>
>> Modified:
>>     subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c
>>
>> Modified: subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c
>> URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c?rev=1058753&r1=1058752&r2=1058753&view=diff
>> ==============================================================================
>> --- subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c (original)
>> +++ subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c Thu Jan 13 21:38:16 2011
>> @@ -251,10 +251,14 @@ svn_diff_diff3(svn_diff_t **diff,
>>  {
>>    svn_diff__tree_t *tree;
>>    svn_diff__position_t *position_list[3];
>> +  svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
>> +                                        svn_diff_datasource_modified,
>> +                                        svn_diff_datasource_latest};
>>    svn_diff__lcs_t *lcs_om;
>>    svn_diff__lcs_t *lcs_ol;
>>    apr_pool_t *subpool;
>>    apr_pool_t *treepool;
>> +  apr_off_t prefix_lines = 0;
>>
>>    *diff = NULL;
>>
>> @@ -263,28 +267,30 @@ svn_diff_diff3(svn_diff_t **diff,
>>
>>    svn_diff__tree_create(&tree, treepool);
>>
>> +  SVN_ERR(vtable->datasources_open(diff_baton, &prefix_lines, datasource, 3));
>> +
>>    SVN_ERR(svn_diff__get_tokens(&position_list[0],
>>                                 tree,
>>                                 diff_baton, vtable,
>>                                 svn_diff_datasource_original,
>> -                               FALSE,
>> -                               0,
>> +                               TRUE,
>> +                               prefix_lines,
>>                                 subpool));
>>
>>    SVN_ERR(svn_diff__get_tokens(&position_list[1],
>>                                 tree,
>>                                 diff_baton, vtable,
>>                                 svn_diff_datasource_modified,
>> -                               FALSE,
>> -                               0,
>> +                               TRUE,
>> +                               prefix_lines,
>>                                 subpool));
>>
>>    SVN_ERR(svn_diff__get_tokens(&position_list[2],
>>                                 tree,
>>                                 diff_baton, vtable,
>>                                 svn_diff_datasource_latest,
>> -                               FALSE,
>> -                               0,
>> +                               TRUE,
>> +                               prefix_lines,
>>                                 subpool));
>>
>>    /* Get rid of the tokens, we don't need them to calc the diff */
>> @@ -295,9 +301,9 @@ svn_diff_diff3(svn_diff_t **diff,
>>    svn_pool_destroy(treepool);
>>
>>    /* Get the lcs for original-modified and original-latest */
>> -  lcs_om = svn_diff__lcs(position_list[0], position_list[1], 0,
>> +  lcs_om = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
>>                           subpool);
>> -  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], 0,
>> +  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
>>                           subpool);
>>
>>    /* Produce a merged diff */
>> @@ -330,7 +336,7 @@ svn_diff_diff3(svn_diff_t **diff,
>>        }
>>      else
>>        {
>> -        sentinel_position[0].offset = 1;
>> +        sentinel_position[0].offset = prefix_lines + 1;
>>          sentinel_position[0].next = NULL;
>>          position_list[1] = &sentinel_position[0];
>>        }
>> @@ -344,7 +350,7 @@ svn_diff_diff3(svn_diff_t **diff,
>>        }
>>      else
>>        {
>> -        sentinel_position[1].offset = 1;
>> +        sentinel_position[1].offset = prefix_lines + 1;
>>          sentinel_position[1].next = NULL;
>>          position_list[2] = &sentinel_position[1];
>>        }
>>
>>
>
>

diff4-optimization-bytes

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.
Johan,

I'm concerned about this change: on the one hand, it's untested and
no one claims to be understanding the code; on the other hand, it
doesn't exactly parallel the diff3 change:

specifically, the last hunk of the diff3 patch (which is also included
below) has no equivalent in the diff4 patch.  A wild guess is that the
parameters to svn_diff__diff() might need to be adjusted.

All in all, given your warnings in the log message, I have zero reason
to believe the change is correct, and I just cited one reason I have to
believe it's wrong :-).


May I suggest that, if this code is to be released, then you validate
its correctnss?  For example, a minimal regression test that is written
to record trunk's pre-branch behaviour might be sufficient.


Best,

Daniel


jcorvel@apache.org wrote on Thu, Jan 13, 2011 at 22:02:38 -0000:
> Author: jcorvel
> Date: Thu Jan 13 22:02:38 2011
> New Revision: 1058759
> 
> URL: http://svn.apache.org/viewvc?rev=1058759&view=rev
> Log:
> On the diff-optimizations-bytes branch:
> 
> Adapt the diff4 algorithm, as far as I understand it, to make use of the
> prefix/suffix optimization.
> 
> Note:
> - diff4 isn't used in svn core, only in tools/diff/diff4.
> - This change is untested, because there is currently no test in the test
>   suite exercising this algorithm, and I don't understand it well enough to
>   write tests for it. Review and/or adding tests for this code is thus very
>   welcome.
> 
> * subversion/libsvn_diff/diff4.c
>   (svn_diff_diff4): Open the 4 datasources together with datasources_open, to
>   let it eliminate identical prefix and suffix.
> 
> Modified:
>     subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c
> 
> Modified: subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c
> URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c?rev=1058759&r1=1058758&r2=1058759&view=diff
> ==============================================================================
> --- subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c (original)
> +++ subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c Thu Jan 13 22:02:38 2011
> @@ -173,6 +173,10 @@ svn_diff_diff4(svn_diff_t **diff,
>  {
>    svn_diff__tree_t *tree;
>    svn_diff__position_t *position_list[4];
> +  svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
> +                                        svn_diff_datasource_modified,
> +                                        svn_diff_datasource_latest,
> +                                        svn_diff_datasource_ancestor};
>    svn_diff__lcs_t *lcs_ol;
>    svn_diff__lcs_t *lcs_adjust;
>    svn_diff_t *diff_ol;
> @@ -181,6 +185,7 @@ svn_diff_diff4(svn_diff_t **diff,
>    apr_pool_t *subpool;
>    apr_pool_t *subpool2;
>    apr_pool_t *subpool3;
> +  apr_off_t prefix_lines = 0;
>  
>    *diff = NULL;
>  
> @@ -190,36 +195,38 @@ svn_diff_diff4(svn_diff_t **diff,
>  
>    svn_diff__tree_create(&tree, subpool3);
>  
> +  SVN_ERR(vtable->datasources_open(diff_baton, &prefix_lines, datasource, 4));
> +
>    SVN_ERR(svn_diff__get_tokens(&position_list[0],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_original,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool2));
>  
>    SVN_ERR(svn_diff__get_tokens(&position_list[1],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_modified,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool));
>  
>    SVN_ERR(svn_diff__get_tokens(&position_list[2],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_latest,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool));
>  
>    SVN_ERR(svn_diff__get_tokens(&position_list[3],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_ancestor,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool2));
>  
>    /* Get rid of the tokens, we don't need them to calc the diff */
> @@ -230,7 +237,8 @@ svn_diff_diff4(svn_diff_t **diff,
>    svn_pool_clear(subpool3);
>  
>    /* Get the lcs for original - latest */
> -  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], 0, subpool3);
> +  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
> +                         subpool3);
>    diff_ol = svn_diff__diff(lcs_ol, 1, 1, TRUE, pool);
>  
>    svn_pool_clear(subpool3);
> @@ -251,7 +259,8 @@ svn_diff_diff4(svn_diff_t **diff,
>    /* Get the lcs for common ancestor - original
>     * Do reverse adjustements
>     */
> -  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], 0, subpool3);
> +  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], prefix_lines,
> +                             subpool3);
>    diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
>    adjust_diff(diff_ol, diff_adjust);
>  
> @@ -260,7 +269,8 @@ svn_diff_diff4(svn_diff_t **diff,
>    /* Get the lcs for modified - common ancestor
>     * Do forward adjustments
>     */
> -  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], 0, subpool3);
> +  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], prefix_lines,
> +                             subpool3);
>    diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
>    adjust_diff(diff_ol, diff_adjust);
>  
> 
> 

jcorvel@apache.org wrote on Thu, Jan 13, 2011 at 21:38:16 -0000:
> Author: jcorvel
> Date: Thu Jan 13 21:38:16 2011
> New Revision: 1058753
> 
> URL: http://svn.apache.org/viewvc?rev=1058753&view=rev
> Log:
> On the diff-optimizations-bytes branch:
> 
> Adapt the diff3 algorithm to make use of the prefix/suffix optimization.
> 
> * subversion/libsvn_diff/diff3.c
>   (svn_diff_diff3): Open the 3 datasources together with datasources_open, to
>    let it eliminate identical prefix and suffix. Don't forget to adjust the
>    sentinel_positions with the prefix_lines, in the case of an empty
>    position_list.
> 
> Modified:
>     subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c
> 
> Modified: subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c
> URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c?rev=1058753&r1=1058752&r2=1058753&view=diff
> ==============================================================================
> --- subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c (original)
> +++ subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c Thu Jan 13 21:38:16 2011
> @@ -251,10 +251,14 @@ svn_diff_diff3(svn_diff_t **diff,
>  {
>    svn_diff__tree_t *tree;
>    svn_diff__position_t *position_list[3];
> +  svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
> +                                        svn_diff_datasource_modified,
> +                                        svn_diff_datasource_latest};
>    svn_diff__lcs_t *lcs_om;
>    svn_diff__lcs_t *lcs_ol;
>    apr_pool_t *subpool;
>    apr_pool_t *treepool;
> +  apr_off_t prefix_lines = 0;
>  
>    *diff = NULL;
>  
> @@ -263,28 +267,30 @@ svn_diff_diff3(svn_diff_t **diff,
>  
>    svn_diff__tree_create(&tree, treepool);
>  
> +  SVN_ERR(vtable->datasources_open(diff_baton, &prefix_lines, datasource, 3));
> +
>    SVN_ERR(svn_diff__get_tokens(&position_list[0],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_original,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool));
>  
>    SVN_ERR(svn_diff__get_tokens(&position_list[1],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_modified,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool));
>  
>    SVN_ERR(svn_diff__get_tokens(&position_list[2],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_latest,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool));
>  
>    /* Get rid of the tokens, we don't need them to calc the diff */
> @@ -295,9 +301,9 @@ svn_diff_diff3(svn_diff_t **diff,
>    svn_pool_destroy(treepool);
>  
>    /* Get the lcs for original-modified and original-latest */
> -  lcs_om = svn_diff__lcs(position_list[0], position_list[1], 0,
> +  lcs_om = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
>                           subpool);
> -  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], 0,
> +  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
>                           subpool);
>  
>    /* Produce a merged diff */
> @@ -330,7 +336,7 @@ svn_diff_diff3(svn_diff_t **diff,
>        }
>      else
>        {
> -        sentinel_position[0].offset = 1;
> +        sentinel_position[0].offset = prefix_lines + 1;
>          sentinel_position[0].next = NULL;
>          position_list[1] = &sentinel_position[0];
>        }
> @@ -344,7 +350,7 @@ svn_diff_diff3(svn_diff_t **diff,
>        }
>      else
>        {
> -        sentinel_position[1].offset = 1;
> +        sentinel_position[1].offset = prefix_lines + 1;
>          sentinel_position[1].next = NULL;
>          position_list[2] = &sentinel_position[1];
>        }
> 
> 


diff4-optimization-bytes

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.
Johan,

I'm concerned about this change: on the one hand, it's untested and
no one claims to be understanding the code; on the other hand, it
doesn't exactly parallel the diff3 change:

specifically, the last hunk of the diff3 patch (which is also included
below) has no equivalent in the diff4 patch.  A wild guess is that the
parameters to svn_diff__diff() might need to be adjusted.

All in all, given your warnings in the log message, I have zero reason
to believe the change is correct, and I just cited one reason I have to
believe it's wrong :-).


May I suggest that, if this code is to be released, then you validate
its correctnss?  For example, a minimal regression test that is written
to record trunk's pre-branch behaviour might be sufficient.


Best,

Daniel


jcorvel@apache.org wrote on Thu, Jan 13, 2011 at 22:02:38 -0000:
> Author: jcorvel
> Date: Thu Jan 13 22:02:38 2011
> New Revision: 1058759
> 
> URL: http://svn.apache.org/viewvc?rev=1058759&view=rev
> Log:
> On the diff-optimizations-bytes branch:
> 
> Adapt the diff4 algorithm, as far as I understand it, to make use of the
> prefix/suffix optimization.
> 
> Note:
> - diff4 isn't used in svn core, only in tools/diff/diff4.
> - This change is untested, because there is currently no test in the test
>   suite exercising this algorithm, and I don't understand it well enough to
>   write tests for it. Review and/or adding tests for this code is thus very
>   welcome.
> 
> * subversion/libsvn_diff/diff4.c
>   (svn_diff_diff4): Open the 4 datasources together with datasources_open, to
>   let it eliminate identical prefix and suffix.
> 
> Modified:
>     subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c
> 
> Modified: subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c
> URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c?rev=1058759&r1=1058758&r2=1058759&view=diff
> ==============================================================================
> --- subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c (original)
> +++ subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff4.c Thu Jan 13 22:02:38 2011
> @@ -173,6 +173,10 @@ svn_diff_diff4(svn_diff_t **diff,
>  {
>    svn_diff__tree_t *tree;
>    svn_diff__position_t *position_list[4];
> +  svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
> +                                        svn_diff_datasource_modified,
> +                                        svn_diff_datasource_latest,
> +                                        svn_diff_datasource_ancestor};
>    svn_diff__lcs_t *lcs_ol;
>    svn_diff__lcs_t *lcs_adjust;
>    svn_diff_t *diff_ol;
> @@ -181,6 +185,7 @@ svn_diff_diff4(svn_diff_t **diff,
>    apr_pool_t *subpool;
>    apr_pool_t *subpool2;
>    apr_pool_t *subpool3;
> +  apr_off_t prefix_lines = 0;
>  
>    *diff = NULL;
>  
> @@ -190,36 +195,38 @@ svn_diff_diff4(svn_diff_t **diff,
>  
>    svn_diff__tree_create(&tree, subpool3);
>  
> +  SVN_ERR(vtable->datasources_open(diff_baton, &prefix_lines, datasource, 4));
> +
>    SVN_ERR(svn_diff__get_tokens(&position_list[0],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_original,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool2));
>  
>    SVN_ERR(svn_diff__get_tokens(&position_list[1],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_modified,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool));
>  
>    SVN_ERR(svn_diff__get_tokens(&position_list[2],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_latest,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool));
>  
>    SVN_ERR(svn_diff__get_tokens(&position_list[3],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_ancestor,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool2));
>  
>    /* Get rid of the tokens, we don't need them to calc the diff */
> @@ -230,7 +237,8 @@ svn_diff_diff4(svn_diff_t **diff,
>    svn_pool_clear(subpool3);
>  
>    /* Get the lcs for original - latest */
> -  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], 0, subpool3);
> +  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
> +                         subpool3);
>    diff_ol = svn_diff__diff(lcs_ol, 1, 1, TRUE, pool);
>  
>    svn_pool_clear(subpool3);
> @@ -251,7 +259,8 @@ svn_diff_diff4(svn_diff_t **diff,
>    /* Get the lcs for common ancestor - original
>     * Do reverse adjustements
>     */
> -  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], 0, subpool3);
> +  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], prefix_lines,
> +                             subpool3);
>    diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
>    adjust_diff(diff_ol, diff_adjust);
>  
> @@ -260,7 +269,8 @@ svn_diff_diff4(svn_diff_t **diff,
>    /* Get the lcs for modified - common ancestor
>     * Do forward adjustments
>     */
> -  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], 0, subpool3);
> +  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], prefix_lines,
> +                             subpool3);
>    diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
>    adjust_diff(diff_ol, diff_adjust);
>  
> 
> 

jcorvel@apache.org wrote on Thu, Jan 13, 2011 at 21:38:16 -0000:
> Author: jcorvel
> Date: Thu Jan 13 21:38:16 2011
> New Revision: 1058753
> 
> URL: http://svn.apache.org/viewvc?rev=1058753&view=rev
> Log:
> On the diff-optimizations-bytes branch:
> 
> Adapt the diff3 algorithm to make use of the prefix/suffix optimization.
> 
> * subversion/libsvn_diff/diff3.c
>   (svn_diff_diff3): Open the 3 datasources together with datasources_open, to
>    let it eliminate identical prefix and suffix. Don't forget to adjust the
>    sentinel_positions with the prefix_lines, in the case of an empty
>    position_list.
> 
> Modified:
>     subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c
> 
> Modified: subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c
> URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c?rev=1058753&r1=1058752&r2=1058753&view=diff
> ==============================================================================
> --- subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c (original)
> +++ subversion/branches/diff-optimizations-bytes/subversion/libsvn_diff/diff3.c Thu Jan 13 21:38:16 2011
> @@ -251,10 +251,14 @@ svn_diff_diff3(svn_diff_t **diff,
>  {
>    svn_diff__tree_t *tree;
>    svn_diff__position_t *position_list[3];
> +  svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
> +                                        svn_diff_datasource_modified,
> +                                        svn_diff_datasource_latest};
>    svn_diff__lcs_t *lcs_om;
>    svn_diff__lcs_t *lcs_ol;
>    apr_pool_t *subpool;
>    apr_pool_t *treepool;
> +  apr_off_t prefix_lines = 0;
>  
>    *diff = NULL;
>  
> @@ -263,28 +267,30 @@ svn_diff_diff3(svn_diff_t **diff,
>  
>    svn_diff__tree_create(&tree, treepool);
>  
> +  SVN_ERR(vtable->datasources_open(diff_baton, &prefix_lines, datasource, 3));
> +
>    SVN_ERR(svn_diff__get_tokens(&position_list[0],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_original,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool));
>  
>    SVN_ERR(svn_diff__get_tokens(&position_list[1],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_modified,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool));
>  
>    SVN_ERR(svn_diff__get_tokens(&position_list[2],
>                                 tree,
>                                 diff_baton, vtable,
>                                 svn_diff_datasource_latest,
> -                               FALSE,
> -                               0,
> +                               TRUE,
> +                               prefix_lines,
>                                 subpool));
>  
>    /* Get rid of the tokens, we don't need them to calc the diff */
> @@ -295,9 +301,9 @@ svn_diff_diff3(svn_diff_t **diff,
>    svn_pool_destroy(treepool);
>  
>    /* Get the lcs for original-modified and original-latest */
> -  lcs_om = svn_diff__lcs(position_list[0], position_list[1], 0,
> +  lcs_om = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
>                           subpool);
> -  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], 0,
> +  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
>                           subpool);
>  
>    /* Produce a merged diff */
> @@ -330,7 +336,7 @@ svn_diff_diff3(svn_diff_t **diff,
>        }
>      else
>        {
> -        sentinel_position[0].offset = 1;
> +        sentinel_position[0].offset = prefix_lines + 1;
>          sentinel_position[0].next = NULL;
>          position_list[1] = &sentinel_position[0];
>        }
> @@ -344,7 +350,7 @@ svn_diff_diff3(svn_diff_t **diff,
>        }
>      else
>        {
> -        sentinel_position[1].offset = 1;
> +        sentinel_position[1].offset = prefix_lines + 1;
>          sentinel_position[1].next = NULL;
>          position_list[2] = &sentinel_position[1];
>        }
> 
>