You are viewing a plain text version of this content. The canonical link for it is here.

Posted to dev@subversion.apache.org by Karl Fogel <kf...@red-bean.com> on 2008/06/22 22:19:10 UTC

Re: svn commit: r31504 - trunk/subversion/libsvn_client

cmpilato@tigris.org writes:
> Log:
> Optimize file merges performed with ancestrally related sources by
> using the svn_ra_get_log2() interface to figure out in which revisions
> the source really changed, and using that information to filter out
> no-op revision ranges prior to merging.  We still *record* the
> original set of revision ranges, of course.
>
> * subversion/libsvn_client/merge.c
>   (log_changed_revs, remove_noop_merge_ranges): New functions.
>   (do_file_merge): Selective use remove_noop_merge_ranges() to reduce
>     the amount of work this function needs to do.
>
> --- trunk/subversion/libsvn_client/merge.c (r31503)
> +++ trunk/subversion/libsvn_client/merge.c (r31504)
> @@ -3543,6 +3543,89 @@ get_mergeinfo_paths(apr_array_header_t *
>  }
>  
>  
> +/* Implements the svn_log_entry_receiver_t interface. */
> +static svn_error_t *
> +log_changed_revs(void *baton,
> +                  svn_log_entry_t *log_entry,
> +                  apr_pool_t *pool)
> +{
> +  apr_array_header_t *revs = baton;
> +  svn_revnum_t *revision = apr_palloc(revs->pool, sizeof(*revision));
> +  *revision = log_entry->revision;
> +  APR_ARRAY_PUSH(revs, svn_revnum_t *) = revision;
> +  return SVN_NO_ERROR;
> +}

We should also document what the function actually does -- the fact that
it implements a type is important, but that's not all there is to say
(we have to say what BATON is, for example).  I committed a doc string
tweak in r31845.

> +/* Set *OPERATIVE_RANGES_P to an array of svn_merge_range_t * merge
> +   range objects copied wholesale from RANGES which have the property
> +   that in some revision within that range the object identified by
> +   RA_SESSION was modified (if by "modified" we mean "'svn log' would
> +   return that revision).  *OPERATIVE_RANGES_P is allocated from the
> +   same pool as RANGES, and the ranges within it are shared with
> +   RANGES, too.  Use POOL for temporary allocations.  */
> +static svn_error_t *
> +remove_noop_merge_ranges(apr_array_header_t **operative_ranges_p,
> +                         svn_ra_session_t *ra_session,
> +                         apr_array_header_t *ranges,
> +                         apr_pool_t *pool)
> +{
> +  int i;
> +  svn_revnum_t oldest_rev = SVN_INVALID_REVNUM;
> +  svn_revnum_t youngest_rev = SVN_INVALID_REVNUM;
> +  apr_array_header_t *changed_revs =
> +    apr_array_make(pool, ranges->nelts, sizeof(svn_revnum_t *));
> +  apr_array_header_t *operative_ranges =
> +    apr_array_make(ranges->pool, ranges->nelts, ranges->elt_size);
> +  apr_array_header_t *log_targets = 
> +    apr_array_make(pool, 1, sizeof(const char *));
> +  APR_ARRAY_PUSH(log_targets, const char *) = "";
> +
> +  /* Find the revision extremes of the RANGES we have. */
> +  for (i = 0; i < ranges->nelts; i++)
> +    {
> +      svn_merge_range_t *r = APR_ARRAY_IDX(ranges, i, svn_merge_range_t *);
> +      svn_revnum_t max_rev = MAX(r->start, r->end);
> +      svn_revnum_t min_rev = MIN(r->start, r->end) + 1;
> +
> +      if ((! SVN_IS_VALID_REVNUM(youngest_rev)) || (max_rev > youngest_rev))
> +        youngest_rev = max_rev;
> +      if ((! SVN_IS_VALID_REVNUM(oldest_rev)) || (min_rev < oldest_rev))
> +        oldest_rev = min_rev;
> +    }
> +
> +  /* Get logs across those ranges, recording which revisions hold
> +     changes to our object's history. */
> +  SVN_ERR(svn_ra_get_log2(ra_session, log_targets, youngest_rev, 
> +                          oldest_rev, 0, FALSE, FALSE, FALSE, 
> +                          apr_array_make(pool, 0, sizeof(const char *)),
> +                          log_changed_revs, changed_revs, pool));
> +
> +  /* Now, copy from RANGES to *OPERATIVE_RANGES, filtering out ranges
> +     that aren't operative (by virtue of not having any revisions
> +     represented in the CHANGED_REVS array). */
> +  for (i = 0; i < ranges->nelts; i++)
> +    {
> +      svn_merge_range_t *range = APR_ARRAY_IDX(ranges, i, svn_merge_range_t *);
> +      int j;
> +
> +      for (j = 0; j < changed_revs->nelts; j++)
> +        {
> +          svn_revnum_t *changed_rev = 
> +            APR_ARRAY_IDX(changed_revs, j, svn_revnum_t *);
> +          if ((*changed_rev > MIN(range->start, range->end))
> +              && (*changed_rev <= MAX(range->start, range->end)))
> +            {
> +              APR_ARRAY_PUSH(operative_ranges, svn_merge_range_t *) = range;
> +              break;
> +            }
> +        }
> +    }
> +  *operative_ranges_p = operative_ranges;
> +  return SVN_NO_ERROR;
> +}

Hmmm, a question about that final (innermost) for-loop:

   for (j = 0; j < changed_revs->nelts; j++)
   {
      ...
   }

Do we really have to loop over (on average) half of changed_revs each
time to evaluate whether or not the range in question is operative?
After all, changed_revs is in sorted order already -- I don't know which
way, but svn_ra_get_log2()'s doc string has details on that.  So instead
of looping every time, we could just compare 

    APR_ARRAY_IDX(changed_revs, 0, svn_revnum_t *)
    APR_ARRAY_IDX(changed_revs, changed_revs->nelts, svn_revnum_t *)

against the appropriate MIN|MAX of (range->start, range->end), and know
in constant time whether we're even interested in pursuing it further.
And once we know we *are* interested in pursuing it further -- i.e.,
that there could be a changed_rev that would make this range interesting
-- *then* we could loop, or we could be really fancy and binary search.

I'm hand-waving some on the details here, but I think you get the idea?

>  /*-----------------------------------------------------------------------*/
>  
>  /*** Merge Source Normalization ***/
> @@ -4104,7 +4187,28 @@ do_file_merge(const char *url1,
>  
>    if (!merge_b->record_only)
>      {
> -      for (i = 0; i < remaining_ranges->nelts; i++)
> +      apr_array_header_t *ranges_to_merge = remaining_ranges;
> +      
> +      /* If we have ancestrally related sources and more than one
> +         range to merge, eliminate no-op ranges before going through
> +         the effort of downloading the many copies of the file
> +         required to do these merges (two copies per range). */
> +      if (merge_b->sources_ancestral && (remaining_ranges->nelts > 1))
> +        {
> +          const char *old_sess_url = NULL;
> +          SVN_ERR(svn_client__ensure_ra_session_url(&old_sess_url, 
> +                                                    merge_b->ra_session1,
> +                                                    primary_url, subpool));
> +          SVN_ERR(remove_noop_merge_ranges(&ranges_to_merge, 
> +                                           merge_b->ra_session1, 
> +                                           remaining_ranges, subpool));
> +          if (old_sess_url)
> +            SVN_ERR(svn_ra_reparent(merge_b->ra_session1, old_sess_url, 
> +                                    subpool));
> +          svn_pool_clear(subpool);
> +        }
> +
> +      for (i = 0; i < ranges_to_merge->nelts; i++)
>          {
>            svn_wc_notify_t *n;
>            svn_boolean_t header_sent = FALSE;

*nod* Looks like we effectively depend on remove_noop_merge_ranges()
being able to take the same value (or a reference thereto) for input and
output.  So I updated that doc as well in r31845.

-Karl

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@subversion.tigris.org
For additional commands, e-mail: dev-help@subversion.tigris.org

[PATCH] single-file-merge optimization (was: svn commit: r31504 - trunk/subversion/libsvn_client)

Posted by "C. Michael Pilato" <cm...@collab.net>.

C. Michael Pilato wrote:
> Karl Fogel wrote:
>> Do we really have to loop over (on average) half of changed_revs each
>> time to evaluate whether or not the range in question is operative?
>> After all, changed_revs is in sorted order already -- I don't know which
>> way, but svn_ra_get_log2()'s doc string has details on that.  So instead
>> of looping every time, we could just compare
>>     APR_ARRAY_IDX(changed_revs, 0, svn_revnum_t *)
>>     APR_ARRAY_IDX(changed_revs, changed_revs->nelts, svn_revnum_t *)
> 
> changed_revs->nelts-1 here, but yes, I see what you're saying.

Attaching a patch which should do this, for extra eyes and backup purposes.

-- 
C. Michael Pilato <cm...@collab.net>
CollabNet   <>   www.collab.net   <>   Distributed Development On Demand

Re: svn commit: r31504 - trunk/subversion/libsvn_client

Posted by "C. Michael Pilato" <cm...@collab.net>.

Karl Fogel wrote:
> cmpilato@tigris.org writes:
>> Log:
>> Optimize file merges performed with ancestrally related sources by
>> using the svn_ra_get_log2() interface to figure out in which revisions
>> the source really changed, and using that information to filter out
>> no-op revision ranges prior to merging.  We still *record* the
>> original set of revision ranges, of course.
>>
>> * subversion/libsvn_client/merge.c
>>   (log_changed_revs, remove_noop_merge_ranges): New functions.
>>   (do_file_merge): Selective use remove_noop_merge_ranges() to reduce
>>     the amount of work this function needs to do.
>>
>> --- trunk/subversion/libsvn_client/merge.c (r31503)
>> +++ trunk/subversion/libsvn_client/merge.c (r31504)
>> @@ -3543,6 +3543,89 @@ get_mergeinfo_paths(apr_array_header_t *
>>  }
>>  
>>  
>> +/* Implements the svn_log_entry_receiver_t interface. */
>> +static svn_error_t *
>> +log_changed_revs(void *baton,
>> +                  svn_log_entry_t *log_entry,
>> +                  apr_pool_t *pool)
>> +{
>> +  apr_array_header_t *revs = baton;
>> +  svn_revnum_t *revision = apr_palloc(revs->pool, sizeof(*revision));
>> +  *revision = log_entry->revision;
>> +  APR_ARRAY_PUSH(revs, svn_revnum_t *) = revision;
>> +  return SVN_NO_ERROR;
>> +}
> 
> We should also document what the function actually does -- the fact that
> it implements a type is important, but that's not all there is to say
> (we have to say what BATON is, for example).  I committed a doc string
> tweak in r31845.
> 
>> +/* Set *OPERATIVE_RANGES_P to an array of svn_merge_range_t * merge
>> +   range objects copied wholesale from RANGES which have the property
>> +   that in some revision within that range the object identified by
>> +   RA_SESSION was modified (if by "modified" we mean "'svn log' would
>> +   return that revision).  *OPERATIVE_RANGES_P is allocated from the
>> +   same pool as RANGES, and the ranges within it are shared with
>> +   RANGES, too.  Use POOL for temporary allocations.  */
>> +static svn_error_t *
>> +remove_noop_merge_ranges(apr_array_header_t **operative_ranges_p,
>> +                         svn_ra_session_t *ra_session,
>> +                         apr_array_header_t *ranges,
>> +                         apr_pool_t *pool)
>> +{
>> +  int i;
>> +  svn_revnum_t oldest_rev = SVN_INVALID_REVNUM;
>> +  svn_revnum_t youngest_rev = SVN_INVALID_REVNUM;
>> +  apr_array_header_t *changed_revs =
>> +    apr_array_make(pool, ranges->nelts, sizeof(svn_revnum_t *));
>> +  apr_array_header_t *operative_ranges =
>> +    apr_array_make(ranges->pool, ranges->nelts, ranges->elt_size);
>> +  apr_array_header_t *log_targets = 
>> +    apr_array_make(pool, 1, sizeof(const char *));
>> +  APR_ARRAY_PUSH(log_targets, const char *) = "";
>> +
>> +  /* Find the revision extremes of the RANGES we have. */
>> +  for (i = 0; i < ranges->nelts; i++)
>> +    {
>> +      svn_merge_range_t *r = APR_ARRAY_IDX(ranges, i, svn_merge_range_t *);
>> +      svn_revnum_t max_rev = MAX(r->start, r->end);
>> +      svn_revnum_t min_rev = MIN(r->start, r->end) + 1;
>> +
>> +      if ((! SVN_IS_VALID_REVNUM(youngest_rev)) || (max_rev > youngest_rev))
>> +        youngest_rev = max_rev;
>> +      if ((! SVN_IS_VALID_REVNUM(oldest_rev)) || (min_rev < oldest_rev))
>> +        oldest_rev = min_rev;
>> +    }
>> +
>> +  /* Get logs across those ranges, recording which revisions hold
>> +     changes to our object's history. */
>> +  SVN_ERR(svn_ra_get_log2(ra_session, log_targets, youngest_rev, 
>> +                          oldest_rev, 0, FALSE, FALSE, FALSE, 
>> +                          apr_array_make(pool, 0, sizeof(const char *)),
>> +                          log_changed_revs, changed_revs, pool));
>> +
>> +  /* Now, copy from RANGES to *OPERATIVE_RANGES, filtering out ranges
>> +     that aren't operative (by virtue of not having any revisions
>> +     represented in the CHANGED_REVS array). */
>> +  for (i = 0; i < ranges->nelts; i++)
>> +    {
>> +      svn_merge_range_t *range = APR_ARRAY_IDX(ranges, i, svn_merge_range_t *);
>> +      int j;
>> +
>> +      for (j = 0; j < changed_revs->nelts; j++)
>> +        {
>> +          svn_revnum_t *changed_rev = 
>> +            APR_ARRAY_IDX(changed_revs, j, svn_revnum_t *);
>> +          if ((*changed_rev > MIN(range->start, range->end))
>> +              && (*changed_rev <= MAX(range->start, range->end)))
>> +            {
>> +              APR_ARRAY_PUSH(operative_ranges, svn_merge_range_t *) = range;
>> +              break;
>> +            }
>> +        }
>> +    }
>> +  *operative_ranges_p = operative_ranges;
>> +  return SVN_NO_ERROR;
>> +}
> 
> Hmmm, a question about that final (innermost) for-loop:
> 
>    for (j = 0; j < changed_revs->nelts; j++)
>    {
>       ...
>    }
> 
> Do we really have to loop over (on average) half of changed_revs each
> time to evaluate whether or not the range in question is operative?
> After all, changed_revs is in sorted order already -- I don't know which
> way, but svn_ra_get_log2()'s doc string has details on that.  So instead
> of looping every time, we could just compare 
> 
>     APR_ARRAY_IDX(changed_revs, 0, svn_revnum_t *)
>     APR_ARRAY_IDX(changed_revs, changed_revs->nelts, svn_revnum_t *)

changed_revs->nelts-1 here, but yes, I see what you're saying.

-- 
C. Michael Pilato <cm...@collab.net>
CollabNet   <>   www.collab.net   <>   Distributed Development On Demand