You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@subversion.apache.org by jc...@apache.org on 2010/11/24 01:03:07 UTC

svn commit: r1038390 - in /subversion/branches/diff-optimizations-tokens/subversion: include/ libsvn_diff/

Author: jcorvel
Date: Wed Nov 24 00:03:07 2010
New Revision: 1038390

URL: http://svn.apache.org/viewvc?rev=1038390&view=rev
Log:
On the diff-optimizations-tokens branch:

Make svn_diff skip identical prefix to make diff and blame faster.

* subversion/include/svn_diff.h
  (svn_diff_fns_t): Add new function type token_pushback_prefix to the
   vtable. This function is needed to push back the last token that has been
   read during prefix scanning (the first non-matching line), so it can be
   read again (and checksummed) during the get_next_token phase.

* subversion/libsvn_diff/diff_memory.c
  (datasource_get_next_token): Make the checksum calculation optional (only
   calculate checksum if HASH is not NULL).
  (token_pushback_prefix): New function.
  (svn_diff__mem_vtable): Add new function token_pushback_prefix.

* subversion/libsvn_diff/diff_file.c
  (svn_diff__file_baton_t): Add member PREFIX_TOKENS, a linked list of pushed
   back prefix tokens.
  (datasource_get_next_token): Make the checksum calculation optional (only
   calculate checksum if HASH is not NULL). If applicable, use a pushed back
   prefix token to adjust the raw_length of a token that is being reread.
  (token_pushback_prefix): New function.
  (svn_diff__mem_vtable): Add new function token_pushback_prefix.

* subversion/libsvn_diff/diff.h
  (svn_diff__lcs): Add argument "prefix_lines".
  (svn_diff__get_all_tokens): New function declaration.

* subversion/libsvn_diff/token.c
  (find_identical_prefix): New function.
  (svn_diff__get_all_tokens): New function. The purpose of this function is to
   read the tokens from all datasources at once, so it can first scan for
   identical prefix and suffix.

* subversion/libsvn_diff/lcs.c
  (svn_diff__lcs): Added argument "prefix_lines". Use this to correctly set
   the offset of the sentinel position for EOF, even if one of the files
   became empty after eliminating the identical prefix.

* subversion/libsvn_diff/diff.c
  (svn_diff__diff): Add a chunk of "common" diff for identical prefix.
  (svn_diff_diff): Use new function svn_diff__get_all_tokens to get the tokens
   from original and modified at once (while scanning for identical prefix).
   Pass prefix_lines to svn_diff__lcs and svn_diff__diff.

* subversion/libsvn_diff/diff3.c
  (svn_diff_diff3): For now, don't use svn_diff__get_all_tokens yet.
   Pass prefix_lines = 0 to svn_diff__lcs.

* subversion/libsvn_diff/diff4.c
  (svn_diff_diff4): For now, don't use svn_diff__get_all_tokens yet.
   Pass prefix_lines = 0 to svn_diff__lcs.

Modified:
    subversion/branches/diff-optimizations-tokens/subversion/include/svn_diff.h
    subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.c
    subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.h
    subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff3.c
    subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff4.c
    subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_file.c
    subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_memory.c
    subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/lcs.c
    subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/token.c

Modified: subversion/branches/diff-optimizations-tokens/subversion/include/svn_diff.h
URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/include/svn_diff.h?rev=1038390&r1=1038389&r2=1038390&view=diff
==============================================================================
--- subversion/branches/diff-optimizations-tokens/subversion/include/svn_diff.h (original)
+++ subversion/branches/diff-optimizations-tokens/subversion/include/svn_diff.h Wed Nov 24 00:03:07 2010
@@ -135,6 +135,10 @@ typedef struct svn_diff_fns_t
                                 void *rtoken,
                                 int *compare);
 
+  svn_error_t *(*token_pushback_prefix)(void *diff_baton,
+                                        void *token,
+                                        svn_diff_datasource_e datasource);
+
   /** Free @a token from memory, the diff algorithm is done with it. */
   void (*token_discard)(void *diff_baton,
                         void *token);

Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.c
URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.c?rev=1038390&r1=1038389&r2=1038390&view=diff
==============================================================================
--- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.c (original)
+++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.c Wed Nov 24 00:03:07 2010
@@ -43,6 +43,22 @@ svn_diff__diff(svn_diff__lcs_t *lcs,
   svn_diff_t *diff;
   svn_diff_t **diff_ref = &diff;
 
+  if (want_common && (original_start > 1))
+    {
+      /* we have a prefix to skip */
+      (*diff_ref) = apr_palloc(pool, sizeof(**diff_ref));
+
+      (*diff_ref)->type = svn_diff__type_common;
+      (*diff_ref)->original_start = 0;
+      (*diff_ref)->original_length = original_start - 1;
+      (*diff_ref)->modified_start = 0;
+      (*diff_ref)->modified_length = modified_start - 1;
+      (*diff_ref)->latest_start = 0;
+      (*diff_ref)->latest_length = 0;
+
+      diff_ref = &(*diff_ref)->next;
+    }
+
   while (1)
     {
       if (original_start < lcs->position[0]->offset
@@ -105,11 +121,17 @@ svn_diff_diff(svn_diff_t **diff,
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[2];
+  svn_diff__position_t **position_list_ref[2];
+  svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
+                                        svn_diff_datasource_modified};
   svn_diff__lcs_t *lcs;
   apr_pool_t *subpool;
   apr_pool_t *treepool;
+  apr_off_t prefix_lines = 0;
 
   *diff = NULL;
+  position_list_ref[0] = &position_list[0];
+  position_list_ref[1] = &position_list[1];
 
   subpool = svn_pool_create(pool);
   treepool = svn_pool_create(pool);
@@ -117,17 +139,12 @@ svn_diff_diff(svn_diff_t **diff,
   svn_diff__tree_create(&tree, treepool);
 
   /* Insert the data into the tree */
-  SVN_ERR(svn_diff__get_tokens(&position_list[0],
-                               tree,
-                               diff_baton, vtable,
-                               svn_diff_datasource_original,
-                               subpool));
-
-  SVN_ERR(svn_diff__get_tokens(&position_list[1],
-                               tree,
-                               diff_baton, vtable,
-                               svn_diff_datasource_modified,
-                               subpool));
+  SVN_ERR(svn_diff__get_all_tokens(position_list_ref,
+                                   &prefix_lines,
+                                   tree,
+                                   diff_baton, vtable,
+                                   datasource, 2,
+                                   subpool));
 
   /* The cool part is that we don't need the tokens anymore.
    * Allow the app to clean them up if it wants to.
@@ -139,10 +156,10 @@ svn_diff_diff(svn_diff_t **diff,
   svn_pool_destroy(treepool);
 
   /* Get the lcs */
-  lcs = svn_diff__lcs(position_list[0], position_list[1], subpool);
+  lcs = svn_diff__lcs(position_list[0], position_list[1], prefix_lines, subpool);
 
   /* Produce the diff */
-  *diff = svn_diff__diff(lcs, 1, 1, TRUE, pool);
+  *diff = svn_diff__diff(lcs, prefix_lines + 1, prefix_lines + 1, TRUE, pool);
 
   /* Get rid of all the data we don't have a use for anymore */
   svn_pool_destroy(subpool);

Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.h
URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.h?rev=1038390&r1=1038389&r2=1038390&view=diff
==============================================================================
--- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.h (original)
+++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.h Wed Nov 24 00:03:07 2010
@@ -91,6 +91,7 @@ typedef enum svn_diff__normalize_state_t
 svn_diff__lcs_t *
 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) */
               svn_diff__position_t *position_list2, /* pointer to tail (ring) */
+              apr_off_t prefix_lines,
               apr_pool_t *pool);
 
 
@@ -113,6 +114,20 @@ svn_diff__get_tokens(svn_diff__position_
                      svn_diff_datasource_e datasource,
                      apr_pool_t *pool);
 
+/*
+ * Get all tokens from all datasources (skipping identical prefix and suffix).
+ * Return the last item in the (circular) list.
+ */
+svn_error_t *
+svn_diff__get_all_tokens(svn_diff__position_t **position_list[],
+                         apr_off_t *prefix_lines,
+                         svn_diff__tree_t *tree,
+                         void *diff_baton,
+                         const svn_diff_fns_t *vtable,
+                         svn_diff_datasource_e datasource[],
+                         int datasource_len,
+                         apr_pool_t *pool);
+
 
 /* Morph a svn_lcs_t into a svn_diff_t. */
 svn_diff_t *

Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff3.c
URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff3.c?rev=1038390&r1=1038389&r2=1038390&view=diff
==============================================================================
--- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff3.c (original)
+++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff3.c Wed Nov 24 00:03:07 2010
@@ -173,7 +173,7 @@ svn_diff__resolve_conflict(svn_diff_t *h
         position[1]->next = start_position[1];
       }
 
-    *lcs_ref = svn_diff__lcs(position[0], position[1],
+    *lcs_ref = svn_diff__lcs(position[0], position[1], 0,
                              subpool);
 
     /* Fix up the EOF lcs element in case one of
@@ -289,9 +289,9 @@ svn_diff_diff3(svn_diff_t **diff,
   svn_pool_destroy(treepool);
 
   /* Get the lcs for original-modified and original-latest */
-  lcs_om = svn_diff__lcs(position_list[0], position_list[1],
+  lcs_om = svn_diff__lcs(position_list[0], position_list[1], 0,
                          subpool);
-  lcs_ol = svn_diff__lcs(position_list[0], position_list[2],
+  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], 0,
                          subpool);
 
   /* Produce a merged diff */

Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff4.c
URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff4.c?rev=1038390&r1=1038389&r2=1038390&view=diff
==============================================================================
--- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff4.c (original)
+++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff4.c Wed Nov 24 00:03:07 2010
@@ -222,7 +222,7 @@ svn_diff_diff4(svn_diff_t **diff,
   svn_pool_clear(subpool3);
 
   /* Get the lcs for original - latest */
-  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], subpool3);
+  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], 0, subpool3);
   diff_ol = svn_diff__diff(lcs_ol, 1, 1, TRUE, pool);
 
   svn_pool_clear(subpool3);
@@ -243,7 +243,7 @@ svn_diff_diff4(svn_diff_t **diff,
   /* Get the lcs for common ancestor - original
    * Do reverse adjustements
    */
-  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], subpool3);
+  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], 0, subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
 
@@ -252,7 +252,7 @@ svn_diff_diff4(svn_diff_t **diff,
   /* Get the lcs for modified - common ancestor
    * Do forward adjustments
    */
-  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], subpool3);
+  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], 0, subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
 

Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_file.c
URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_file.c?rev=1038390&r1=1038389&r2=1038390&view=diff
==============================================================================
--- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_file.c (original)
+++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_file.c Wed Nov 24 00:03:07 2010
@@ -82,6 +82,9 @@ typedef struct svn_diff__file_baton_t
     char *endp;    /* next memory address after the current chunk */
 
     svn_diff__normalize_state_t normalize_state;
+
+    /* Linked list of pushed back prefix tokens */
+    svn_diff__file_token_t *prefix_tokens;
   } files[4];
 
   /* List of free tokens that may be reused. */
@@ -330,7 +333,8 @@ datasource_get_next_token(apr_uint32_t *
                                  &file->normalize_state,
                                  curp, file_baton->options);
       file_token->length += length;
-      h = svn_diff__adler32(h, curp, length);
+      if (hash)
+        h = svn_diff__adler32(h, curp, length);
 
       curp = endp = file->buffer;
       file->chunk++;
@@ -377,10 +381,48 @@ datasource_get_next_token(apr_uint32_t *
 
       file_token->length += length;
 
-      *hash = svn_diff__adler32(h, c, length);
+      if (hash)
+        *hash = svn_diff__adler32(h, c, length);
       *token = file_token;
     }
 
+  /* If there is a pushed back prefix_token available, that means we have
+     read (and normalized in-place) this token before. Adjust our new token
+     with the original raw_length of the prefix_token. */
+  if (file->prefix_tokens)
+    {
+      svn_diff__file_token_t *prefix_token = file->prefix_tokens;
+      file->prefix_tokens = file->prefix_tokens->next;
+
+      if (file_token->raw_length < prefix_token->raw_length)
+        {
+          int length_difference = prefix_token->raw_length 
+                                  - file_token->raw_length;
+          if (length_difference < file->endp - file->curp)
+            {
+              file->curp += length_difference;
+            }
+          else
+            {
+              /* ### TODO: rework this so it can handle crossing multiple 
+                 chunks, instead of only one. */
+              length_difference -= file->endp - file->curp;
+              file->curp = file->buffer;
+              file->chunk++;
+              length = file->chunk == last_chunk ?
+                offset_in_chunk(file->size) : CHUNK_SIZE;
+              file->endp = file->curp + length;
+
+              SVN_ERR(read_chunk(file->file, file->path,
+                                 file->curp, length,
+                                 chunk_to_offset(file->chunk),
+                                 file_baton->pool));
+              file->curp += length_difference;
+            }
+          file_token->raw_length = prefix_token->raw_length;
+        }
+    }
+
   return SVN_NO_ERROR;
 }
 
@@ -506,6 +548,37 @@ token_compare(void *baton, void *token1,
   return SVN_NO_ERROR;
 }
 
+static svn_error_t *
+token_pushback_prefix(void *baton,
+                      void *token,
+                      svn_diff_datasource_e datasource)
+{
+  svn_diff__file_baton_t *file_baton = baton;
+  svn_diff__file_token_t *file_token = token;
+  struct file_info *file =
+    &file_baton->files[datasource_to_index(datasource)];
+
+  file_token->next = file->prefix_tokens;
+  file->prefix_tokens = file_token;
+
+  if (offset_to_chunk(file_token->norm_offset) == file->chunk)
+    {
+      file->curp = file->buffer + offset_in_chunk(file_token->norm_offset);
+    }
+  else
+    {
+      /* ### TODO: rework this so it can handle crossing multiple 
+         chunks, instead of only one. */
+      file->chunk--;
+      SVN_ERR(read_chunk(file->file, file->path, file->buffer,
+                         CHUNK_SIZE, chunk_to_offset(file->chunk),
+                         file_baton->pool));
+      file->endp = file->buffer + CHUNK_SIZE;
+      file->curp = file->buffer + offset_in_chunk(file_token->norm_offset);
+    }
+
+  return SVN_NO_ERROR;
+}
 
 /* Implements svn_diff_fns_t::token_discard */
 static void
@@ -536,6 +609,7 @@ static const svn_diff_fns_t svn_diff__fi
   datasource_close,
   datasource_get_next_token,
   token_compare,
+  token_pushback_prefix,
   token_discard,
   token_discard_all
 };

Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_memory.c
URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_memory.c?rev=1038390&r1=1038389&r2=1038390&view=diff
==============================================================================
--- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_memory.c (original)
+++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_memory.c Wed Nov 24 00:03:07 2010
@@ -128,7 +128,8 @@ datasource_get_next_token(apr_uint32_t *
 
       svn_diff__normalize_buffer(&buf, &len, &state, tok->data,
                                  mem_baton->normalization_options);
-      *hash = svn_diff__adler32(0, buf, len);
+      if (hash)
+        *hash = svn_diff__adler32(0, buf, len);
       src->next_token++;
     }
   else
@@ -166,6 +167,19 @@ token_compare(void *baton, void *token1,
   return SVN_NO_ERROR;
 }
 
+static svn_error_t *
+token_pushback_prefix(void *baton,
+                      void *token,
+                      svn_diff_datasource_e datasource)
+{
+  diff_mem_baton_t *mem_baton = baton;
+  source_tokens_t *src = &(mem_baton->sources[datasource_to_index(datasource)]);
+
+  src->next_token--;  
+  
+  return SVN_NO_ERROR;
+}
+
 /* Implements svn_diff_fns_t::token_discard */
 static void
 token_discard(void *baton, void *token)
@@ -192,6 +206,7 @@ static const svn_diff_fns_t svn_diff__me
   datasource_close,
   datasource_get_next_token,
   token_compare,
+  token_pushback_prefix,
   token_discard,
   token_discard_all
 };

Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/lcs.c
URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/lcs.c?rev=1038390&r1=1038389&r2=1038390&view=diff
==============================================================================
--- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/lcs.c (original)
+++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/lcs.c Wed Nov 24 00:03:07 2010
@@ -163,6 +163,7 @@ svn_diff__lcs_reverse(svn_diff__lcs_t *l
 svn_diff__lcs_t *
 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) */
               svn_diff__position_t *position_list2, /* pointer to tail (ring) */
+              apr_off_t prefix_lines,
               apr_pool_t *pool)
 {
   int idx;
@@ -180,9 +181,11 @@ svn_diff__lcs(svn_diff__position_t *posi
    */
   lcs = apr_palloc(pool, sizeof(*lcs));
   lcs->position[0] = apr_pcalloc(pool, sizeof(*lcs->position[0]));
-  lcs->position[0]->offset = position_list1 ? position_list1->offset + 1 : 1;
+  lcs->position[0]->offset = position_list1 ? 
+    position_list1->offset + 1 : prefix_lines + 1;
   lcs->position[1] = apr_pcalloc(pool, sizeof(*lcs->position[1]));
-  lcs->position[1]->offset = position_list2 ? position_list2->offset + 1 : 1;
+  lcs->position[1]->offset = position_list2 ?
+    position_list2->offset + 1 : prefix_lines + 1;
   lcs->length = 0;
   lcs->refcount = 1;
   lcs->next = NULL;

Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/token.c
URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/token.c?rev=1038390&r1=1038389&r2=1038390&view=diff
==============================================================================
--- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/token.c (original)
+++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/token.c Wed Nov 24 00:03:07 2010
@@ -187,3 +187,148 @@ svn_diff__get_tokens(svn_diff__position_
 
   return SVN_NO_ERROR;
 }
+
+/* Find identical prefix between all datasources
+ */
+static svn_error_t *
+find_identical_prefix(svn_boolean_t *reached_one_eof,
+                      apr_off_t *prefix_lines,
+                      void *diff_baton,
+                      const svn_diff_fns_t *vtable,
+                      svn_diff_datasource_e datasource[],
+                      int datasource_len)
+{
+  void *token[4];
+  svn_boolean_t is_match, reached_all_eof;
+  int i, rv;
+
+  *prefix_lines = 0;
+  *reached_one_eof = FALSE;
+
+  /* Read first token from every datasource, and check if it matches. */
+  for (i = 0; i < datasource_len; i++)
+    {
+      SVN_ERR(vtable->datasource_get_next_token(NULL, &token[i],
+                                                diff_baton, datasource[i]));
+      *reached_one_eof = *reached_one_eof || token[i] == NULL;
+    }
+  if (*reached_one_eof)
+    {
+      /* We're done. Push back tokens that we may have read too much. */
+      for (i = 0; i < datasource_len; i++)
+        if (token[i] != NULL)
+          SVN_ERR(vtable->token_pushback_prefix(diff_baton, token[i], datasource[i]));
+      return SVN_NO_ERROR;
+    }
+  for (i = 1, is_match = TRUE; is_match && i < datasource_len; i++)
+    {
+      SVN_ERR(vtable->token_compare(diff_baton, token[0], token[i], &rv));
+      is_match = is_match && rv == 0;
+    }
+
+  /* While tokens match up, continue getting tokens and matching. */
+  while (is_match)
+    {
+      (*prefix_lines)++;
+      for (i = 0; i < datasource_len; i++)
+        {
+          SVN_ERR(vtable->datasource_get_next_token(NULL, &token[i],
+                                                    diff_baton, datasource[i]));
+          *reached_one_eof = *reached_one_eof || token[i] == NULL;
+        }
+      if (*reached_one_eof)
+        break;
+      else
+        for (i = 1, is_match = TRUE; is_match && i < datasource_len; i++)
+          {
+            SVN_ERR(vtable->token_compare(diff_baton, token[0], token[i], &rv));
+            is_match = is_match && rv == 0;
+          }
+    }
+
+  /* If all files reached their end (i.e. are fully identical), we're done. */
+  for (i = 0, reached_all_eof = TRUE; i < datasource_len; i++)
+    reached_all_eof = reached_all_eof && token[i] == NULL;
+  if (reached_all_eof)
+    return SVN_NO_ERROR;
+
+  /* Push back the non-matching token we read. */
+  for (i = 0; i < datasource_len; i++)
+    if (token[i] != NULL)
+      SVN_ERR(vtable->token_pushback_prefix(diff_baton, token[i], datasource[i]));
+
+  return SVN_NO_ERROR;
+}
+
+/*
+ * Get all tokens from the datasources.  For each datasource, return the
+ * last item in the (circular) list.
+ */
+svn_error_t *
+svn_diff__get_all_tokens(svn_diff__position_t **position_list[],
+                         apr_off_t *prefix_lines,
+                         svn_diff__tree_t *tree,
+                         void *diff_baton,
+                         const svn_diff_fns_t *vtable,
+                         svn_diff_datasource_e datasource[],
+                         int datasource_len,
+                         apr_pool_t *pool)
+{
+  svn_diff__position_t *start_position;
+  svn_diff__position_t *position = NULL;
+  svn_diff__position_t **position_ref;
+  svn_diff__node_t *node;
+  void *token;
+  apr_off_t offset;
+  apr_uint32_t hash;
+  svn_boolean_t reached_one_eof;
+  int i;
+
+  for (i = 0; i < datasource_len; i++)
+    {
+      *position_list[i] = NULL;
+      SVN_ERR(vtable->datasource_open(diff_baton, datasource[i]));
+    }
+
+  /* find identical prefix */
+  SVN_ERR(find_identical_prefix(&reached_one_eof, prefix_lines, 
+                                diff_baton, vtable, datasource, datasource_len));
+
+  /* ### TODO: find identical suffix (if not eof) */
+
+  for (i = 0; i < datasource_len; i++)
+    {
+      position_ref = &start_position;
+      offset = *prefix_lines;
+      hash = 0; /* The callback fn doesn't need to touch it per se */
+      while (1)
+        {
+          SVN_ERR(vtable->datasource_get_next_token(&hash, &token,
+                                                    diff_baton, datasource[i]));
+          if (token == NULL)
+            break;
+
+          offset++;
+          SVN_ERR(svn_diff__tree_insert_token(&node, tree,
+                                              diff_baton, vtable,
+                                              hash, token));
+
+          /* Create a new position */
+          position = apr_palloc(pool, sizeof(*position));
+          position->next = NULL;
+          position->node = node;
+          position->offset = offset;
+
+          *position_ref = position;
+          position_ref = &position->next;
+        }
+
+      *position_ref = start_position;
+
+      SVN_ERR(vtable->datasource_close(diff_baton, datasource[i]));
+
+      *position_list[i] = position;
+    }
+
+  return SVN_NO_ERROR;
+}