You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@subversion.apache.org by da...@apache.org on 2010/06/30 19:55:25 UTC

svn commit: r959388 - in /subversion/trunk/subversion: libsvn_diff/parse-diff.c tests/libsvn_diff/parse-diff-test.c

Author: dannas
Date: Wed Jun 30 17:55:25 2010
New Revision: 959388

URL: http://svn.apache.org/viewvc?rev=959388&view=rev
Log:
Add support for parsing git diffs. The parser is implemented as a lookup
table. The idea is that we call different callbacks depending on what line
we have just read and or current state. There is one flag though, to help us
distinguish between the case when we have tree modifications and when we
have both tree and text modifications. In the former case, the last read
line will be put back into the stream so that it can be re-read when we
start scanning for a new patch.

Currently the parser isn't very strict about how a 'git diff' header must
look like.  For instance it doesn't ..

.. handle paths with spaces, tabs or newlines.
.. check if the lines are continous, e.g. we could have a line between 
   'copy from ' and 'copy to ' and the parser would still say that the patch is
   valid.
.. compare the paths found in 'git --diff a/path b/path' with 'copy from
   path' and 'copy to path'

But it's a start and something to work from!

* subversion/libsvn_diff/parse-diff.c
  (parse_next_hunk): Add check for 'git --diff'. If we see a line like that
    when looking for a hunk header, we bail out.
  (parse_state): The different states our state machine can be in.
  (transition): The struct descibing transitions in the state machine.
  (grab_filename): New.
  (diff_minus,
   diff_plus,
   git_start,
   git_minus,
   git_plus,
   git_move_from,
   git_move_to,
   git_copy_from,
   git_copy_to,
   git_new_file,
   git_deleted_file): Callbacks for the parser.
  (svn_diff_parse_next_patch): Add a table of transitions and loop over
    those transitions until we've either find a regular unidiff or a
    git diff or reached eof.

* subversion/tests/libsvn_diff/parse-diff-tests.c
  (test_funcs): Remove XFail for test_parse_git_diff() and
    test_parse_git_tree_and_text_diff().

Modified:
    subversion/trunk/subversion/libsvn_diff/parse-diff.c
    subversion/trunk/subversion/tests/libsvn_diff/parse-diff-test.c

Modified: subversion/trunk/subversion/libsvn_diff/parse-diff.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_diff/parse-diff.c?rev=959388&r1=959387&r2=959388&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_diff/parse-diff.c (original)
+++ subversion/trunk/subversion/libsvn_diff/parse-diff.c Wed Jun 30 17:55:25 2010
@@ -479,7 +479,8 @@ parse_next_hunk(svn_hunk_t **hunk,
               SVN_ERR(parse_prop_name(prop_name, line->data, "Modified: ",
                                       result_pool));
             }
-          else if (starts_with(line->data, minus))
+          else if (starts_with(line->data, minus)
+                   || starts_with(line->data, "git --diff "))
             /* This could be a header of another patch. Bail out. */
             break;
         }
@@ -578,6 +579,268 @@ close_hunk(const svn_hunk_t *hunk)
   return SVN_NO_ERROR;
 }
 
+enum parse_state
+{
+   state_start,
+   state_git_diff_seen,
+  /* if we have an add || del || cp src+dst || mv src+dst */
+   state_git_tree_seen, 
+   state_git_minus_seen,
+   state_git_plus_seen,
+   state_move_from_seen,
+   state_copy_from_seen,
+   state_minus_seen,
+   state_unidiff_found,
+   state_add_seen,
+   state_del_seen,
+   state_git_header_found
+};
+
+struct transition
+{
+  const char *line;
+  enum parse_state state;
+  svn_error_t *(*fn)(enum parse_state *state, const char *line, 
+                     svn_patch_t *patch, apr_pool_t *result_pool,
+                     apr_pool_t *scratch_pool);
+};
+
+/* UTF-8 encode and canonicalize the content of LINE as FILE_NAME. */
+static svn_error_t *
+grab_filename(const char **file_name, const char *line, apr_pool_t *result_pool,
+              apr_pool_t *scratch_pool)
+{
+  const char *utf8_path;
+  const char *canon_path;
+
+  /* Grab the filename and encode it in UTF-8. */
+  /* TODO: Allow specifying the patch file's encoding.
+   *       For now, we assume its encoding is native. */
+  SVN_ERR(svn_utf_cstring_to_utf8(&utf8_path,
+                                  line,
+                                  scratch_pool));
+
+  /* Canonicalize the path name. */
+  canon_path = svn_dirent_canonicalize(utf8_path, scratch_pool);
+
+  *file_name = apr_pstrdup(result_pool, canon_path);
+
+  return SVN_NO_ERROR;
+}
+
+/* Parse the '--- ' line of a regular unidiff. */
+static svn_error_t *
+diff_minus(enum parse_state *state, const char *line, svn_patch_t *patch,
+           apr_pool_t *result_pool, apr_pool_t *scratch_pool)
+{
+  /* If we can find a tab, it separates the filename from
+   * the rest of the line which we can discard. */
+  char *tab = strchr(line, '\t');
+  if (tab)
+    *tab = '\0';
+
+  SVN_ERR(grab_filename(&patch->old_filename, line + strlen("--- "),
+                        result_pool, scratch_pool));
+
+  *state = state_minus_seen;
+
+  return SVN_NO_ERROR;
+}
+
+/* Parse the '+++ ' line of a regular unidiff. */
+static svn_error_t *
+diff_plus(enum parse_state *state, const char *line, svn_patch_t *patch,
+           apr_pool_t *result_pool, apr_pool_t *scratch_pool)
+{
+  /* If we can find a tab, it separates the filename from
+   * the rest of the line which we can discard. */
+  char *tab = strchr(line, '\t');
+  if (tab)
+    *tab = '\0';
+
+  SVN_ERR(grab_filename(&patch->new_filename, line + strlen("+++ "),
+                        result_pool, scratch_pool));
+
+  *state = state_unidiff_found;
+
+  return SVN_NO_ERROR;
+}
+
+/* Parse the first line of a git extended unidiff. */
+static svn_error_t *
+git_start(enum parse_state *state, const char *line, svn_patch_t *patch,
+          apr_pool_t *result_pool, apr_pool_t *scratch_pool)
+{
+  const char *old_path;
+  const char *new_path;
+  char *end_old_path;
+  char *slash;
+
+  /* ### Add handling of escaped paths
+   * http://www.kernel.org/pub/software/scm/git/docs/git-diff.html: 
+   *
+   * TAB, LF, double quote and backslash characters in pathnames are
+   * represented as \t, \n, \" and \\, respectively. If there is need for
+   * such substitution then the whole pathname is put in double quotes.
+   */
+
+  /* Our line should look like this: 'git --diff a/path b/path'. 
+   * If we find any deviations from that format, we return with state reset
+   * to start.
+   *
+   * ### We can't handle paths with spaces!
+   */
+  slash = strchr(line, '/');
+
+  if (! slash)
+    {
+      *state = state_start;
+      return SVN_NO_ERROR;
+    }
+
+  old_path = slash + 1;
+
+  if (! *old_path)
+    {
+      *state = state_start;
+      return SVN_NO_ERROR;
+    }
+
+  end_old_path = strchr(old_path, ' ');
+
+  if (end_old_path)
+    *end_old_path = '\0';
+  else
+    {
+      *state = state_start;
+      return SVN_NO_ERROR;
+    }
+
+  /* The new path begins after the first slash after the old path. */
+  slash = strchr(end_old_path + 1, '/');
+
+  if (! slash)
+    {
+      *state = state_start;
+      return SVN_NO_ERROR;
+    }
+
+  /* The path starts after the slash */
+  new_path = slash + 1;
+
+  if (! *new_path)
+    {
+      *state = state_start;
+      return SVN_NO_ERROR;
+    }
+
+  SVN_ERR(grab_filename(&patch->old_filename, old_path,
+                        result_pool, scratch_pool));
+
+  SVN_ERR(grab_filename(&patch->new_filename, new_path,
+                        result_pool, scratch_pool));
+
+  /* We assume that the path is only modified until we've found a 'tree'
+   * header */
+  patch->operation = svn_diff_op_modified;
+
+  *state = state_git_diff_seen;
+  return SVN_NO_ERROR;
+}
+
+/* Parse the '--- ' line of a git extended unidiff. */
+static svn_error_t *
+git_minus(enum parse_state *state, const char *line, svn_patch_t *patch,
+          apr_pool_t *result_pool, apr_pool_t *scratch_pool)
+{
+  /* ### Check that the path is consistent with the 'git --diff ' line. */
+
+  *state = state_git_minus_seen;
+  return SVN_NO_ERROR;
+}
+
+/* Parse the '+++ ' line of a git extended unidiff. */
+static svn_error_t *
+git_plus(enum parse_state *state, const char *line, svn_patch_t *patch,
+          apr_pool_t *result_pool, apr_pool_t *scratch_pool)
+{
+  /* ### Check that the path is consistent with the 'git --diff ' line. */
+
+  *state = state_git_header_found;
+  return SVN_NO_ERROR;
+}
+
+/* Parse the 'move from ' line of a git extended unidiff. */
+static svn_error_t *
+git_move_from(enum parse_state *state, const char *line, svn_patch_t *patch,
+              apr_pool_t *result_pool, apr_pool_t *scratch_pool)
+{
+  /* ### Check that the path is consistent with the 'git --diff ' line. */
+
+  *state = state_move_from_seen;
+  return SVN_NO_ERROR;
+}
+
+/* Parse the 'move to ' line fo a git extended unidiff. */
+static svn_error_t *
+git_move_to(enum parse_state *state, const char *line, svn_patch_t *patch,
+            apr_pool_t *result_pool, apr_pool_t *scratch_pool)
+{
+  /* ### Check that the path is consistent with the 'git --diff ' line. */
+
+  patch->operation = svn_diff_op_moved;
+
+  *state = state_git_tree_seen;
+  return SVN_NO_ERROR;
+}
+
+/* Parse the 'copy from ' line of a git extended unidiff. */
+static svn_error_t *
+git_copy_from(enum parse_state *state, const char *line, svn_patch_t *patch,
+              apr_pool_t *result_pool, apr_pool_t *scratch_pool)
+{
+  /* ### Check that the path is consistent with the 'git --diff ' line. */
+
+  *state = state_copy_from_seen; 
+  return SVN_NO_ERROR;
+}
+
+/* Parse the 'copy to ' line of a git extended unidiff. */
+static svn_error_t *
+git_copy_to(enum parse_state *state, const char *line, svn_patch_t *patch,
+            apr_pool_t *result_pool, apr_pool_t *scratch_pool)
+{
+  /* ### Check that the path is consistent with the 'git --diff ' line. */
+
+  patch->operation = svn_diff_op_copied;
+
+  *state = state_git_tree_seen;
+  return SVN_NO_ERROR;
+}
+
+/* Parse the 'new file ' line of a git extended unidiff. */
+static svn_error_t *
+git_new_file(enum parse_state *state, const char *line, svn_patch_t *patch,
+             apr_pool_t *result_pool, apr_pool_t *scratch_pool)
+{
+  patch->operation = svn_diff_op_added;
+
+  *state = state_git_header_found;
+  return SVN_NO_ERROR;
+}
+
+/* Parse the 'deleted file ' line of a git extended unidiff. */
+static svn_error_t *
+git_deleted_file(enum parse_state *state, const char *line, svn_patch_t *patch,
+                 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
+{
+  patch->operation = svn_diff_op_deleted;
+
+  *state = state_git_header_found;
+  return SVN_NO_ERROR;
+}
+
+
 svn_error_t *
 svn_diff_parse_next_patch(svn_patch_t **patch,
                           apr_file_t *patch_file,
@@ -586,15 +849,39 @@ svn_diff_parse_next_patch(svn_patch_t **
                           apr_pool_t *result_pool,
                           apr_pool_t *scratch_pool)
 {
-  static const char * const minus = "--- ";
-  static const char * const plus = "+++ ";
-  const char *indicator;
   const char *fname;
   svn_stream_t *stream;
-  svn_boolean_t eof, in_header;
+  apr_off_t pos, last_line;
+  svn_boolean_t eof;
+  svn_boolean_t line_after_tree_header_read = FALSE;
 
   apr_pool_t *iterpool;
 
+  enum parse_state state = state_start;
+
+  /* ### dannas: As I've understood the git diff format, the first line
+   * ### contains both paths and the paths in the headers that follow are only
+   * ### there to ensure that the path is valid. Not sure though, the
+   * ### research continues... */
+
+  /* Our table consisting of:
+   * Input             Required state           function to call */
+  struct transition transitions[] = 
+    {
+      {"--- ",          state_start,            diff_minus},
+      {"+++ ",          state_minus_seen,       diff_plus},
+      {"git --diff",    state_start,            git_start},
+      {"--- a/",        state_git_diff_seen,    git_minus},
+      {"--- a/",        state_git_tree_seen,    git_minus},
+      {"+++ b/",        state_git_minus_seen,   git_plus},
+      {"move from ",    state_git_diff_seen,    git_move_from},
+      {"move to ",      state_move_from_seen,   git_move_to},
+      {"copy from ",    state_git_diff_seen,    git_copy_from},
+      {"copy to ",      state_copy_from_seen,   git_copy_to},
+      {"new file ",     state_git_diff_seen,    git_new_file},
+      {"deleted file ", state_git_diff_seen,    git_deleted_file},
+    };
+
   if (apr_file_eof(patch_file) == APR_EOF)
     {
       /* No more patches here. */
@@ -615,66 +902,74 @@ svn_diff_parse_next_patch(svn_patch_t **
    * make sure it is disowned. */
   stream = svn_stream_from_aprfile2(patch_file, TRUE, scratch_pool);
 
-  indicator = minus;
-  in_header = FALSE;
+  /* Get current seek position -- APR has no ftell() :( */
+  pos = 0;
+  SVN_ERR(svn_io_file_seek((*patch)->patch_file, APR_CUR, &pos, scratch_pool));
+
   iterpool = svn_pool_create(scratch_pool);
+
   do
     {
       svn_stringbuf_t *line;
+      int i;
 
       svn_pool_clear(iterpool);
 
-      /* Read a line from the stream. */
+      /* Remember the current line's offset, and read the line. */
+      last_line = pos;
       SVN_ERR(svn_stream_readline_detect_eol(stream, &line, NULL, &eof,
                                              iterpool));
 
-      /* See if we have a diff header. */
-      if (line->len > strlen(indicator) && starts_with(line->data, indicator))
+      if (! eof)
         {
-          const char *utf8_path;
-          const char *canon_path;
-
-          /* If we can find a tab, it separates the filename from
-           * the rest of the line which we can discard. */
-          char *tab = strchr(line->data, '\t');
-          if (tab)
-            *tab = '\0';
-
-          /* Grab the filename and encode it in UTF-8. */
-          /* TODO: Allow specifying the patch file's encoding.
-           *       For now, we assume its encoding is native. */
-          SVN_ERR(svn_utf_cstring_to_utf8(&utf8_path,
-                                          line->data + strlen(indicator),
-                                          iterpool));
-
-          /* Canonicalize the path name. */
-          canon_path = svn_dirent_canonicalize(utf8_path, iterpool);
+          /* Update line offset for next iteration.
+           * APR has no ftell() :( */
+          pos = 0;
+          SVN_ERR(svn_io_file_seek((*patch)->patch_file, APR_CUR, &pos, iterpool));
+        }
 
-          if ((! in_header) && strcmp(indicator, minus) == 0)
-            {
-              /* First line of header contains old filename. */
-              if (reverse)
-                (*patch)->new_filename = apr_pstrdup(result_pool, canon_path);
-              else
-                (*patch)->old_filename = apr_pstrdup(result_pool, canon_path);
-              indicator = plus;
-              in_header = TRUE;
-            }
-          else if (in_header && strcmp(indicator, plus) == 0)
+      /* Run the state machine. */
+      for (i = 0; i < sizeof(transitions)/sizeof(transitions[0]); i++)
+        {
+          if (line->len > strlen(transitions[i].line) 
+              && starts_with(line->data, transitions[i].line)
+              && state == transitions[i].state)
             {
-              /* Second line of header contains new filename. */
-              if (reverse)
-                (*patch)->old_filename = apr_pstrdup(result_pool, canon_path);
-              else
-                (*patch)->new_filename = apr_pstrdup(result_pool, canon_path);
-              in_header = FALSE;
-              break; /* All good! */
+              SVN_ERR(transitions[i].fn(&state, line->data, *patch,
+                                        result_pool, iterpool));
+              break;
             }
-          else
-            in_header = FALSE;
         }
+
+      if (state == state_unidiff_found
+          || state == state_git_header_found)
+        {
+          /* We have a valid diff header, yay! */
+          break; 
+        }
+      else if (state == state_git_tree_seen 
+               && line_after_tree_header_read)
+        {
+          /* We have a valid diff header for a patch with only tree changes.
+           * Rewind to the start of the line just read, so subsequent calls
+           * to this function don't end up skipping the line -- it may
+           * contain a patch. */
+          SVN_ERR(svn_io_file_seek((*patch)->patch_file, APR_SET, &last_line,
+                                   scratch_pool));
+          break;
+        }
+      else if (state == state_git_tree_seen)
+          line_after_tree_header_read = TRUE;
+
+    } while (! eof);
+
+  if (reverse)
+    {
+      const char *temp;
+      temp = (*patch)->old_filename;
+      (*patch)->old_filename = (*patch)->new_filename;
+      (*patch)->new_filename = temp;
     }
-  while (! eof);
 
   if ((*patch)->old_filename == NULL || (*patch)->new_filename == NULL)
     {

Modified: subversion/trunk/subversion/tests/libsvn_diff/parse-diff-test.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/tests/libsvn_diff/parse-diff-test.c?rev=959388&r1=959387&r2=959388&view=diff
==============================================================================
--- subversion/trunk/subversion/tests/libsvn_diff/parse-diff-test.c (original)
+++ subversion/trunk/subversion/tests/libsvn_diff/parse-diff-test.c Wed Jun 30 17:55:25 2010
@@ -566,9 +566,9 @@ struct svn_test_descriptor_t test_funcs[
     SVN_TEST_NULL,
     SVN_TEST_PASS2(test_parse_unidiff,
                    "test unidiff parsing"),
-    SVN_TEST_XFAIL2(test_parse_git_diff,
+    SVN_TEST_PASS2(test_parse_git_diff,
                     "test git unidiff parsing"),
-    SVN_TEST_XFAIL2(test_parse_git_tree_and_text_diff,
+    SVN_TEST_PASS2(test_parse_git_tree_and_text_diff,
                     "test git unidiff parsing of tree and text changes"),
     SVN_TEST_PASS2(test_parse_property_diff,
                    "test property unidiff parsing"),