You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@subversion.apache.org by br...@apache.org on 2012/11/22 06:43:57 UTC

svn commit: r1412425 - in /subversion/trunk/subversion: include/private/svn_string_private.h libsvn_subr/string.c tests/libsvn_subr/string-test.c

Author: brane
Date: Thu Nov 22 05:43:56 2012
New Revision: 1412425

URL: http://svn.apache.org/viewvc?rev=1412425&view=rev
Log:
Groundwork for issue #4261. Utility for calculating string similarity.

* subversion/include/private/svn_string_private.h
  (svn_cstring__similarity): New prototype.
* subversion/libsvn_subr/string.c
  (svn_cstring__similarity): Implement.

* subversion/tests/libsvn_subr/string-test.c
  (test_string_similarity): New test case.
  (test_funcs): Register test_string_similarity.

Modified:
    subversion/trunk/subversion/include/private/svn_string_private.h
    subversion/trunk/subversion/libsvn_subr/string.c
    subversion/trunk/subversion/tests/libsvn_subr/string-test.c

Modified: subversion/trunk/subversion/include/private/svn_string_private.h
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/include/private/svn_string_private.h?rev=1412425&r1=1412424&r2=1412425&view=diff
==============================================================================
--- subversion/trunk/subversion/include/private/svn_string_private.h (original)
+++ subversion/trunk/subversion/include/private/svn_string_private.h Thu Nov 22 05:43:56 2012
@@ -167,6 +167,39 @@ svn__ui64toa_sep(apr_uint64_t number, ch
 char *
 svn__i64toa_sep(apr_int64_t number, char seperator, apr_pool_t *pool);
 
+/**
+ * Computes the similarity score STRA and STRB, given as the ratio
+ * between the length of their longest common subsequence and the
+ * length of the strings, normalized to the range [0..1000].
+ * The result is equivalent to Python's
+ *
+ *   difflib.SequenceMatcher.ratio
+ *
+ * Optionally sets *RLCS to the length of the longest common
+ * subsequence of STRA and STRB. Using BUFFER for temporary storage,
+ * requires memory proportional to the length of the shorter string.
+ *
+ * The LCS algorithm used is described in, e.g.,
+ *
+ *   http://en.wikipedia.org/wiki/Longest_common_subsequence_problem
+ *
+ * Q: Why another LCS when we already have one in libsvn_diff?
+ * A: svn_diff__lcs is too heavyweight and too generic for the
+ *    purposes of similarity testing. Whilst it would be possible
+ *    to use a characte-based tokenizer with it, we really only need
+ *    the *length* of the LCS for the similarity score, not all the
+ *    other information that svn_diff__lcs produces in order to
+ *    make printing diffs possible.
+ *
+ * Q: Is there a limit on the length of the string parameters?
+ * A: Only available memory. But note that the LCS algorithm used
+ *    has O(strlen(STRA) * strlen(STRB)) worst-case performance,
+ *    so do keep a rein on your enthusiasm.
+ */
+unsigned int
+svn_cstring__similarity(const char *stra, const char *strb,
+                        svn_membuf_t *buffer, apr_size_t *rlcs);
+
 /** @} */
 
 /** @} */

Modified: subversion/trunk/subversion/libsvn_subr/string.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/string.c?rev=1412425&r1=1412424&r2=1412425&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/string.c (original)
+++ subversion/trunk/subversion/libsvn_subr/string.c Thu Nov 22 05:43:56 2012
@@ -1181,3 +1181,90 @@ svn__i64toa_sep(apr_int64_t number, char
   return apr_pstrdup(pool, buffer);
 }
 
+unsigned int
+svn_cstring__similarity(const char *stra, const char *strb,
+                        svn_membuf_t *buffer, apr_size_t *rlcs)
+{
+  const apr_size_t lena = strlen(stra);
+  const apr_size_t lenb = strlen(strb);
+  const apr_size_t total = lena + lenb;
+  const char *enda = stra + lena;
+  const char *endb = strb + lenb;
+  apr_size_t lcs = 0;
+
+  /* Skip the common prefix ... */
+  while (stra < enda && strb < endb && *stra == *strb)
+    {
+      ++stra; ++strb;
+      ++lcs;
+    }
+
+  /* ... and the common suffix */
+  while (stra < enda && strb < endb && *enda == *endb)
+    {
+      --enda; --endb;
+      ++lcs;
+    }
+
+  if (stra < enda && strb < endb)
+    {
+      /* Move the end pointers past the non-matching part */
+      const apr_size_t resta = ++enda - stra;
+      const apr_size_t restb = ++endb - strb;
+      const apr_size_t slots = (resta > restb ? restb : resta);
+      apr_size_t *curr, *prev;
+      const char *pstr;
+
+      /* The outer loop must iterate on the longer string. */
+      if (resta < restb)
+        {
+          pstr = stra;
+          stra = strb;
+          strb = pstr;
+
+          pstr = enda;
+          enda = endb;
+          endb = pstr;
+        }
+
+      /* Allocate two columns in the LCS matrix
+         ### Optimize this to (slots + 2) instesd of 2 * (slots + 1) */
+      svn_membuf__ensure(buffer, 2 * (slots + 1) * sizeof(apr_size_t));
+      svn_membuf__zero(buffer);
+      curr = buffer->data;
+      prev = curr + slots + 1;
+
+      /* Calculate LCS length of the remainder */
+      for (pstr = stra; pstr < enda; ++pstr)
+        {
+          int i;
+          for (i = 1; i <= slots; ++i)
+            {
+              if (*pstr == strb[i])
+                curr[i] = prev[i-1] + 1;
+              else
+                curr[i] = (curr[i-1] > prev[i] ? curr[i-1] : prev[i]);
+            }
+
+          /* Swap the buffers, making the previous one current */
+          {
+            apr_size_t *const temp = prev;
+            prev = curr;
+            curr = temp;
+          }
+        }
+
+      /* The common suffix matcher always finds the nul terminator,
+         so subtract 1 from the result. */
+      lcs += prev[slots] - 1;
+    }
+
+  if (rlcs)
+    *rlcs = lcs;
+
+  /* Return similarity ratio rounded to 4 significant digits */
+  if (total)
+    return(unsigned int)((2000 * lcs + total/2) / total);
+  else
+    return 1000;
+}

Modified: subversion/trunk/subversion/tests/libsvn_subr/string-test.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/tests/libsvn_subr/string-test.c?rev=1412425&r1=1412424&r2=1412425&view=diff
==============================================================================
--- subversion/trunk/subversion/tests/libsvn_subr/string-test.c (original)
+++ subversion/trunk/subversion/tests/libsvn_subr/string-test.c Thu Nov 22 05:43:56 2012
@@ -612,6 +612,96 @@ test_stringbuf_replace(apr_pool_t *pool)
   return expect_stringbuf_equal(a, "test hello, world!!!", pool);
 }
 
+static svn_error_t *
+test_string_similarity(apr_pool_t *pool)
+{
+  const struct sim_score_test_t
+  {
+    const char *stra;
+    const char *strb;
+    apr_size_t lcs;
+    int score;
+  } tests[] =
+      {
+#define SCORE(lcs, len) ((2000 * (lcs) + (len)/2) / (len))
+
+        /* Equality */
+        {"",       "",          0, 1000},
+        {"quoth",  "quoth",     5, SCORE(5, 5+5)},
+
+        /* Deletion at start */
+        {"quoth",  "uoth",      4, SCORE(4, 5+4)},
+        {"uoth",   "quoth",     4, SCORE(4, 4+5)},
+
+        /* Deletion at end */
+        {"quoth",  "quot",      4, SCORE(4, 5+4)},
+        {"quot",   "quoth",     4, SCORE(4, 4+5)},
+
+        /* Insertion at start */
+        {"quoth",  "Xquoth",    5, SCORE(5, 5+6)},
+        {"Xquoth", "quoth",     5, SCORE(5, 6+5)},
+
+        /* Insertion at end */
+        {"quoth",  "quothX",    5, SCORE(5, 5+6)},
+        {"quothX", "quoth",     5, SCORE(5, 6+5)},
+
+        /* Insertion in middle */
+        {"quoth",  "quoXth",    5, SCORE(5, 5+6)},
+        {"quoXth", "quoth",     5, SCORE(5, 6+5)},
+
+        /* Transposition at start */
+        {"quoth",  "uqoth",     4, SCORE(4, 5+5)},
+        {"uqoth",  "quoth",     4, SCORE(4, 5+5)},
+
+        /* Transposition at end */
+        {"quoth",  "quoht",     4, SCORE(4, 5+5)},
+        {"quoht",  "quoth",     4, SCORE(4, 5+5)},
+
+        /* Transposition in middle */
+        {"quoth",  "qutoh",     4, SCORE(4, 5+5)},
+        {"qutoh",  "quoth",     4, SCORE(4, 5+5)},
+
+        /* Difference */
+        {"quoth",  "raven",     0, SCORE(0, 5+5)},
+        {"raven",  "quoth",     0, SCORE(0, 5+5)},
+        {"x",      "",          0, SCORE(0, 1+0)},
+        {"",       "x",         0, SCORE(0, 0+1)},
+        {"",       "quoth",     0, SCORE(0, 0+5)},
+        {"quoth",  "",          0, SCORE(0, 5+0)},
+        {"quoth",  "the raven", 2, SCORE(2, 5+9)},
+        {"the raven",  "quoth", 2, SCORE(2, 5+9)},
+        {NULL, NULL}
+      };
+
+  const struct sim_score_test_t *t;
+  svn_membuf_t buffer;
+
+  svn_membuf__create(&buffer, 0, pool);
+  for (t = tests; t->stra; ++t)
+    {
+      apr_size_t lcs;
+      const unsigned int score =
+        svn_cstring__similarity(t->stra, t->strb, &buffer, &lcs);
+      /*
+      fprintf(stderr,
+              "lcs %s ~ %s score %.3f (%"APR_SIZE_T_FMT
+              ") expected %.3f (%"APR_SIZE_T_FMT"))\n",
+              t->stra, t->strb, score/1000.0, lcs, t->score/1000.0, t->lcs);
+      */
+      if (score != t->score)
+        return fail(pool, "%s ~ %s score %.3f <> expected %.3f",
+                    t->stra, t->strb, score/1000.0, t->score/1000.0);
+
+      if (lcs != t->lcs)
+        return fail(pool,
+                    "%s ~ %s lcs %"APR_SIZE_T_FMT
+                    " <> expected "APR_SIZE_T_FMT,
+                    t->stra, t->strb, lcs, t->lcs);
+    }
+
+  return SVN_NO_ERROR;
+}
+
 /*
    ====================================================================
    If you add a new test to this file, update this array.
@@ -677,5 +767,7 @@ struct svn_test_descriptor_t test_funcs[
                    "check deletion from svn_stringbuf_t"),
     SVN_TEST_PASS2(test_stringbuf_replace,
                    "check replacement in svn_stringbuf_t"),
+    SVN_TEST_PASS2(test_string_similarity,
+                   "test string similarity scores"),
     SVN_TEST_NULL
   };

Re: svn commit: r1412425 - in /subversion/trunk/subversion: include/private/svn_string_private.h libsvn_subr/string.c tests/libsvn_subr/string-test.c

Posted by Branko Čibej <br...@wandisco.com>.

On 22.11.2012 18:29, Branko Čibej wrote:
> On 22.11.2012 17:01, Julian Foad wrote:
>> Just use 'float' and [0..1]?  I'm thinking we may want to use this for comparing diff hunks, and there may be more than 500 characters in each hunk and it matters whether just one character has been changed.
> Could do that, too. Harder to test though. :) It's trivial to extend the
> range to [0..10**6] or even 10**9 if that's a problem. Frankly it feels
> nice to keep the whole thing integral and not leave floating-point
> messes around.

Actually I'm being silly. The function already provides the actual LCS
length, so if 3 significant digits aren't enough, you can always compare
that to the length of the hunk.

-- Brane


-- 
Branko Čibej
Director of Subversion | WANdisco | www.wandisco.com

Re: svn commit: r1412425 - in /subversion/trunk/subversion: include/private/svn_string_private.h libsvn_subr/string.c tests/libsvn_subr/string-test.c

Posted by Branko Čibej <br...@wandisco.com>.

On 22.11.2012 17:01, Julian Foad wrote:
>> Author: brane
>> Modified: subversion/trunk/subversion/include/private/svn_string_private.h
>> +/**
>> + * Computes the similarity score STRA and STRB, given as the ratio
>> + * between the length of their longest common subsequence and the
>> + * length of the strings, normalized to the range [0..1000].
> "longest common non-contiguous subsequence"

Yes, we know it is, but that's not what the name of the algorithm is.

> "average length of the two strings"

This is true; however, saying "average" always sounds, to me, as if
we're dealing with a whole bunch of values, rather than just two.

> Just use 'float' and [0..1]?  I'm thinking we may want to use this for comparing diff hunks, and there may be more than 500 characters in each hunk and it matters whether just one character has been changed.

Could do that, too. Harder to test though. :) It's trivial to extend the
range to [0..10**6] or even 10**9 if that's a problem. Frankly it feels
nice to keep the whole thing integral and not leave floating-point
messes around.

> [...]
>
>> Modified: subversion/trunk/subversion/tests/libsvn_subr/string-test.c
>> +static svn_error_t *
>> +test_string_similarity(apr_pool_t *pool)
>> +{
>> +  const struct sim_score_test_t
>> +  {
>> +    const char *stra;
>> +    const char *strb;
>> +    apr_size_t lcs;
>> +    int score;
>> +  } tests[] =
>> +      {
>> +#define SCORE(lcs, len) ((2000 * (lcs) + (len)/2) / (len))
>> +
>> +        /* Equality */
>> +        {"",       "",          0, 1000},
>> +        {"quoth",  "quoth",     5, SCORE(5, 5+5)},
>> +
>> +        /* Deletion at start */
>> +        {"quoth",  "uoth",      4, SCORE(4, 5+4)},
>> +        {"uoth",   "quoth",     4, SCORE(4, 4+5)},
>> +
>> +        /* Deletion at end */
>> +        {"quoth",  "quot",      4, SCORE(4, 5+4)},
>> +        {"quot",   "quoth",     4, SCORE(4, 4+5)},
>> +
>> +        /* Insertion at start */
>> +        {"quoth",  "Xquoth",    5, SCORE(5, 5+6)},
>> +        {"Xquoth", "quoth",     5, SCORE(5, 6+5)},
>> +
>> +        /* Insertion at end */
>> +        {"quoth",  "quothX",    5, SCORE(5, 5+6)},
>> +        {"quothX", "quoth",     5, SCORE(5, 6+5)},
> Half of the examples in each of these four pairs are redundant.  
> (The word "deletion" implies an ordering of string-A -> string-B, so
>  by that terminology "uoth" -> "quoth" is an Insertion, same as 
> "quoth" -> "Xquoth".)

Theoretically and academically speaking, yes, they're redundant.
Practically, however, this helped me find a bug that only popped up when
exactly one of these examples was removed. I love symmetric algorithms
but my bugs are always asymmetric and some of them break causality, too. :)

So in order to avoid falling into the incompleteness trap, I just
mirrored all the test cases. It's less of a problem to have duplicate
tests than it is to miss one.

-- Brane

-- 
Branko Čibej
Director of Subversion | WANdisco | www.wandisco.com

Re: svn commit: r1412425 - in /subversion/trunk/subversion: include/private/svn_string_private.h libsvn_subr/string.c tests/libsvn_subr/string-test.c

Posted by Julian Foad <ju...@btopenworld.com>.

> Author: brane

> Date: Thu Nov 22 05:43:56 2012
> New Revision: 1412425
> 
> URL: http://svn.apache.org/viewvc?rev=1412425&view=rev
> Log:
> Groundwork for issue #4261. Utility for calculating string similarity.
> 
> * subversion/include/private/svn_string_private.h
>   (svn_cstring__similarity): New prototype.
> * subversion/libsvn_subr/string.c
>   (svn_cstring__similarity): Implement.
> 
> * subversion/tests/libsvn_subr/string-test.c
>   (test_string_similarity): New test case.
>   (test_funcs): Register test_string_similarity.

> Modified: subversion/trunk/subversion/include/private/svn_string_private.h
> +/**
> + * Computes the similarity score STRA and STRB, given as the ratio
> + * between the length of their longest common subsequence and the
> + * length of the strings, normalized to the range [0..1000].

"longest common non-contiguous subsequence"

"average length of the two strings"

Just use 'float' and [0..1]?  I'm thinking we may want to use this for comparing diff hunks, and there may be more than 500 characters in each hunk and it matters whether just one character has been changed.

> + * The result is equivalent to Python's
> + *
> + *   difflib.SequenceMatcher.ratio
> + *
> + * Optionally sets *RLCS to the length of the longest common
> + * subsequence of STRA and STRB. Using BUFFER for temporary storage,
> + * requires memory proportional to the length of the shorter string.
[...]

> Modified: subversion/trunk/subversion/tests/libsvn_subr/string-test.c
> +static svn_error_t *
> +test_string_similarity(apr_pool_t *pool)
> +{
> +  const struct sim_score_test_t
> +  {
> +    const char *stra;
> +    const char *strb;
> +    apr_size_t lcs;
> +    int score;
> +  } tests[] =
> +      {
> +#define SCORE(lcs, len) ((2000 * (lcs) + (len)/2) / (len))
> +
> +        /* Equality */
> +        {"",       "",          0, 1000},
> +        {"quoth",  "quoth",     5, SCORE(5, 5+5)},
> +
> +        /* Deletion at start */
> +        {"quoth",  "uoth",      4, SCORE(4, 5+4)},
> +        {"uoth",   "quoth",     4, SCORE(4, 4+5)},
> +
> +        /* Deletion at end */
> +        {"quoth",  "quot",      4, SCORE(4, 5+4)},
> +        {"quot",   "quoth",     4, SCORE(4, 4+5)},
> +
> +        /* Insertion at start */
> +        {"quoth",  "Xquoth",    5, SCORE(5, 5+6)},
> +        {"Xquoth", "quoth",     5, SCORE(5, 6+5)},
> +
> +        /* Insertion at end */
> +        {"quoth",  "quothX",    5, SCORE(5, 5+6)},
> +        {"quothX", "quoth",     5, SCORE(5, 6+5)},

Half of the examples in each of these four pairs are redundant.  
(The word "deletion" implies an ordering of string-A -> string-B, so
 by that terminology "uoth" -> "quoth" is an Insertion, same as 
"quoth" -> "Xquoth".)

> +        /* Insertion in middle */

(That is, "Insertion/deletion in middle".)

> +        {"quoth",  "quoXth",    5, SCORE(5, 5+6)},
> +        {"quoXth", "quoth",     5, SCORE(5, 6+5)},
> +
> +        /* Transposition at start */
> +        {"quoth",  "uqoth",     4, SCORE(4, 5+5)},
> +        {"uqoth",  "quoth",     4, SCORE(4, 5+5)},
> +
> +        /* Transposition at end */
> +        {"quoth",  "quoht",     4, SCORE(4, 5+5)},
> +        {"quoht",  "quoth",     4, SCORE(4, 5+5)},
> +
> +        /* Transposition in middle */
> +        {"quoth",  "qutoh",     4, SCORE(4, 5+5)},
> +        {"qutoh",  "quoth",     4, SCORE(4, 5+5)},

Same redundance in the three "transposition" pairs.

> +        /* Difference */
> +        {"quoth",  "raven",     0, SCORE(0, 5+5)},
> +        {"raven",  "quoth",     0, SCORE(0, 5+5)},

And this pair.

> +        {"x",      "",          0, SCORE(0, 1+0)},
> +        {"",       "x",         0, SCORE(0, 0+1)},
> +        {"",       "quoth",     0, SCORE(0, 0+5)},
> +        {"quoth",  "",          0, SCORE(0, 5+0)},
> +        {"quoth",  "the raven", 2, SCORE(2, 5+9)},
> +        {"the raven",  "quoth", 2, SCORE(2, 5+9)},

And this one.

> +        {NULL, NULL}
> +      };


- Julian