You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@subversion.apache.org by ko...@apache.org on 2016/03/18 16:29:52 UTC
svn commit: r1735614 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_subr/utf8proc.c svn/log-cmd.c svn/svn.c tests/libsvn_subr/utf-test.c

Author: kotkov
Date: Fri Mar 18 15:29:38 2016
New Revision: 1735614

URL: http://svn.apache.org/viewvc?rev=1735614&view=rev
Log:
Ignore diacriticals in svn log --search, so that e.g., `müssen' would
match against `mussen'.

* subversion/include/private/svn_utf_private.h
  (svn_utf__casefold): Remove, is now replaced by ...
  (svn_utf__xfrm): ...this new function that transforms a given string into
   shape suitable for case- and accent-insensitive comparison.

* subversion/libsvn_subr/utf8proc.c
  (normalize_cstring): Add a `stripmark' argument that corresponds
   to the UTF8PROC_STRIPMARK transformation flag.
  (svn_utf__normalize): Adjust call to normalize_cstring().
  (svn_utf__casefold): Remove.
  (svn_utf__xfrm): Implement this function by calling normalize_cstring()
   with appropriate arguments.
  (svn_utf__is_normalized): Adjust call to normalize_cstring().

* subversion/svn/log-cmd.c
  (match): Prepare the string for case- and accent-insensitive comparison
   by calling svn_utf__xfrm().

* subversion/svn/svn.c
  (sub_main): Prepare the pattern for case- and accent-insensitive comparison
   by calling svn_utf__xfrm().

* subversion/tests/libsvn_subr/utf-test.c
  (test_utf_casefold): Replaced with ...
  (test_utf_xfrm): ...this new test.  Test the behavior with non-synthetic
   examples, such as with `İstanbul'.
  (test_funcs): Track the test changes.

Modified:
    subversion/trunk/subversion/include/private/svn_utf_private.h
    subversion/trunk/subversion/libsvn_subr/utf8proc.c
    subversion/trunk/subversion/svn/log-cmd.c
    subversion/trunk/subversion/svn/svn.c
    subversion/trunk/subversion/tests/libsvn_subr/utf-test.c

Modified: subversion/trunk/subversion/include/private/svn_utf_private.h
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/include/private/svn_utf_private.h?rev=1735614&r1=1735613&r2=1735614&view=diff
==============================================================================
--- subversion/trunk/subversion/include/private/svn_utf_private.h (original)
+++ subversion/trunk/subversion/include/private/svn_utf_private.h Fri Mar 18 15:29:38 2016
@@ -150,22 +150,27 @@ svn_utf__normalize(const char **result,
                    const char *str, apr_size_t len,
                    svn_membuf_t *buf);
 
-/* Normalize the UTF-8 string STR to form C and remove case distinctions
- * with Unicode's Default Caseless Matching algorithm. Use BUF as a
- * temporary storage. If LEN is SVN_UTF__UNKNOWN_LENGTH, assume STR
- * is null-terminated; otherwise, consider the string only up to the
- * given length.
+/* Transform the UTF-8 string to a shape suitable for comparison with
+ * strcmp(). The tranformation is defined by CASE_INSENSITIVE and
+ * ACCENT_INSENSITIVE arguments. If CASE_INSENSITIVE is non-zero,
+ * remove case distinctions from the string. If ACCENT_INSENSITIVE
+ * is non-zero, remove diacritical marks from the string.
  *
- * Return the resulting string in *RESULT, which shares storage with
- * BUF and is valid only until the next time BUF is modified.
+ * Use BUF as a temporary storage. If LEN is SVN_UTF__UNKNOWN_LENGTH,
+ * assume STR is null-terminated; otherwise, consider the string only
+ * up to the given length. Place the tranformed string in *RESULT, which
+ * shares storage with BUF and is valid only until the next time BUF is
+ * modified.
  *
  * A returned error may indicate that STRING contains invalid UTF-8 or
  * invalid Unicode codepoints.
  */
 svn_error_t *
-svn_utf__casefold(const char **result,
-                  const char *str, apr_size_t len,
-                  svn_membuf_t *buf);
+svn_utf__xfrm(const char **result,
+              const char *str, apr_size_t len,
+              svn_boolean_t case_insensitive,
+              svn_boolean_t accent_insensitive,
+              svn_membuf_t *buf);
 
 /* Check if STRING is a valid, NFC-normalized UTF-8 string.  Note that
  * a FALSE return value may indicate that STRING is not valid UTF-8 at

Modified: subversion/trunk/subversion/libsvn_subr/utf8proc.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/utf8proc.c?rev=1735614&r1=1735613&r2=1735614&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf8proc.c (original)
+++ subversion/trunk/subversion/libsvn_subr/utf8proc.c Fri Mar 18 15:29:38 2016
@@ -127,7 +127,8 @@ decompose_normalized(apr_size_t *result_
  * of UTF-8 characters.
  *
  * If CASEFOLD is non-zero, perform Unicode case folding, e.g., for
- * case-insensitive string comparison.
+ * case-insensitive string comparison. If STRIPMARK is non-zero, strip
+ * all diacritical marks (e.g., accents) from the string.
  *
  * A returned error may indicate that STRING contains invalid UTF-8 or
  * invalid Unicode codepoints. Any error message comes from utf8proc.
@@ -136,10 +137,19 @@ static svn_error_t *
 normalize_cstring(apr_size_t *result_length,
                   const char *string, apr_size_t length,
                   svn_boolean_t casefold,
+                  svn_boolean_t stripmark,
                   svn_membuf_t *buffer)
 {
-  ssize_t result = unicode_decomposition(casefold ? UTF8PROC_CASEFOLD : 0,
-                                         string, length, buffer);
+  int flags = 0;
+  ssize_t result;
+
+  if (casefold)
+    flags |= UTF8PROC_CASEFOLD;
+
+  if (stripmark)
+    flags |= UTF8PROC_STRIPMARK;
+
+  result = unicode_decomposition(flags, string, length, buffer);
   if (result >= 0)
     {
       svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
@@ -207,18 +217,21 @@ svn_utf__normalize(const char **result,
                    svn_membuf_t *buf)
 {
   apr_size_t result_length;
-  SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, buf));
+  SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, FALSE, buf));
   *result = (const char*)(buf->data);
   return SVN_NO_ERROR;
 }
 
 svn_error_t *
-svn_utf__casefold(const char **result,
-                  const char *str, apr_size_t len,
-                  svn_membuf_t *buf)
+svn_utf__xfrm(const char **result,
+              const char *str, apr_size_t len,
+              svn_boolean_t case_insensitive,
+              svn_boolean_t accent_insensitive,
+              svn_membuf_t *buf)
 {
   apr_size_t result_length;
-  SVN_ERR(normalize_cstring(&result_length, str, len, TRUE, buf));
+  SVN_ERR(normalize_cstring(&result_length, str, len,
+                            case_insensitive, accent_insensitive, buf));
   *result = (const char*)(buf->data);
   return SVN_NO_ERROR;
 }
@@ -375,7 +388,8 @@ svn_utf__is_normalized(const char *strin
   apr_size_t result_length;
   const apr_size_t length = strlen(string);
   svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
-  err = normalize_cstring(&result_length, string, length, FALSE, &buffer);
+  err = normalize_cstring(&result_length, string, length,
+                          FALSE, FALSE, &buffer);
   if (err)
     {
       svn_error_clear(err);

Modified: subversion/trunk/subversion/svn/log-cmd.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/svn/log-cmd.c?rev=1735614&r1=1735613&r2=1735614&view=diff
==============================================================================
--- subversion/trunk/subversion/svn/log-cmd.c (original)
+++ subversion/trunk/subversion/svn/log-cmd.c Fri Mar 18 15:29:38 2016
@@ -112,14 +112,14 @@ display_diff(const svn_log_entry_t *log_
 }
 
 /* Return TRUE if STR matches PATTERN. Else, return FALSE. Assumes that
- * PATTERN is a UTF-8 string normalized to form C with case folding
- * applied. Use BUF for temporary allocations. */
+ * PATTERN is a UTF-8 string prepared for case- and accent-insensitive
+ * comparison via svn_utf__xfrm(). */
 static svn_boolean_t
 match(const char *pattern, const char *str, svn_membuf_t *buf)
 {
   svn_error_t *err;
 
-  err = svn_utf__casefold(&str, str, strlen(str), buf);
+  err = svn_utf__xfrm(&str, str, strlen(str), TRUE, TRUE, buf);
   if (err)
     {
       /* Can't match invalid data. */

Modified: subversion/trunk/subversion/svn/svn.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/svn/svn.c?rev=1735614&r1=1735613&r2=1735614&view=diff
==============================================================================
--- subversion/trunk/subversion/svn/svn.c (original)
+++ subversion/trunk/subversion/svn/svn.c Fri Mar 18 15:29:38 2016
@@ -2397,16 +2397,16 @@ sub_main(int *exit_code, int argc, const
         break;
       case opt_search:
         SVN_ERR(svn_utf_cstring_to_utf8(&utf8_opt_arg, opt_arg, pool));
-        SVN_ERR(svn_utf__casefold(&utf8_opt_arg, utf8_opt_arg,
-                                  strlen(utf8_opt_arg), &buf));
+        SVN_ERR(svn_utf__xfrm(&utf8_opt_arg, utf8_opt_arg,
+                              strlen(utf8_opt_arg), TRUE, TRUE, &buf));
         add_search_pattern_group(&opt_state,
                                  apr_pstrdup(pool, utf8_opt_arg),
                                  pool);
         break;
       case opt_search_and:
         SVN_ERR(svn_utf_cstring_to_utf8(&utf8_opt_arg, opt_arg, pool));
-        SVN_ERR(svn_utf__casefold(&utf8_opt_arg, utf8_opt_arg,
-                                  strlen(utf8_opt_arg), &buf));
+        SVN_ERR(svn_utf__xfrm(&utf8_opt_arg, utf8_opt_arg,
+                              strlen(utf8_opt_arg), TRUE, TRUE, &buf));
         add_search_pattern_to_latest_group(&opt_state,
                                            apr_pstrdup(pool, utf8_opt_arg),
                                            pool);

Modified: subversion/trunk/subversion/tests/libsvn_subr/utf-test.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/tests/libsvn_subr/utf-test.c?rev=1735614&r1=1735613&r2=1735614&view=diff
==============================================================================
--- subversion/trunk/subversion/tests/libsvn_subr/utf-test.c (original)
+++ subversion/trunk/subversion/tests/libsvn_subr/utf-test.c Fri Mar 18 15:29:38 2016
@@ -898,86 +898,75 @@ test_utf_normalize(apr_pool_t *pool)
 
 
 static svn_error_t *
-test_utf_casefold(apr_pool_t *pool)
+test_utf_xfrm(apr_pool_t *pool)
 {
-  /* Normalized: NFC */
-  static const char nfc[] =
-    "\xe1\xb9\xa8"              /* S with dot above and below */
-    "\xc5\xaf"                  /* u with ring */
-    "\xe1\xb8\x87"              /* b with macron below */
-    "\xe1\xb9\xbd"              /* v with tilde */
-    "\xe1\xb8\x9d"              /* e with breve and cedilla */
-    "\xc8\x91"                  /* r with double grave */
-    "\xc5\xa1"                  /* s with caron */
-    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
-    "\xe1\xbb\x9d"              /* o with grave and hook */
-    "\xe1\xb9\x8b";             /* n with circumflex below */
-
-  /* Normalized: NFC, case folded */
-  static const char nfc_casefold[] =
-    "\xe1\xb9\xa9"              /* s with dot above and below */
-    "\xc5\xaf"                  /* u with ring */
-    "\xe1\xb8\x87"              /* b with macron below */
-    "\xe1\xb9\xbd"              /* v with tilde */
-    "\xe1\xb8\x9d"              /* e with breve and cedilla */
-    "\xc8\x91"                  /* r with double grave */
-    "\xc5\xa1"                  /* s with caron */
-    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
-    "\xe1\xbb\x9d"              /* o with grave and hook */
-    "\xe1\xb9\x8b";             /* n with circumflex below */
-
-  /* Normalized: NFD */
-  static const char nfd[] =
-    "S\xcc\xa3\xcc\x87"         /* S with dot above and below */
-    "u\xcc\x8a"                 /* u with ring */
-    "b\xcc\xb1"                 /* b with macron below */
-    "v\xcc\x83"                 /* v with tilde */
-    "e\xcc\xa7\xcc\x86"         /* e with breve and cedilla */
-    "r\xcc\x8f"                 /* r with double grave */
-    "s\xcc\x8c"                 /* s with caron */
-    "i\xcc\x88\xcc\x81"         /* i with diaeresis and acute */
-    "o\xcc\x9b\xcc\x80"         /* o with grave and hook */
-    "n\xcc\xad";                /* n with circumflex below */
-
-  /* Mixed, denormalized */
-  static const char mixup[] =
-    "S\xcc\x87\xcc\xa3"         /* S with dot above and below */
-    "\xc5\xaf"                  /* u with ring */
-    "b\xcc\xb1"                 /* b with macron below */
-    "\xe1\xb9\xbd"              /* v with tilde */
-    "e\xcc\xa7\xcc\x86"         /* e with breve and cedilla */
-    "\xc8\x91"                  /* r with double grave */
-    "s\xcc\x8c"                 /* s with caron */
-    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
-    "o\xcc\x80\xcc\x9b"         /* o with grave and hook */
-    "\xe1\xb9\x8b";             /* n with circumflex below */
-
-  /* Invalid UTF-8 */
-  static const char invalid[] =
-    "\xe1\xb9\xa8"              /* S with dot above and below */
-    "\xc5\xaf"                  /* u with ring */
-    "\xe1\xb8\x87"              /* b with macron below */
-    "\xe1\xb9\xbd"              /* v with tilde */
-    "\xe1\xb8\x9d"              /* e with breve and cedilla */
-    "\xc8\x91"                  /* r with double grave */
-    "\xc5\xa1"                  /* s with caron */
-    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
-    "\xe6"                      /* Invalid byte */
-    "\xe1\xb9\x8b";             /* n with circumflex below */
-
+  const char *str;
   const char *result;
   svn_membuf_t buf;
 
   svn_membuf__create(&buf, 0, pool);
-  SVN_ERR(svn_utf__casefold(&result, nfc, strlen(nfc), &buf));
-  SVN_TEST_STRING_ASSERT(result, nfc_casefold);
-  SVN_ERR(svn_utf__casefold(&result, nfd, strlen(nfd), &buf));
-  SVN_TEST_STRING_ASSERT(result, nfc_casefold);
-  SVN_ERR(svn_utf__casefold(&result, mixup, strlen(mixup), &buf));
-  SVN_TEST_STRING_ASSERT(result, nfc_casefold);
 
-  SVN_TEST_ASSERT_ERROR(svn_utf__casefold(&result, invalid, strlen(invalid),
-                                          &buf),
+  /* ASCII string */
+  str = "Subversion";
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "Subversion");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "subversion");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "Subversion");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "subversion");
+
+  /* M (u with diaeresis) (sharp s) en */
+  str = "M" "\xc3\xbc" "\xc3\x9f" "en";
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "M" "\xc3\xbc" "\xc3\x9f" "en");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "m" "\xc3\xbc" "ssen");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "Mu" "\xc3\x9f" "en");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "mussen");
+
+  /* Na (i with diaeresis) vet (e with acute), decomposed */
+  str = "Nai" "\xcc\x88" "vete" "\xcc\x81";
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "Na" "\xc3\xaf" "vet" "\xc3\xa9");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "na" "\xc3\xaf" "vet" "\xc3\xa9");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "Naivete");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "naivete");
+
+  /* (I with dot above) stanbul */
+  str = "\xc4\xb0" "stanbul";
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "\xc4\xb0" "stanbul");
+
+  /* The Latin Capital Letter I with Dot Above (0130) should fold into
+     Latin Small Letter I (0069) with Combining Dot Above (0307) per full
+     mapping in http://www.unicode.org/Public/UNIDATA/CaseFolding.txt */
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "i" "\xcc\x87" "stanbul");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "Istanbul");
+  SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, "istanbul");
+
+  /* Invalid UTF-8 */
+  str = "a" "\xe6" "bc";
+  SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str),
+                                      FALSE, FALSE, &buf),
+                        SVN_ERR_UTF8PROC_ERROR);
+  SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str),
+                                      TRUE, FALSE, &buf),
+                        SVN_ERR_UTF8PROC_ERROR);
+  SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str),
+                                      FALSE, TRUE, &buf),
+                        SVN_ERR_UTF8PROC_ERROR);
+  SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str),
+                                      TRUE, TRUE, &buf),
                         SVN_ERR_UTF8PROC_ERROR);
 
   return SVN_NO_ERROR;
@@ -1011,8 +1000,8 @@ static struct svn_test_descriptor_t test
                    "test svn_utf__utf{16,32}_to_utf8"),
     SVN_TEST_PASS2(test_utf_normalize,
                    "test svn_utf__normalize"),
-    SVN_TEST_PASS2(test_utf_casefold,
-                   "test svn_utf__casefold"),
+    SVN_TEST_PASS2(test_utf_xfrm,
+                   "test svn_utf__xfrm"),
     SVN_TEST_NULL
   };