You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@subversion.apache.org by ko...@apache.org on 2016/02/19 23:11:11 UTC

svn commit: r1731300 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_repos/dump.c libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c

Author: kotkov
Date: Fri Feb 19 22:11:11 2016
New Revision: 1731300

URL: http://svn.apache.org/viewvc?rev=1731300&view=rev
Log:
Make svn log --search case-insensitive.

Use utf8proc to do the normalization and locale-independent case folding
(UTF8PROC_CASEFOLD) for both the search pattern and the input strings.

Related discussion is in http://svn.haxx.se/dev/archive-2013-04/0374.shtml
(Subject: "log --search test failures on trunk and 1.8.x").

* subversion/include/private/svn_utf_private.h
  (svn_utf__normalize): Add new boolean argument to perform case folding.

* subversion/libsvn_subr/utf8proc.c
  (normalize_cstring): Add new boolean argument to perform case folding.
   In case it is non-zero, set the flags to UTF8PROC_CASEFOLD when
   doing the Unicode decomposition.
  (svn_utf__normalize): Pass new argument to normalize_cstring().
  (svn_utf__is_normalized): Adjust call to normalize_cstring().

* subversion/libsvn_repos/dump.c
  (extract_mergeinfo_paths, verify_mergeinfo_normalization,
   check_name_collision): Update callers of svn_utf__normalize().

* subversion/svn/cl-log.h
  (): Include svn_string_private for svn_membuf_t.
  (svn_cl__log_receiver_baton): Add an svn_membuf_t for the case folding
   and normalization purposes.

* subversion/svn/log-cmd.c
  (): Include svn_utf_private.h.
  (match): New helper that normalizes, folds case of the input, and matches
   it against the specified pattern.
  (match_search_pattern): Now accepts an svn_membuf_t.  Call the new helper
   function to perform the pattern matching.
  (svn_cl__log_entry_receiver, svn_cl__log_entry_receiver_xml): Pass the
   svn_membuf_t from the baton when calling match_search_pattern().
  (svn_cl__log): Initialize the svn_membuf_t in the log receiver baton.

* subversion/svn/svn.c
  (): Include svn_utf_private.h.
  (sub_main): Normalize and fold case of --search and --search-and arguments.

* subversion/tests/cmdline/log_tests.py
  (log_search): Adjust expectations, since --search is now case-insensitive.

* subversion/tests/libsvn_subr/utf-test.c
  (test_utf_normalize): New test for svn_utf__normalize().
  (test_funcs): Add new test.

Modified:
    subversion/trunk/subversion/include/private/svn_utf_private.h
    subversion/trunk/subversion/libsvn_repos/dump.c
    subversion/trunk/subversion/libsvn_subr/utf8proc.c
    subversion/trunk/subversion/svn/cl-log.h
    subversion/trunk/subversion/svn/log-cmd.c
    subversion/trunk/subversion/svn/svn.c
    subversion/trunk/subversion/tests/cmdline/log_tests.py
    subversion/trunk/subversion/tests/libsvn_subr/utf-test.c

Modified: subversion/trunk/subversion/include/private/svn_utf_private.h
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/include/private/svn_utf_private.h?rev=1731300&r1=1731299&r2=1731300&view=diff
==============================================================================
--- subversion/trunk/subversion/include/private/svn_utf_private.h (original)
+++ subversion/trunk/subversion/include/private/svn_utf_private.h Fri Feb 19 22:11:11 2016
@@ -139,6 +139,9 @@ svn_utf__normcmp(int *result,
  * null-terminated; otherwise, consider the string only up to the
  * given length.
  *
+ * If CASEFOLD is non-zero, perform Unicode case folding, e.g., for
+ * case-insensitive string comparison.
+ *
  * Return the normalized string in *RESULT, which shares storage with
  * BUF and is valid only until the next time BUF is modified.
  *
@@ -148,6 +151,7 @@ svn_utf__normcmp(int *result,
 svn_error_t*
 svn_utf__normalize(const char **result,
                    const char *str, apr_size_t len,
+                   svn_boolean_t casefold,
                    svn_membuf_t *buf);
 
 /* Check if STRING is a valid, NFC-normalized UTF-8 string.  Note that

Modified: subversion/trunk/subversion/libsvn_repos/dump.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_repos/dump.c?rev=1731300&r1=1731299&r2=1731300&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_repos/dump.c (original)
+++ subversion/trunk/subversion/libsvn_repos/dump.c Fri Feb 19 22:11:11 2016
@@ -897,7 +897,7 @@ extract_mergeinfo_paths(void *baton, con
   if (xb->normalize)
     {
       const char *normkey;
-      SVN_ERR(svn_utf__normalize(&normkey, key, klen, &xb->buffer));
+      SVN_ERR(svn_utf__normalize(&normkey, key, klen, FALSE, &xb->buffer));
       svn_hash_sets(xb->result,
                     apr_pstrdup(xb->buffer.pool, normkey),
                     normalized_unique);
@@ -951,7 +951,7 @@ verify_mergeinfo_normalization(void *bat
   const char *normpath;
   const char *found;
 
-  SVN_ERR(svn_utf__normalize(&normpath, path, klen, &vb->buffer));
+  SVN_ERR(svn_utf__normalize(&normpath, path, klen, FALSE, &vb->buffer));
   found = svn_hash_gets(vb->normalized_paths, normpath);
   if (!found)
       svn_hash_sets(vb->normalized_paths,
@@ -2233,7 +2233,7 @@ check_name_collision(void *baton, const
   const char *name;
   const char *found;
 
-  SVN_ERR(svn_utf__normalize(&name, key, klen, &cb->buffer));
+  SVN_ERR(svn_utf__normalize(&name, key, klen, FALSE, &cb->buffer));
 
   found = svn_hash_gets(cb->normalized, name);
   if (!found)
@@ -2252,7 +2252,7 @@ check_name_collision(void *baton, const
 
       SVN_ERR(svn_utf__normalize(
                   &normpath, svn_relpath_join(db->path, name, iterpool),
-                  SVN_UTF__UNKNOWN_LENGTH, &cb->buffer));
+                  SVN_UTF__UNKNOWN_LENGTH, FALSE, &cb->buffer));
       notify_warning(iterpool, eb->notify_func, eb->notify_baton,
                      svn_repos_notify_warning_name_collision,
                      _("Duplicate representation of path '%s'"), normpath);

Modified: subversion/trunk/subversion/libsvn_subr/utf8proc.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/utf8proc.c?rev=1731300&r1=1731299&r2=1731300&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf8proc.c (original)
+++ subversion/trunk/subversion/libsvn_subr/utf8proc.c Fri Feb 19 22:11:11 2016
@@ -126,15 +126,20 @@ decompose_normalized(apr_size_t *result_
  * STRING. Upon return, BUFFER->data points at a NUL-terminated string
  * of UTF-8 characters.
  *
+ * If CASEFOLD is non-zero, perform Unicode case folding, e.g., for
+ * case-insensitive string comparison.
+ *
  * A returned error may indicate that STRING contains invalid UTF-8 or
  * invalid Unicode codepoints. Any error message comes from utf8proc.
  */
 static svn_error_t *
 normalize_cstring(apr_size_t *result_length,
                   const char *string, apr_size_t length,
+                  svn_boolean_t casefold,
                   svn_membuf_t *buffer)
 {
-  ssize_t result = unicode_decomposition(0, string, length, buffer);
+  ssize_t result = unicode_decomposition(casefold ? UTF8PROC_CASEFOLD : 0,
+                                         string, length, buffer);
   if (result >= 0)
     {
       svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
@@ -199,10 +204,11 @@ svn_utf__normcmp(int *result,
 svn_error_t*
 svn_utf__normalize(const char **result,
                    const char *str, apr_size_t len,
+                   svn_boolean_t casefold,
                    svn_membuf_t *buf)
 {
   apr_size_t result_length;
-  SVN_ERR(normalize_cstring(&result_length, str, len, buf));
+  SVN_ERR(normalize_cstring(&result_length, str, len, casefold, buf));
   *result = (const char*)(buf->data);
   return SVN_NO_ERROR;
 }
@@ -359,7 +365,7 @@ svn_utf__is_normalized(const char *strin
   apr_size_t result_length;
   const apr_size_t length = strlen(string);
   svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
-  err = normalize_cstring(&result_length, string, length, &buffer);
+  err = normalize_cstring(&result_length, string, length, FALSE, &buffer);
   if (err)
     {
       svn_error_clear(err);

Modified: subversion/trunk/subversion/svn/cl-log.h
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/svn/cl-log.h?rev=1731300&r1=1731299&r2=1731300&view=diff
==============================================================================
--- subversion/trunk/subversion/svn/cl-log.h (original)
+++ subversion/trunk/subversion/svn/cl-log.h Fri Feb 19 22:11:11 2016
@@ -31,6 +31,8 @@
 
 #include "svn_types.h"
 
+#include "private/svn_string_private.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */
@@ -70,6 +72,9 @@ typedef struct svn_cl__log_receiver_bato
    * the log message, or a changed path matches one of these patterns. */
   apr_array_header_t *search_patterns;
 
+  /* Buffer for Unicode normalization and case folding. */
+  svn_membuf_t buffer;
+
   /* Pool for persistent allocations. */
   apr_pool_t *pool;
 } svn_cl__log_receiver_baton;

Modified: subversion/trunk/subversion/svn/log-cmd.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/svn/log-cmd.c?rev=1731300&r1=1731299&r2=1731300&view=diff
==============================================================================
--- subversion/trunk/subversion/svn/log-cmd.c (original)
+++ subversion/trunk/subversion/svn/log-cmd.c Fri Feb 19 22:11:11 2016
@@ -38,6 +38,7 @@
 
 #include "private/svn_cmdline_private.h"
 #include "private/svn_sorts_private.h"
+#include "private/svn_utf_private.h"
 
 #include "cl.h"
 #include "cl-log.h"
@@ -110,6 +111,24 @@ display_diff(const svn_log_entry_t *log_
   return SVN_NO_ERROR;
 }
 
+/* Return TRUE if STR matches PATTERN. Else, return FALSE. Assumes that
+ * PATTERN is a UTF-8 string normalized to form C with case folding
+ * applied. Use BUF for temporary allocations. */
+static svn_boolean_t
+match(const char *pattern, const char *str, svn_membuf_t *buf)
+{
+  svn_error_t *err;
+
+  err = svn_utf__normalize(&str, str, strlen(str), TRUE, buf);
+  if (err)
+    {
+      /* Can't match invalid data. */
+      svn_error_clear(err);
+      return FALSE;
+    }
+
+  return apr_fnmatch(pattern, str, 0) == APR_SUCCESS;
+}
 
 /* Return TRUE if SEARCH_PATTERN matches the AUTHOR, DATE, LOG_MESSAGE,
  * or a path in the set of keys of the CHANGED_PATHS hash. Else, return FALSE.
@@ -120,22 +139,22 @@ match_search_pattern(const char *search_
                      const char *date,
                      const char *log_message,
                      apr_hash_t *changed_paths,
+                     svn_membuf_t *buf,
                      apr_pool_t *pool)
 {
   /* Match any substring containing the pattern, like UNIX 'grep' does. */
   const char *pattern = apr_psprintf(pool, "*%s*", search_pattern);
-  int flags = 0;
 
   /* Does the author match the search pattern? */
-  if (author && apr_fnmatch(pattern, author, flags) == APR_SUCCESS)
+  if (author && match(pattern, author, buf))
     return TRUE;
 
   /* Does the date the search pattern? */
-  if (date && apr_fnmatch(pattern, date, flags) == APR_SUCCESS)
+  if (date && match(pattern, date, buf))
     return TRUE;
 
   /* Does the log message the search pattern? */
-  if (log_message && apr_fnmatch(pattern, log_message, flags) == APR_SUCCESS)
+  if (log_message && match(pattern, log_message, buf))
     return TRUE;
 
   if (changed_paths)
@@ -150,15 +169,14 @@ match_search_pattern(const char *search_
           const char *path = apr_hash_this_key(hi);
           svn_log_changed_path2_t *log_item;
 
-          if (apr_fnmatch(pattern, path, flags) == APR_SUCCESS)
+          if (match(pattern, path, buf))
             return TRUE;
 
           /* Match copy-from paths, too. */
           log_item = apr_hash_this_val(hi);
           if (log_item->copyfrom_path
               && SVN_IS_VALID_REVNUM(log_item->copyfrom_rev)
-              && apr_fnmatch(pattern,
-                             log_item->copyfrom_path, flags) == APR_SUCCESS)
+              && match(pattern, log_item->copyfrom_path, buf))
             return TRUE;
         }
     }
@@ -168,13 +186,14 @@ match_search_pattern(const char *search_
 
 /* Match all search patterns in SEARCH_PATTERNS against AUTHOR, DATE, MESSAGE,
  * and CHANGED_PATHS. Return TRUE if any pattern matches, else FALSE.
- * SCRACH_POOL is used for temporary allocations. */
+ * BUF and SCRATCH_POOL are used for temporary allocations. */
 static svn_boolean_t
 match_search_patterns(apr_array_header_t *search_patterns,
                       const char *author,
                       const char *date,
                       const char *message,
                       apr_hash_t *changed_paths,
+                      svn_membuf_t *buf,
                       apr_pool_t *scratch_pool)
 {
   int i;
@@ -197,7 +216,7 @@ match_search_patterns(apr_array_header_t
 
           pattern = APR_ARRAY_IDX(pattern_group, j, const char *);
           match = match_search_pattern(pattern, author, date, message,
-                                       changed_paths, iterpool);
+                                       changed_paths, buf, iterpool);
           if (!match)
             break;
         }
@@ -331,7 +350,7 @@ svn_cl__log_entry_receiver(void *baton,
 
   if (lb->search_patterns &&
       ! match_search_patterns(lb->search_patterns, author, date, message,
-                              log_entry->changed_paths2, pool))
+                              log_entry->changed_paths2, &lb->buffer, pool))
     {
       if (log_entry->has_children)
         {
@@ -535,7 +554,7 @@ svn_cl__log_entry_receiver_xml(void *bat
   /* Match search pattern before XML-escaping. */
   if (lb->search_patterns &&
       ! match_search_patterns(lb->search_patterns, author, date, message,
-                              log_entry->changed_paths2, pool))
+                              log_entry->changed_paths2, &lb->buffer, pool))
     {
       if (log_entry->has_children)
         {
@@ -795,6 +814,7 @@ svn_cl__log(apr_getopt_t *os,
   lb.diff_extensions = opt_state->extensions;
   lb.merge_stack = NULL;
   lb.search_patterns = opt_state->search_patterns;
+  svn_membuf__create(&lb.buffer, 0, pool);
   lb.pool = pool;
 
   if (opt_state->xml)

Modified: subversion/trunk/subversion/svn/svn.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/svn/svn.c?rev=1731300&r1=1731299&r2=1731300&view=diff
==============================================================================
--- subversion/trunk/subversion/svn/svn.c (original)
+++ subversion/trunk/subversion/svn/svn.c Fri Feb 19 22:11:11 2016
@@ -56,6 +56,7 @@
 #include "private/svn_opt_private.h"
 #include "private/svn_cmdline_private.h"
 #include "private/svn_subr_private.h"
+#include "private/svn_utf_private.h"
 
 #include "svn_private_config.h"
 
@@ -1868,6 +1869,7 @@ sub_main(int *exit_code, int argc, const
   svn_boolean_t reading_file_from_stdin = FALSE;
   apr_hash_t *changelists;
   apr_hash_t *cfg_hash;
+  svn_membuf_t buf;
 
   received_opts = apr_array_make(pool, SVN_OPT_MAX_OPTIONS, sizeof(int));
 
@@ -1888,6 +1890,9 @@ sub_main(int *exit_code, int argc, const
   /* Init our changelists hash. */
   changelists = apr_hash_make(pool);
 
+  /* Init the temporary buffer. */
+  svn_membuf__create(&buf, 0, pool);
+
   /* Begin processing arguments. */
   opt_state.start_revision.kind = svn_opt_revision_unspecified;
   opt_state.end_revision.kind = svn_opt_revision_unspecified;
@@ -2392,11 +2397,19 @@ sub_main(int *exit_code, int argc, const
         break;
       case opt_search:
         SVN_ERR(svn_utf_cstring_to_utf8(&utf8_opt_arg, opt_arg, pool));
-        add_search_pattern_group(&opt_state, utf8_opt_arg, pool);
+        SVN_ERR(svn_utf__normalize(&utf8_opt_arg, utf8_opt_arg,
+                                   strlen(utf8_opt_arg), TRUE, &buf));
+        add_search_pattern_group(&opt_state,
+                                 apr_pstrdup(pool, utf8_opt_arg),
+                                 pool);
         break;
       case opt_search_and:
         SVN_ERR(svn_utf_cstring_to_utf8(&utf8_opt_arg, opt_arg, pool));
-        add_search_pattern_to_latest_group(&opt_state, utf8_opt_arg, pool);
+        SVN_ERR(svn_utf__normalize(&utf8_opt_arg, utf8_opt_arg,
+                                   strlen(utf8_opt_arg), TRUE, &buf));
+        add_search_pattern_to_latest_group(&opt_state,
+                                           apr_pstrdup(pool, utf8_opt_arg),
+                                           pool);
       case opt_remove_unversioned:
         opt_state.remove_unversioned = TRUE;
         break;

Modified: subversion/trunk/subversion/tests/cmdline/log_tests.py
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/tests/cmdline/log_tests.py?rev=1731300&r1=1731299&r2=1731300&view=diff
==============================================================================
--- subversion/trunk/subversion/tests/cmdline/log_tests.py (original)
+++ subversion/trunk/subversion/tests/cmdline/log_tests.py Fri Feb 19 22:11:11 2016
@@ -2288,13 +2288,13 @@ def log_search(sbox):
   log_chain = parse_log_output(output)
   check_log_chain(log_chain, [7, 6, 3])
 
-  # search is case-sensitive
+  # search is case-insensitive
   exit_code, output, err = svntest.actions.run_and_verify_svn(
                              None, [], 'log', '--search',
                              'FOR REVISION [367]')
 
   log_chain = parse_log_output(output)
-  check_log_chain(log_chain, [])
+  check_log_chain(log_chain, [7, 6, 3])
 
   # multi-pattern search
   exit_code, output, err = svntest.actions.run_and_verify_svn(

Modified: subversion/trunk/subversion/tests/libsvn_subr/utf-test.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/tests/libsvn_subr/utf-test.c?rev=1731300&r1=1731299&r2=1731300&view=diff
==============================================================================
--- subversion/trunk/subversion/tests/libsvn_subr/utf-test.c (original)
+++ subversion/trunk/subversion/tests/libsvn_subr/utf-test.c Fri Feb 19 22:11:11 2016
@@ -823,6 +823,97 @@ test_utf_conversions(apr_pool_t *pool)
 }
 
 
+static svn_error_t *
+test_utf_normalize(apr_pool_t *pool)
+{
+  /* Normalized: NFC */
+  static const char nfc[] =
+    "\xe1\xb9\xa8"              /* S with dot above and below */
+    "\xc5\xaf"                  /* u with ring */
+    "\xe1\xb8\x87"              /* b with macron below */
+    "\xe1\xb9\xbd"              /* v with tilde */
+    "\xe1\xb8\x9d"              /* e with breve and cedilla */
+    "\xc8\x91"                  /* r with double grave */
+    "\xc5\xa1"                  /* s with caron */
+    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
+    "\xe1\xbb\x9d"              /* o with grave and hook */
+    "\xe1\xb9\x8b";             /* n with circumflex below */
+
+  /* Normalized: NFC, case folded */
+  static const char nfc_casefold[] =
+    "\xe1\xb9\xa9"              /* s with dot above and below */
+    "\xc5\xaf"                  /* u with ring */
+    "\xe1\xb8\x87"              /* b with macron below */
+    "\xe1\xb9\xbd"              /* v with tilde */
+    "\xe1\xb8\x9d"              /* e with breve and cedilla */
+    "\xc8\x91"                  /* r with double grave */
+    "\xc5\xa1"                  /* s with caron */
+    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
+    "\xe1\xbb\x9d"              /* o with grave and hook */
+    "\xe1\xb9\x8b";             /* n with circumflex below */
+
+  /* Normalized: NFD */
+  static const char nfd[] =
+    "S\xcc\xa3\xcc\x87"         /* S with dot above and below */
+    "u\xcc\x8a"                 /* u with ring */
+    "b\xcc\xb1"                 /* b with macron below */
+    "v\xcc\x83"                 /* v with tilde */
+    "e\xcc\xa7\xcc\x86"         /* e with breve and cedilla */
+    "r\xcc\x8f"                 /* r with double grave */
+    "s\xcc\x8c"                 /* s with caron */
+    "i\xcc\x88\xcc\x81"         /* i with diaeresis and acute */
+    "o\xcc\x9b\xcc\x80"         /* o with grave and hook */
+    "n\xcc\xad";                /* n with circumflex below */
+
+  /* Mixed, denormalized */
+  static const char mixup[] =
+    "S\xcc\x87\xcc\xa3"         /* S with dot above and below */
+    "\xc5\xaf"                  /* u with ring */
+    "b\xcc\xb1"                 /* b with macron below */
+    "\xe1\xb9\xbd"              /* v with tilde */
+    "e\xcc\xa7\xcc\x86"         /* e with breve and cedilla */
+    "\xc8\x91"                  /* r with double grave */
+    "s\xcc\x8c"                 /* s with caron */
+    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
+    "o\xcc\x80\xcc\x9b"         /* o with grave and hook */
+    "\xe1\xb9\x8b";             /* n with circumflex below */
+
+  /* Invalid UTF-8 */
+  static const char invalid[] =
+    "\xe1\xb9\xa8"              /* S with dot above and below */
+    "\xc5\xaf"                  /* u with ring */
+    "\xe1\xb8\x87"              /* b with macron below */
+    "\xe1\xb9\xbd"              /* v with tilde */
+    "\xe1\xb8\x9d"              /* e with breve and cedilla */
+    "\xc8\x91"                  /* r with double grave */
+    "\xc5\xa1"                  /* s with caron */
+    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
+    "\xe6"                      /* Invalid byte */
+    "\xe1\xb9\x8b";             /* n with circumflex below */
+
+  const char *result;
+  svn_membuf_t buf;
+
+  svn_membuf__create(&buf, 0, pool);
+  SVN_ERR(svn_utf__normalize(&result, nfd, strlen(nfd), FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, nfc);
+  SVN_ERR(svn_utf__normalize(&result, nfd, strlen(nfd), TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, nfc_casefold);
+  SVN_ERR(svn_utf__normalize(&result, mixup, strlen(mixup), FALSE, &buf));
+  SVN_TEST_STRING_ASSERT(result, nfc);
+  SVN_ERR(svn_utf__normalize(&result, mixup, strlen(mixup), TRUE, &buf));
+  SVN_TEST_STRING_ASSERT(result, nfc_casefold);
+
+  SVN_TEST_ASSERT_ERROR(svn_utf__normalize(&result, invalid, strlen(invalid),
+                                           FALSE, &buf),
+                        SVN_ERR_UTF8PROC_ERROR);
+  SVN_TEST_ASSERT_ERROR(svn_utf__normalize(&result, invalid, strlen(invalid),
+                                           TRUE, &buf),
+                        SVN_ERR_UTF8PROC_ERROR);
+
+  return SVN_NO_ERROR;
+}
+
 
 /* The test table.  */
 
@@ -849,6 +940,8 @@ static struct svn_test_descriptor_t test
                    "test svn_utf__is_normalized"),
     SVN_TEST_PASS2(test_utf_conversions,
                    "test svn_utf__utf{16,32}_to_utf8"),
+    SVN_TEST_PASS2(test_utf_normalize,
+                   "test svn_utf__normalize"),
     SVN_TEST_NULL
   };

Re: 'svn log --search': forcing case sensitivity?

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.

Branko Čibej wrote on Tue, Mar 15, 2016 at 06:13:28 +0100:
> On 15.03.2016 01:08, Daniel Shahaf wrote:
> > kotkov@apache.org wrote on Fri, Feb 19, 2016 at 22:11:11 -0000:
> >> +/* Return TRUE if STR matches PATTERN. Else, return FALSE. Assumes that
> >> + * PATTERN is a UTF-8 string normalized to form C with case folding
> >> + * applied. Use BUF for temporary allocations. */
> >> +static svn_boolean_t
> >> +match(const char *pattern, const char *str, svn_membuf_t *buf)
> >> +{
> >> +  svn_error_t *err;
> >> +
> >> +  err = svn_utf__normalize(&str, str, strlen(str), TRUE /* casefold */, buf);
> >> +  if (err)
> >> +    {
> >> +      /* Can't match invalid data. */
> >> +      svn_error_clear(err);
> >> +      return FALSE;
> >> +    }
> >> +
> >> +  return apr_fnmatch(pattern, str, 0) == APR_SUCCESS;
> > Should there be a command-line flag to disable casefolding?
> >
> > E.g., to allow users to grep for identifiers (function/variable/file
> > names) using their exact case?  Do people who use 'log --search' need it
> > to be case-sensitive?  (I don't use 'log --search' often.)
> 
> I'd prefer to keep things simple.
>

Fair enough.  I was concerned that users might perceive removing
case-sensitive search as a regression.

(I don't like having new knobs any more than you do.)

> And as I recall, this whole discussion began because apr_fnmatch
> doesn't like non-ASCII characters?

s/non-ASCII/multibyte/, but yes.

> > Even if casefolding is disabled, we should still apply Unicode
> > normalization to form C.
> 
> There's no particular reason it has to be form C, as long as both the
> pattern and the string are normalized to the same form.

Agreed, that's what I meant to say.

> Using form D is possibly even a bit faster, since that's the internal
> 32-bit representation used by utf8proc. It's a pity we don't have
> a 32-bit-char fnmatch implementation.

> Still, as you note below, normalizing a glob pattern isn't entirely
> trivial to do correctly.
> 
> > P.S. This patch introduces a minor behaviour change: before this patch,
> > the search pattern «foo[A-z]bar» would match the log message «foo_bar»,
> > whereas after this change it would not.  (This is because the pattern is
> > now casefolded between being passed to APR, and '_' is between 'A'
> > and 'z' but not between 'A' and 'Z', when compared as C chars.)  I doubt
> > anyone will notice this behaviour change; I'm just mentioning it for
> > completeness.
> 
> Mmhh ... this is what comes of 'obviously trivial' solutions. :)

The important thing is that there is no _other_ apr_fnmatch() syntax
that changes meaning through case-folding the pattern, at least in
apr-1.5 with flags==0.

Cheers,

Daniel

Re: 'svn log --search': forcing case sensitivity?

Posted by Branko Čibej <br...@apache.org>.

On 15.03.2016 01:08, Daniel Shahaf wrote:
> kotkov@apache.org wrote on Fri, Feb 19, 2016 at 22:11:11 -0000:
>> Author: kotkov
>> Date: Fri Feb 19 22:11:11 2016
>> New Revision: 1731300
>>
>> URL: http://svn.apache.org/viewvc?rev=1731300&view=rev
>> Log:
>> Make svn log --search case-insensitive.
>>
>> Use utf8proc to do the normalization and locale-independent case folding
>> (UTF8PROC_CASEFOLD) for both the search pattern and the input strings.
>>
>> Related discussion is in http://svn.haxx.se/dev/archive-2013-04/0374.shtml
>> (Subject: "log --search test failures on trunk and 1.8.x").
>>
>> +++ subversion/trunk/subversion/svn/log-cmd.c Fri Feb 19 22:11:11 2016
>> @@ -38,6 +38,7 @@
>> @@ -110,6 +111,24 @@
>> +/* Return TRUE if STR matches PATTERN. Else, return FALSE. Assumes that
>> + * PATTERN is a UTF-8 string normalized to form C with case folding
>> + * applied. Use BUF for temporary allocations. */
>> +static svn_boolean_t
>> +match(const char *pattern, const char *str, svn_membuf_t *buf)
>> +{
>> +  svn_error_t *err;
>> +
>> +  err = svn_utf__normalize(&str, str, strlen(str), TRUE /* casefold */, buf);
>> +  if (err)
>> +    {
>> +      /* Can't match invalid data. */
>> +      svn_error_clear(err);
>> +      return FALSE;
>> +    }
>> +
>> +  return apr_fnmatch(pattern, str, 0) == APR_SUCCESS;
> Should there be a command-line flag to disable casefolding?
>
> E.g., to allow users to grep for identifiers (function/variable/file
> names) using their exact case?  Do people who use 'log --search' need it
> to be case-sensitive?  (I don't use 'log --search' often.)

I'd prefer to keep things simple. And as I recall, this whole discussion
began because apr_fnmatch doesn't like non-ASCII characters?

> Even if casefolding is disabled, we should still apply Unicode
> normalization to form C.

There's no particular reason it has to be form C, as long as both the
pattern and the string are normalized to the same form. Using form D is
possibly even a bit faster, since that's the internal 32-bit
representation used by utf8proc. It's a pity we don't have a 32-bit-char
fnmatch implementation.

Still, as you note below, normalizing a glob pattern isn't entirely
trivial to do correctly.

> P.S. This patch introduces a minor behaviour change: before this patch,
> the search pattern «foo[A-z]bar» would match the log message «foo_bar»,
> whereas after this change it would not.  (This is because the pattern is
> now casefolded between being passed to APR, and '_' is between 'A'
> and 'z' but not between 'A' and 'Z', when compared as C chars.)  I doubt
> anyone will notice this behaviour change; I'm just mentioning it for
> completeness.

Mmhh ... this is what comes of 'obviously trivial' solutions. :)

-- Brane

Re: 'svn log --search': forcing case sensitivity?

Posted by Evgeny Kotkov <ev...@visualsvn.com>.

Julian Foad <ju...@gmail.com> writes:

>> Should there be a command-line flag to disable casefolding?
>
> No. Not unless we also want to provide all the other kinds of control for
> specialist searching, such as regexes, specifying which of the metadata
> fields to look in, boolean operators, etc. -- and I think it would be silly
> for the project to go that way. Like Brane said, keep it simple.

+1

Daniel Shahaf <d....@daniel.shahaf.name> writes:

> P.S. This patch introduces a minor behaviour change: before this patch,
> the search pattern «foo[A-z]bar» would match the log message «foo_bar»,
> whereas after this change it would not.  (This is because the pattern is
> now casefolded between being passed to APR, and '_' is between 'A'
> and 'z' but not between 'A' and 'Z', when compared as C chars.)  I doubt
> anyone will notice this behaviour change; I'm just mentioning it for
> completeness.

That's a good point.  However, I am not sure that there is an easy way
to solve this (IMHO, edge) case.


Regards,
Evgeny Kotkov

Re: 'svn log --search': forcing case sensitivity?

Posted by Julian Foad <ju...@gmail.com>.

Daniel Shahaf wrote:
> Should there be a command-line flag to disable casefolding?

No. Not unless we also want to provide all the other kinds of control 
for specialist searching, such as regexes, specifying which of the 
metadata fields to look in, boolean operators, etc. -- and I think it 
would be silly for the project to go that way. Like Brane said, keep it 
simple.

- Julian

'svn log --search': forcing case sensitivity? (was: svn commit: r1731300 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_repos/dump.c libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c)

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.

kotkov@apache.org wrote on Fri, Feb 19, 2016 at 22:11:11 -0000:
> Author: kotkov
> Date: Fri Feb 19 22:11:11 2016
> New Revision: 1731300
> 
> URL: http://svn.apache.org/viewvc?rev=1731300&view=rev
> Log:
> Make svn log --search case-insensitive.
> 
> Use utf8proc to do the normalization and locale-independent case folding
> (UTF8PROC_CASEFOLD) for both the search pattern and the input strings.
> 
> Related discussion is in http://svn.haxx.se/dev/archive-2013-04/0374.shtml
> (Subject: "log --search test failures on trunk and 1.8.x").
> 
> +++ subversion/trunk/subversion/svn/log-cmd.c Fri Feb 19 22:11:11 2016
> @@ -38,6 +38,7 @@
> @@ -110,6 +111,24 @@
> +/* Return TRUE if STR matches PATTERN. Else, return FALSE. Assumes that
> + * PATTERN is a UTF-8 string normalized to form C with case folding
> + * applied. Use BUF for temporary allocations. */
> +static svn_boolean_t
> +match(const char *pattern, const char *str, svn_membuf_t *buf)
> +{
> +  svn_error_t *err;
> +
> +  err = svn_utf__normalize(&str, str, strlen(str), TRUE /* casefold */, buf);
> +  if (err)
> +    {
> +      /* Can't match invalid data. */
> +      svn_error_clear(err);
> +      return FALSE;
> +    }
> +
> +  return apr_fnmatch(pattern, str, 0) == APR_SUCCESS;

Should there be a command-line flag to disable casefolding?

E.g., to allow users to grep for identifiers (function/variable/file
names) using their exact case?  Do people who use 'log --search' need it
to be case-sensitive?  (I don't use 'log --search' often.)

Even if casefolding is disabled, we should still apply Unicode
normalization to form C.

Cheers,

Daniel

P.S. This patch introduces a minor behaviour change: before this patch,
the search pattern «foo[A-z]bar» would match the log message «foo_bar»,
whereas after this change it would not.  (This is because the pattern is
now casefolded between being passed to APR, and '_' is between 'A'
and 'z' but not between 'A' and 'Z', when compared as C chars.)  I doubt
anyone will notice this behaviour change; I'm just mentioning it for
completeness.

> +}

Re: svn commit: r1731300 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_repos/dump.c libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c

Posted by Stefan Sperling <st...@elego.de>.

On Sun, Feb 21, 2016 at 02:22:45PM +0100, Branko Čibej wrote:
> Personally I'd much prefer the svn_utf__casefold() you propose (i.e.,
> normalize plus casefold) as a separate API. Internally, it can be
> implemented with that extra flag, but even for a private API, I think
> it's better to make each function do one thing.

+1

Re: svn commit: r1731300 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_repos/dump.c libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c

Posted by Evgeny Kotkov <ev...@visualsvn.com>.

Evgeny Kotkov <ev...@visualsvn.com> writes:

> We can drop the `normalize' argument, since keeping denormalized strings
> around is dangerous and unnecessary, but I'd leave the other two and let the
> caller specify the wanted behavior:
>
>     svn_error_t *
>     svn_utf__xfrm(const char **result,
>                   const char *str,
>                   apr_size_t len,
>                   svn_boolean_t case_insensitive,
>                   svn_boolean_t accent_insensitive,
>                   svn_membuf_t *buf);
>
> I attached the patch that does that.  What do you think?

I committed the patch in r1735614.


Regards,
Evgeny Kotkov

Re: svn commit: r1731300 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_repos/dump.c libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c

Posted by Evgeny Kotkov <ev...@visualsvn.com>.

Branko Čibej <br...@apache.org> writes:

> The big question here is what we'll use the API for. Currently we have a
> 'normalize' function that's used by svn_fs_verify (IIRC). Since we're
> talking about a funciton that transforms a UTF-8 string to a shape
> suitable for stuff-insensitive comparison, we could follow the example
> of the standard strxfrm() -> svn_utf__xfrm(); but if that's too ugly, my
> preference is for svn_utf__fold().
>
> However, I'd not add arguments for normalization/case folding/etc; I'd
> just make this function DTRT without any additional flags, because
> otherwise we'll always be second-guessing the correct invocation.

One use case that I keep in mind is doing server-side search or filtering,
where a client tells the server what kind of comparison and matching she
expects to get.

The strxfrm() function doesn't define the transformation in terms of
preserving case or diacritical marks.  Hence, we can't have svn_utf__xfrm()
doing the right thing for svn log --search, as that would mean that a
libsvn_subr function controls the behavior of the command-line client.
And while a private function somewhere around svn.c could be doing that,
hardcoding this kind of behavior in libsvn_subr doesn't sound proper to me.

We can drop the `normalize' argument, since keeping denormalized strings
around is dangerous and unnecessary, but I'd leave the other two and let the
caller specify the wanted behavior:

    svn_error_t *
    svn_utf__xfrm(const char **result,
                  const char *str,
                  apr_size_t len,
                  svn_boolean_t case_insensitive,
                  svn_boolean_t accent_insensitive,
                  svn_membuf_t *buf);

I attached the patch that does that.  What do you think?

Regards,
Evgeny Kotkov

Re: svn commit: r1731300 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_repos/dump.c libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c

Posted by Branko Čibej <br...@apache.org>.

On 24.02.2016 14:30, Evgeny Kotkov wrote:
> Branko Čibej <br...@apache.org> writes:
>> Instead of relying on the Unicode spec, I propose a different approach:
>> to treat accented letters as if they don't have diacriticals at all.
>> This should be fairly easy to do with utf8proc: in the intermediate,
>> 32-bit NFD string, remove any character that's in the
>> combining-diacritical group, and then convert the result to NFC UTF-8.
>> I've done this before with fairly good results; it's also much easier to
>> explain this behaviour to users than to tell them, "read the Unicode spec".
> I see that utf8proc has UTF8PROC_STRIPMARK flag that does something
> similar to what you describe.  The difference is that this option strips the
> codepoints that fall into either Mn (Nonspacing_Mark), Mc (Spacing_Mark) or
> Me (Enclosing_Mark) categories [1].
>
> Although that's more than just removing the characters that are marked as
> Combining Diacritical Marks [2,3,4,5], I am thinking that we could just use
> this flag.  How does this cope with what you propose?

This is probably even better than just removing combining diacriticals,
because it should work well with non-latin/cyrillic scripts, too.

> Another question is about exposing this ability in the API.  I'd say that we
> could do something like this:
>
>   svn_utf__transform(svn_boolean_t normalize,
>                      svn_boolean_t casefold,
>                      svn_boolean_t remove_diacritics)
>
>   (or maybe svn_utf__map / svn_utf__alter / svn_utf__fold?)
>
> Do you have an opinion or suggestions about that?

The big question here is what we'll use the API for. Currently we have a
'normalize' function that's used by svn_fs_verify (IIRC). Since we're
talking about a funciton that transforms a UTF-8 string to a shape
suitable for stuff-insensitive comparison, we could follow the example
of the standard strxfrm() -> svn_utf__xfrm(); but if that's too ugly, my
preference is for svn_utf__fold().

However, I'd not add arguments for normalization/case folding/etc; I'd
just make this function DTRT without any additional flags, because
otherwise we'll always be second-guessing the correct invocation.

If there's a use case for case-folding vs. non-case folding, then make
two functions: svn_utf__xfrm and svn_utf__xfrm_casefold.

(Again, obviously, all of these -- including svn_utf__normalize -- need
only one private impltmentation in the source.)

-- Brane

Re: svn commit: r1731300 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_repos/dump.c libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c

Posted by Evgeny Kotkov <ev...@visualsvn.com>.

Branko Čibej <br...@apache.org> writes:

> Personally I'd much prefer the svn_utf__casefold() you propose (i.e.,
> normalize plus casefold) as a separate API. Internally, it can be
> implemented with that extra flag, but even for a private API, I think
> it's better to make each function do one thing.

After giving it more thought, I agree that a separate API is a better choice
here.  For now, I added svn_utf__casefold() in r1732152.

> Instead of relying on the Unicode spec, I propose a different approach:
> to treat accented letters as if they don't have diacriticals at all.
> This should be fairly easy to do with utf8proc: in the intermediate,
> 32-bit NFD string, remove any character that's in the
> combining-diacritical group, and then convert the result to NFC UTF-8.
> I've done this before with fairly good results; it's also much easier to
> explain this behaviour to users than to tell them, "read the Unicode spec".

I see that utf8proc has UTF8PROC_STRIPMARK flag that does something
similar to what you describe.  The difference is that this option strips the
codepoints that fall into either Mn (Nonspacing_Mark), Mc (Spacing_Mark) or
Me (Enclosing_Mark) categories [1].

Although that's more than just removing the characters that are marked as
Combining Diacritical Marks [2,3,4,5], I am thinking that we could just use
this flag.  How does this cope with what you propose?

Another question is about exposing this ability in the API.  I'd say that we
could do something like this:

  svn_utf__transform(svn_boolean_t normalize,
                     svn_boolean_t casefold,
                     svn_boolean_t remove_diacritics)

  (or maybe svn_utf__map / svn_utf__alter / svn_utf__fold?)

Do you have an opinion or suggestions about that?

[1] http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
[2] http://www.unicode.org/charts/PDF/U0300.pdf
[3] http://www.unicode.org/charts/PDF/U1AB0.pdf
[4] http://www.unicode.org/charts/PDF/U1DC0.pdf
[5] http://www.unicode.org/charts/PDF/U20D0.pdf

Regards,
Evgeny Kotkov

Re: svn commit: r1731300 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_repos/dump.c libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c

Posted by Branko Čibej <br...@apache.org>.

On 20.02.2016 22:16, Evgeny Kotkov wrote:
> Branko Čibej <br...@apache.org> writes:
>
>> Not really. For example, 'á' and 'A' are equivalent, but 'ß' and 'SS'
>> are not — whereas the latter should be equivalent in German, but I doubt
>> very much that utf8proc does that right. Case-insensitive comparison
>> must *always* be done in the context of a well-defined locale. Anything
>> that calls itself "locale-independent" is likely to be wrong in a really
>> huge number of cases.
> The Unicode Standard (Section 3.13 Default Case Algorithms) is quite clear
> on how case-insensitive matching should be done [1]:
>
>     Default caseless matching is the process of comparing two strings for
>     case-insensitive equality. The definitions of Unicode Default Caseless
>     Matching build on the definitions of Unicode Default Case Folding.
>
>        Default Caseless Matching uses full case folding:
>
>        A string X is a caseless match for a string Y if and only if:
>        toCasefold(X) = toCasefold(Y)
>
>        toCasefold(X): Map each character C in X to Case_Folding(C).
>
>        Case_Folding(C) uses the mappings with the status field value “C” or
>        “F” in the data file CaseFolding.txt in the Unicode Character Database.
>
>     When comparing strings for case-insensitive equality, the strings should
>     also be normalized for most correct results.
>
> The behavior we get with this patch is well-defined and follows the spec,
> since we normalize and fold the case of the strings with utf8proc.  (The
> UTF8PROC_CASEFOLD flag results in full C + F case folding as per [2],
> omitting special case T.)

It may be well-defined and follow the spec, but the important question
IMO is whether the behaviour makes sense for our users. Bert mentioned
Turkish (i ∼ İ and I ∼ ı), I mentioned German, where ß ∼ SS, and French,
where ò ∼ O but that doesn't hold in Italian, where ò ∼ Ò ... these are
just off the top of my head, there must be tens or even hundreds of
examples where locale-independent case folding can give unexpected results.

Instead of relying on the Unicode spec, I propose a different approach:
to treat accented letters as if they don't have diacriticals at all.
This should be fairly easy to do with utf8proc: in the intermediate,
32-bit NFD string, remove any character that's in the
combining-diacritical group, and then convert the result to NFC UTF-8.
I've done this before with fairly good results; it's also much easier to
explain this behaviour to users than to tell them, "read the Unicode spec".

>>> But I'm wondering why you added this feature to an existing function?
>>>
>>> I don't think it is recommended practice to perform the normalization this
>>> way and adding a boolean to an existing function makes it easier to do
>>> perform things in a not recommended way.
>> Adding flags that drastically change the semantics of a function is just
>> broken API design, period.
> I don't think that we expose this functionality in a broken way.  There aren't
> that many options to choose from, since we need to perform the normalization
> and the case folding in a single call to utf8proc, with appropriate flags set.
> We could add an svn_utf__casefold() function that does both, but I'd rather
> prefer what we have now.
>
> After all, the maintainers of utf8proc expose its features in a quite similar
> fashion [3] — with a normalize_string(..., casefold=true/false) function.

Heh, quoting bad examples doesn't make the API change any better. :)

Personally I'd much prefer the svn_utf__casefold() you propose (i.e.,
normalize plus casefold) as a separate API. Internally, it can be
implemented with that extra flag, but even for a private API, I think
it's better to make each function do one thing.

-- Brane

Re: svn commit: r1731300 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_repos/dump.c libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c

Posted by Evgeny Kotkov <ev...@visualsvn.com>.

Branko Čibej <br...@apache.org> writes:

> Not really. For example, 'á' and 'A' are equivalent, but 'ß' and 'SS'
> are not — whereas the latter should be equivalent in German, but I doubt
> very much that utf8proc does that right. Case-insensitive comparison
> must *always* be done in the context of a well-defined locale. Anything
> that calls itself "locale-independent" is likely to be wrong in a really
> huge number of cases.

The Unicode Standard (Section 3.13 Default Case Algorithms) is quite clear
on how case-insensitive matching should be done [1]:

    Default caseless matching is the process of comparing two strings for
    case-insensitive equality. The definitions of Unicode Default Caseless
    Matching build on the definitions of Unicode Default Case Folding.

       Default Caseless Matching uses full case folding:

       A string X is a caseless match for a string Y if and only if:
       toCasefold(X) = toCasefold(Y)

       toCasefold(X): Map each character C in X to Case_Folding(C).

       Case_Folding(C) uses the mappings with the status field value “C” or
       “F” in the data file CaseFolding.txt in the Unicode Character Database.

    When comparing strings for case-insensitive equality, the strings should
    also be normalized for most correct results.

The behavior we get with this patch is well-defined and follows the spec,
since we normalize and fold the case of the strings with utf8proc.  (The
UTF8PROC_CASEFOLD flag results in full C + F case folding as per [2],
omitting special case T.)

>> But I'm wondering why you added this feature to an existing function?
>>
>> I don't think it is recommended practice to perform the normalization this
>> way and adding a boolean to an existing function makes it easier to do
>> perform things in a not recommended way.
>
> Adding flags that drastically change the semantics of a function is just
> broken API design, period.

I don't think that we expose this functionality in a broken way.  There aren't
that many options to choose from, since we need to perform the normalization
and the case folding in a single call to utf8proc, with appropriate flags set.
We could add an svn_utf__casefold() function that does both, but I'd rather
prefer what we have now.

After all, the maintainers of utf8proc expose its features in a quite similar
fashion [3] — with a normalize_string(..., casefold=true/false) function.

[1] http://www.unicode.org/versions/Unicode8.0.0/ch03.pdf
[2] http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
[3] https://julia.readthedocs.org/en/latest/stdlib/strings/#Base.normalize_string

Regards,
Evgeny Kotkov

Re: svn commit: r1731300 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_repos/dump.c libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c

Posted by Branko Čibej <br...@apache.org>.

On 20.02.2016 12:09, Bert Huijben wrote:
>
>> -----Original Message-----
>> From: kotkov@apache.org [mailto:kotkov@apache.org]
>> Sent: vrijdag 19 februari 2016 23:11
>> To: commits@subversion.apache.org
>> Subject: svn commit: r1731300 - in /subversion/trunk/subversion:
>> include/private/svn_utf_private.h libsvn_repos/dump.c
>> libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c
>> tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c
>>
>> Author: kotkov
>> Date: Fri Feb 19 22:11:11 2016
>> New Revision: 1731300
>>
>> URL: http://svn.apache.org/viewvc?rev=1731300&view=rev
>> Log:
>> Make svn log --search case-insensitive.
>>
>> Use utf8proc to do the normalization and locale-independent case folding
>> (UTF8PROC_CASEFOLD) for both the search pattern and the input strings.
>>
>> Related discussion is in http://svn.haxx.se/dev/archive-2013-04/0374.shtml
>> (Subject: "log --search test failures on trunk and 1.8.x").
>>
>> * subversion/include/private/svn_utf_private.h
>>   (svn_utf__normalize): Add new boolean argument to perform case folding.
> Usually it is far more efficient to perform the comparison on the unnormalized strings using the apis, than to normalize and perform the operation later.  I'm not sure if utf8proc supports this feature though
>
> But I'm wondering why you added this feature to an existing function?
>
> I don't think it is recommended practice to perform the normalization this way and adding a boolean to an existing function makes it easier to do perform things in a not recommended way.

Adding flags that drastically change the semantics of a function is just
broken API design, period.

> Locale independent case folding is not that well defined... Things like the Turkish 'i' that doesn't fold, so any decision on that makes it locale dependent. (n this case probably by choosing not Turkish, but that doesn't make it 'locale independent'.
>
> Just folding the western European characters is much easier to explain/document.

Not really. For example, 'á' and 'A' are equivalent, but 'ß' and 'SS'
are not — whereas the latter should be equivalent in German, but I doubt
very much that utf8proc does that right. Case-insensitive comparison
must *always* be done in the context of a well-defined locale. Anything
that calls itself "locale-independent" is likely to be wrong in a really
huge number of cases.

Furthermore, this fix is not in any way related to the discussion that
the log message points to ... it's pretty clear that the bug is in
apr_fnmatch and should be fixed there.

In light of the above, I'd like to see some more discussion about a
realistic solution to the problem. So-called "locale-independent" case
folding isn't locale-independent, and I don't see how it can solve the
original problem without introducing a bunch of new bugs. It's scary
enough that the test cases don't try anything other than accented latin
characters.

-- Brane

RE: svn commit: r1731300 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_repos/dump.c libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c

Posted by Bert Huijben <be...@qqmail.nl>.


> -----Original Message-----
> From: kotkov@apache.org [mailto:kotkov@apache.org]
> Sent: vrijdag 19 februari 2016 23:11
> To: commits@subversion.apache.org
> Subject: svn commit: r1731300 - in /subversion/trunk/subversion:
> include/private/svn_utf_private.h libsvn_repos/dump.c
> libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c
> tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c
> 
> Author: kotkov
> Date: Fri Feb 19 22:11:11 2016
> New Revision: 1731300
> 
> URL: http://svn.apache.org/viewvc?rev=1731300&view=rev
> Log:
> Make svn log --search case-insensitive.
> 
> Use utf8proc to do the normalization and locale-independent case folding
> (UTF8PROC_CASEFOLD) for both the search pattern and the input strings.
> 
> Related discussion is in http://svn.haxx.se/dev/archive-2013-04/0374.shtml
> (Subject: "log --search test failures on trunk and 1.8.x").
> 
> * subversion/include/private/svn_utf_private.h
>   (svn_utf__normalize): Add new boolean argument to perform case folding.

Usually it is far more efficient to perform the comparison on the unnormalized strings using the apis, than to normalize and perform the operation later.  I'm not sure if utf8proc supports this feature though

But I'm wondering why you added this feature to an existing function?

I don't think it is recommended practice to perform the normalization this way and adding a boolean to an existing function makes it easier to do perform things in a not recommended way.



Locale independent case folding is not that well defined... Things like the Turkish 'i' that doesn't fold, so any decision on that makes it locale dependent. (n this case probably by choosing not Turkish, but that doesn't make it 'locale independent'.

Just folding the western European characters is much easier to explain/document.

	Bert

'svn log --search': forcing case sensitivity? (was: svn commit: r1731300 - in /subversion/trunk/subversion: include/private/svn_utf_private.h libsvn_repos/dump.c libsvn_subr/utf8proc.c svn/cl-log.h svn/log-cmd.c svn/svn.c tests/cmdline/log_tests.py tests/libsvn_subr/utf-test.c)

Posted by Daniel Shahaf <d....@daniel.shahaf.name>.

kotkov@apache.org wrote on Fri, Feb 19, 2016 at 22:11:11 -0000:
> Author: kotkov
> Date: Fri Feb 19 22:11:11 2016
> New Revision: 1731300
> 
> URL: http://svn.apache.org/viewvc?rev=1731300&view=rev
> Log:
> Make svn log --search case-insensitive.
> 
> Use utf8proc to do the normalization and locale-independent case folding
> (UTF8PROC_CASEFOLD) for both the search pattern and the input strings.
> 
> Related discussion is in http://svn.haxx.se/dev/archive-2013-04/0374.shtml
> (Subject: "log --search test failures on trunk and 1.8.x").
> 
> +++ subversion/trunk/subversion/svn/log-cmd.c Fri Feb 19 22:11:11 2016
> @@ -38,6 +38,7 @@
> @@ -110,6 +111,24 @@
> +/* Return TRUE if STR matches PATTERN. Else, return FALSE. Assumes that
> + * PATTERN is a UTF-8 string normalized to form C with case folding
> + * applied. Use BUF for temporary allocations. */
> +static svn_boolean_t
> +match(const char *pattern, const char *str, svn_membuf_t *buf)
> +{
> +  svn_error_t *err;
> +
> +  err = svn_utf__normalize(&str, str, strlen(str), TRUE /* casefold */, buf);
> +  if (err)
> +    {
> +      /* Can't match invalid data. */
> +      svn_error_clear(err);
> +      return FALSE;
> +    }
> +
> +  return apr_fnmatch(pattern, str, 0) == APR_SUCCESS;

Should there be a command-line flag to disable casefolding?

E.g., to allow users to grep for identifiers (function/variable/file
names) using their exact case?  Do people who use 'log --search' need it
to be case-sensitive?  (I don't use 'log --search' often.)

Even if casefolding is disabled, we should still apply Unicode
normalization to form C.

Cheers,

Daniel

P.S. This patch introduces a minor behaviour change: before this patch,
the search pattern «foo[A-z]bar» would match the log message «foo_bar»,
whereas after this change it would not.  (This is because the pattern is
now casefolded between being passed to APR, and '_' is between 'A'
and 'z' but not between 'A' and 'Z', when compared as C chars.)  I doubt
anyone will notice this behaviour change; I'm just mentioning it for
completeness.

> +}