You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@subversion.apache.org by ko...@apache.org on 2016/03/18 16:29:52 UTC
svn commit: r1735614 - in /subversion/trunk/subversion:
include/private/svn_utf_private.h libsvn_subr/utf8proc.c svn/log-cmd.c
svn/svn.c tests/libsvn_subr/utf-test.c
Author: kotkov
Date: Fri Mar 18 15:29:38 2016
New Revision: 1735614
URL: http://svn.apache.org/viewvc?rev=1735614&view=rev
Log:
Ignore diacriticals in svn log --search, so that e.g., `müssen' would
match against `mussen'.
* subversion/include/private/svn_utf_private.h
(svn_utf__casefold): Remove, is now replaced by ...
(svn_utf__xfrm): ...this new function that transforms a given string into
shape suitable for case- and accent-insensitive comparison.
* subversion/libsvn_subr/utf8proc.c
(normalize_cstring): Add a `stripmark' argument that corresponds
to the UTF8PROC_STRIPMARK transformation flag.
(svn_utf__normalize): Adjust call to normalize_cstring().
(svn_utf__casefold): Remove.
(svn_utf__xfrm): Implement this function by calling normalize_cstring()
with appropriate arguments.
(svn_utf__is_normalized): Adjust call to normalize_cstring().
* subversion/svn/log-cmd.c
(match): Prepare the string for case- and accent-insensitive comparison
by calling svn_utf__xfrm().
* subversion/svn/svn.c
(sub_main): Prepare the pattern for case- and accent-insensitive comparison
by calling svn_utf__xfrm().
* subversion/tests/libsvn_subr/utf-test.c
(test_utf_casefold): Replaced with ...
(test_utf_xfrm): ...this new test. Test the behavior with non-synthetic
examples, such as with `İstanbul'.
(test_funcs): Track the test changes.
Modified:
subversion/trunk/subversion/include/private/svn_utf_private.h
subversion/trunk/subversion/libsvn_subr/utf8proc.c
subversion/trunk/subversion/svn/log-cmd.c
subversion/trunk/subversion/svn/svn.c
subversion/trunk/subversion/tests/libsvn_subr/utf-test.c
Modified: subversion/trunk/subversion/include/private/svn_utf_private.h
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/include/private/svn_utf_private.h?rev=1735614&r1=1735613&r2=1735614&view=diff
==============================================================================
--- subversion/trunk/subversion/include/private/svn_utf_private.h (original)
+++ subversion/trunk/subversion/include/private/svn_utf_private.h Fri Mar 18 15:29:38 2016
@@ -150,22 +150,27 @@ svn_utf__normalize(const char **result,
const char *str, apr_size_t len,
svn_membuf_t *buf);
-/* Normalize the UTF-8 string STR to form C and remove case distinctions
- * with Unicode's Default Caseless Matching algorithm. Use BUF as a
- * temporary storage. If LEN is SVN_UTF__UNKNOWN_LENGTH, assume STR
- * is null-terminated; otherwise, consider the string only up to the
- * given length.
+/* Transform the UTF-8 string to a shape suitable for comparison with
+ * strcmp(). The tranformation is defined by CASE_INSENSITIVE and
+ * ACCENT_INSENSITIVE arguments. If CASE_INSENSITIVE is non-zero,
+ * remove case distinctions from the string. If ACCENT_INSENSITIVE
+ * is non-zero, remove diacritical marks from the string.
*
- * Return the resulting string in *RESULT, which shares storage with
- * BUF and is valid only until the next time BUF is modified.
+ * Use BUF as a temporary storage. If LEN is SVN_UTF__UNKNOWN_LENGTH,
+ * assume STR is null-terminated; otherwise, consider the string only
+ * up to the given length. Place the tranformed string in *RESULT, which
+ * shares storage with BUF and is valid only until the next time BUF is
+ * modified.
*
* A returned error may indicate that STRING contains invalid UTF-8 or
* invalid Unicode codepoints.
*/
svn_error_t *
-svn_utf__casefold(const char **result,
- const char *str, apr_size_t len,
- svn_membuf_t *buf);
+svn_utf__xfrm(const char **result,
+ const char *str, apr_size_t len,
+ svn_boolean_t case_insensitive,
+ svn_boolean_t accent_insensitive,
+ svn_membuf_t *buf);
/* Check if STRING is a valid, NFC-normalized UTF-8 string. Note that
* a FALSE return value may indicate that STRING is not valid UTF-8 at
Modified: subversion/trunk/subversion/libsvn_subr/utf8proc.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/utf8proc.c?rev=1735614&r1=1735613&r2=1735614&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf8proc.c (original)
+++ subversion/trunk/subversion/libsvn_subr/utf8proc.c Fri Mar 18 15:29:38 2016
@@ -127,7 +127,8 @@ decompose_normalized(apr_size_t *result_
* of UTF-8 characters.
*
* If CASEFOLD is non-zero, perform Unicode case folding, e.g., for
- * case-insensitive string comparison.
+ * case-insensitive string comparison. If STRIPMARK is non-zero, strip
+ * all diacritical marks (e.g., accents) from the string.
*
* A returned error may indicate that STRING contains invalid UTF-8 or
* invalid Unicode codepoints. Any error message comes from utf8proc.
@@ -136,10 +137,19 @@ static svn_error_t *
normalize_cstring(apr_size_t *result_length,
const char *string, apr_size_t length,
svn_boolean_t casefold,
+ svn_boolean_t stripmark,
svn_membuf_t *buffer)
{
- ssize_t result = unicode_decomposition(casefold ? UTF8PROC_CASEFOLD : 0,
- string, length, buffer);
+ int flags = 0;
+ ssize_t result;
+
+ if (casefold)
+ flags |= UTF8PROC_CASEFOLD;
+
+ if (stripmark)
+ flags |= UTF8PROC_STRIPMARK;
+
+ result = unicode_decomposition(flags, string, length, buffer);
if (result >= 0)
{
svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
@@ -207,18 +217,21 @@ svn_utf__normalize(const char **result,
svn_membuf_t *buf)
{
apr_size_t result_length;
- SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, buf));
+ SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, FALSE, buf));
*result = (const char*)(buf->data);
return SVN_NO_ERROR;
}
svn_error_t *
-svn_utf__casefold(const char **result,
- const char *str, apr_size_t len,
- svn_membuf_t *buf)
+svn_utf__xfrm(const char **result,
+ const char *str, apr_size_t len,
+ svn_boolean_t case_insensitive,
+ svn_boolean_t accent_insensitive,
+ svn_membuf_t *buf)
{
apr_size_t result_length;
- SVN_ERR(normalize_cstring(&result_length, str, len, TRUE, buf));
+ SVN_ERR(normalize_cstring(&result_length, str, len,
+ case_insensitive, accent_insensitive, buf));
*result = (const char*)(buf->data);
return SVN_NO_ERROR;
}
@@ -375,7 +388,8 @@ svn_utf__is_normalized(const char *strin
apr_size_t result_length;
const apr_size_t length = strlen(string);
svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
- err = normalize_cstring(&result_length, string, length, FALSE, &buffer);
+ err = normalize_cstring(&result_length, string, length,
+ FALSE, FALSE, &buffer);
if (err)
{
svn_error_clear(err);
Modified: subversion/trunk/subversion/svn/log-cmd.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/svn/log-cmd.c?rev=1735614&r1=1735613&r2=1735614&view=diff
==============================================================================
--- subversion/trunk/subversion/svn/log-cmd.c (original)
+++ subversion/trunk/subversion/svn/log-cmd.c Fri Mar 18 15:29:38 2016
@@ -112,14 +112,14 @@ display_diff(const svn_log_entry_t *log_
}
/* Return TRUE if STR matches PATTERN. Else, return FALSE. Assumes that
- * PATTERN is a UTF-8 string normalized to form C with case folding
- * applied. Use BUF for temporary allocations. */
+ * PATTERN is a UTF-8 string prepared for case- and accent-insensitive
+ * comparison via svn_utf__xfrm(). */
static svn_boolean_t
match(const char *pattern, const char *str, svn_membuf_t *buf)
{
svn_error_t *err;
- err = svn_utf__casefold(&str, str, strlen(str), buf);
+ err = svn_utf__xfrm(&str, str, strlen(str), TRUE, TRUE, buf);
if (err)
{
/* Can't match invalid data. */
Modified: subversion/trunk/subversion/svn/svn.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/svn/svn.c?rev=1735614&r1=1735613&r2=1735614&view=diff
==============================================================================
--- subversion/trunk/subversion/svn/svn.c (original)
+++ subversion/trunk/subversion/svn/svn.c Fri Mar 18 15:29:38 2016
@@ -2397,16 +2397,16 @@ sub_main(int *exit_code, int argc, const
break;
case opt_search:
SVN_ERR(svn_utf_cstring_to_utf8(&utf8_opt_arg, opt_arg, pool));
- SVN_ERR(svn_utf__casefold(&utf8_opt_arg, utf8_opt_arg,
- strlen(utf8_opt_arg), &buf));
+ SVN_ERR(svn_utf__xfrm(&utf8_opt_arg, utf8_opt_arg,
+ strlen(utf8_opt_arg), TRUE, TRUE, &buf));
add_search_pattern_group(&opt_state,
apr_pstrdup(pool, utf8_opt_arg),
pool);
break;
case opt_search_and:
SVN_ERR(svn_utf_cstring_to_utf8(&utf8_opt_arg, opt_arg, pool));
- SVN_ERR(svn_utf__casefold(&utf8_opt_arg, utf8_opt_arg,
- strlen(utf8_opt_arg), &buf));
+ SVN_ERR(svn_utf__xfrm(&utf8_opt_arg, utf8_opt_arg,
+ strlen(utf8_opt_arg), TRUE, TRUE, &buf));
add_search_pattern_to_latest_group(&opt_state,
apr_pstrdup(pool, utf8_opt_arg),
pool);
Modified: subversion/trunk/subversion/tests/libsvn_subr/utf-test.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/tests/libsvn_subr/utf-test.c?rev=1735614&r1=1735613&r2=1735614&view=diff
==============================================================================
--- subversion/trunk/subversion/tests/libsvn_subr/utf-test.c (original)
+++ subversion/trunk/subversion/tests/libsvn_subr/utf-test.c Fri Mar 18 15:29:38 2016
@@ -898,86 +898,75 @@ test_utf_normalize(apr_pool_t *pool)
static svn_error_t *
-test_utf_casefold(apr_pool_t *pool)
+test_utf_xfrm(apr_pool_t *pool)
{
- /* Normalized: NFC */
- static const char nfc[] =
- "\xe1\xb9\xa8" /* S with dot above and below */
- "\xc5\xaf" /* u with ring */
- "\xe1\xb8\x87" /* b with macron below */
- "\xe1\xb9\xbd" /* v with tilde */
- "\xe1\xb8\x9d" /* e with breve and cedilla */
- "\xc8\x91" /* r with double grave */
- "\xc5\xa1" /* s with caron */
- "\xe1\xb8\xaf" /* i with diaeresis and acute */
- "\xe1\xbb\x9d" /* o with grave and hook */
- "\xe1\xb9\x8b"; /* n with circumflex below */
-
- /* Normalized: NFC, case folded */
- static const char nfc_casefold[] =
- "\xe1\xb9\xa9" /* s with dot above and below */
- "\xc5\xaf" /* u with ring */
- "\xe1\xb8\x87" /* b with macron below */
- "\xe1\xb9\xbd" /* v with tilde */
- "\xe1\xb8\x9d" /* e with breve and cedilla */
- "\xc8\x91" /* r with double grave */
- "\xc5\xa1" /* s with caron */
- "\xe1\xb8\xaf" /* i with diaeresis and acute */
- "\xe1\xbb\x9d" /* o with grave and hook */
- "\xe1\xb9\x8b"; /* n with circumflex below */
-
- /* Normalized: NFD */
- static const char nfd[] =
- "S\xcc\xa3\xcc\x87" /* S with dot above and below */
- "u\xcc\x8a" /* u with ring */
- "b\xcc\xb1" /* b with macron below */
- "v\xcc\x83" /* v with tilde */
- "e\xcc\xa7\xcc\x86" /* e with breve and cedilla */
- "r\xcc\x8f" /* r with double grave */
- "s\xcc\x8c" /* s with caron */
- "i\xcc\x88\xcc\x81" /* i with diaeresis and acute */
- "o\xcc\x9b\xcc\x80" /* o with grave and hook */
- "n\xcc\xad"; /* n with circumflex below */
-
- /* Mixed, denormalized */
- static const char mixup[] =
- "S\xcc\x87\xcc\xa3" /* S with dot above and below */
- "\xc5\xaf" /* u with ring */
- "b\xcc\xb1" /* b with macron below */
- "\xe1\xb9\xbd" /* v with tilde */
- "e\xcc\xa7\xcc\x86" /* e with breve and cedilla */
- "\xc8\x91" /* r with double grave */
- "s\xcc\x8c" /* s with caron */
- "\xe1\xb8\xaf" /* i with diaeresis and acute */
- "o\xcc\x80\xcc\x9b" /* o with grave and hook */
- "\xe1\xb9\x8b"; /* n with circumflex below */
-
- /* Invalid UTF-8 */
- static const char invalid[] =
- "\xe1\xb9\xa8" /* S with dot above and below */
- "\xc5\xaf" /* u with ring */
- "\xe1\xb8\x87" /* b with macron below */
- "\xe1\xb9\xbd" /* v with tilde */
- "\xe1\xb8\x9d" /* e with breve and cedilla */
- "\xc8\x91" /* r with double grave */
- "\xc5\xa1" /* s with caron */
- "\xe1\xb8\xaf" /* i with diaeresis and acute */
- "\xe6" /* Invalid byte */
- "\xe1\xb9\x8b"; /* n with circumflex below */
-
+ const char *str;
const char *result;
svn_membuf_t buf;
svn_membuf__create(&buf, 0, pool);
- SVN_ERR(svn_utf__casefold(&result, nfc, strlen(nfc), &buf));
- SVN_TEST_STRING_ASSERT(result, nfc_casefold);
- SVN_ERR(svn_utf__casefold(&result, nfd, strlen(nfd), &buf));
- SVN_TEST_STRING_ASSERT(result, nfc_casefold);
- SVN_ERR(svn_utf__casefold(&result, mixup, strlen(mixup), &buf));
- SVN_TEST_STRING_ASSERT(result, nfc_casefold);
- SVN_TEST_ASSERT_ERROR(svn_utf__casefold(&result, invalid, strlen(invalid),
- &buf),
+ /* ASCII string */
+ str = "Subversion";
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "Subversion");
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "subversion");
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "Subversion");
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "subversion");
+
+ /* M (u with diaeresis) (sharp s) en */
+ str = "M" "\xc3\xbc" "\xc3\x9f" "en";
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "M" "\xc3\xbc" "\xc3\x9f" "en");
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "m" "\xc3\xbc" "ssen");
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "Mu" "\xc3\x9f" "en");
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "mussen");
+
+ /* Na (i with diaeresis) vet (e with acute), decomposed */
+ str = "Nai" "\xcc\x88" "vete" "\xcc\x81";
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "Na" "\xc3\xaf" "vet" "\xc3\xa9");
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "na" "\xc3\xaf" "vet" "\xc3\xa9");
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "Naivete");
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "naivete");
+
+ /* (I with dot above) stanbul */
+ str = "\xc4\xb0" "stanbul";
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, FALSE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "\xc4\xb0" "stanbul");
+
+ /* The Latin Capital Letter I with Dot Above (0130) should fold into
+ Latin Small Letter I (0069) with Combining Dot Above (0307) per full
+ mapping in http://www.unicode.org/Public/UNIDATA/CaseFolding.txt */
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, FALSE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "i" "\xcc\x87" "stanbul");
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), FALSE, TRUE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "Istanbul");
+ SVN_ERR(svn_utf__xfrm(&result, str, strlen(str), TRUE, TRUE, &buf));
+ SVN_TEST_STRING_ASSERT(result, "istanbul");
+
+ /* Invalid UTF-8 */
+ str = "a" "\xe6" "bc";
+ SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str),
+ FALSE, FALSE, &buf),
+ SVN_ERR_UTF8PROC_ERROR);
+ SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str),
+ TRUE, FALSE, &buf),
+ SVN_ERR_UTF8PROC_ERROR);
+ SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str),
+ FALSE, TRUE, &buf),
+ SVN_ERR_UTF8PROC_ERROR);
+ SVN_TEST_ASSERT_ERROR(svn_utf__xfrm(&result, str, strlen(str),
+ TRUE, TRUE, &buf),
SVN_ERR_UTF8PROC_ERROR);
return SVN_NO_ERROR;
@@ -1011,8 +1000,8 @@ static struct svn_test_descriptor_t test
"test svn_utf__utf{16,32}_to_utf8"),
SVN_TEST_PASS2(test_utf_normalize,
"test svn_utf__normalize"),
- SVN_TEST_PASS2(test_utf_casefold,
- "test svn_utf__casefold"),
+ SVN_TEST_PASS2(test_utf_xfrm,
+ "test svn_utf__xfrm"),
SVN_TEST_NULL
};