You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@subversion.apache.org by br...@apache.org on 2012/11/10 17:01:13 UTC

svn commit: r1407841 - /subversion/branches/wc-collate-path/subversion/libsvn_subr/utf8proc.c

Author: brane
Date: Sat Nov 10 16:01:12 2012
New Revision: 1407841

URL: http://svn.apache.org/viewvc?rev=1407841&view=rev
Log:
On the wc-collate-path branch: Speed up UCS-4 to UTF-8 encoding.

* subversion/libsvn_subr/utf8proc.c (encode_ucs4): Perform one less memcpy
   and don't bother nul-terminating the stringbuf after each encode.
* (encode_ucs4_string): New. Use encode_ucs4 to encode a whole UCS-4 string
   and nul-terminate the result.
  (svn_utf__glob): Update to use encode_ucs4_string and new encode_ucs4.

Modified:
    subversion/branches/wc-collate-path/subversion/libsvn_subr/utf8proc.c

Modified: subversion/branches/wc-collate-path/subversion/libsvn_subr/utf8proc.c
URL: http://svn.apache.org/viewvc/subversion/branches/wc-collate-path/subversion/libsvn_subr/utf8proc.c?rev=1407841&r1=1407840&r2=1407841&view=diff
==============================================================================
--- subversion/branches/wc-collate-path/subversion/libsvn_subr/utf8proc.c (original)
+++ subversion/branches/wc-collate-path/subversion/libsvn_subr/utf8proc.c Sat Nov 10 16:01:12 2012
@@ -145,23 +145,43 @@ svn_utf__normcmp(const char *str1, apr_s
 
 
 /* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER.
+ * This function does *not* nul-terminate the stringbuf!
+ *
  * A returned error indicates that the codepoint is invalud.
  */
 static svn_error_t *
 encode_ucs4(svn_stringbuf_t *buffer, apr_int32_t ucs4chr)
 {
-  char utf8buf[8];     /* The longest UTF-8 sequence has 4 bytes */
-  const apr_size_t utf8len = utf8proc_encode_char(ucs4chr, (void *)utf8buf);
+  apr_size_t utf8len;
 
-  if (utf8len)
-    {
-      svn_stringbuf_appendbytes(buffer, utf8buf, utf8len);
-      return SVN_NO_ERROR;
-    }
+  if (buffer->blocksize - buffer->len < 4)
+    svn_stringbuf_ensure(buffer, 2 * buffer->blocksize - 1);
 
-  return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL,
-                           "Invalid Unicode character U+%04lX",
-                           (long)ucs4chr);
+  utf8len = utf8proc_encode_char(ucs4chr,
+                                 (void *)(buffer->data + buffer->len));
+  if (!utf8len)
+    return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL,
+                             "Invalid Unicode character U+%04lX",
+                             (long)ucs4chr);
+  buffer->len += utf8len;
+  return SVN_NO_ERROR;
+}
+
+/* Decode an UCS-4 string to UTF-8, placing the result into BUFFER.
+ * While utf8proc does have a similar function, it does more checking
+ * and processing than we want here.
+ *
+ * A returned error indicates that the codepoint is invalud.
+ */
+static svn_error_t *
+encode_ucs4_string(svn_stringbuf_t *buffer,
+                   apr_int32_t *ucs4str, apr_size_t len)
+{
+  svn_stringbuf_setempty(buffer);
+  while (len-- > 0)
+    SVN_ERR(encode_ucs4(buffer, *ucs4str++));
+  buffer->data[buffer->len] = '\0';
+  return SVN_NO_ERROR;
 }
 
 
@@ -188,14 +208,10 @@ svn_utf__glob(const char *pattern, apr_s
   /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result
      because apr_fnmatch can't handle it.*/
   SVN_ERR(decompose_normalized(pattern, pattern_len, temp_buf));
-  svn_stringbuf_setempty(pattern_buf);
   if (!sql_like)
-    {
-      const apr_int32_t *const glob = (void *)temp_buf->data;
-      apr_size_t i;
-      for (i = 0; i < temp_buf->len; ++i)
-        SVN_ERR(encode_ucs4(pattern_buf, glob[i]));
-    }
+    SVN_ERR(encode_ucs4_string(pattern_buf,
+                               (void *)temp_buf->data,
+                               temp_buf->len));
   else
     {
       /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */
@@ -226,6 +242,7 @@ svn_utf__glob(const char *pattern, apr_s
           SVN_DBG(("<esc  : %c (U+%04lX)\n", (char)(ucs4esc & 0xFF), (long)ucs4esc));
         }
 
+      svn_stringbuf_setempty(pattern_buf);
       for (i = 0, escaped = FALSE; i < temp_buf->len; ++i, ++like)
         {
           if (*like == ucs4esc && !escaped)
@@ -259,18 +276,15 @@ svn_utf__glob(const char *pattern, apr_s
                 SVN_ERR(encode_ucs4(pattern_buf, *like));
             }
         }
+      pattern_buf->data[pattern_buf->len] = '\0';
     }
   SVN_DBG(("glob  : %s\n", pattern_buf->data));
 
   /* Now normalize the string */
   SVN_ERR(decompose_normalized(string, string_len, temp_buf));
-  svn_stringbuf_setempty(string_buf);
-  {
-    const apr_int32_t *const ucs4nfd = (void *)temp_buf->data;
-    apr_size_t i;
-    for (i = 0; i < temp_buf->len; ++i)
-      SVN_ERR(encode_ucs4(string_buf, ucs4nfd[i]));
-  }
+    SVN_ERR(encode_ucs4_string(string_buf,
+                               (void *)temp_buf->data,
+                               temp_buf->len));
   SVN_DBG(("string: %s\n", string_buf->data));
 
   *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);