You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@subversion.apache.org by st...@apache.org on 2012/09/08 00:59:10 UTC

svn commit: r1382204 - /subversion/trunk/subversion/libsvn_subr/utf_validate.c

Author: stefan2
Date: Fri Sep  7 22:59:09 2012
New Revision: 1382204

URL: http://svn.apache.org/viewvc?rev=1382204&view=rev
Log:
Make clear that this is, in fact, the *UTF*_validate file.
So, we are not dealing with true ASCII representations here
but 1-byte encoded UTF-8 codepoints.

Also, support platforms with unsigned chars being default
without giving away any of the efficiency on the others.

* subversion/libsvn_subr/utf_validate.c
  (first_non_ascii_char,
   first_non_ascii_char_cstring): rename to ...
  (first_non_fsm_start_char,
   first_non_fsm_start_char_cstring): ... this; support platforms
   on which chars are not signed by default
  (svn_utf__last_valid,
   svn_utf__cstring_is_valid,
   svn_utf__is_valid,
   svn_utf__last_valid2): update callers

Modified:
    subversion/trunk/subversion/libsvn_subr/utf_validate.c

Modified: subversion/trunk/subversion/libsvn_subr/utf_validate.c
URL: http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/utf_validate.c?rev=1382204&r1=1382203&r2=1382204&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf_validate.c (original)
+++ subversion/trunk/subversion/libsvn_subr/utf_validate.c Fri Sep  7 22:59:09 2012
@@ -250,11 +250,12 @@ static const char machine [9][14] = {
    FSM_ERROR},        /* 0xf5-0xff */
 };
 
-/* Scan MAX_LEN bytes in *DATA for non-ASCII chars. Return the position
- * of the first non-ASCII char or DATA + MAX_LEN if all were ASCII.
+/* Scan MAX_LEN bytes in *DATA for chars that are not in the octet
+ * category 0 (FSM_START).  Return the position of the first such char
+ * or DATA + MAX_LEN if all were cat 0.
  */
 static const char *
-first_non_ascii_char(const char *data, apr_size_t max_len)
+first_non_fsm_start_char(const char *data, apr_size_t max_len)
 {
 #if !SVN_UNALIGNED_ACCESS_IS_OK
 
@@ -269,8 +270,8 @@ first_non_ascii_char(const char *data, a
       max_len -= len;
 
       for (; len > 0; ++data, --len)
-          if (*data < 0)
-            return data;
+        if (*data < 0 || *data >= 0x80)
+          return data;
     }
     
 #endif
@@ -283,17 +284,18 @@ first_non_ascii_char(const char *data, a
 
   /* The remaining odd bytes will be examined the naive way: */
   for (; max_len > 0; ++data, --max_len)
-    if (*data < 0)
+    if (*data < 0 || *data >= 0x80)
       return data;
 
   return data;
 }
 
-/* Scan the C string in *DATA for non-ASCII chars. Return the position
- * of either the first non-ASCII char or the terminating NUL.
+/* Scan the C string in *DATA for chars that are not in the octet
+ * category 0 (FSM_START).  Return the position of either the such
+ * char or of the terminating NUL.
  */
 static const char *
-first_non_ascii_char_cstring(const char *data)
+first_non_fsm_start_char_cstring(const char *data)
 {
   /* We need to make sure that BUF is properly aligned for chunky data
    * access because we don't know the string's length. Unaligned chunk
@@ -301,7 +303,7 @@ first_non_ascii_char_cstring(const char 
    * segfault.
    */
   for (; (apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1); ++data)
-    if (*data <= 0)
+    if (*data <= 0 || *data >= 0x80)
       return data;
 
   /* Scan the input one machine word at a time. */
@@ -320,7 +322,7 @@ first_non_ascii_char_cstring(const char 
 
   /* The remaining odd bytes will be examined the naive way: */
   for (; ; ++data)
-    if (*data <= 0)
+    if (*data <= 0 || *data >= 0x80)
       return data;
 
   return data;
@@ -329,7 +331,7 @@ first_non_ascii_char_cstring(const char 
 const char *
 svn_utf__last_valid(const char *data, apr_size_t len)
 {
-  const char *start = first_non_ascii_char(data, len);
+  const char *start = first_non_fsm_start_char(data, len);
   const char *end = data + len;
   int state = FSM_START;
 
@@ -349,7 +351,7 @@ svn_boolean_t
 svn_utf__cstring_is_valid(const char *data)
 {
   int state = FSM_START;
-  data = first_non_ascii_char_cstring(data);
+  data = first_non_fsm_start_char_cstring(data);
 
   while (*data)
     {
@@ -365,7 +367,7 @@ svn_utf__is_valid(const char *data, apr_
 {
   const char *end = data + len;
   int state = FSM_START;
-  data = first_non_ascii_char(data, len);
+  data = first_non_fsm_start_char(data, len);
 
   while (data < end)
     {
@@ -379,7 +381,7 @@ svn_utf__is_valid(const char *data, apr_
 const char *
 svn_utf__last_valid2(const char *data, apr_size_t len)
 {
-  const char *start = first_non_ascii_char(data, len);
+  const char *start = first_non_fsm_start_char(data, len);
   const char *end = data + len;
   int state = FSM_START;

Re: svn commit: r1382204 - /subversion/trunk/subversion/libsvn_subr/utf_validate.c

Posted by Stefan Fuhrmann <st...@wandisco.com>.

On Sat, Sep 8, 2012 at 3:59 AM, Peter Samuelson <pe...@p12n.org> wrote:

>
> >        for (; len > 0; ++data, --len)
> > -          if (*data < 0)
> > -            return data;
> > +        if (*data < 0 || *data >= 0x80)
> > +          return data;
>
> A reasonable compiler will collapse it anyway, but this is shorter and
> more direct:
>
>         if (*data & 0x80)
>

You are right. I'm just using the same pattern
as for the "<=" variant.

-- Stefan^2.

-- 
*

Join us this October at Subversion Live
2012<http://www.wandisco.com/svn-live-2012>
 for two days of best practice SVN training, networking, live demos,
committer meet and greet, and more! Space is limited, so get signed up
today<http://www.wandisco.com/svn-live-2012>
!
*

Re: svn commit: r1382204 - /subversion/trunk/subversion/libsvn_subr/utf_validate.c

Posted by Branko Čibej <br...@wandisco.com>.

On 08.09.2012 03:59, Peter Samuelson wrote:
>>        for (; len > 0; ++data, --len)
>> -          if (*data < 0)
>> -            return data;
>> +        if (*data < 0 || *data >= 0x80)
>> +          return data;
> A reasonable compiler will collapse it anyway, but this is shorter and
> more direct:
>
>         if (*data & 0x80)

Also less obvious to the casual reader. Let's just leave it to
reasonable compilers to optimize the code for us. I can live with
unreasonable compilers causing the CPU to do more grunt work. :)

-- Brane

-- 
Certified & Supported Apache Subversion Downloads:
http://www.wandisco.com/subversion/download

Re: svn commit: r1382204 - /subversion/trunk/subversion/libsvn_subr/utf_validate.c

Posted by Peter Samuelson <pe...@p12n.org>.

>        for (; len > 0; ++data, --len)
> -          if (*data < 0)
> -            return data;
> +        if (*data < 0 || *data >= 0x80)
> +          return data;

A reasonable compiler will collapse it anyway, but this is shorter and
more direct:

        if (*data & 0x80)

Peter