You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@apr.apache.org by wr...@apache.org on 2008/08/07 19:53:39 UTC
svn commit: r683665 - /apr/apr/trunk/misc/win32/utf8.c

Author: wrowe
Date: Thu Aug  7 10:53:39 2008
New Revision: 683665

URL: http://svn.apache.org/viewvc?rev=683665&view=rev
Log:
Improve explanations, reference appropriate RFC's and add some
exploratory math for the limits.

Modified:
    apr/apr/trunk/misc/win32/utf8.c

Modified: apr/apr/trunk/misc/win32/utf8.c
URL: http://svn.apache.org/viewvc/apr/apr/trunk/misc/win32/utf8.c?rev=683665&r1=683664&r2=683665&view=diff
==============================================================================
--- apr/apr/trunk/misc/win32/utf8.c (original)
+++ apr/apr/trunk/misc/win32/utf8.c Thu Aug  7 10:53:39 2008
@@ -19,31 +19,32 @@
 #include "apr_errno.h"
 #include "apr_arch_utf8.h"
 
-/* Implement the design principal specified by RFC 2718 2.2.5 
- * Guidelines for new URL Schemes - within the APR.
+/* Implementation of RFC 3629, "UTF-8, a transformation format of ISO 10646"
+ * with particular attention to canonical translation forms (see section 10
+ * "Security Considerations" of the RFC for more info).
+ *
+ * Since several architectures including Windows support unicode, with UCS2
+ * used as the actual storage conventions by that archicture, these functions
+ * exist to transform or validate UCS2 strings into APR's 'char' type
+ * convention.  It is left up to the operating system to determine the
+ * validitity of the string, e.g. normative forms, in the context of 
+ * its native language support.  Other file systems which support filename 
+ * characters of 0x80-0xff but have no explicit requirement for Unicode
+ * will find this function useful only for validating the character sequences 
+ * and rejecting poorly encoded UTF8 sequences.
+ *
+ * Len UCS-4 range (hex) UTF-8 octet sequence (binary)
+ * 1:2 00000000-0000007F 0xxxxxxx
+ * 2:2 00000080-000007FF 110XXXXx 10xxxxxx
+ * 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx
+ * 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
+ *     00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *     04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  *
- * Since many architectures support unicode, and UCS2 is the most
- * efficient storage used by those archictures, these functions
- * exist to validate a UCS string.  It is up to the operating system
- * to determine the validitity of the string in the context of it's
- * native language support.  File systems that support filename 
- * characters of 0x80-0xff but have no support of Unicode will find 
- * this function useful only for validating the character sequences 
- * and rejecting poorly encoded strings, if RFC 2718 2.2.5 naming is
- * desired.
- *
- * from RFC 2279 UTF-8, a transformation format of ISO 10646
- *
- *     UCS-4 range (hex.)    UTF-8 octet sequence (binary)
- * 1:2 0000 0000-0000 007F   0xxxxxxx
- * 2:2 0000 0080-0000 07FF   110XXXXx 10xxxxxx
- * 3:2 0000 0800-0000 FFFF   1110XXXX 10Xxxxxx 10xxxxxx
- * 4:4 0001 0000-001F FFFF   11110zXX 10XXxxxx 10xxxxxx 10xxxxxx
- * inv 0020 0000-03FF FFFF   111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
- * inv 0400 0000-7FFF FFFF   1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * One of the X bits must be 1 to avoid overlong representation of ucs2 values. 
  *
- * One of the X values must be one for the encoding length to be legit.
- * Neither the z bit, nor the final two forms, are used for ucs-2
+ * For conversion into ucs2, the 4th form is limited in range to 0010 FFFF,
+ * and the final two forms are used only by full ucs4, per RFC 3629;
  *
  *   "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in 
  *   Unicode parlance), being actually UCS-4 characters transformed 
@@ -51,16 +52,20 @@
  *   must be undone, yielding a UCS-4 character that is then transformed 
  *   as above."
  *
- * from RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
+ * From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
  *
  *  U' = U - 0x10000
- *  U' = 000000000000yyyyyyyyyyxxxxxxxxxx
- *                  W1 = 110110yyyyyyyyyy
- *                  W2 = 110111xxxxxxxxxx
+ *  U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx
+ *                    W1 = 110110yy yyyyyyyy
+ *                    W2 = 110111xx xxxxxxxx
+ *  Max U' = 0000 00001111 11111111 11111111
+ *  Max U  = 0000 00010000 11111111 11111111
  *
- * apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
+ * Len is the table above is a mapping of bytes used for utf8:ucs2 values,
+ * which results in these conclusions of maximum allocations;
  *
- * apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
+ *  apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
+ *  apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
  */
 
 APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in,