You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by mt...@apache.org on 2009/07/01 09:13:06 UTC

svn commit: r790049 - in /commons/sandbox/runtime/trunk/src: main/native/include/acr_string.h main/native/os/unix/main.c main/native/shared/string.c main/native/test/testcase.c test/org/apache/commons/runtime/TestPrivate.java

Author: mturk
Date: Wed Jul  1 07:13:06 2009
New Revision: 790049

URL: http://svn.apache.org/viewvc?rev=790049&view=rev
Log:
Use APR ucs-utf8 conversion and add multi-part string support

Modified:
    commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h
    commons/sandbox/runtime/trunk/src/main/native/os/unix/main.c
    commons/sandbox/runtime/trunk/src/main/native/shared/string.c
    commons/sandbox/runtime/trunk/src/main/native/test/testcase.c
    commons/sandbox/runtime/trunk/src/test/org/apache/commons/runtime/TestPrivate.java

Modified: commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h?rev=790049&r1=790048&r2=790049&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h (original)
+++ commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h Wed Jul  1 07:13:06 2009
@@ -166,6 +166,62 @@
 ACR_DECLARE(wchar_t *) ACR_StrdupW(JNIEnv *env, const char *file, int line,
                                    const wchar_t *s);
 
+/**
+ * Apache's "replacement" for the strtok_r() function that uses
+ * a single char instead set of delimiters.
+ * @param str string to tokenize.
+ * @param sep Token delimiting character.
+ * @param last internal buffer for maintaining the state.
+ * @return Token.
+ */
+ACR_DECLARE(char *) ACR_strtok_c(char *str, int sep, char **last);
+
+/**
+ * Apache's "replacement" for the wcstok_r() function that uses
+ * a single char instead set of delimiters.
+ * @param str string to tokenize.
+ * @param sep Token delimiting character.
+ * @param last internal buffer for maintaining the state.
+ * @return Token.
+ */
+ACR_DECLARE(wchar_t *) ACR_wcstok_c(wchar_t *str, int sep, wchar_t **last);
+
+/**
+ * Count the number of string parts in multi string. Ansi version.
+ * <p> Multi strings are zero separated double zero
+ * terminated strings
+ * <p>
+ * @param str String to use.
+ * @return Number of string parts.
+ */
+ACR_DECLARE(size_t) ACR_MszStrCountA(const char *s);
+
+/**
+ * Count the number of string parts in multi string. Unicode version.
+ * <p> Multi strings are zero separated double zero
+ * terminated strings
+ * <p>
+ * @param str String to use.
+ * @return Number of string parts.
+ */
+ACR_DECLARE(size_t) ACR_MszStrCountW(const wchar_t *s);
+
+/**
+ * Convert the multipart string to Java String array.
+ * @param env Current JNI environment.
+ * @param str String to use.
+ * @return Java string array.
+ */
+ACR_DECLARE(jobjectArray) ACR_MszStrToStringArrayA(JNIEnv *env, const char *s);
+
+/**
+ * Convert the multipart string to Java String array.
+ * @param env Current JNI environment.
+ * @param str String to use.
+ * @return Java string array.
+ */
+ACR_DECLARE(jobjectArray) ACR_MszStrToStringArrayW(JNIEnv *env, const wchar_t *s);
+
 #ifdef __cplusplus
 }
 #endif

Modified: commons/sandbox/runtime/trunk/src/main/native/os/unix/main.c
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/main/native/os/unix/main.c?rev=790049&r1=790048&r2=790049&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/main/native/os/unix/main.c (original)
+++ commons/sandbox/runtime/trunk/src/main/native/os/unix/main.c Wed Jul  1 07:13:06 2009
@@ -117,3 +117,4 @@
     return tlsd->env;
 }
 
+

Modified: commons/sandbox/runtime/trunk/src/main/native/shared/string.c
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/main/native/shared/string.c?rev=790049&r1=790048&r2=790049&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/main/native/shared/string.c (original)
+++ commons/sandbox/runtime/trunk/src/main/native/shared/string.c Wed Jul  1 07:13:06 2009
@@ -128,28 +128,279 @@
     return rv;
 }
 
+/* Implementation of RFC 3629, "UTF-8, a transformation format of ISO 10646"
+ * with particular attention to canonical translation forms (see section 10
+ * "Security Considerations" of the RFC for more info).
+ *
+ * Since several architectures including Windows support unicode, with UCS2
+ * used as the actual storage conventions by that archicture, these functions
+ * exist to transform or validate UCS2 strings into APR's 'char' type
+ * convention.  It is left up to the operating system to determine the
+ * validitity of the string, e.g. normative forms, in the context of
+ * its native language support.  Other file systems which support filename
+ * characters of 0x80-0xff but have no explicit requirement for Unicode
+ * will find this function useful only for validating the character sequences
+ * and rejecting poorly encoded UTF8 sequences.
+ *
+ * Len UCS-4 range (hex) UTF-8 octet sequence (binary)
+ * 1:2 00000000-0000007F 0xxxxxxx
+ * 2:2 00000080-000007FF 110XXXXx 10xxxxxx
+ * 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx
+ * 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
+ *     00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *     04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * One of the X bits must be 1 to avoid overlong representation of ucs2 values.
+ *
+ * For conversion into ucs2, the 4th form is limited in range to 0010 FFFF,
+ * and the final two forms are used only by full ucs4, per RFC 3629;
+ *
+ *   "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
+ *   Unicode parlance), being actually UCS-4 characters transformed
+ *   through UTF-16, need special treatment: the UTF-16 transformation
+ *   must be undone, yielding a UCS-4 character that is then transformed
+ *   as above."
+ *
+ * From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
+ *
+ *  U' = U - 0x10000
+ *  U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx
+ *                    W1 = 110110yy yyyyyyyy
+ *                    W2 = 110111xx xxxxxxxx
+ *  Max U' = 0000 00001111 11111111 11111111
+ *  Max U  = 0000 00010000 11111111 11111111
+ *
+ * Len is the table above is a mapping of bytes used for utf8:ucs2 values,
+ * which results in these conclusions of maximum allocations;
+ *
+ *  conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
+ *  conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
+ */
+static int conv_utf8_to_ucs2(const char *in, jsize inbytes,
+                             jchar *out, jsize *outwords)
+{
+    acr_int64_t newch, mask;
+    jsize expect, eating;
+    int ch;
+
+    while (inbytes && *outwords) {
+        ch = (unsigned char)(*in++);
+        if (!(ch & 0200)) {
+            /* US-ASCII-7 plain text
+             */
+            --inbytes;
+            --*outwords;
+            *(out++) = ch;
+        }
+        else {
+            if ((ch & 0300) != 0300) {
+                /* Multibyte Continuation is out of place
+                 */
+                return ACR_EINVAL;
+            }
+            else {
+                /* Multibyte Sequence Lead Character
+                 *
+                 * Compute the expected bytes while adjusting
+                 * or lead byte and leading zeros mask.
+                 */
+                mask = 0340;
+                expect = 1;
+                while ((ch & mask) == mask) {
+                    mask |= mask >> 1;
+                    if (++expect > 3) /* (truly 5 for ucs-4) */
+                        return ACR_EINVAL;
+                }
+                newch = ch & ~mask;
+                eating = expect + 1;
+                if (inbytes <= expect)
+                    return ACR_INCOMPLETE;
+                /* Reject values of excessive leading 0 bits
+                 * utf-8 _demands_ the shortest possible byte length
+                 */
+                if (expect == 1) {
+                    if (!(newch & 0036))
+                        return ACR_EINVAL;
+                }
+                else {
+                    /* Reject values of excessive leading 0 bits
+                     */
+                    if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
+                        return ACR_EINVAL;
+                    if (expect == 2) {
+                        /* Reject values D800-DFFF when not utf16 encoded
+                         * (may not be an appropriate restriction for ucs-4)
+                         */
+                        if (newch == 0015 && ((unsigned char)*in & 0040))
+                            return ACR_EINVAL;
+                    }
+                    else if (expect == 3) {
+                        /* Short circuit values > 110000
+                         */
+                        if (newch > 4)
+                            return ACR_EINVAL;
+                        if (newch == 4 && ((unsigned char)*in & 0060))
+                            return ACR_EINVAL;
+                    }
+                }
+                /* Where the boolean (expect > 2) is true, we will need
+                 * an extra word for the output.
+                 */
+                if (*outwords < (jsize)(expect > 2) + 1)
+                    break; /* buffer full */
+                while (expect--) {
+                    /* Multibyte Continuation must be legal */
+                    if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
+                        return ACR_EINVAL;
+                    newch <<= 6;
+                    newch |= (ch & 0077);
+                }
+                inbytes -= eating;
+                /* newch is now a true ucs-4 character
+                 *
+                 * now we need to fold to ucs-2
+                 */
+                if (newch < 0x10000) {
+                    --*outwords;
+                    *(out++) = (jchar) newch;
+                }
+                else {
+                    *outwords -= 2;
+                    newch -= 0x10000;
+                    *(out++) = (jchar) (0xD800 | (newch >> 10));
+                    *(out++) = (jchar) (0xDC00 | (newch & 0x03FF));
+                }
+            }
+        }
+    }
+    /* Buffer full 'errors' aren't errors, the client must inspect both
+     * the inbytes and outwords values
+     */
+    return ACR_SUCCESS;
+}
+
+/* Java implementation of GetStringUTF is bogus.
+ * It breaks on embeded NUL in strings.
+ * Use the APR implementation instead.
+ */
+static int conv_ucs2_to_utf8(const jchar *in, jsize inwords,
+                             char *out, jsize *outbytes)
+{
+    acr_int64_t newch, require;
+    jsize need;
+    char *invout;
+    int ch;
+
+    while (inwords && *outbytes) {
+        ch = (unsigned short)(*in++);
+        if (ch < 0x80) {
+            --inwords;
+            --*outbytes;
+            *(out++) = (unsigned char) ch;
+        }
+        else  {
+            if ((ch & 0xFC00) == 0xDC00) {
+                /* Invalid Leading ucs-2 Multiword Continuation Character
+                 */
+                return ACR_EINVAL;
+            }
+            if ((ch & 0xFC00) == 0xD800) {
+                /* Leading ucs-2 Multiword Character
+                 */
+                if (inwords < 2) {
+                    /* Missing ucs-2 Multiword Continuation Character
+                     */
+                    return ACR_INCOMPLETE;
+                }
+                if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
+                    /* Invalid ucs-2 Multiword Continuation Character
+                     */
+                    return ACR_EINVAL;
+                }
+                newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);
+                newch += 0x10000;
+            }
+            else {
+                /* ucs-2 Single Word Character
+                 */
+                newch = ch;
+            }
+            /* Determine the absolute minimum utf-8 bytes required
+             */
+            require = newch >> 11;
+            need = 1;
+            while (require)
+                require >>= 5, ++need;
+            if (need >= *outbytes)
+                break; /* Insufficient buffer */
+            inwords   -= (need > 2) + 1;
+            *outbytes -=  need + 1;
+            /* Compute the utf-8 characters in last to first order,
+             * calculating the lead character length bits along the way.
+             */
+            ch = 0200;
+            out += need + 1;
+            invout = out;
+            while (need--) {
+                ch |= ch >> 1;
+                *(--invout) = (unsigned char)(0200 | (newch & 0077));
+                newch >>= 6;
+            }
+            /* Compute the lead utf-8 character and move the dest offset
+             */
+            *(--invout) = (unsigned char)(ch | newch);
+        }
+    }
+    /* Buffer full 'errors' aren't errors, the client must inspect both
+     * the inwords and outbytes values
+     */
+    return ACR_SUCCESS;
+}
+
 static char *get_string_utf_8(JNIEnv *_E, jstring str, char *b)
 {
-    const char *sr;
+    jsize sl, nl;
+    const jchar *sr;
     char *rv = NULL;
-    size_t sl;
 
-    sr = (const char *)(*_E)->GetStringUTFChars(_E, str, NULL);
-    if (!sr) {
+    if (!str) {
         return NULL;
     }
-    sl = strlen(sr);
-    if (b && sl < ACR_PBUFF_LEN)
+    if ((*_E)->EnsureLocalCapacity(_E, 2) < 0) {
+        /* JNI out of memory error */
+        return NULL;
+    }
+    sl = (*_E)->GetStringLength(_E, str);
+    nl = sl * 3 / 2;
+    if (b && nl < ACR_MBUFF_LEN)
         rv = b;
     else {
-        rv = (char *)ACR_Malloc(_E, THROW_FMARK, sl + 1);
-        if (rv == NULL) {
-            (*_E)->ReleaseStringUTFChars(_E, str, sr);
+        rv = (char *)ACR_Malloc(_E, THROW_FMARK, nl + 1);
+        if (!rv) {
+            /* Exception has already neen throw from ACR_Malloc
+             */
+            return NULL;
+        }
+    }
+    sr = (*_E)->GetStringCritical(_E, str, NULL);
+    if (!sr) {
+        if (rv != b)
+            free(rv);
+        return NULL;
+    }
+    else {
+        jsize ol = nl;
+        if (conv_ucs2_to_utf8(sr, sl, rv, &nl) == ACR_SUCCESS)
+            rv[ol - nl] = '\0';
+        else {
+            /* XXX: Throw some exception ?
+             */
+            if (rv != b)
+                free(rv);
             return NULL;
         }
     }
-    strcpy(rv, sr);
-    (*_E)->ReleaseStringUTFChars(_E, str, sr);
+    (*_E)->ReleaseStringCritical(_E, str, sr);
     return rv;
 }
 
@@ -226,6 +477,30 @@
     return rs;
 }
 
+static jstring new_string_utf_8(JNIEnv *_E, const char *s)
+{
+    jstring rs = NULL;
+    if (s) {
+        jsize sl = (jsize)strlen(s);
+        if (sl < ACR_MBUFF_SIZ) {
+            jchar  cc[ACR_MBUFF_SIZ];
+            jsize  wl = ACR_MBUFF_LEN;
+            if (conv_utf8_to_ucs2(s, sl, cc, &wl) == ACR_SUCCESS)
+                rs = (*_E)->NewString(_E, cc, sl);
+        }
+        else {
+            jchar  *cc;
+            if ((cc = ACR_Malloc(_E, THROW_FMARK, (sl + 1) * sizeof(jchar)))) {
+                jsize wl = sl;
+                if (conv_utf8_to_ucs2(s, sl, cc, &wl) == ACR_SUCCESS)
+                    rs = (*_E)->NewString(_E, cc, sl);
+                free(cc);
+            }
+        }
+    }
+    return rs;
+}
+
 
 /*
  * Apache's "replacement" for the strncpy() function. We roll our
@@ -505,7 +780,7 @@
             return new_string_iso_8859_1(_E, str);
         break;
         case ACR_CP_UTF_8:
-            return (*_E)->NewStringUTF(_E, str);
+            return new_string_utf_8(_E, str);
         break;
         default:
             return new_string_default(_E, str);
@@ -588,3 +863,121 @@
     return (str[x] != L'\0');
 }
 
+ACR_DECLARE(char *) ACR_strtok_c(char *str, int sep, char **last)
+{
+    char *sp;
+    if (!str)           /* subsequent call */
+        str = *last;    /* start where we left off */
+    if (!str)           /* no more tokens */
+        return NULL;
+    if ((sp = strchr(str, sep))) {
+        *sp++ = '\0';
+        *last = sp;
+        return str;
+    }
+    else {
+        /* Check for last empty token */
+        return *str ? str : NULL;
+    }
+}
+
+ACR_DECLARE(wchar_t *) ACR_wcstok_c(wchar_t *str, int sep, wchar_t **last)
+{
+    wchar_t *sp;
+    if (!str)           /* subsequent call */
+        str = *last;    /* start where we left off */
+    if (!str)           /* no more tokens */
+        return NULL;
+    if ((sp = wcschr(str, (wchar_t)sep))) {
+        *sp++ = L'\0';
+        *last = sp;
+        return str;
+    }
+    else {
+        /* Check for last empty token */
+        return *str ? str : NULL;
+    }
+}
+
+ACR_DECLARE(size_t) ACR_MszStrCountA(const char *s)
+{
+    size_t n = 0;
+    const char *p;
+
+    for (p = s; p && *p; p++) {
+        n++;
+        while (*p)
+            p++;
+    }
+    return n;
+}
+
+ACR_DECLARE(size_t) ACR_MszStrCountW(const wchar_t *s)
+{
+    size_t n = 0;
+    const wchar_t *p;
+
+    for (p = s; p && *p; p++) {
+        n++;
+        while (*p)
+            p++;
+    }
+    return n;
+}
+
+ACR_DECLARE(jobjectArray) ACR_MszStrToStringArrayA(JNIEnv *_E,
+                                                   const char *str)
+{
+    jobjectArray arr = NULL;
+    jsize n = 0;
+    const char *p;
+
+    if (str) {
+        if ((n = ACR_MszStrCountA(str)) > 0)
+            arr = ACR_NewCoreObjectArray(_E, ACR_CC_STRING, n);
+    }
+    if (!arr)
+        return NULL;
+
+    n = 0;
+    for (p = str; p && *p; p++) {
+        jstring s = ACR_NewJavaStringA(_E, p);
+        if (s) {
+            (*_E)->SetObjectArrayElement(_E, arr, n, s);
+            (*_E)->DeleteLocalRef(_E, s);
+        }
+        n++;
+        while (*p)
+            p++;
+    }
+    return arr;
+}
+
+ACR_DECLARE(jobjectArray) ACR_MszStrToStringArrayW(JNIEnv *_E,
+                                                   const wchar_t *str)
+{
+    jobjectArray arr = NULL;
+    jsize n = 0;
+    const wchar_t *p;
+
+    if (str) {
+        if ((n = ACR_MszStrCountW(str)) > 0)
+            arr = ACR_NewCoreObjectArray(_E, ACR_CC_STRING, n);
+    }
+    if (!arr)
+        return NULL;
+
+    n = 0;
+    for (p = str; p && *p; p++) {
+        jstring s = ACR_NewJavaStringW(_E, p);
+        if (s) {
+            (*_E)->SetObjectArrayElement(_E, arr, n, s);
+            (*_E)->DeleteLocalRef(_E, s);
+        }
+        n++;
+        while (*p)
+            p++;
+    }
+    return arr;
+}
+

Modified: commons/sandbox/runtime/trunk/src/main/native/test/testcase.c
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/main/native/test/testcase.c?rev=790049&r1=790048&r2=790049&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/main/native/test/testcase.c (original)
+++ commons/sandbox/runtime/trunk/src/main/native/test/testcase.c Wed Jul  1 07:13:06 2009
@@ -326,6 +326,28 @@
     }
 }
 
+ACR_JNI_EXPORT_DECLARE(jobjectArray, TestPrivate, test031)(ACR_JNISTDARGS, jstring s)
+{
+    jobjectArray a = NULL;
+
+    WITH_CSTR(s) {
+        a = ACR_MszStrToStringArrayA(_E, J2S(s));
+    } END_WITH_CSTR(s);
+
+	return a;
+}
+
+ACR_JNI_EXPORT_DECLARE(jobjectArray, TestPrivate, test032)(ACR_JNISTDARGS, jstring s)
+{
+    jobjectArray a = NULL;
+
+    WITH_WSTR(s) {
+        a = ACR_MszStrToStringArrayW(_E, J2W(s));
+    } END_WITH_WSTR(s);
+
+	return a;
+}
+
 
 ACR_JNI_EXPORT_DECLARE(jint, TestFile, ftest00)(ACR_JNISTDARGS, jint d)
 {

Modified: commons/sandbox/runtime/trunk/src/test/org/apache/commons/runtime/TestPrivate.java
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/test/org/apache/commons/runtime/TestPrivate.java?rev=790049&r1=790048&r2=790049&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/test/org/apache/commons/runtime/TestPrivate.java (original)
+++ commons/sandbox/runtime/trunk/src/test/org/apache/commons/runtime/TestPrivate.java Wed Jul  1 07:13:06 2009
@@ -80,6 +80,10 @@
     private static native File   test029(int d);
 
     private static native void   test030(int d);
+    private static native String[] test031(String s);
+    private static native String[] test032(String s);
+
+
     private static native String test100(String s);
 
 
@@ -600,6 +604,21 @@
         }
     }
 
+    public void testMszAnsi()
+        throws Throwable
+    {
+        byte[] b = { 'A', 0, 'B', 0, 'C', 0, 0 };
+        String[] a = test031(new String(b));
+        assertEquals("Length", 3, a.length);
+    }
+
+    public void testMszWide()
+        throws Throwable
+    {
+        String[] a = test032("A\0B\0C\0\0");
+        assertEquals("Length", 3, a.length);
+    }
+
     public void testModuleSSL()
         throws Throwable
     {