You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by mt...@apache.org on 2009/07/01 09:13:06 UTC
svn commit: r790049 - in /commons/sandbox/runtime/trunk/src:
main/native/include/acr_string.h main/native/os/unix/main.c
main/native/shared/string.c main/native/test/testcase.c
test/org/apache/commons/runtime/TestPrivate.java
Author: mturk
Date: Wed Jul 1 07:13:06 2009
New Revision: 790049
URL: http://svn.apache.org/viewvc?rev=790049&view=rev
Log:
Use APR ucs-utf8 conversion and add multi-part string support
Modified:
commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h
commons/sandbox/runtime/trunk/src/main/native/os/unix/main.c
commons/sandbox/runtime/trunk/src/main/native/shared/string.c
commons/sandbox/runtime/trunk/src/main/native/test/testcase.c
commons/sandbox/runtime/trunk/src/test/org/apache/commons/runtime/TestPrivate.java
Modified: commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h?rev=790049&r1=790048&r2=790049&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h (original)
+++ commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h Wed Jul 1 07:13:06 2009
@@ -166,6 +166,62 @@
ACR_DECLARE(wchar_t *) ACR_StrdupW(JNIEnv *env, const char *file, int line,
const wchar_t *s);
+/**
+ * Apache's "replacement" for the strtok_r() function that uses
+ * a single char instead set of delimiters.
+ * @param str string to tokenize.
+ * @param sep Token delimiting character.
+ * @param last internal buffer for maintaining the state.
+ * @return Token.
+ */
+ACR_DECLARE(char *) ACR_strtok_c(char *str, int sep, char **last);
+
+/**
+ * Apache's "replacement" for the wcstok_r() function that uses
+ * a single char instead set of delimiters.
+ * @param str string to tokenize.
+ * @param sep Token delimiting character.
+ * @param last internal buffer for maintaining the state.
+ * @return Token.
+ */
+ACR_DECLARE(wchar_t *) ACR_wcstok_c(wchar_t *str, int sep, wchar_t **last);
+
+/**
+ * Count the number of string parts in multi string. Ansi version.
+ * <p> Multi strings are zero separated double zero
+ * terminated strings
+ * <p>
+ * @param str String to use.
+ * @return Number of string parts.
+ */
+ACR_DECLARE(size_t) ACR_MszStrCountA(const char *s);
+
+/**
+ * Count the number of string parts in multi string. Unicode version.
+ * <p> Multi strings are zero separated double zero
+ * terminated strings
+ * <p>
+ * @param str String to use.
+ * @return Number of string parts.
+ */
+ACR_DECLARE(size_t) ACR_MszStrCountW(const wchar_t *s);
+
+/**
+ * Convert the multipart string to Java String array.
+ * @param env Current JNI environment.
+ * @param str String to use.
+ * @return Java string array.
+ */
+ACR_DECLARE(jobjectArray) ACR_MszStrToStringArrayA(JNIEnv *env, const char *s);
+
+/**
+ * Convert the multipart string to Java String array.
+ * @param env Current JNI environment.
+ * @param str String to use.
+ * @return Java string array.
+ */
+ACR_DECLARE(jobjectArray) ACR_MszStrToStringArrayW(JNIEnv *env, const wchar_t *s);
+
#ifdef __cplusplus
}
#endif
Modified: commons/sandbox/runtime/trunk/src/main/native/os/unix/main.c
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/main/native/os/unix/main.c?rev=790049&r1=790048&r2=790049&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/main/native/os/unix/main.c (original)
+++ commons/sandbox/runtime/trunk/src/main/native/os/unix/main.c Wed Jul 1 07:13:06 2009
@@ -117,3 +117,4 @@
return tlsd->env;
}
+
Modified: commons/sandbox/runtime/trunk/src/main/native/shared/string.c
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/main/native/shared/string.c?rev=790049&r1=790048&r2=790049&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/main/native/shared/string.c (original)
+++ commons/sandbox/runtime/trunk/src/main/native/shared/string.c Wed Jul 1 07:13:06 2009
@@ -128,28 +128,279 @@
return rv;
}
+/* Implementation of RFC 3629, "UTF-8, a transformation format of ISO 10646"
+ * with particular attention to canonical translation forms (see section 10
+ * "Security Considerations" of the RFC for more info).
+ *
+ * Since several architectures including Windows support unicode, with UCS2
+ * used as the actual storage conventions by that archicture, these functions
+ * exist to transform or validate UCS2 strings into APR's 'char' type
+ * convention. It is left up to the operating system to determine the
+ * validitity of the string, e.g. normative forms, in the context of
+ * its native language support. Other file systems which support filename
+ * characters of 0x80-0xff but have no explicit requirement for Unicode
+ * will find this function useful only for validating the character sequences
+ * and rejecting poorly encoded UTF8 sequences.
+ *
+ * Len UCS-4 range (hex) UTF-8 octet sequence (binary)
+ * 1:2 00000000-0000007F 0xxxxxxx
+ * 2:2 00000080-000007FF 110XXXXx 10xxxxxx
+ * 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx
+ * 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
+ * 00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * One of the X bits must be 1 to avoid overlong representation of ucs2 values.
+ *
+ * For conversion into ucs2, the 4th form is limited in range to 0010 FFFF,
+ * and the final two forms are used only by full ucs4, per RFC 3629;
+ *
+ * "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
+ * Unicode parlance), being actually UCS-4 characters transformed
+ * through UTF-16, need special treatment: the UTF-16 transformation
+ * must be undone, yielding a UCS-4 character that is then transformed
+ * as above."
+ *
+ * From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
+ *
+ * U' = U - 0x10000
+ * U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx
+ * W1 = 110110yy yyyyyyyy
+ * W2 = 110111xx xxxxxxxx
+ * Max U' = 0000 00001111 11111111 11111111
+ * Max U = 0000 00010000 11111111 11111111
+ *
+ * Len is the table above is a mapping of bytes used for utf8:ucs2 values,
+ * which results in these conclusions of maximum allocations;
+ *
+ * conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
+ * conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
+ */
+static int conv_utf8_to_ucs2(const char *in, jsize inbytes,
+ jchar *out, jsize *outwords)
+{
+ acr_int64_t newch, mask;
+ jsize expect, eating;
+ int ch;
+
+ while (inbytes && *outwords) {
+ ch = (unsigned char)(*in++);
+ if (!(ch & 0200)) {
+ /* US-ASCII-7 plain text
+ */
+ --inbytes;
+ --*outwords;
+ *(out++) = ch;
+ }
+ else {
+ if ((ch & 0300) != 0300) {
+ /* Multibyte Continuation is out of place
+ */
+ return ACR_EINVAL;
+ }
+ else {
+ /* Multibyte Sequence Lead Character
+ *
+ * Compute the expected bytes while adjusting
+ * or lead byte and leading zeros mask.
+ */
+ mask = 0340;
+ expect = 1;
+ while ((ch & mask) == mask) {
+ mask |= mask >> 1;
+ if (++expect > 3) /* (truly 5 for ucs-4) */
+ return ACR_EINVAL;
+ }
+ newch = ch & ~mask;
+ eating = expect + 1;
+ if (inbytes <= expect)
+ return ACR_INCOMPLETE;
+ /* Reject values of excessive leading 0 bits
+ * utf-8 _demands_ the shortest possible byte length
+ */
+ if (expect == 1) {
+ if (!(newch & 0036))
+ return ACR_EINVAL;
+ }
+ else {
+ /* Reject values of excessive leading 0 bits
+ */
+ if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
+ return ACR_EINVAL;
+ if (expect == 2) {
+ /* Reject values D800-DFFF when not utf16 encoded
+ * (may not be an appropriate restriction for ucs-4)
+ */
+ if (newch == 0015 && ((unsigned char)*in & 0040))
+ return ACR_EINVAL;
+ }
+ else if (expect == 3) {
+ /* Short circuit values > 110000
+ */
+ if (newch > 4)
+ return ACR_EINVAL;
+ if (newch == 4 && ((unsigned char)*in & 0060))
+ return ACR_EINVAL;
+ }
+ }
+ /* Where the boolean (expect > 2) is true, we will need
+ * an extra word for the output.
+ */
+ if (*outwords < (jsize)(expect > 2) + 1)
+ break; /* buffer full */
+ while (expect--) {
+ /* Multibyte Continuation must be legal */
+ if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
+ return ACR_EINVAL;
+ newch <<= 6;
+ newch |= (ch & 0077);
+ }
+ inbytes -= eating;
+ /* newch is now a true ucs-4 character
+ *
+ * now we need to fold to ucs-2
+ */
+ if (newch < 0x10000) {
+ --*outwords;
+ *(out++) = (jchar) newch;
+ }
+ else {
+ *outwords -= 2;
+ newch -= 0x10000;
+ *(out++) = (jchar) (0xD800 | (newch >> 10));
+ *(out++) = (jchar) (0xDC00 | (newch & 0x03FF));
+ }
+ }
+ }
+ }
+ /* Buffer full 'errors' aren't errors, the client must inspect both
+ * the inbytes and outwords values
+ */
+ return ACR_SUCCESS;
+}
+
+/* Java implementation of GetStringUTF is bogus.
+ * It breaks on embeded NUL in strings.
+ * Use the APR implementation instead.
+ */
+static int conv_ucs2_to_utf8(const jchar *in, jsize inwords,
+ char *out, jsize *outbytes)
+{
+ acr_int64_t newch, require;
+ jsize need;
+ char *invout;
+ int ch;
+
+ while (inwords && *outbytes) {
+ ch = (unsigned short)(*in++);
+ if (ch < 0x80) {
+ --inwords;
+ --*outbytes;
+ *(out++) = (unsigned char) ch;
+ }
+ else {
+ if ((ch & 0xFC00) == 0xDC00) {
+ /* Invalid Leading ucs-2 Multiword Continuation Character
+ */
+ return ACR_EINVAL;
+ }
+ if ((ch & 0xFC00) == 0xD800) {
+ /* Leading ucs-2 Multiword Character
+ */
+ if (inwords < 2) {
+ /* Missing ucs-2 Multiword Continuation Character
+ */
+ return ACR_INCOMPLETE;
+ }
+ if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
+ /* Invalid ucs-2 Multiword Continuation Character
+ */
+ return ACR_EINVAL;
+ }
+ newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);
+ newch += 0x10000;
+ }
+ else {
+ /* ucs-2 Single Word Character
+ */
+ newch = ch;
+ }
+ /* Determine the absolute minimum utf-8 bytes required
+ */
+ require = newch >> 11;
+ need = 1;
+ while (require)
+ require >>= 5, ++need;
+ if (need >= *outbytes)
+ break; /* Insufficient buffer */
+ inwords -= (need > 2) + 1;
+ *outbytes -= need + 1;
+ /* Compute the utf-8 characters in last to first order,
+ * calculating the lead character length bits along the way.
+ */
+ ch = 0200;
+ out += need + 1;
+ invout = out;
+ while (need--) {
+ ch |= ch >> 1;
+ *(--invout) = (unsigned char)(0200 | (newch & 0077));
+ newch >>= 6;
+ }
+ /* Compute the lead utf-8 character and move the dest offset
+ */
+ *(--invout) = (unsigned char)(ch | newch);
+ }
+ }
+ /* Buffer full 'errors' aren't errors, the client must inspect both
+ * the inwords and outbytes values
+ */
+ return ACR_SUCCESS;
+}
+
static char *get_string_utf_8(JNIEnv *_E, jstring str, char *b)
{
- const char *sr;
+ jsize sl, nl;
+ const jchar *sr;
char *rv = NULL;
- size_t sl;
- sr = (const char *)(*_E)->GetStringUTFChars(_E, str, NULL);
- if (!sr) {
+ if (!str) {
return NULL;
}
- sl = strlen(sr);
- if (b && sl < ACR_PBUFF_LEN)
+ if ((*_E)->EnsureLocalCapacity(_E, 2) < 0) {
+ /* JNI out of memory error */
+ return NULL;
+ }
+ sl = (*_E)->GetStringLength(_E, str);
+ nl = sl * 3 / 2;
+ if (b && nl < ACR_MBUFF_LEN)
rv = b;
else {
- rv = (char *)ACR_Malloc(_E, THROW_FMARK, sl + 1);
- if (rv == NULL) {
- (*_E)->ReleaseStringUTFChars(_E, str, sr);
+ rv = (char *)ACR_Malloc(_E, THROW_FMARK, nl + 1);
+ if (!rv) {
+ /* Exception has already neen throw from ACR_Malloc
+ */
+ return NULL;
+ }
+ }
+ sr = (*_E)->GetStringCritical(_E, str, NULL);
+ if (!sr) {
+ if (rv != b)
+ free(rv);
+ return NULL;
+ }
+ else {
+ jsize ol = nl;
+ if (conv_ucs2_to_utf8(sr, sl, rv, &nl) == ACR_SUCCESS)
+ rv[ol - nl] = '\0';
+ else {
+ /* XXX: Throw some exception ?
+ */
+ if (rv != b)
+ free(rv);
return NULL;
}
}
- strcpy(rv, sr);
- (*_E)->ReleaseStringUTFChars(_E, str, sr);
+ (*_E)->ReleaseStringCritical(_E, str, sr);
return rv;
}
@@ -226,6 +477,30 @@
return rs;
}
+static jstring new_string_utf_8(JNIEnv *_E, const char *s)
+{
+ jstring rs = NULL;
+ if (s) {
+ jsize sl = (jsize)strlen(s);
+ if (sl < ACR_MBUFF_SIZ) {
+ jchar cc[ACR_MBUFF_SIZ];
+ jsize wl = ACR_MBUFF_LEN;
+ if (conv_utf8_to_ucs2(s, sl, cc, &wl) == ACR_SUCCESS)
+ rs = (*_E)->NewString(_E, cc, sl);
+ }
+ else {
+ jchar *cc;
+ if ((cc = ACR_Malloc(_E, THROW_FMARK, (sl + 1) * sizeof(jchar)))) {
+ jsize wl = sl;
+ if (conv_utf8_to_ucs2(s, sl, cc, &wl) == ACR_SUCCESS)
+ rs = (*_E)->NewString(_E, cc, sl);
+ free(cc);
+ }
+ }
+ }
+ return rs;
+}
+
/*
* Apache's "replacement" for the strncpy() function. We roll our
@@ -505,7 +780,7 @@
return new_string_iso_8859_1(_E, str);
break;
case ACR_CP_UTF_8:
- return (*_E)->NewStringUTF(_E, str);
+ return new_string_utf_8(_E, str);
break;
default:
return new_string_default(_E, str);
@@ -588,3 +863,121 @@
return (str[x] != L'\0');
}
+ACR_DECLARE(char *) ACR_strtok_c(char *str, int sep, char **last)
+{
+ char *sp;
+ if (!str) /* subsequent call */
+ str = *last; /* start where we left off */
+ if (!str) /* no more tokens */
+ return NULL;
+ if ((sp = strchr(str, sep))) {
+ *sp++ = '\0';
+ *last = sp;
+ return str;
+ }
+ else {
+ /* Check for last empty token */
+ return *str ? str : NULL;
+ }
+}
+
+ACR_DECLARE(wchar_t *) ACR_wcstok_c(wchar_t *str, int sep, wchar_t **last)
+{
+ wchar_t *sp;
+ if (!str) /* subsequent call */
+ str = *last; /* start where we left off */
+ if (!str) /* no more tokens */
+ return NULL;
+ if ((sp = wcschr(str, (wchar_t)sep))) {
+ *sp++ = L'\0';
+ *last = sp;
+ return str;
+ }
+ else {
+ /* Check for last empty token */
+ return *str ? str : NULL;
+ }
+}
+
+ACR_DECLARE(size_t) ACR_MszStrCountA(const char *s)
+{
+ size_t n = 0;
+ const char *p;
+
+ for (p = s; p && *p; p++) {
+ n++;
+ while (*p)
+ p++;
+ }
+ return n;
+}
+
+ACR_DECLARE(size_t) ACR_MszStrCountW(const wchar_t *s)
+{
+ size_t n = 0;
+ const wchar_t *p;
+
+ for (p = s; p && *p; p++) {
+ n++;
+ while (*p)
+ p++;
+ }
+ return n;
+}
+
+ACR_DECLARE(jobjectArray) ACR_MszStrToStringArrayA(JNIEnv *_E,
+ const char *str)
+{
+ jobjectArray arr = NULL;
+ jsize n = 0;
+ const char *p;
+
+ if (str) {
+ if ((n = ACR_MszStrCountA(str)) > 0)
+ arr = ACR_NewCoreObjectArray(_E, ACR_CC_STRING, n);
+ }
+ if (!arr)
+ return NULL;
+
+ n = 0;
+ for (p = str; p && *p; p++) {
+ jstring s = ACR_NewJavaStringA(_E, p);
+ if (s) {
+ (*_E)->SetObjectArrayElement(_E, arr, n, s);
+ (*_E)->DeleteLocalRef(_E, s);
+ }
+ n++;
+ while (*p)
+ p++;
+ }
+ return arr;
+}
+
+ACR_DECLARE(jobjectArray) ACR_MszStrToStringArrayW(JNIEnv *_E,
+ const wchar_t *str)
+{
+ jobjectArray arr = NULL;
+ jsize n = 0;
+ const wchar_t *p;
+
+ if (str) {
+ if ((n = ACR_MszStrCountW(str)) > 0)
+ arr = ACR_NewCoreObjectArray(_E, ACR_CC_STRING, n);
+ }
+ if (!arr)
+ return NULL;
+
+ n = 0;
+ for (p = str; p && *p; p++) {
+ jstring s = ACR_NewJavaStringW(_E, p);
+ if (s) {
+ (*_E)->SetObjectArrayElement(_E, arr, n, s);
+ (*_E)->DeleteLocalRef(_E, s);
+ }
+ n++;
+ while (*p)
+ p++;
+ }
+ return arr;
+}
+
Modified: commons/sandbox/runtime/trunk/src/main/native/test/testcase.c
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/main/native/test/testcase.c?rev=790049&r1=790048&r2=790049&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/main/native/test/testcase.c (original)
+++ commons/sandbox/runtime/trunk/src/main/native/test/testcase.c Wed Jul 1 07:13:06 2009
@@ -326,6 +326,28 @@
}
}
+ACR_JNI_EXPORT_DECLARE(jobjectArray, TestPrivate, test031)(ACR_JNISTDARGS, jstring s)
+{
+ jobjectArray a = NULL;
+
+ WITH_CSTR(s) {
+ a = ACR_MszStrToStringArrayA(_E, J2S(s));
+ } END_WITH_CSTR(s);
+
+ return a;
+}
+
+ACR_JNI_EXPORT_DECLARE(jobjectArray, TestPrivate, test032)(ACR_JNISTDARGS, jstring s)
+{
+ jobjectArray a = NULL;
+
+ WITH_WSTR(s) {
+ a = ACR_MszStrToStringArrayW(_E, J2W(s));
+ } END_WITH_WSTR(s);
+
+ return a;
+}
+
ACR_JNI_EXPORT_DECLARE(jint, TestFile, ftest00)(ACR_JNISTDARGS, jint d)
{
Modified: commons/sandbox/runtime/trunk/src/test/org/apache/commons/runtime/TestPrivate.java
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/test/org/apache/commons/runtime/TestPrivate.java?rev=790049&r1=790048&r2=790049&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/test/org/apache/commons/runtime/TestPrivate.java (original)
+++ commons/sandbox/runtime/trunk/src/test/org/apache/commons/runtime/TestPrivate.java Wed Jul 1 07:13:06 2009
@@ -80,6 +80,10 @@
private static native File test029(int d);
private static native void test030(int d);
+ private static native String[] test031(String s);
+ private static native String[] test032(String s);
+
+
private static native String test100(String s);
@@ -600,6 +604,21 @@
}
}
+ public void testMszAnsi()
+ throws Throwable
+ {
+ byte[] b = { 'A', 0, 'B', 0, 'C', 0, 0 };
+ String[] a = test031(new String(b));
+ assertEquals("Length", 3, a.length);
+ }
+
+ public void testMszWide()
+ throws Throwable
+ {
+ String[] a = test032("A\0B\0C\0\0");
+ assertEquals("Length", 3, a.length);
+ }
+
public void testModuleSSL()
throws Throwable
{