You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by mt...@apache.org on 2009/08/27 18:10:38 UTC

svn commit: r808482 - in /commons/sandbox/runtime/trunk/src/main/native: include/acr_string.h shared/string.c test/testcase.c

Author: mturk
Date: Thu Aug 27 16:10:36 2009
New Revision: 808482

URL: http://svn.apache.org/viewvc?rev=808482&view=rev
Log:
Add few more handy string functions

Modified:
    commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h
    commons/sandbox/runtime/trunk/src/main/native/shared/string.c
    commons/sandbox/runtime/trunk/src/main/native/test/testcase.c

Modified: commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h?rev=808482&r1=808481&r2=808482&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h (original)
+++ commons/sandbox/runtime/trunk/src/main/native/include/acr_string.h Thu Aug 27 16:10:36 2009
@@ -204,7 +204,7 @@
  * @param last internal buffer for maintaining the state.
  * @return Token.
  */
-ACR_DECLARE(char *) ACR_strtok_c(char *str, int sep, char **last);
+ACR_DECLARE(char *) ACR_strctok(char *str, int sep, char **last);
 
 /**
  * Apache's "replacement" for the wcstok_r() function that uses
@@ -214,7 +214,18 @@
  * @param last internal buffer for maintaining the state.
  * @return Token.
  */
-ACR_DECLARE(wchar_t *) ACR_wcstok_c(wchar_t *str, int sep, wchar_t **last);
+ACR_DECLARE(wchar_t *) ACR_wcsctok(wchar_t *str, int sep, wchar_t **last);
+
+/**
+ * Apache's "replacement" for the strtok_r() function that uses
+ * a single char instead set of delimiters and doesn't tokenize the
+ * strings inside double quotes.
+ * @param str string to tokenize.
+ * @param sep Token delimiting character.
+ * @param last internal buffer for maintaining the state.
+ * @return Token.
+ */
+ACR_DECLARE(char *) ACR_strqctok(char *str, int sep, char **last);
 
 /**
  * Determine the number of tokens in string without
@@ -223,7 +234,7 @@
  * @param sep Token delimiting character.
  * @return Number of tokens.
  */
-ACR_DECLARE(int) ACR_StrTokensA(const char *str, int sep);
+ACR_DECLARE(int)ACR_strnctok(const char *str, int sep);
 
 /**
  * Determine the number of tokens in string without
@@ -232,7 +243,65 @@
  * @param sep Token delimiting character.
  * @return Number of tokens.
  */
-ACR_DECLARE(int) ACR_StrTokensW(const wchar_t *str, int sep);
+ACR_DECLARE(int) ACR_wcsnctok(const wchar_t *str, int sep);
+
+/**
+ * Determine the number of tokens in string without
+ * modifying it and not breaking the quoted strings.
+ * @param str string to tokenize.
+ * @param sep Token delimiting character.
+ * @return Number of tokens.
+ */
+ACR_DECLARE(int)ACR_strnqctok(const char *str, int sep);
+
+/**
+ * Determine the number of tokens in string without
+ * modifying it and not breaking the quoted strings.
+ * @param str string to tokenize.
+ * @param sep Token delimiting character.
+ * @return Number of tokens.
+ */
+ACR_DECLARE(int) ACR_wcsnqctok(const wchar_t *str, int sep);
+
+/**
+ * Determine the number of tokens in string without
+ * modifying it using space characters as delimiters and
+ * not breaking the quoted strings.
+ * @param str string to tokenize.
+ * @return Number of tokens.
+ */
+ACR_DECLARE(int) ACR_strnqtok(const char *str);
+
+/**
+ * Determine the number of tokens in string without
+ * modifying it using space characters as delimiters and
+ * not breaking the quoted strings.
+ * @param str string to tokenize.
+ * @return Number of tokens.
+ */
+ACR_DECLARE(int) ACR_wcsnqtok(const wchar_t *str);
+
+/**
+ * Apache's "replacement" for the strtok_r() function that uses
+ * space characters instead set of delimiters and doesn't tokenize the
+ * strings inside double quotes.
+ * @param str string to tokenize.
+ * @param sep Token delimiting character.
+ * @param last internal buffer for maintaining the state.
+ * @return Token.
+ */
+ACR_DECLARE(char *) ACR_strqtok(char *str, char **last);
+
+/**
+ * Apache's "replacement" for the strtok_r() function that uses
+ * space characters instead set of delimiters and doesn't tokenize the
+ * strings inside double quotes.
+ * @param str string to tokenize.
+ * @param sep Token delimiting character.
+ * @param last internal buffer for maintaining the state.
+ * @return Token.
+ */
+ACR_DECLARE(wchar_t *) ACR_wcsqtok(wchar_t *str, wchar_t **last);
 
 /**
  * Count the number of string parts in multi string. Ansi version.
@@ -270,9 +339,28 @@
  */
 ACR_DECLARE(jobjectArray) ACR_MszStrToStringArrayW(JNIEnv *env, const wchar_t *s);
 
+/**
+ * Convert all tab characters in a string to single space character.
+ * Tabs inside quotes are not converted.
+ * @param str String to use
+ * @return pointer to the string
+ * @note This function is useful to prepare the strings for tokenizing
+ *       where the string can contain both tab and spaces.
+ */
+ACR_DECLARE(char *) ACR_strqtab2ss(char *str);
+
+/**
+ * Convert all tab characters in a string to single space character.
+ * Tabs inside quotes are not converted.
+ * @param str String to use
+ * @return pointer to the string
+ * @note This function is useful to prepare the strings for tokenizing
+ *       where the string can contain both tab and spaces.
+ */
+ACR_DECLARE(wchar_t *) ACR_wcsqtab2ss(wchar_t *str);
+
 #ifdef __cplusplus
 }
 #endif
 
 #endif /* _ACR_STRING_H */
-

Modified: commons/sandbox/runtime/trunk/src/main/native/shared/string.c
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/main/native/shared/string.c?rev=808482&r1=808481&r2=808482&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/main/native/shared/string.c (original)
+++ commons/sandbox/runtime/trunk/src/main/native/shared/string.c Thu Aug 27 16:10:36 2009
@@ -928,7 +928,39 @@
     return (str[x] != L'\0');
 }
 
-ACR_DECLARE(char *) ACR_strtok_c(char *str, int sep, char **last)
+static char *strchr_q(const char *s1, int ch)
+{
+    int s0;
+
+    /* Some early sanity check */
+    if (!s1 || !*s1)
+        return NULL;
+    while ((s0 = *s1++) != 0) {
+        if (s0 == ch)
+            return (char *)(s1 - 1);
+        if (s0 == '\\' && *s1)
+            s1++; /* We have something escaped. Advance */
+    }
+    return NULL;
+}
+
+static wchar_t *wcschr_q(const wchar_t *s1, int ch)
+{
+    int s0;
+
+    /* Some early sanity check */
+    if (!s1 || !*s1)
+        return NULL;
+    while ((s0 = *s1++) != 0) {
+        if (s0 == ch)
+            return (wchar_t *)(s1 - 1);
+        if (s0 == L'\\' && *s1)
+            s1++; /* We have something escaped. Advance */
+    }
+    return NULL;
+}
+
+ACR_DECLARE(char *) ACR_strctok(char *str, int sep, char **last)
 {
     char *tok;
 
@@ -950,7 +982,95 @@
     }
 }
 
-ACR_DECLARE(int) ACR_StrTokensA(const char *str, int sep)
+ACR_DECLARE(char *) ACR_strqctok(char *str, int sep, char **last)
+{
+    char *tok;
+
+    if (!str)                       /* subsequent call */
+        str = *last;                /* start where we left off */
+    if (!str)                       /* no more tokens */
+        return NULL;
+    while (*str == sep)             /* skip leading delimiters */
+         str++;
+    if (*str == '"') {
+        int ch;
+        /* Advance to the first unescaped quote */
+        tok = str + 1;
+        while ((ch = *tok++) != 0) {
+            if (ch == '"') {
+                if (*tok) {
+                    *tok++ = '\0';
+                    *last  = tok;
+                }
+                else
+                    *last  = NULL;
+                return str;
+            }
+            if (ch == '\\' && *tok)
+                tok++;
+        }
+        /* Unterminated quote */
+        *last = NULL;
+        /* Check for last empty token */
+        return *str ? str : NULL;
+    }
+    if ((tok = strchr_q(str, sep))) {
+        *tok++ = '\0';
+        *last  = tok;
+        return str;
+    }
+    else {
+        *last = NULL;
+        /* Check for last empty token */
+        return *str ? str : NULL;
+    }
+}
+
+ACR_DECLARE(char *) ACR_strqtok(char *str, char **last)
+{
+    char *tok;
+
+    if (!str)                           /* subsequent call */
+        str = *last;                    /* start where we left off */
+    if (!str)                           /* no more tokens */
+        return NULL;
+    while (*str && acr_isspace(*str))   /* skip leading delimiters */
+         str++;
+    if (*str == '"') {
+        int ch;
+        /* Advance to the first unescaped quote */
+        tok = str + 1;
+        while ((ch = *tok++) != 0) {
+            if (ch == '"') {
+                if (*tok) {
+                    *tok++ = '\0';
+                    *last  = tok;
+                }
+                else
+                    *last  = NULL;
+                return str;
+            }
+            if (ch == '\\' && *tok)
+                tok++;
+        }
+        /* Unterminated quote */
+        *last = NULL;
+        /* Check for last empty token */
+        return *str ? str : NULL;
+    }
+    if ((tok = strpbrk(str, " \t"))) {
+        *tok++ = '\0';
+        *last  = tok;
+        return str;
+    }
+    else {
+        *last = NULL;
+        /* Check for last empty token */
+        return *str ? str : NULL;
+    }
+}
+
+ACR_DECLARE(int) ACR_strnctok(const char *str, int sep)
 {
     int cnt = 1;
 
@@ -969,13 +1089,27 @@
     return cnt;
 }
 
-ACR_DECLARE(int) ACR_StrTokensW(const wchar_t *str, int sep)
+ACR_DECLARE(int) ACR_strnqctok(const char *str, int sep)
 {
     int cnt = 1;
 
-    while (*str && *str == (wchar_t)sep) /* skip leading delimiters */
+    while (*str == sep)          /* skip leading delimiters */
         str++;
     while (*str) {
+        if (*str == '"') {
+            int ch;
+            str++;
+            /* Advance to the first unescaped quote */
+            while ((ch = *str++) != 0) {
+                if (ch == '"') {
+                    cnt++;
+                    break;
+                }
+                if (ch == '\\' && *str)
+                    str++;
+            }
+            continue;
+        }
         if (*str == sep) {
             while (*str == sep)
                 str++;
@@ -988,7 +1122,59 @@
     return cnt;
 }
 
-ACR_DECLARE(wchar_t *) ACR_wcstok_c(wchar_t *str, int sep, wchar_t **last)
+ACR_DECLARE(int) ACR_strnqtok(const char *str)
+{
+    int cnt = 1;
+
+    while (*str && acr_isspace(*str))          /* skip leading delimiters */
+        str++;
+    while (*str) {
+        if (*str == '"') {
+            int ch;
+            str++;
+            /* Advance to the first unescaped quote */
+            while ((ch = *str++) != 0) {
+                if (ch == '"') {
+                    cnt++;
+                    break;
+                }
+                if (ch == '\\' && *str)
+                    str++;
+            }
+            continue;
+        }
+        if (acr_isspace(*str)) {
+            while (acr_isspace(*str))
+                str++;
+            if (*str)
+                cnt++;
+        }
+        else
+            str++;
+    }
+    return cnt;
+}
+
+ACR_DECLARE(int) ACR_wcsnctok(const wchar_t *str, int sep)
+{
+    int cnt = 1;
+
+    while (*str && *str == (wchar_t)sep)    /* skip leading delimiters */
+        str++;
+    while (*str) {
+        if (*str == (wchar_t)sep) {
+            while (*str == (wchar_t)sep)
+                str++;
+            if (*str)
+                cnt++;
+        }
+        else
+            str++;
+    }
+    return cnt;
+}
+
+ACR_DECLARE(wchar_t *) ACR_wcsctok(wchar_t *str, int sep, wchar_t **last)
 {
     wchar_t *tok;
 
@@ -1010,6 +1196,127 @@
     }
 }
 
+ACR_DECLARE(int) ACR_wcsnqtok(const wchar_t *str)
+{
+    int cnt = 1;
+
+    while (*str && iswspace(*str))          /* skip leading delimiters */
+        str++;
+    while (*str) {
+        if (*str == L'"') {
+            wchar_t ch;
+            str++;
+            /* Advance to the first unescaped quote */
+            while ((ch = *str++) != 0) {
+                if (ch == L'"') {
+                    cnt++;
+                    break;
+                }
+                if (ch == L'\\' && *str)
+                    str++;
+            }
+            continue;
+        }
+        if (iswspace(*str)) {
+            while (iswspace(*str))
+                str++;
+            if (*str)
+                cnt++;
+        }
+        else
+            str++;
+    }
+    return cnt;
+}
+
+ACR_DECLARE(wchar_t *) ACR_wcsqctok(wchar_t *str, int sep, wchar_t **last)
+{
+    wchar_t *tok;
+
+    if (!str)                       /* subsequent call */
+        str = *last;                /* start where we left off */
+    if (!str)                       /* no more tokens */
+        return NULL;
+    while (*str == sep)             /* skip leading delimiters */
+         str++;
+    if (*str == L'"') {
+        int ch;
+        /* Advance to the first unescaped quote */
+        tok = str + 1;
+        while ((ch = *tok++) != 0) {
+            if (ch == L'"') {
+                if (*tok) {
+                    *tok++ = L'\0';
+                    *last  = tok;
+                }
+                else
+                    *last  = NULL;
+                return str;
+            }
+            if (ch == L'\\' && *tok)
+                tok++;
+        }
+        /* Unterminated quote */
+        *last = NULL;
+        /* Check for last empty token */
+        return *str ? str : NULL;
+    }
+    if ((tok = wcschr_q(str, sep))) {
+        *tok++ = L'\0';
+        *last  = tok;
+        return str;
+    }
+    else {
+        *last = NULL;
+        /* Check for last empty token */
+        return *str ? str : NULL;
+    }
+}
+
+ACR_DECLARE(wchar_t *) ACR_wcsqtok(wchar_t *str, wchar_t **last)
+{
+    wchar_t *tok;
+
+    if (!str)                           /* subsequent call */
+        str = *last;                    /* start where we left off */
+    if (!str)                           /* no more tokens */
+        return NULL;
+    while (*str && acr_isspace(*str))   /* skip leading delimiters */
+         str++;
+    if (*str == L'"') {
+        int ch;
+        /* Advance to the first unescaped quote */
+        tok = str + 1;
+        while ((ch = *tok++) != 0) {
+            if (ch == L'"') {
+                if (*tok) {
+                    *tok++ = L'\0';
+                    *last  = tok;
+                }
+                else
+                    *last  = NULL;
+                return str;
+            }
+            if (ch == L'\\' && *tok)
+                tok++;
+        }
+        /* Unterminated quote */
+        *last = NULL;
+        /* Check for last empty token */
+        return *str ? str : NULL;
+    }
+    if ((tok = wcspbrk(str, L" \t"))) {
+        *tok++ = L'\0';
+        *last  = tok;
+        return str;
+    }
+    else {
+        *last = NULL;
+        /* Check for last empty token */
+        return *str ? str : NULL;
+    }
+}
+
 ACR_DECLARE(size_t) ACR_MszStrCountA(const char *s)
 {
     size_t n = 0;
@@ -1092,3 +1399,52 @@
     return arr;
 }
 
+ACR_DECLARE(char *) ACR_strqtab2ss(char *str)
+{
+    char *ptr = str;
+
+    while (*str) {
+        if (*str == '"') {
+            int ch;
+            str++;
+            /* Advance to the first unescaped quote */
+            while ((ch = *str++) != 0) {
+                if (ch == '"') {
+                    break;
+                }
+                if (ch == '\\' && *str)
+                    str++;
+            }
+            continue;
+        }
+        if (*str == '\t')
+            *str =  ' ';
+        str++;
+    }
+    return ptr;
+}
+
+ACR_DECLARE(wchar_t *) ACR_wcsqtab2ss(wchar_t *str)
+{
+    wchar_t *ptr = str;
+
+    while (*str) {
+        if (*str == L'"') {
+            wchar_t ch;
+            str++;
+            /* Advance to the first unescaped quote */
+            while ((ch = *str++) != 0) {
+                if (ch == L'"') {
+                    break;
+                }
+                if (ch == L'\\' && *str)
+                    str++;
+            }
+            continue;
+        }
+        if (*str == L'\t')
+            *str =  L' ';
+        str++;
+    }
+    return ptr;
+}

Modified: commons/sandbox/runtime/trunk/src/main/native/test/testcase.c
URL: http://svn.apache.org/viewvc/commons/sandbox/runtime/trunk/src/main/native/test/testcase.c?rev=808482&r1=808481&r2=808482&view=diff
==============================================================================
--- commons/sandbox/runtime/trunk/src/main/native/test/testcase.c (original)
+++ commons/sandbox/runtime/trunk/src/main/native/test/testcase.c Thu Aug 27 16:10:36 2009
@@ -474,11 +474,11 @@
     char *state;
 
     sprintf(buf, "    1 22   3333  4");
-    e = ACR_StrTokensA(buf, ' ');
-    token = ACR_strtok_c(buf, ' ', &state);
+    e = ACR_strnctok(buf, ' ');
+    token = ACR_strctok(buf, ' ', &state);
     if (token) {
         n++;
-        while ((token = ACR_strtok_c(NULL, ' ', &state))) {
+        while ((token = ACR_strctok(NULL, ' ', &state))) {
             n++;
             if (n > 20)
                 break;