You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pr...@apache.org on 2021/05/25 04:06:48 UTC
[arrow] branch master updated: ARROW-11565: [C++][Gandiva] Modify upper()/lower() to work with UTF8 and add INIT_CAP function

This is an automated email from the ASF dual-hosted git repository.

praveenbingo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0dc9bc2  ARROW-11565: [C++][Gandiva] Modify upper()/lower() to work with UTF8 and add INIT_CAP function
0dc9bc2 is described below

commit 0dc9bc21c968e38ee2fa47b41213d33c2d2a7d1f
Author: Anthony Louis <an...@simbioseventures.com>
AuthorDate: Tue May 25 09:35:00 2021 +0530

    ARROW-11565: [C++][Gandiva] Modify upper()/lower() to work with UTF8 and add INIT_CAP function
    
    It finishes the implementation that started in the https://github.com/apache/arrow/pull/9450 pull request
    
    Closes #10040 from anthonylouisbsb/feature/fix-upper-lower-for-utf8 and squashes the following commits:
    
    52c11f8d6 <Anthony Louis> Add missing ;
    2aaa7d891 <Anthony Louis> Remove utf8proc libs from wheel scripts
    253426c04 <Anthony Louis> Add changes for initcap function
    5fa60c1eb <Anthony Louis> Change the name of the UTF8PROC dir to include
    3c2b25ee1 <Anthony Louis> Remove unnecessary lib references
    a8f5e1fc9 <Anthony Louis> Remove gandiva aliases
    5f5ec7f53 <Anthony Louis> Add missing function in global engine mapping
    1f06fa758 <Anthony Louis> Fix cmake formatting
    39dd7712e <Anthony Louis> Fix way library is imported
    2d6157bc6 <Anthony Louis> Apply formatter changes
    5cb1f8092 <Anthony Louis> Move function to a stub
    1d3b7c48b <Sagnik Chakraborty> ARROW-11565:  Modify upper()/lower() logic to make them work for utf8 strings
    
    Lead-authored-by: Anthony Louis <an...@simbioseventures.com>
    Co-authored-by: Sagnik Chakraborty <sa...@dremio.com>
    Signed-off-by: Praveen <pr...@dremio.com>
---
 cpp/cmake_modules/DefineOptions.cmake              |   2 +-
 cpp/cmake_modules/ThirdpartyToolchain.cmake        |   4 +-
 cpp/src/gandiva/CMakeLists.txt                     |   4 +-
 cpp/src/gandiva/function_registry_string.cc        |   8 +-
 cpp/src/gandiva/gdv_function_stubs.cc              | 318 +++++++++++++++++++++
 cpp/src/gandiva/gdv_function_stubs.h               |  21 ++
 cpp/src/gandiva/gdv_function_stubs_test.cc         | 194 +++++++++++++
 cpp/src/gandiva/precompiled/CMakeLists.txt         |   1 +
 cpp/src/gandiva/precompiled/string_ops.cc          |  61 +---
 cpp/src/gandiva/precompiled/string_ops_test.cc     |  22 --
 cpp/src/gandiva/precompiled/types.h                |   3 -
 .../arrow/gandiva/evaluator/ProjectorTest.java     |  72 ++++-
 12 files changed, 618 insertions(+), 92 deletions(-)

diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake
index 0e92811..033076e 100644
--- a/cpp/cmake_modules/DefineOptions.cmake
+++ b/cpp/cmake_modules/DefineOptions.cmake
@@ -365,7 +365,7 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
 
   define_option(
     ARROW_WITH_UTF8PROC
-    "Build with support for Unicode properties using the utf8proc library;(only used if ARROW_COMPUTE is ON)"
+    "Build with support for Unicode properties using the utf8proc library;(only used if ARROW_COMPUTE is ON or ARROW_GANDIVA is ON)"
     ON)
   define_option(
     ARROW_WITH_RE2
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 01e818b..18941df 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -287,10 +287,10 @@ if(ARROW_S3)
   set(ARROW_WITH_ZLIB ON)
 endif()
 
-if(NOT ARROW_COMPUTE)
-  # utf8proc is only potentially used in kernels for now
+if((NOT ARROW_COMPUTE) AND (NOT ARROW_GANDIVA))
   set(ARROW_WITH_UTF8PROC OFF)
 endif()
+
 if((NOT ARROW_COMPUTE) AND (NOT ARROW_GANDIVA) AND (NOT ARROW_WITH_GRPC))
   set(ARROW_WITH_RE2 OFF)
 endif()
diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt
index fcdaf97..44b6fab 100644
--- a/cpp/src/gandiva/CMakeLists.txt
+++ b/cpp/src/gandiva/CMakeLists.txt
@@ -138,6 +138,7 @@ add_arrow_lib(gandiva
               EXTRA_INCLUDES
               $<TARGET_PROPERTY:LLVM::LLVM_INTERFACE,INTERFACE_INCLUDE_DIRECTORIES>
               ${GANDIVA_OPENSSL_INCLUDE_DIR}
+              ${UTF8PROC_INCLUDE_DIR}
               SHARED_LINK_FLAGS
               ${GANDIVA_SHARED_LINK_FLAGS}
               SHARED_LINK_LIBS
@@ -239,7 +240,8 @@ add_gandiva_test(internals-test
                  EXTRA_INCLUDES
                  $<TARGET_PROPERTY:LLVM::LLVM_INTERFACE,INTERFACE_INCLUDE_DIRECTORIES>
                  ${GANDIVA_INTERNALS_TEST_ARGUMENTS}
-                 ${GANDIVA_OPENSSL_INCLUDE_DIR})
+                 ${GANDIVA_OPENSSL_INCLUDE_DIR}
+                 ${UTF8PROC_INCLUDE_DIR})
 
 if(ARROW_GANDIVA_JAVA)
   add_subdirectory(jni)
diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc
index e50069e..cbc7006 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -63,10 +63,14 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
       UNARY_SAFE_NULL_NEVER_BOOL_FN(isnotnull, {}),
 
       NativeFunction("upper", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
-                     "upper_utf8", NativeFunction::kNeedsContext),
+                     "gdv_fn_upper_utf8", NativeFunction::kNeedsContext),
 
       NativeFunction("lower", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
-                     "lower_utf8", NativeFunction::kNeedsContext),
+                     "gdv_fn_lower_utf8", NativeFunction::kNeedsContext),
+
+      NativeFunction("initcap", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
+                     "gdv_fn_initcap_utf8",
+                     NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
 
       NativeFunction("castINT", {}, DataTypeVector{utf8()}, int32(), kResultNullIfNull,
                      "gdv_fn_castINT_utf8",
diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc
index acf3f56..a890775 100644
--- a/cpp/src/gandiva/gdv_function_stubs.cc
+++ b/cpp/src/gandiva/gdv_function_stubs.cc
@@ -17,10 +17,13 @@
 
 #include "gandiva/gdv_function_stubs.h"
 
+#include <utf8proc.h>
+
 #include <string>
 #include <vector>
 
 #include "arrow/util/formatting.h"
+#include "arrow/util/utf8.h"
 #include "arrow/util/value_parsing.h"
 #include "gandiva/engine.h"
 #include "gandiva/exported_funcs.h"
@@ -402,6 +405,286 @@ GDV_FN_CAST_VARCHAR_REAL(float64, DoubleType)
 
 #undef GDV_FN_CAST_VARCHAR_INTEGER
 #undef GDV_FN_CAST_VARCHAR_REAL
+
+GANDIVA_EXPORT
+int32_t gdv_fn_utf8_char_length(char c) {
+  if ((signed char)c >= 0) {  // 1-byte char (0x00 ~ 0x7F)
+    return 1;
+  } else if ((c & 0xE0) == 0xC0) {  // 2-byte char
+    return 2;
+  } else if ((c & 0xF0) == 0xE0) {  // 3-byte char
+    return 3;
+  } else if ((c & 0xF8) == 0xF0) {  // 4-byte char
+    return 4;
+  }
+  // invalid char
+  return 0;
+}
+
+GANDIVA_EXPORT
+void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val) {
+  char const* fmt = "unexpected byte \\%02hhx encountered while decoding utf8 string";
+  int size = static_cast<int>(strlen(fmt)) + 64;
+  char* error = reinterpret_cast<char*>(malloc(size));
+  snprintf(error, size, fmt, (unsigned char)val);
+  gdv_fn_context_set_error_msg(execution_context, error);
+  free(error);
+}
+
+// Convert an utf8 string to its corresponding uppercase string
+GANDIVA_EXPORT
+const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_len,
+                              int32_t* out_len) {
+  if (data_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // If it is a single-byte character (ASCII), corresponding uppercase is always 1-byte
+  // long; if it is >= 2 bytes long, uppercase can be at most 4 bytes long, so length of
+  // the output can be at most twice the length of the input
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t char_len, out_char_len, out_idx = 0;
+  uint32_t char_codepoint;
+
+  for (int32_t i = 0; i < data_len; i += char_len) {
+    char_len = gdv_fn_utf8_char_length(data[i]);
+    // For single byte characters:
+    // If it is a lowercase ASCII character, set the output to its corresponding uppercase
+    // character; else, set the output to the read character
+    if (char_len == 1) {
+      char cur = data[i];
+      // 'A' - 'Z' : 0x41 - 0x5a
+      // 'a' - 'z' : 0x61 - 0x7a
+      if (cur >= 0x61 && cur <= 0x7a) {
+        out[out_idx++] = static_cast<char>(cur - 0x20);
+      } else {
+        out[out_idx++] = cur;
+      }
+      continue;
+    }
+
+    // Control reaches here when we encounter a multibyte character
+    const auto* in_char = (const uint8_t*)(data + i);
+
+    // Decode the multibyte character
+    bool is_valid_utf8_char =
+        arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);
+
+    // If it is an invalid utf8 character, UTF8Decode evaluates to false
+    if (!is_valid_utf8_char) {
+      gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+      *out_len = 0;
+      return "";
+    }
+
+    // Convert the encoded codepoint to its uppercase codepoint
+    int32_t upper_codepoint = utf8proc_toupper(char_codepoint);
+
+    // UTF8Encode advances the pointer by the number of bytes present in the uppercase
+    // character
+    auto* out_char = (uint8_t*)(out + out_idx);
+    uint8_t* out_char_start = out_char;
+
+    // Encode the uppercase character
+    out_char = arrow::util::UTF8Encode(out_char, upper_codepoint);
+
+    out_char_len = static_cast<int32_t>(out_char - out_char_start);
+    out_idx += out_char_len;
+  }
+
+  *out_len = out_idx;
+  return out;
+}
+
+// Convert an utf8 string to its corresponding lowercase string
+GANDIVA_EXPORT
+const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_len,
+                              int32_t* out_len) {
+  if (data_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // If it is a single-byte character (ASCII), corresponding lowercase is always 1-byte
+  // long; if it is >= 2 bytes long, lowercase can be at most 4 bytes long, so length of
+  // the output can be at most twice the length of the input
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t char_len, out_char_len, out_idx = 0;
+  uint32_t char_codepoint;
+
+  for (int32_t i = 0; i < data_len; i += char_len) {
+    char_len = gdv_fn_utf8_char_length(data[i]);
+    // For single byte characters:
+    // If it is an uppercase ASCII character, set the output to its corresponding
+    // lowercase character; else, set the output to the read character
+    if (char_len == 1) {
+      char cur = data[i];
+      // 'A' - 'Z' : 0x41 - 0x5a
+      // 'a' - 'z' : 0x61 - 0x7a
+      if (cur >= 0x41 && cur <= 0x5a) {
+        out[out_idx++] = static_cast<char>(cur + 0x20);
+      } else {
+        out[out_idx++] = cur;
+      }
+      continue;
+    }
+
+    // Control reaches here when we encounter a multibyte character
+    const auto* in_char = (const uint8_t*)(data + i);
+
+    // Decode the multibyte character
+    bool is_valid_utf8_char =
+        arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);
+
+    // If it is an invalid utf8 character, UTF8Decode evaluates to false
+    if (!is_valid_utf8_char) {
+      gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+      *out_len = 0;
+      return "";
+    }
+
+    // Convert the encoded codepoint to its lowercase codepoint
+    int32_t lower_codepoint = utf8proc_tolower(char_codepoint);
+
+    // UTF8Encode advances the pointer by the number of bytes present in the lowercase
+    // character
+    auto* out_char = (uint8_t*)(out + out_idx);
+    uint8_t* out_char_start = out_char;
+
+    // Encode the lowercase character
+    out_char = arrow::util::UTF8Encode(out_char, lower_codepoint);
+
+    out_char_len = static_cast<int32_t>(out_char - out_char_start);
+    out_idx += out_char_len;
+  }
+
+  *out_len = out_idx;
+  return out;
+}
+
+// Checks if the character is a whitespace by its code point. To check the list
+// of the existent whitespaces characters in UTF8, take a look at this link
+// https://en.wikipedia.org/wiki/Whitespace_character#Unicode
+//
+// The Unicode characters also are divided between categories. This link
+// https://en.wikipedia.org/wiki/Unicode_character_property#General_Category shows
+// more information about characters categories.
+GANDIVA_EXPORT
+bool gdv_fn_is_codepoint_for_space(uint32_t val) {
+  auto category = utf8proc_category(val);
+
+  return category == utf8proc_category_t::UTF8PROC_CATEGORY_ZS ||
+         category == utf8proc_category_t::UTF8PROC_CATEGORY_ZL ||
+         category == utf8proc_category_t::UTF8PROC_CATEGORY_ZP;
+}
+
+// For a given text, initialize the first letter of each word, e.g:
+//     - "it is a text str" -> "It Is A Text Str"
+GANDIVA_EXPORT
+const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
+                                int32_t* out_len) {
+  if (data_len == 0) {
+    *out_len = data_len;
+    return "";
+  }
+
+  // If it is a single-byte character (ASCII), corresponding uppercase is always 1-byte
+  // long; if it is >= 2 bytes long, uppercase can be at most 4 bytes long, so length of
+  // the output can be at most twice the length of the input
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t char_len = 0;
+  int32_t out_char_len = 0;
+  int32_t out_idx = 0;
+  uint32_t char_codepoint;
+  bool last_char_was_space = true;
+
+  for (int32_t i = 0; i < data_len; i += char_len) {
+    char_len = gdv_fn_utf8_char_length(data[i]);
+    // For single byte characters:
+    // If it is a lowercase ASCII character, set the output to its corresponding uppercase
+    // character; else, set the output to the read character
+    if (char_len == 1) {
+      char cur = data[i];
+
+      if (cur >= 0x61 && cur <= 0x7a && last_char_was_space) {
+        // 'A' - 'Z' : 0x41 - 0x5a
+        // 'a' - 'z' : 0x61 - 0x7a
+        out[out_idx++] = static_cast<char>(cur - 0x20);
+        last_char_was_space = false;
+      } else {
+        // Check if the ASCII character is one of these:
+        // - space : 0x20
+        // - character tabulation : 0x9
+        // - line feed : 0xA
+        // - line tabulation : 0xB
+        // - form feed : 0xC
+        // - carriage return : 0xD
+        last_char_was_space = cur <= 0x20;
+        out[out_idx++] = cur;
+      }
+      continue;
+    }
+
+    // Control reaches here when we encounter a multibyte character
+    const auto* in_char = (const uint8_t*)(data + i);
+
+    // Decode the multibyte character
+    bool is_valid_utf8_char =
+        arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);
+
+    // If it is an invalid utf8 character, UTF8Decode evaluates to false
+    if (!is_valid_utf8_char) {
+      gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+      *out_len = 0;
+      return "";
+    }
+
+    bool is_char_space = gdv_fn_is_codepoint_for_space(char_codepoint);
+
+    int32_t formatted_codepoint;
+    if (last_char_was_space && !is_char_space) {
+      // Convert the encoded codepoint to its uppercase codepoint
+      formatted_codepoint = utf8proc_toupper(char_codepoint);
+    } else {
+      // Leave the codepoint as is
+      formatted_codepoint = char_codepoint;
+    }
+
+    // UTF8Encode advances the pointer by the number of bytes present in the character
+    auto* out_char = (uint8_t*)(out + out_idx);
+    uint8_t* out_char_start = out_char;
+
+    // Encode the uppercase character
+    out_char = arrow::util::UTF8Encode(out_char, formatted_codepoint);
+
+    out_char_len = static_cast<int32_t>(out_char - out_char_start);
+    out_idx += out_char_len;
+
+    last_char_was_space = is_char_space;
+  }
+
+  *out_len = out_idx;
+  return out;
+}
 }
 
 namespace gandiva {
@@ -1031,5 +1314,40 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
   engine->AddGlobalMappingForFunc("gdv_fn_sha256_decimal128",
                                   types->i8_ptr_type() /*return_type*/, args,
                                   reinterpret_cast<void*>(gdv_fn_sha256_decimal128));
+
+  // gdv_fn_upper_utf8
+  args = {
+      types->i64_type(),      // context
+      types->i8_ptr_type(),   // data
+      types->i32_type(),      // data_len
+      types->i32_ptr_type(),  // out_len
+  };
+
+  engine->AddGlobalMappingForFunc("gdv_fn_upper_utf8",
+                                  types->i8_ptr_type() /*return_type*/, args,
+                                  reinterpret_cast<void*>(gdv_fn_upper_utf8));
+  // gdv_fn_lower_utf8
+  args = {
+      types->i64_type(),      // context
+      types->i8_ptr_type(),   // data
+      types->i32_type(),      // data_len
+      types->i32_ptr_type(),  // out_len
+  };
+
+  engine->AddGlobalMappingForFunc("gdv_fn_lower_utf8",
+                                  types->i8_ptr_type() /*return_type*/, args,
+                                  reinterpret_cast<void*>(gdv_fn_lower_utf8));
+
+  // gdv_fn_initcap_utf8
+  args = {
+      types->i64_type(),     // context
+      types->i8_ptr_type(),  // const char*
+      types->i32_type(),     // value_length
+      types->i32_ptr_type()  // out_length
+  };
+
+  engine->AddGlobalMappingForFunc("gdv_fn_initcap_utf8",
+                                  types->i8_ptr_type() /*return_type*/, args,
+                                  reinterpret_cast<void*>(gdv_fn_initcap_utf8));
 }
 }  // namespace gandiva
diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h
index 0a6cd70..847772b 100644
--- a/cpp/src/gandiva/gdv_function_stubs.h
+++ b/cpp/src/gandiva/gdv_function_stubs.h
@@ -108,4 +108,25 @@ const char* gdv_fn_castVARCHAR_float32_int64(int64_t context, float value, int64
 GANDIVA_EXPORT
 const char* gdv_fn_castVARCHAR_float64_int64(int64_t context, double value, int64_t len,
                                              int32_t* out_len);
+
+GANDIVA_EXPORT
+int32_t gdv_fn_utf8_char_length(char c);
+
+GANDIVA_EXPORT
+void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val);
+
+GANDIVA_EXPORT
+const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_len,
+                              int32_t* out_len);
+
+GANDIVA_EXPORT
+const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_len,
+                              int32_t* out_len);
+
+GANDIVA_EXPORT
+bool gdv_fn_is_codepoint_for_space(uint32_t val);
+
+GANDIVA_EXPORT
+const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
+                                int32_t* out_len);
 }
diff --git a/cpp/src/gandiva/gdv_function_stubs_test.cc b/cpp/src/gandiva/gdv_function_stubs_test.cc
index 8f44ce2..6cfff5b 100644
--- a/cpp/src/gandiva/gdv_function_stubs_test.cc
+++ b/cpp/src/gandiva/gdv_function_stubs_test.cc
@@ -290,4 +290,198 @@ TEST(TestGdvFnStubs, TestCastVARCHARFromDouble) {
   EXPECT_FALSE(ctx.has_error());
 }
 
+TEST(TestGdvFnStubs, TestUpper) {
+  gandiva::ExecutionContext ctx;
+  uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+  gdv_int32 out_len = 0;
+
+  const char* out_str = gdv_fn_upper_utf8(ctx_ptr, "AbcDEfGh", 8, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "ABCDEFGH");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_upper_utf8(ctx_ptr, "asdfj", 5, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "ASDFJ");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_upper_utf8(ctx_ptr, "s;dcGS,jO!l", 11, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "S;DCGS,JO!L");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_upper_utf8(ctx_ptr, "münchen", 8, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "MÜNCHEN");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_upper_utf8(ctx_ptr, "CITROËN", 8, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "CITROËN");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_upper_utf8(ctx_ptr, "âBćDëFGH", 11, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "ÂBĆDËFGH");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_upper_utf8(ctx_ptr, "øhpqRšvñ", 11, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "ØHPQRŠVÑ");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_upper_utf8(ctx_ptr, "Möbelträgerfüße", 19, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "MÖBELTRÄGERFÜẞE");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_upper_utf8(ctx_ptr, "{õhp,PQŚv}ń+", 15, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "{ÕHP,PQŚV}Ń+");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_upper_utf8(ctx_ptr, "", 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  std::string d("AbOJjÜoß\xc3");
+  out_str = gdv_fn_upper_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_THAT(ctx.get_error(),
+              ::testing::HasSubstr(
+                  "unexpected byte \\c3 encountered while decoding utf8 string"));
+  ctx.Reset();
+
+  std::string e(
+      "åbÑg\xe0\xa0"
+      "åBUå");
+  out_str = gdv_fn_upper_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_THAT(ctx.get_error(),
+              ::testing::HasSubstr(
+                  "unexpected byte \\e0 encountered while decoding utf8 string"));
+  ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestLower) {
+  gandiva::ExecutionContext ctx;
+  uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+  gdv_int32 out_len = 0;
+
+  const char* out_str = gdv_fn_lower_utf8(ctx_ptr, "AbcDEfGh", 8, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "abcdefgh");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_lower_utf8(ctx_ptr, "asdfj", 5, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "asdfj");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_lower_utf8(ctx_ptr, "S;DCgs,Jo!L", 11, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "s;dcgs,jo!l");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_lower_utf8(ctx_ptr, "MÜNCHEN", 8, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "münchen");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_lower_utf8(ctx_ptr, "citroën", 8, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "citroën");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_lower_utf8(ctx_ptr, "ÂbĆDËFgh", 11, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "âbćdëfgh");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_lower_utf8(ctx_ptr, "ØHPQrŠvÑ", 11, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "øhpqršvñ");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_lower_utf8(ctx_ptr, "MÖBELTRÄGERFÜẞE", 20, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "möbelträgerfüße");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_lower_utf8(ctx_ptr, "{ÕHP,pqśv}Ń+", 15, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "{õhp,pqśv}ń+");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_lower_utf8(ctx_ptr, "", 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  std::string d("AbOJjÜoß\xc3");
+  out_str = gdv_fn_lower_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_THAT(ctx.get_error(),
+              ::testing::HasSubstr(
+                  "unexpected byte \\c3 encountered while decoding utf8 string"));
+  ctx.Reset();
+
+  std::string e(
+      "åbÑg\xe0\xa0"
+      "åBUå");
+  out_str = gdv_fn_lower_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_THAT(ctx.get_error(),
+              ::testing::HasSubstr(
+                  "unexpected byte \\e0 encountered while decoding utf8 string"));
+  ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestInitCap) {
+  gandiva::ExecutionContext ctx;
+  uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+  gdv_int32 out_len = 0;
+
+  const char* out_str = gdv_fn_initcap_utf8(ctx_ptr, "test string", 11, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Test String");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "asdfj\nhlqf", 10, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Asdfj\nHlqf");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "s;DCgs,Jo!L", 11, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "S;DCgs,Jo!L");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, " mÜNCHEN", 9, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), " MÜNCHEN");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "citroën CaR", 12, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Citroën CaR");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "ÂbĆDËFgh\néll", 16, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "ÂbĆDËFgh\nÉll");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "  øhpqršvñ  \n\n", 17, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "  Øhpqršvñ  \n\n");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str =
+      gdv_fn_initcap_utf8(ctx_ptr, "möbelträgerfüße   \nmöbelträgerfüße", 42, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Möbelträgerfüße   \nMöbelträgerfüße");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "{ÕHP,pqśv}Ń+", 15, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "{ÕHP,pqśv}Ń+");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "", 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  std::string d("AbOJjÜoß\xc3");
+  out_str =
+      gdv_fn_initcap_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_THAT(ctx.get_error(),
+              ::testing::HasSubstr(
+                  "unexpected byte \\c3 encountered while decoding utf8 string"));
+  ctx.Reset();
+
+  std::string e(
+      "åbÑg\xe0\xa0"
+      "åBUå");
+  out_str =
+      gdv_fn_initcap_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_THAT(ctx.get_error(),
+              ::testing::HasSubstr(
+                  "unexpected byte \\e0 encountered while decoding utf8 string"));
+  ctx.Reset();
+}
 }  // namespace gandiva
diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt
index 7343bc0..176b047 100644
--- a/cpp/src/gandiva/precompiled/CMakeLists.txt
+++ b/cpp/src/gandiva/precompiled/CMakeLists.txt
@@ -77,6 +77,7 @@ foreach(SRC_FILE ${PRECOMPILED_SRCS})
            ${ARROW_GANDIVA_PC_CXX_FLAGS}
            -I${CMAKE_SOURCE_DIR}/src
            -I${ARROW_BINARY_DIR}/src)
+
   if(NOT ARROW_USE_NATIVE_INT128)
     list(APPEND PRECOMPILE_COMMAND -I${Boost_INCLUDE_DIR})
   endif()
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc
index b35062d..ac50633 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -17,6 +17,7 @@
 
 // String functions
 #include "arrow/util/value_parsing.h"
+
 extern "C" {
 
 #include <algorithm>
@@ -221,66 +222,6 @@ UTF8_LENGTH(char_length, utf8)
 UTF8_LENGTH(length, utf8)
 UTF8_LENGTH(lengthUtf8, binary)
 
-// Convert a utf8 sequence to upper case.
-// TODO : This handles only ascii characters.
-FORCE_INLINE
-const char* upper_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
-                       int32_t* out_len) {
-  if (data_len == 0) {
-    *out_len = 0;
-    return "";
-  }
-
-  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, data_len));
-  if (ret == nullptr) {
-    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
-    *out_len = 0;
-    return "";
-  }
-  for (gdv_int32 i = 0; i < data_len; ++i) {
-    char cur = data[i];
-
-    // 'A- - 'Z' : 0x41 - 0x5a
-    // 'a' - 'z' : 0x61 - 0x7a
-    if (cur >= 0x61 && cur <= 0x7a) {
-      cur = static_cast<char>(cur - 0x20);
-    }
-    ret[i] = cur;
-  }
-  *out_len = data_len;
-  return ret;
-}
-
-// Convert a utf8 sequence to lower case.
-// TODO : This handles only ascii characters.
-FORCE_INLINE
-const char* lower_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
-                       int32_t* out_len) {
-  if (data_len == 0) {
-    *out_len = 0;
-    return "";
-  }
-
-  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, data_len));
-  if (ret == nullptr) {
-    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
-    *out_len = 0;
-    return "";
-  }
-  for (gdv_int32 i = 0; i < data_len; ++i) {
-    char cur = data[i];
-
-    // 'A' - 'Z' : 0x41 - 0x5a
-    // 'a' - 'z' : 0x61 - 0x7a
-    if (cur >= 0x41 && cur <= 0x5a) {
-      cur = static_cast<char>(cur + 0x20);
-    }
-    ret[i] = cur;
-  }
-  *out_len = data_len;
-  return ret;
-}
-
 // Reverse a utf8 sequence
 FORCE_INLINE
 const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc
index fd36665..ae3c0f2 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -695,28 +695,6 @@ TEST(TestStringOps, TestConcat) {
   EXPECT_FALSE(ctx.has_error());
 }
 
-TEST(TestStringOps, TestLower) {
-  gandiva::ExecutionContext ctx;
-  uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
-  gdv_int32 out_len = 0;
-
-  const char* out_str = lower_utf8(ctx_ptr, "AsDfJ", 5, &out_len);
-  EXPECT_EQ(std::string(out_str, out_len), "asdfj");
-  EXPECT_FALSE(ctx.has_error());
-
-  out_str = lower_utf8(ctx_ptr, "asdfj", 5, &out_len);
-  EXPECT_EQ(std::string(out_str, out_len), "asdfj");
-  EXPECT_FALSE(ctx.has_error());
-
-  out_str = lower_utf8(ctx_ptr, "Ç††AbD", 11, &out_len);
-  EXPECT_EQ(std::string(out_str, out_len), "Ç††abd");
-  EXPECT_FALSE(ctx.has_error());
-
-  out_str = lower_utf8(ctx_ptr, "", 0, &out_len);
-  EXPECT_EQ(std::string(out_str, out_len), "");
-  EXPECT_FALSE(ctx.has_error());
-}
-
 TEST(TestStringOps, TestReverse) {
   gandiva::ExecutionContext ctx;
   uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h
index 170cf92..b8c7aa9 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -378,9 +378,6 @@ const char* castVARCHAR_utf8_int64(gdv_int64 context, const char* data,
                                    gdv_int32 data_len, int64_t out_len,
                                    int32_t* out_length);
 
-const char* lower_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
-                       int32_t* out_length);
-
 const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
                          int32_t* out_len);
 
diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java
index 9844d02..80d4281 100644
--- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java
+++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java
@@ -73,7 +73,13 @@ public class ProjectorTest extends BaseEvaluatorTest {
 
   List<ArrowBuf> varBufs(String[] strings, Charset charset) {
     ArrowBuf offsetsBuffer = allocator.buffer((strings.length + 1) * 4);
-    ArrowBuf dataBuffer = allocator.buffer(strings.length * 8);
+    
+    long dataBufferSize = 0L;
+    for (String string : strings) {
+      dataBufferSize += string.getBytes(charset).length;
+    }
+
+    ArrowBuf dataBuffer = allocator.buffer(dataBufferSize);
 
     int startOffset = 0;
     for (int i = 0; i < strings.length; i++) {
@@ -2284,4 +2290,68 @@ public class ProjectorTest extends BaseEvaluatorTest {
     releaseValueVectors(output);
   }
 
+  @Test
+  public void testInitCap() throws Exception {
+
+    Field x = Field.nullable("x", new ArrowType.Utf8());
+
+    Field retType = Field.nullable("c", new ArrowType.Utf8());
+
+    TreeNode cond =
+            TreeBuilder.makeFunction(
+                    "initcap",
+                    Lists.newArrayList(TreeBuilder.makeField(x)),
+                    new ArrowType.Utf8());
+    ExpressionTree expr = TreeBuilder.makeExpression(cond, retType);
+    Schema schema = new Schema(Lists.newArrayList(x));
+    Projector eval = Projector.make(schema, Lists.newArrayList(expr));
+
+    int numRows = 5;
+    byte[] validity = new byte[]{(byte) 15, 0};
+    String[] valuesX = new String[]{
+        "  øhpqršvñ  \n\n",
+        "möbelträgerfüße   \nmöbelträgerfüße",
+        "ÂbĆDËFgh\néll",
+        "citroën CaR",
+        "kjk"
+    };
+
+    String[] expected = new String[]{
+        "  Øhpqršvñ  \n\n",
+        "Möbelträgerfüße   \nMöbelträgerfüße",
+        "ÂbĆDËFgh\nÉll",
+        "Citroën CaR",
+        null
+    };
+
+    ArrowBuf validityX = buf(validity);
+    List<ArrowBuf> dataBufsX = stringBufs(valuesX);
+
+    ArrowRecordBatch batch =
+            new ArrowRecordBatch(
+                    numRows,
+                    Lists.newArrayList(new ArrowFieldNode(numRows, 0)),
+                    Lists.newArrayList(validityX, dataBufsX.get(0), dataBufsX.get(1)));
+
+    // allocate data for output vector.
+    VarCharVector outVector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator);
+    outVector.allocateNew(numRows * 100, numRows);
+
+    // evaluate expression
+    List<ValueVector> output = new ArrayList<>();
+    output.add(outVector);
+    eval.evaluate(batch, output);
+    eval.close();
+
+    // match expected output.
+    for (int i = 0; i < numRows - 1; i++) {
+      assertFalse("Expect none value equals null", outVector.isNull(i));
+      assertEquals(expected[i], new String(outVector.get(i)));
+    }
+
+    assertTrue("Last value must be null", outVector.isNull(numRows - 1));
+
+    releaseRecordBatch(batch);
+    releaseValueVectors(output);
+  }
 }