You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pr...@apache.org on 2021/05/25 04:06:48 UTC
[arrow] branch master updated: ARROW-11565: [C++][Gandiva] Modify
upper()/lower() to work with UTF8 and add INIT_CAP function
This is an automated email from the ASF dual-hosted git repository.
praveenbingo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 0dc9bc2 ARROW-11565: [C++][Gandiva] Modify upper()/lower() to work with UTF8 and add INIT_CAP function
0dc9bc2 is described below
commit 0dc9bc21c968e38ee2fa47b41213d33c2d2a7d1f
Author: Anthony Louis <an...@simbioseventures.com>
AuthorDate: Tue May 25 09:35:00 2021 +0530
ARROW-11565: [C++][Gandiva] Modify upper()/lower() to work with UTF8 and add INIT_CAP function
It finishes the implementation that started in the https://github.com/apache/arrow/pull/9450 pull request
Closes #10040 from anthonylouisbsb/feature/fix-upper-lower-for-utf8 and squashes the following commits:
52c11f8d6 <Anthony Louis> Add missing ;
2aaa7d891 <Anthony Louis> Remove utf8proc libs from wheel scripts
253426c04 <Anthony Louis> Add changes for initcap function
5fa60c1eb <Anthony Louis> Change the name of the UTF8PROC dir to include
3c2b25ee1 <Anthony Louis> Remove unnecessary lib references
a8f5e1fc9 <Anthony Louis> Remove gandiva aliases
5f5ec7f53 <Anthony Louis> Add missing function in global engine mapping
1f06fa758 <Anthony Louis> Fix cmake formatting
39dd7712e <Anthony Louis> Fix way library is imported
2d6157bc6 <Anthony Louis> Apply formatter changes
5cb1f8092 <Anthony Louis> Move function to a stub
1d3b7c48b <Sagnik Chakraborty> ARROW-11565: Modify upper()/lower() logic to make them work for utf8 strings
Lead-authored-by: Anthony Louis <an...@simbioseventures.com>
Co-authored-by: Sagnik Chakraborty <sa...@dremio.com>
Signed-off-by: Praveen <pr...@dremio.com>
---
cpp/cmake_modules/DefineOptions.cmake | 2 +-
cpp/cmake_modules/ThirdpartyToolchain.cmake | 4 +-
cpp/src/gandiva/CMakeLists.txt | 4 +-
cpp/src/gandiva/function_registry_string.cc | 8 +-
cpp/src/gandiva/gdv_function_stubs.cc | 318 +++++++++++++++++++++
cpp/src/gandiva/gdv_function_stubs.h | 21 ++
cpp/src/gandiva/gdv_function_stubs_test.cc | 194 +++++++++++++
cpp/src/gandiva/precompiled/CMakeLists.txt | 1 +
cpp/src/gandiva/precompiled/string_ops.cc | 61 +---
cpp/src/gandiva/precompiled/string_ops_test.cc | 22 --
cpp/src/gandiva/precompiled/types.h | 3 -
.../arrow/gandiva/evaluator/ProjectorTest.java | 72 ++++-
12 files changed, 618 insertions(+), 92 deletions(-)
diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake
index 0e92811..033076e 100644
--- a/cpp/cmake_modules/DefineOptions.cmake
+++ b/cpp/cmake_modules/DefineOptions.cmake
@@ -365,7 +365,7 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
define_option(
ARROW_WITH_UTF8PROC
- "Build with support for Unicode properties using the utf8proc library;(only used if ARROW_COMPUTE is ON)"
+ "Build with support for Unicode properties using the utf8proc library;(only used if ARROW_COMPUTE is ON or ARROW_GANDIVA is ON)"
ON)
define_option(
ARROW_WITH_RE2
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 01e818b..18941df 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -287,10 +287,10 @@ if(ARROW_S3)
set(ARROW_WITH_ZLIB ON)
endif()
-if(NOT ARROW_COMPUTE)
- # utf8proc is only potentially used in kernels for now
+if((NOT ARROW_COMPUTE) AND (NOT ARROW_GANDIVA))
set(ARROW_WITH_UTF8PROC OFF)
endif()
+
if((NOT ARROW_COMPUTE) AND (NOT ARROW_GANDIVA) AND (NOT ARROW_WITH_GRPC))
set(ARROW_WITH_RE2 OFF)
endif()
diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt
index fcdaf97..44b6fab 100644
--- a/cpp/src/gandiva/CMakeLists.txt
+++ b/cpp/src/gandiva/CMakeLists.txt
@@ -138,6 +138,7 @@ add_arrow_lib(gandiva
EXTRA_INCLUDES
$<TARGET_PROPERTY:LLVM::LLVM_INTERFACE,INTERFACE_INCLUDE_DIRECTORIES>
${GANDIVA_OPENSSL_INCLUDE_DIR}
+ ${UTF8PROC_INCLUDE_DIR}
SHARED_LINK_FLAGS
${GANDIVA_SHARED_LINK_FLAGS}
SHARED_LINK_LIBS
@@ -239,7 +240,8 @@ add_gandiva_test(internals-test
EXTRA_INCLUDES
$<TARGET_PROPERTY:LLVM::LLVM_INTERFACE,INTERFACE_INCLUDE_DIRECTORIES>
${GANDIVA_INTERNALS_TEST_ARGUMENTS}
- ${GANDIVA_OPENSSL_INCLUDE_DIR})
+ ${GANDIVA_OPENSSL_INCLUDE_DIR}
+ ${UTF8PROC_INCLUDE_DIR})
if(ARROW_GANDIVA_JAVA)
add_subdirectory(jni)
diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc
index e50069e..cbc7006 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -63,10 +63,14 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
UNARY_SAFE_NULL_NEVER_BOOL_FN(isnotnull, {}),
NativeFunction("upper", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
- "upper_utf8", NativeFunction::kNeedsContext),
+ "gdv_fn_upper_utf8", NativeFunction::kNeedsContext),
NativeFunction("lower", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
- "lower_utf8", NativeFunction::kNeedsContext),
+ "gdv_fn_lower_utf8", NativeFunction::kNeedsContext),
+
+ NativeFunction("initcap", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
+ "gdv_fn_initcap_utf8",
+ NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
NativeFunction("castINT", {}, DataTypeVector{utf8()}, int32(), kResultNullIfNull,
"gdv_fn_castINT_utf8",
diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc
index acf3f56..a890775 100644
--- a/cpp/src/gandiva/gdv_function_stubs.cc
+++ b/cpp/src/gandiva/gdv_function_stubs.cc
@@ -17,10 +17,13 @@
#include "gandiva/gdv_function_stubs.h"
+#include <utf8proc.h>
+
#include <string>
#include <vector>
#include "arrow/util/formatting.h"
+#include "arrow/util/utf8.h"
#include "arrow/util/value_parsing.h"
#include "gandiva/engine.h"
#include "gandiva/exported_funcs.h"
@@ -402,6 +405,286 @@ GDV_FN_CAST_VARCHAR_REAL(float64, DoubleType)
#undef GDV_FN_CAST_VARCHAR_INTEGER
#undef GDV_FN_CAST_VARCHAR_REAL
+
+GANDIVA_EXPORT
+int32_t gdv_fn_utf8_char_length(char c) {
+ if ((signed char)c >= 0) { // 1-byte char (0x00 ~ 0x7F)
+ return 1;
+ } else if ((c & 0xE0) == 0xC0) { // 2-byte char
+ return 2;
+ } else if ((c & 0xF0) == 0xE0) { // 3-byte char
+ return 3;
+ } else if ((c & 0xF8) == 0xF0) { // 4-byte char
+ return 4;
+ }
+ // invalid char
+ return 0;
+}
+
+GANDIVA_EXPORT
+void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val) {
+ char const* fmt = "unexpected byte \\%02hhx encountered while decoding utf8 string";
+ int size = static_cast<int>(strlen(fmt)) + 64;
+ char* error = reinterpret_cast<char*>(malloc(size));
+ snprintf(error, size, fmt, (unsigned char)val);
+ gdv_fn_context_set_error_msg(execution_context, error);
+ free(error);
+}
+
+// Convert an utf8 string to its corresponding uppercase string
+GANDIVA_EXPORT
+const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_len,
+ int32_t* out_len) {
+ if (data_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ // If it is a single-byte character (ASCII), corresponding uppercase is always 1-byte
+ // long; if it is >= 2 bytes long, uppercase can be at most 4 bytes long, so length of
+ // the output can be at most twice the length of the input
+ char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+
+ int32_t char_len, out_char_len, out_idx = 0;
+ uint32_t char_codepoint;
+
+ for (int32_t i = 0; i < data_len; i += char_len) {
+ char_len = gdv_fn_utf8_char_length(data[i]);
+ // For single byte characters:
+ // If it is a lowercase ASCII character, set the output to its corresponding uppercase
+ // character; else, set the output to the read character
+ if (char_len == 1) {
+ char cur = data[i];
+ // 'A' - 'Z' : 0x41 - 0x5a
+ // 'a' - 'z' : 0x61 - 0x7a
+ if (cur >= 0x61 && cur <= 0x7a) {
+ out[out_idx++] = static_cast<char>(cur - 0x20);
+ } else {
+ out[out_idx++] = cur;
+ }
+ continue;
+ }
+
+ // Control reaches here when we encounter a multibyte character
+ const auto* in_char = (const uint8_t*)(data + i);
+
+ // Decode the multibyte character
+ bool is_valid_utf8_char =
+ arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);
+
+ // If it is an invalid utf8 character, UTF8Decode evaluates to false
+ if (!is_valid_utf8_char) {
+ gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+ *out_len = 0;
+ return "";
+ }
+
+ // Convert the encoded codepoint to its uppercase codepoint
+ int32_t upper_codepoint = utf8proc_toupper(char_codepoint);
+
+ // UTF8Encode advances the pointer by the number of bytes present in the uppercase
+ // character
+ auto* out_char = (uint8_t*)(out + out_idx);
+ uint8_t* out_char_start = out_char;
+
+ // Encode the uppercase character
+ out_char = arrow::util::UTF8Encode(out_char, upper_codepoint);
+
+ out_char_len = static_cast<int32_t>(out_char - out_char_start);
+ out_idx += out_char_len;
+ }
+
+ *out_len = out_idx;
+ return out;
+}
+
+// Convert an utf8 string to its corresponding lowercase string
+GANDIVA_EXPORT
+const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_len,
+ int32_t* out_len) {
+ if (data_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ // If it is a single-byte character (ASCII), corresponding lowercase is always 1-byte
+ // long; if it is >= 2 bytes long, lowercase can be at most 4 bytes long, so length of
+ // the output can be at most twice the length of the input
+ char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+
+ int32_t char_len, out_char_len, out_idx = 0;
+ uint32_t char_codepoint;
+
+ for (int32_t i = 0; i < data_len; i += char_len) {
+ char_len = gdv_fn_utf8_char_length(data[i]);
+ // For single byte characters:
+ // If it is an uppercase ASCII character, set the output to its corresponding
+ // lowercase character; else, set the output to the read character
+ if (char_len == 1) {
+ char cur = data[i];
+ // 'A' - 'Z' : 0x41 - 0x5a
+ // 'a' - 'z' : 0x61 - 0x7a
+ if (cur >= 0x41 && cur <= 0x5a) {
+ out[out_idx++] = static_cast<char>(cur + 0x20);
+ } else {
+ out[out_idx++] = cur;
+ }
+ continue;
+ }
+
+ // Control reaches here when we encounter a multibyte character
+ const auto* in_char = (const uint8_t*)(data + i);
+
+ // Decode the multibyte character
+ bool is_valid_utf8_char =
+ arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);
+
+ // If it is an invalid utf8 character, UTF8Decode evaluates to false
+ if (!is_valid_utf8_char) {
+ gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+ *out_len = 0;
+ return "";
+ }
+
+ // Convert the encoded codepoint to its lowercase codepoint
+ int32_t lower_codepoint = utf8proc_tolower(char_codepoint);
+
+ // UTF8Encode advances the pointer by the number of bytes present in the lowercase
+ // character
+ auto* out_char = (uint8_t*)(out + out_idx);
+ uint8_t* out_char_start = out_char;
+
+ // Encode the lowercase character
+ out_char = arrow::util::UTF8Encode(out_char, lower_codepoint);
+
+ out_char_len = static_cast<int32_t>(out_char - out_char_start);
+ out_idx += out_char_len;
+ }
+
+ *out_len = out_idx;
+ return out;
+}
+
+// Checks if the character is a whitespace by its code point. To check the list
+// of the existent whitespaces characters in UTF8, take a look at this link
+// https://en.wikipedia.org/wiki/Whitespace_character#Unicode
+//
+// The Unicode characters also are divided between categories. This link
+// https://en.wikipedia.org/wiki/Unicode_character_property#General_Category shows
+// more information about characters categories.
+GANDIVA_EXPORT
+bool gdv_fn_is_codepoint_for_space(uint32_t val) {
+ auto category = utf8proc_category(val);
+
+ return category == utf8proc_category_t::UTF8PROC_CATEGORY_ZS ||
+ category == utf8proc_category_t::UTF8PROC_CATEGORY_ZL ||
+ category == utf8proc_category_t::UTF8PROC_CATEGORY_ZP;
+}
+
+// For a given text, initialize the first letter of each word, e.g:
+// - "it is a text str" -> "It Is A Text Str"
+GANDIVA_EXPORT
+const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
+ int32_t* out_len) {
+ if (data_len == 0) {
+ *out_len = data_len;
+ return "";
+ }
+
+ // If it is a single-byte character (ASCII), corresponding uppercase is always 1-byte
+ // long; if it is >= 2 bytes long, uppercase can be at most 4 bytes long, so length of
+ // the output can be at most twice the length of the input
+ char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+ *out_len = 0;
+ return "";
+ }
+
+ int32_t char_len = 0;
+ int32_t out_char_len = 0;
+ int32_t out_idx = 0;
+ uint32_t char_codepoint;
+ bool last_char_was_space = true;
+
+ for (int32_t i = 0; i < data_len; i += char_len) {
+ char_len = gdv_fn_utf8_char_length(data[i]);
+ // For single byte characters:
+ // If it is a lowercase ASCII character, set the output to its corresponding uppercase
+ // character; else, set the output to the read character
+ if (char_len == 1) {
+ char cur = data[i];
+
+ if (cur >= 0x61 && cur <= 0x7a && last_char_was_space) {
+ // 'A' - 'Z' : 0x41 - 0x5a
+ // 'a' - 'z' : 0x61 - 0x7a
+ out[out_idx++] = static_cast<char>(cur - 0x20);
+ last_char_was_space = false;
+ } else {
+ // Check if the ASCII character is one of these:
+ // - space : 0x20
+ // - character tabulation : 0x9
+ // - line feed : 0xA
+ // - line tabulation : 0xB
+ // - form feed : 0xC
+ // - carriage return : 0xD
+ last_char_was_space = cur <= 0x20;
+ out[out_idx++] = cur;
+ }
+ continue;
+ }
+
+ // Control reaches here when we encounter a multibyte character
+ const auto* in_char = (const uint8_t*)(data + i);
+
+ // Decode the multibyte character
+ bool is_valid_utf8_char =
+ arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);
+
+ // If it is an invalid utf8 character, UTF8Decode evaluates to false
+ if (!is_valid_utf8_char) {
+ gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+ *out_len = 0;
+ return "";
+ }
+
+ bool is_char_space = gdv_fn_is_codepoint_for_space(char_codepoint);
+
+ int32_t formatted_codepoint;
+ if (last_char_was_space && !is_char_space) {
+ // Convert the encoded codepoint to its uppercase codepoint
+ formatted_codepoint = utf8proc_toupper(char_codepoint);
+ } else {
+ // Leave the codepoint as is
+ formatted_codepoint = char_codepoint;
+ }
+
+ // UTF8Encode advances the pointer by the number of bytes present in the character
+ auto* out_char = (uint8_t*)(out + out_idx);
+ uint8_t* out_char_start = out_char;
+
+ // Encode the uppercase character
+ out_char = arrow::util::UTF8Encode(out_char, formatted_codepoint);
+
+ out_char_len = static_cast<int32_t>(out_char - out_char_start);
+ out_idx += out_char_len;
+
+ last_char_was_space = is_char_space;
+ }
+
+ *out_len = out_idx;
+ return out;
+}
}
namespace gandiva {
@@ -1031,5 +1314,40 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
engine->AddGlobalMappingForFunc("gdv_fn_sha256_decimal128",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_sha256_decimal128));
+
+ // gdv_fn_upper_utf8
+ args = {
+ types->i64_type(), // context
+ types->i8_ptr_type(), // data
+ types->i32_type(), // data_len
+ types->i32_ptr_type(), // out_len
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_upper_utf8",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_upper_utf8));
+ // gdv_fn_lower_utf8
+ args = {
+ types->i64_type(), // context
+ types->i8_ptr_type(), // data
+ types->i32_type(), // data_len
+ types->i32_ptr_type(), // out_len
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_lower_utf8",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_lower_utf8));
+
+ // gdv_fn_initcap_utf8
+ args = {
+ types->i64_type(), // context
+ types->i8_ptr_type(), // const char*
+ types->i32_type(), // value_length
+ types->i32_ptr_type() // out_length
+ };
+
+ engine->AddGlobalMappingForFunc("gdv_fn_initcap_utf8",
+ types->i8_ptr_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_initcap_utf8));
}
} // namespace gandiva
diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h
index 0a6cd70..847772b 100644
--- a/cpp/src/gandiva/gdv_function_stubs.h
+++ b/cpp/src/gandiva/gdv_function_stubs.h
@@ -108,4 +108,25 @@ const char* gdv_fn_castVARCHAR_float32_int64(int64_t context, float value, int64
GANDIVA_EXPORT
const char* gdv_fn_castVARCHAR_float64_int64(int64_t context, double value, int64_t len,
int32_t* out_len);
+
+GANDIVA_EXPORT
+int32_t gdv_fn_utf8_char_length(char c);
+
+GANDIVA_EXPORT
+void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val);
+
+GANDIVA_EXPORT
+const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_len,
+ int32_t* out_len);
+
+GANDIVA_EXPORT
+const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_len,
+ int32_t* out_len);
+
+GANDIVA_EXPORT
+bool gdv_fn_is_codepoint_for_space(uint32_t val);
+
+GANDIVA_EXPORT
+const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
+ int32_t* out_len);
}
diff --git a/cpp/src/gandiva/gdv_function_stubs_test.cc b/cpp/src/gandiva/gdv_function_stubs_test.cc
index 8f44ce2..6cfff5b 100644
--- a/cpp/src/gandiva/gdv_function_stubs_test.cc
+++ b/cpp/src/gandiva/gdv_function_stubs_test.cc
@@ -290,4 +290,198 @@ TEST(TestGdvFnStubs, TestCastVARCHARFromDouble) {
EXPECT_FALSE(ctx.has_error());
}
+TEST(TestGdvFnStubs, TestUpper) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str = gdv_fn_upper_utf8(ctx_ptr, "AbcDEfGh", 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ABCDEFGH");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "asdfj", 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ASDFJ");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "s;dcGS,jO!l", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "S;DCGS,JO!L");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "münchen", 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "MÜNCHEN");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "CITROËN", 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "CITROËN");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "âBćDëFGH", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ÂBĆDËFGH");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "øhpqRšvñ", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ØHPQRŠVÑ");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "Möbelträgerfüße", 19, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "MÖBELTRÄGERFÜẞE");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "{õhp,PQŚv}ń+", 15, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "{ÕHP,PQŚV}Ń+");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_upper_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string d("AbOJjÜoß\xc3");
+ out_str = gdv_fn_upper_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\c3 encountered while decoding utf8 string"));
+ ctx.Reset();
+
+ std::string e(
+ "åbÑg\xe0\xa0"
+ "åBUå");
+ out_str = gdv_fn_upper_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\e0 encountered while decoding utf8 string"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestLower) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str = gdv_fn_lower_utf8(ctx_ptr, "AbcDEfGh", 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abcdefgh");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "asdfj", 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "asdfj");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "S;DCgs,Jo!L", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "s;dcgs,jo!l");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "MÜNCHEN", 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "münchen");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "citroën", 8, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "citroën");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "ÂbĆDËFgh", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "âbćdëfgh");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "ØHPQrŠvÑ", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "øhpqršvñ");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "MÖBELTRÄGERFÜẞE", 20, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "möbelträgerfüße");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "{ÕHP,pqśv}Ń+", 15, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "{õhp,pqśv}ń+");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_lower_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string d("AbOJjÜoß\xc3");
+ out_str = gdv_fn_lower_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\c3 encountered while decoding utf8 string"));
+ ctx.Reset();
+
+ std::string e(
+ "åbÑg\xe0\xa0"
+ "åBUå");
+ out_str = gdv_fn_lower_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\e0 encountered while decoding utf8 string"));
+ ctx.Reset();
+}
+
+TEST(TestGdvFnStubs, TestInitCap) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ const char* out_str = gdv_fn_initcap_utf8(ctx_ptr, "test string", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test String");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "asdfj\nhlqf", 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Asdfj\nHlqf");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "s;DCgs,Jo!L", 11, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "S;DCgs,Jo!L");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, " mÜNCHEN", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), " MÜNCHEN");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "citroën CaR", 12, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Citroën CaR");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "ÂbĆDËFgh\néll", 16, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ÂbĆDËFgh\nÉll");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, " øhpqršvñ \n\n", 17, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), " Øhpqršvñ \n\n");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str =
+ gdv_fn_initcap_utf8(ctx_ptr, "möbelträgerfüße \nmöbelträgerfüße", 42, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Möbelträgerfüße \nMöbelträgerfüße");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "{ÕHP,pqśv}Ń+", 15, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "{ÕHP,pqśv}Ń+");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = gdv_fn_initcap_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string d("AbOJjÜoß\xc3");
+ out_str =
+ gdv_fn_initcap_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\c3 encountered while decoding utf8 string"));
+ ctx.Reset();
+
+ std::string e(
+ "åbÑg\xe0\xa0"
+ "åBUå");
+ out_str =
+ gdv_fn_initcap_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_THAT(ctx.get_error(),
+ ::testing::HasSubstr(
+ "unexpected byte \\e0 encountered while decoding utf8 string"));
+ ctx.Reset();
+}
} // namespace gandiva
diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt
index 7343bc0..176b047 100644
--- a/cpp/src/gandiva/precompiled/CMakeLists.txt
+++ b/cpp/src/gandiva/precompiled/CMakeLists.txt
@@ -77,6 +77,7 @@ foreach(SRC_FILE ${PRECOMPILED_SRCS})
${ARROW_GANDIVA_PC_CXX_FLAGS}
-I${CMAKE_SOURCE_DIR}/src
-I${ARROW_BINARY_DIR}/src)
+
if(NOT ARROW_USE_NATIVE_INT128)
list(APPEND PRECOMPILE_COMMAND -I${Boost_INCLUDE_DIR})
endif()
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc
index b35062d..ac50633 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -17,6 +17,7 @@
// String functions
#include "arrow/util/value_parsing.h"
+
extern "C" {
#include <algorithm>
@@ -221,66 +222,6 @@ UTF8_LENGTH(char_length, utf8)
UTF8_LENGTH(length, utf8)
UTF8_LENGTH(lengthUtf8, binary)
-// Convert a utf8 sequence to upper case.
-// TODO : This handles only ascii characters.
-FORCE_INLINE
-const char* upper_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
- int32_t* out_len) {
- if (data_len == 0) {
- *out_len = 0;
- return "";
- }
-
- char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, data_len));
- if (ret == nullptr) {
- gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
- *out_len = 0;
- return "";
- }
- for (gdv_int32 i = 0; i < data_len; ++i) {
- char cur = data[i];
-
- // 'A- - 'Z' : 0x41 - 0x5a
- // 'a' - 'z' : 0x61 - 0x7a
- if (cur >= 0x61 && cur <= 0x7a) {
- cur = static_cast<char>(cur - 0x20);
- }
- ret[i] = cur;
- }
- *out_len = data_len;
- return ret;
-}
-
-// Convert a utf8 sequence to lower case.
-// TODO : This handles only ascii characters.
-FORCE_INLINE
-const char* lower_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
- int32_t* out_len) {
- if (data_len == 0) {
- *out_len = 0;
- return "";
- }
-
- char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, data_len));
- if (ret == nullptr) {
- gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
- *out_len = 0;
- return "";
- }
- for (gdv_int32 i = 0; i < data_len; ++i) {
- char cur = data[i];
-
- // 'A' - 'Z' : 0x41 - 0x5a
- // 'a' - 'z' : 0x61 - 0x7a
- if (cur >= 0x41 && cur <= 0x5a) {
- cur = static_cast<char>(cur + 0x20);
- }
- ret[i] = cur;
- }
- *out_len = data_len;
- return ret;
-}
-
// Reverse a utf8 sequence
FORCE_INLINE
const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc
index fd36665..ae3c0f2 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -695,28 +695,6 @@ TEST(TestStringOps, TestConcat) {
EXPECT_FALSE(ctx.has_error());
}
-TEST(TestStringOps, TestLower) {
- gandiva::ExecutionContext ctx;
- uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
- gdv_int32 out_len = 0;
-
- const char* out_str = lower_utf8(ctx_ptr, "AsDfJ", 5, &out_len);
- EXPECT_EQ(std::string(out_str, out_len), "asdfj");
- EXPECT_FALSE(ctx.has_error());
-
- out_str = lower_utf8(ctx_ptr, "asdfj", 5, &out_len);
- EXPECT_EQ(std::string(out_str, out_len), "asdfj");
- EXPECT_FALSE(ctx.has_error());
-
- out_str = lower_utf8(ctx_ptr, "dž†AbD", 11, &out_len);
- EXPECT_EQ(std::string(out_str, out_len), "dž†abd");
- EXPECT_FALSE(ctx.has_error());
-
- out_str = lower_utf8(ctx_ptr, "", 0, &out_len);
- EXPECT_EQ(std::string(out_str, out_len), "");
- EXPECT_FALSE(ctx.has_error());
-}
-
TEST(TestStringOps, TestReverse) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h
index 170cf92..b8c7aa9 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -378,9 +378,6 @@ const char* castVARCHAR_utf8_int64(gdv_int64 context, const char* data,
gdv_int32 data_len, int64_t out_len,
int32_t* out_length);
-const char* lower_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
- int32_t* out_length);
-
const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
int32_t* out_len);
diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java
index 9844d02..80d4281 100644
--- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java
+++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java
@@ -73,7 +73,13 @@ public class ProjectorTest extends BaseEvaluatorTest {
List<ArrowBuf> varBufs(String[] strings, Charset charset) {
ArrowBuf offsetsBuffer = allocator.buffer((strings.length + 1) * 4);
- ArrowBuf dataBuffer = allocator.buffer(strings.length * 8);
+
+ long dataBufferSize = 0L;
+ for (String string : strings) {
+ dataBufferSize += string.getBytes(charset).length;
+ }
+
+ ArrowBuf dataBuffer = allocator.buffer(dataBufferSize);
int startOffset = 0;
for (int i = 0; i < strings.length; i++) {
@@ -2284,4 +2290,68 @@ public class ProjectorTest extends BaseEvaluatorTest {
releaseValueVectors(output);
}
+ @Test
+ public void testInitCap() throws Exception {
+
+ Field x = Field.nullable("x", new ArrowType.Utf8());
+
+ Field retType = Field.nullable("c", new ArrowType.Utf8());
+
+ TreeNode cond =
+ TreeBuilder.makeFunction(
+ "initcap",
+ Lists.newArrayList(TreeBuilder.makeField(x)),
+ new ArrowType.Utf8());
+ ExpressionTree expr = TreeBuilder.makeExpression(cond, retType);
+ Schema schema = new Schema(Lists.newArrayList(x));
+ Projector eval = Projector.make(schema, Lists.newArrayList(expr));
+
+ int numRows = 5;
+ byte[] validity = new byte[]{(byte) 15, 0};
+ String[] valuesX = new String[]{
+ " øhpqršvñ \n\n",
+ "möbelträgerfüße \nmöbelträgerfüße",
+ "ÂbĆDËFgh\néll",
+ "citroën CaR",
+ "kjk"
+ };
+
+ String[] expected = new String[]{
+ " Øhpqršvñ \n\n",
+ "Möbelträgerfüße \nMöbelträgerfüße",
+ "ÂbĆDËFgh\nÉll",
+ "Citroën CaR",
+ null
+ };
+
+ ArrowBuf validityX = buf(validity);
+ List<ArrowBuf> dataBufsX = stringBufs(valuesX);
+
+ ArrowRecordBatch batch =
+ new ArrowRecordBatch(
+ numRows,
+ Lists.newArrayList(new ArrowFieldNode(numRows, 0)),
+ Lists.newArrayList(validityX, dataBufsX.get(0), dataBufsX.get(1)));
+
+ // allocate data for output vector.
+ VarCharVector outVector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator);
+ outVector.allocateNew(numRows * 100, numRows);
+
+ // evaluate expression
+ List<ValueVector> output = new ArrayList<>();
+ output.add(outVector);
+ eval.evaluate(batch, output);
+ eval.close();
+
+ // match expected output.
+ for (int i = 0; i < numRows - 1; i++) {
+ assertFalse("Expect none value equals null", outVector.isNull(i));
+ assertEquals(expected[i], new String(outVector.get(i)));
+ }
+
+ assertTrue("Last value must be null", outVector.isNull(numRows - 1));
+
+ releaseRecordBatch(batch);
+ releaseValueVectors(output);
+ }
}