You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pr...@apache.org on 2021/07/06 06:52:18 UTC
[arrow] branch master updated: ARROW-12856: [C++][Gandiva]
Implement castBIT and castBOOLEAN functions
This is an automated email from the ASF dual-hosted git repository.
praveenbingo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 27be94f ARROW-12856: [C++][Gandiva] Implement castBIT and castBOOLEAN functions
27be94f is described below
commit 27be94f39e988e6461d6900ca9b7ae28cfc65ea9
Author: João Pedro <jo...@simbioseventures.com>
AuthorDate: Tue Jul 6 12:21:15 2021 +0530
ARROW-12856: [C++][Gandiva] Implement castBIT and castBOOLEAN functions
Implement castBIT and castBOOLEAN functions for UTF8 input values
Closes #10382 from jpedroantunes/feature/cast-bit and squashes the following commits:
9aea576e1 <João Pedro> Apply linter corrections
5ed3c64da <João Pedro> Add projector tests and registry for castbit function
51918f8ad <João Pedro> Add base files for castBIT and castBOOLEAN functions
Authored-by: João Pedro <jo...@simbioseventures.com>
Signed-off-by: Praveen <pr...@dremio.com>
---
cpp/src/gandiva/function_registry_string.cc | 3 ++
cpp/src/gandiva/precompiled/string_ops.cc | 60 ++++++++++++++++++++++++++
cpp/src/gandiva/precompiled/string_ops_test.cc | 55 +++++++++++++++++++++++
cpp/src/gandiva/precompiled/types.h | 2 +
cpp/src/gandiva/tests/projector_test.cc | 31 +++++++++++++
5 files changed, 151 insertions(+)
diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc
index 7491e44..9235a3e 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -75,6 +75,9 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
"gdv_fn_initcap_utf8",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+ NativeFunction("castBIT", {"castBOOLEAN"}, DataTypeVector{utf8()}, boolean(),
+ kResultNullIfNull, "castBIT_utf8", NativeFunction::kNeedsContext),
+
NativeFunction("castINT", {}, DataTypeVector{utf8()}, int32(), kResultNullIfNull,
"gdv_fn_castINT_utf8",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc
index 3b475be..0820114 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -481,6 +481,66 @@ const char* btrim_utf8_utf8(gdv_int64 context, const char* basetext,
}
FORCE_INLINE
+gdv_boolean compare_lower_strings(const char* base_str, gdv_int32 base_str_len,
+ const char* str, gdv_int32 str_len) {
+ if (base_str_len != str_len) {
+ return false;
+ }
+ for (int i = 0; i < str_len; i++) {
+ // convert char to lower
+ char cur = str[i];
+ // 'A' - 'Z' : 0x41 - 0x5a
+ // 'a' - 'z' : 0x61 - 0x7a
+ if (cur >= 0x41 && cur <= 0x5a) {
+ cur = static_cast<char>(cur + 0x20);
+ }
+ // if the character does not match, break the flow
+ if (cur != base_str[i]) break;
+ // if the character matches and it is the last iteration, return true
+ if (i == str_len - 1) return true;
+ }
+ return false;
+}
+
+// Try to cast the received string ('0', '1', 'true', 'false'), ignoring leading
+// and trailing spaces, also ignoring lower and upper case.
+FORCE_INLINE
+gdv_boolean castBIT_utf8(gdv_int64 context, const char* data, gdv_int32 data_len) {
+ if (data_len <= 0) {
+ gdv_fn_context_set_error_msg(context, "Invalid value for boolean.");
+ return false;
+ }
+
+ // trim leading and trailing spaces
+ int32_t trimmed_len;
+ int32_t start = 0, end = data_len - 1;
+ while (start <= end && data[start] == ' ') {
+ ++start;
+ }
+ while (end >= start && data[end] == ' ') {
+ --end;
+ }
+ trimmed_len = end - start + 1;
+ const char* trimmed_data = data + start;
+
+ // compare received string with the valid bool string values '1', '0', 'true', 'false'
+ if (trimmed_len == 1) {
+ // case for '0' and '1' value
+ if (trimmed_data[0] == '1') return true;
+ if (trimmed_data[0] == '0') return false;
+ } else if (trimmed_len == 4) {
+ // case for matching 'true'
+ if (compare_lower_strings("true", 4, trimmed_data, trimmed_len)) return true;
+ } else if (trimmed_len == 5) {
+ // case for matching 'false'
+ if (compare_lower_strings("false", 5, trimmed_data, trimmed_len)) return false;
+ }
+ // if no 'true', 'false', '0' or '1' value is found, set an error
+ gdv_fn_context_set_error_msg(context, "Invalid value for boolean.");
+ return false;
+}
+
+FORCE_INLINE
const char* castVARCHAR_bool_int64(gdv_int64 context, gdv_boolean value,
gdv_int64 out_len, gdv_int32* out_length) {
gdv_int32 len = static_cast<gdv_int32>(out_len);
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc
index e85e0ee..c4854c5 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -221,6 +221,61 @@ TEST(TestStringOps, TestCastBoolToVarchar) {
ctx.Reset();
}
+TEST(TestStringOps, TestCastVarcharToBool) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "true", 4), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, " true ", 14), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "true ", 9), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, " true", 9), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "TRUE", 4), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "TrUe", 4), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "1", 1), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, " 1", 3), true);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "false", 5), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "false ", 10), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, " false", 10), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "0", 1), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "0 ", 4), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "FALSE", 5), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "FaLsE", 5), false);
+ EXPECT_FALSE(ctx.has_error());
+
+ EXPECT_EQ(castBIT_utf8(ctx_ptr, "test", 4), false);
+ EXPECT_TRUE(ctx.has_error());
+ EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Invalid value for boolean"));
+ ctx.Reset();
+}
+
TEST(TestStringOps, TestCastVarchar) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h
index 047586b..543a00f 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -114,6 +114,8 @@ gdv_int64 date_sub_timestamp_int32(gdv_timestamp, gdv_int32);
gdv_int64 subtract_timestamp_int32(gdv_timestamp, gdv_int32);
gdv_int64 date_diff_timestamp_int64(gdv_timestamp, gdv_int64);
+gdv_boolean castBIT_utf8(gdv_int64 context, const char* data, gdv_int32 data_len);
+
bool is_distinct_from_timestamp_timestamp(gdv_int64, bool, gdv_int64, bool);
bool is_not_distinct_from_int32_int32(gdv_int32, bool, gdv_int32, bool);
diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc
index 04fa7a6..dcdeeb4 100644
--- a/cpp/src/gandiva/tests/projector_test.cc
+++ b/cpp/src/gandiva/tests/projector_test.cc
@@ -1029,6 +1029,37 @@ TEST_F(TestProjector, TestCastFunction) {
EXPECT_ARROW_ARRAY_EQUALS(out_int8, outputs.at(3));
}
+TEST_F(TestProjector, TestCastBitFunction) {
+ auto field0 = field("f0", arrow::utf8());
+ auto schema = arrow::schema({field0});
+
+ // output fields
+ auto res_bit = field("res_bit", arrow::boolean());
+
+ // Build expression
+ auto cast_bit = TreeExprBuilder::MakeExpression("castBIT", {field0}, res_bit);
+
+ std::shared_ptr<Projector> projector;
+
+ auto status = Projector::Make(schema, {cast_bit}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto arr = MakeArrowArrayUtf8({"1", "true", "false", "0"}, {true, true, true, true});
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {arr});
+
+ auto out = MakeArrowArrayBool({true, true, false, false}, {true, true, true, true});
+
+ arrow::ArrayVector outputs;
+
+ // Evaluate expression
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok());
+
+ EXPECT_ARROW_ARRAY_EQUALS(out, outputs.at(0));
+}
+
TEST_F(TestProjector, TestToDate) {
// schema for input fields
auto field0 = field("f0", arrow::utf8());