You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pr...@apache.org on 2020/06/24 06:04:16 UTC
[arrow] branch master updated: ARROW-9099: [C++][Gandiva] Implement
trim function for string
This is an automated email from the ASF dual-hosted git repository.
praveenbingo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 4b8cbee ARROW-9099: [C++][Gandiva] Implement trim function for string
4b8cbee is described below
commit 4b8cbee11735d7a97a6150938fbee8fba95d8d9e
Author: Sagnik Chakraborty <sa...@dremio.com>
AuthorDate: Wed Jun 24 11:33:42 2020 +0530
ARROW-9099: [C++][Gandiva] Implement trim function for string
Closes #7402 from sagnikc-dremio/master and squashes the following commits:
60923b82a <Sagnik Chakraborty> ARROW-9099: Implement trim function for string
Authored-by: Sagnik Chakraborty <sa...@dremio.com>
Signed-off-by: Praveen <pr...@dremio.com>
---
cpp/src/gandiva/function_registry_string.cc | 1 +
cpp/src/gandiva/precompiled/string_ops.cc | 36 ++++++++++++++++++++++++++
cpp/src/gandiva/precompiled/string_ops_test.cc | 27 +++++++++++++++++++
cpp/src/gandiva/precompiled/types.h | 3 +++
4 files changed, 67 insertions(+)
diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc
index b4b02ef..6f2d54d 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -55,6 +55,7 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
UNARY_UNSAFE_NULL_IF_NULL(length, {}, utf8, int32),
UNARY_UNSAFE_NULL_IF_NULL(lengthUtf8, {}, binary, int32),
UNARY_UNSAFE_NULL_IF_NULL(reverse, {}, utf8, utf8),
+ UNARY_UNSAFE_NULL_IF_NULL(trim, {}, utf8, utf8),
UNARY_SAFE_NULL_NEVER_BOOL_FN(isnull, {}),
UNARY_SAFE_NULL_NEVER_BOOL_FN(isnotnull, {}),
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc
index 0b2d934..6ee02dd 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -284,6 +284,42 @@ const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len
return ret;
}
+// Trim a utf8 sequence
+FORCE_INLINE
+const char* trim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len) {
+ if (data_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ gdv_int32 start = 0, end = data_len - 1;
+ // start and end denote the first and last positions of non-space
+ // characters in the input string respectively
+ while (start <= end && data[start] == ' ') {
+ ++start;
+ }
+ while (end >= start && data[end] == ' ') {
+ --end;
+ }
+
+ // string with no leading/trailing spaces, return original string
+ if (start == 0 && end == data_len - 1) {
+ *out_len = data_len;
+ return data;
+ }
+
+ // string with all spaces
+ if (start > end) {
+ *out_len = 0;
+ return "";
+ }
+
+ // string has some leading/trailing spaces and some non-space characters
+ *out_len = end - start + 1;
+ return data + start;
+}
+
// Truncates the string to given length
FORCE_INLINE
const char* castVARCHAR_utf8_int64(gdv_int64 context, const char* data,
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc
index 75be26d..20a3d27 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -426,6 +426,33 @@ TEST(TestStringOps, TestReverse) {
ctx.Reset();
}
+TEST(TestStringOps, TestTrim) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ out_str = trim_utf8(ctx_ptr, "TestString", 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = trim_utf8(ctx_ptr, " TestString ", 18, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = trim_utf8(ctx_ptr, " Test çåå†bD ", 21, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test çåå†bD");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = trim_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = trim_utf8(ctx_ptr, " ", 6, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+}
+
TEST(TestStringOps, TestLocate) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h
index 09ee596..051f522 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -199,6 +199,9 @@ const char* lower_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
int32_t* out_len);
+const char* trim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len);
+
gdv_int32 locate_utf8_utf8(gdv_int64 context, const char* sub_str, gdv_int32 sub_str_len,
const char* str, gdv_int32 str_len);