You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ra...@apache.org on 2019/08/01 07:07:02 UTC

[arrow] branch master updated: ARROW-6034: [C++][Gandiva] Add string functions in Gandiva

This is an automated email from the ASF dual-hosted git repository.

ravindra pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 9c14739  ARROW-6034: [C++][Gandiva] Add string functions in Gandiva
9c14739 is described below

commit 9c14739251480f4192af613e1df022ac58b86616
Author: Prudhvi Porandla <pr...@icloud.com>
AuthorDate: Thu Aug 1 12:36:37 2019 +0530

    ARROW-6034: [C++][Gandiva] Add string functions in Gandiva
    
    Add following functions in Gandiva:
    substr(str, offset, len), substr(str, offset), concat(str1, str2), castVARCHAR(timestamp, len), convert_fromUTF8(binary)
    
    Closes #4942 from pprudhvi/utf8-funcs and squashes the following commits:
    
    f88773ece <Prudhvi Porandla> add len 0 substr unittest
    3900f8e8d <Prudhvi Porandla> static cast size_t to int32
    208224119 <Prudhvi Porandla> add convert_fromUTF8 method
    112c933cf <Prudhvi Porandla> add castVARCHAR(timestamp) method
    77d3cdd0d <Prudhvi Porandla> add concatOperator
    9e2623fa1 <Prudhvi Porandla> add unittests for substr
    48c4d0823 <Prudhvi Porandla> add substr methods
    
    Authored-by: Prudhvi Porandla <pr...@icloud.com>
    Signed-off-by: Pindikura Ravindra <ra...@dremio.com>
---
 cpp/src/gandiva/function_registry_datetime.cc  |  4 ++
 cpp/src/gandiva/function_registry_string.cc    | 20 +++++++-
 cpp/src/gandiva/precompiled/string_ops.cc      | 71 ++++++++++++++++++++++++++
 cpp/src/gandiva/precompiled/string_ops_test.cc | 71 ++++++++++++++++++++++++++
 cpp/src/gandiva/precompiled/time.cc            | 17 ++++++
 cpp/src/gandiva/precompiled/types.h            |  6 +++
 6 files changed, 188 insertions(+), 1 deletion(-)

diff --git a/cpp/src/gandiva/function_registry_datetime.cc b/cpp/src/gandiva/function_registry_datetime.cc
index b9c61e5..cfe8185 100644
--- a/cpp/src/gandiva/function_registry_datetime.cc
+++ b/cpp/src/gandiva/function_registry_datetime.cc
@@ -57,6 +57,10 @@ std::vector<NativeFunction> GetDateTimeFunctionRegistry() {
                      kResultNullIfNull, "castTIMESTAMP_utf8",
                      NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
 
+      NativeFunction("castVARCHAR", {}, DataTypeVector{timestamp(), int64()}, utf8(),
+                     kResultNullIfNull, "castVARCHAR_timestamp_int64",
+                     NativeFunction::kNeedsContext),
+
       NativeFunction("to_date", {}, DataTypeVector{utf8(), utf8(), int32()}, date64(),
                      kResultNullInternal, "gdv_fn_to_date_utf8_utf8_int32",
                      NativeFunction::kNeedsContext |
diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc
index bd2fe18..bdc7438 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -68,7 +68,25 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
 
       NativeFunction("like", {}, DataTypeVector{utf8(), utf8()}, boolean(),
                      kResultNullIfNull, "gdv_fn_like_utf8_utf8",
-                     NativeFunction::kNeedsFunctionHolder)};
+                     NativeFunction::kNeedsFunctionHolder),
+
+      NativeFunction("substr", {"substring"},
+                     DataTypeVector{utf8(), int64() /*offset*/, int64() /*length*/},
+                     utf8(), kResultNullIfNull, "substr_utf8_int64_int64",
+                     NativeFunction::kNeedsContext),
+
+      NativeFunction("substr", {"substring"}, DataTypeVector{utf8(), int64() /*offset*/},
+                     utf8(), kResultNullIfNull, "substr_utf8_int64",
+                     NativeFunction::kNeedsContext),
+
+      NativeFunction("concatOperator", {"concat"}, DataTypeVector{utf8(), utf8()}, utf8(),
+                     kResultNullIfNull, "concatOperator_utf8_utf8",
+                     NativeFunction::kNeedsContext),
+
+      NativeFunction("convert_fromUTF8", {"convert_fromutf8"}, DataTypeVector{binary()},
+                     utf8(), kResultNullIfNull, "convert_fromUTF8_binary",
+                     NativeFunction::kNeedsContext),
+  };
 
   return string_fn_registry_;
 }
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc
index e65ca2d..2213925 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -189,4 +189,75 @@ VAR_LEN_TYPES(IS_NOT_NULL, isnotnull)
 
 #undef IS_NOT_NULL
 
+FORCE_INLINE
+char* substr_utf8_int64_int64(int64 context, const char* input, int32 in_len,
+                              int64 offset64, int64 length, int32* out_len) {
+  if (length <= 0 || input == nullptr || in_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  int32 offset = static_cast<int32>(offset64);
+  int32 startIndex = offset - 1;  // offset is 1 for first char
+  if (offset < 0) {
+    startIndex = in_len + offset;
+  } else if (offset == 0) {
+    startIndex = 0;
+  }
+
+  if (startIndex < 0 || startIndex >= in_len) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  *out_len = static_cast<int32>(length);
+  if (length > in_len - startIndex) {
+    *out_len = in_len - startIndex;
+  }
+
+  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+  if (ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_len = 0;
+    return nullptr;
+  }
+  memcpy(ret, input + startIndex, *out_len);
+  return ret;
+}
+
+FORCE_INLINE
+char* substr_utf8_int64(int64 context, const char* input, int32 in_len, int64 offset64,
+                        int32* out_len) {
+  return substr_utf8_int64_int64(context, input, in_len, offset64, in_len, out_len);
+}
+
+FORCE_INLINE
+char* concatOperator_utf8_utf8(int64 context, const char* left, int32 left_len,
+                               const char* right, int32 right_len, int32* out_len) {
+  *out_len = left_len + right_len;
+  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+  if (ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_len = 0;
+    return nullptr;
+  }
+  memcpy(ret, left, left_len);
+  memcpy(ret + left_len, right, right_len);
+  return ret;
+}
+
+FORCE_INLINE
+char* convert_fromUTF8_binary(int64 context, const char* bin_in, int32 len,
+                              int32* out_len) {
+  *out_len = len;
+  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+  if (ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_len = 0;
+    return nullptr;
+  }
+  memcpy(ret, bin_in, *out_len);
+  return ret;
+}
+
 }  // extern "C"
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc
index 78d7e9e..617bb70 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -74,4 +74,75 @@ TEST(TestStringOps, TestCharLength) {
       << ctx.get_error();
 }
 
+TEST(TestStringOps, TestSubstring) {
+  gandiva::ExecutionContext ctx;
+  uint64_t ctx_ptr = reinterpret_cast<int64>(&ctx);
+  int32 out_len = 0;
+
+  char* out_str = substr_utf8_int64_int64(ctx_ptr, "asdf", 4, 1, 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = substr_utf8_int64_int64(ctx_ptr, "asdf", 4, 1, 2, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "as");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = substr_utf8_int64_int64(ctx_ptr, "asdf", 4, 1, 5, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "asdf");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = substr_utf8_int64_int64(ctx_ptr, "asdf", 4, 0, 5, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "asdf");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = substr_utf8_int64_int64(ctx_ptr, "asdf", 4, -2, 5, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "df");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = substr_utf8_int64_int64(ctx_ptr, "asdf", 4, -5, 5, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = substr_utf8_int64_int64(ctx_ptr, "afg", 4, 0, -5, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = substr_utf8_int64_int64(ctx_ptr, "", 0, 5, 5, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = substr_utf8_int64(ctx_ptr, "abcd", 4, 2, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "bcd");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = substr_utf8_int64(ctx_ptr, "abcd", 4, 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "abcd");
+  EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestStringOps, TestConcat) {
+  gandiva::ExecutionContext ctx;
+  uint64_t ctx_ptr = reinterpret_cast<int64>(&ctx);
+  int32 out_len = 0;
+
+  char* out_str = concatOperator_utf8_utf8(ctx_ptr, "asdf", 4, "jkl", 3, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "asdfjkl");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = concatOperator_utf8_utf8(ctx_ptr, "asdf", 4, "", 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "asdf");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = concatOperator_utf8_utf8(ctx_ptr, "", 0, "jkl", 3, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "jkl");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = concatOperator_utf8_utf8(ctx_ptr, "", 0, "", 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = concatOperator_utf8_utf8(ctx_ptr, "abcd\n", 5, "a", 1, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "abcd\na");
+  EXPECT_FALSE(ctx.has_error());
+}
 }  // namespace gandiva
diff --git a/cpp/src/gandiva/precompiled/time.cc b/cpp/src/gandiva/precompiled/time.cc
index b24e015..a1792d1 100644
--- a/cpp/src/gandiva/precompiled/time.cc
+++ b/cpp/src/gandiva/precompiled/time.cc
@@ -689,4 +689,21 @@ timestamp castTIMESTAMP_utf8(int64_t context, const char* input, int32 length) {
 
 timestamp castTIMESTAMP_date64(date64 date_in_millis) { return date_in_millis; }
 
+char* castVARCHAR_timestamp_int64(int64 context, timestamp in, int64 length,
+                                  int32* out_len) {
+  std::string timestamp_str = std::to_string(in);
+  *out_len = static_cast<int32>(length);
+  int32 timestamp_str_len = static_cast<int32>(timestamp_str.length());
+  if (length > timestamp_str_len) {
+    *out_len = timestamp_str_len;
+  }
+  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
+  if (ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_len = 0;
+    return nullptr;
+  }
+  memcpy(ret, timestamp_str.data(), *out_len);
+  return ret;
+}
 }  // extern "C"
diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h
index 6931efa..f4fec59 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -168,6 +168,12 @@ timestamp castTIMESTAMP_date64(date64);
 
 int64 truncate_int64_int32(int64 in, int32 out_scale);
 
+char* substr_utf8_int64_int64(int64 context, const char* input, int32 in_len,
+                              int64 offset64, int64 length, int32* out_len);
+char* substr_utf8_int64(int64 context, const char* input, int32 in_len, int64 offset64,
+                        int32* out_len);
+char* concatOperator_utf8_utf8(int64 context, const char* left, int32 left_len,
+                               const char* right, int32 right_len, int32* out_len);
 }  // extern "C"
 
 #endif  // PRECOMPILED_TYPES_H