You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pr...@apache.org on 2021/07/06 06:50:49 UTC

[arrow] branch master updated: ARROW-12567: [C++][Gandiva] Implement ILIKE SQL function

This is an automated email from the ASF dual-hosted git repository.

praveenbingo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0072c67  ARROW-12567: [C++][Gandiva] Implement ILIKE SQL function
0072c67 is described below

commit 0072c677fbbc85832fa7a90ab49daf7c1f99a373
Author: frank400 <j....@gmail.com>
AuthorDate: Tue Jul 6 12:19:05 2021 +0530

    ARROW-12567: [C++][Gandiva] Implement ILIKE SQL function
    
    Closes #10179 from jvictorhuguenin/feature/implement-sql-ilike and squashes the following commits:
    
    f160880d2 <frank400> Optimize holder constructor call
    97e6e2d83 <frank400> Remove unnecessary Make method
    c2363b10f <frank400> Disable TryOptimize for ilike
    a48414931 <frank400> Fix checkstyle on cmake file
    c6a8372cd <frank400> Delete unnecessary holder
    4be6cc611 <frank400> Fix redefined function
    b78085a14 <frank400> Fix miss include
    2efd43e2b <frank400> Implement ilike function
    
    Authored-by: frank400 <j....@gmail.com>
    Signed-off-by: Praveen <pr...@dremio.com>
---
 cpp/src/gandiva/function_holder_registry.h  |  1 +
 cpp/src/gandiva/function_registry_string.cc |  4 ++
 cpp/src/gandiva/gdv_function_stubs.cc       | 17 +++++++
 cpp/src/gandiva/gdv_function_stubs.h        |  3 ++
 cpp/src/gandiva/like_holder.cc              | 21 ++++++++
 cpp/src/gandiva/like_holder.h               |  6 +++
 cpp/src/gandiva/like_holder_test.cc         | 75 +++++++++++++++++++++++++++--
 7 files changed, 123 insertions(+), 4 deletions(-)

diff --git a/cpp/src/gandiva/function_holder_registry.h b/cpp/src/gandiva/function_holder_registry.h
index e1c5630..225c732 100644
--- a/cpp/src/gandiva/function_holder_registry.h
+++ b/cpp/src/gandiva/function_holder_registry.h
@@ -62,6 +62,7 @@ class FunctionHolderRegistry {
   static map_type& makers() {
     static map_type maker_map = {
         {"like", LAMBDA_MAKER(LikeHolder)},
+        {"ilike", LAMBDA_MAKER(LikeHolder)},
         {"to_date", LAMBDA_MAKER(ToDateHolder)},
         {"random", LAMBDA_MAKER(RandomGeneratorHolder)},
         {"rand", LAMBDA_MAKER(RandomGeneratorHolder)},
diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc
index 90e2231..7491e44 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -131,6 +131,10 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
                      kResultNullIfNull, "gdv_fn_like_utf8_utf8_utf8",
                      NativeFunction::kNeedsFunctionHolder),
 
+      NativeFunction("ilike", {}, DataTypeVector{utf8(), utf8()}, boolean(),
+                     kResultNullIfNull, "gdv_fn_ilike_utf8_utf8",
+                     NativeFunction::kNeedsFunctionHolder),
+
       NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
                      kResultNullIfNull, "ltrim_utf8_utf8", NativeFunction::kNeedsContext),
 
diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc
index 38c31a8..3c27804 100644
--- a/cpp/src/gandiva/gdv_function_stubs.cc
+++ b/cpp/src/gandiva/gdv_function_stubs.cc
@@ -52,6 +52,12 @@ bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len,
   return (*holder)(std::string(data, data_len));
 }
 
+bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len,
+                            const char* pattern, int pattern_len) {
+  gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr);
+  return (*holder)(std::string(data, data_len));
+}
+
 double gdv_fn_random(int64_t ptr) {
   gandiva::RandomGeneratorHolder* holder =
       reinterpret_cast<gandiva::RandomGeneratorHolder*>(ptr);
@@ -807,6 +813,17 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
                                   types->i1_type() /*return_type*/, args,
                                   reinterpret_cast<void*>(gdv_fn_like_utf8_utf8_utf8));
 
+  // gdv_fn_ilike_utf8_utf8
+  args = {types->i64_type(),     // int64_t ptr
+          types->i8_ptr_type(),  // const char* data
+          types->i32_type(),     // int data_len
+          types->i8_ptr_type(),  // const char* pattern
+          types->i32_type()};    // int pattern_len
+
+  engine->AddGlobalMappingForFunc("gdv_fn_ilike_utf8_utf8",
+                                  types->i1_type() /*return_type*/, args,
+                                  reinterpret_cast<void*>(gdv_fn_ilike_utf8_utf8));
+
   // gdv_fn_to_date_utf8_utf8
   args = {types->i64_type(),                   // int64_t execution_context
           types->i64_type(),                   // int64_t holder_ptr
diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h
index ee22c3f..043e940 100644
--- a/cpp/src/gandiva/gdv_function_stubs.h
+++ b/cpp/src/gandiva/gdv_function_stubs.h
@@ -50,6 +50,9 @@ bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len,
                                 const char* pattern, int pattern_len,
                                 const char* escape_char, int escape_char_len);
 
+bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len,
+                            const char* pattern, int pattern_len);
+
 int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context, int64_t ptr, const char* data,
                                        int data_len, bool in1_validity,
                                        const char* pattern, int pattern_len,
diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc
index 5a3510e..af9ac67 100644
--- a/cpp/src/gandiva/like_holder.cc
+++ b/cpp/src/gandiva/like_holder.cc
@@ -80,6 +80,13 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr<LikeHolder>* h
       !IsArrowStringLiteral(literal_type),
       Status::Invalid(
           "'like' function requires a string literal as the second parameter"));
+
+  RE2::Options regex_op;
+  if (node.descriptor()->name() == "ilike") {
+    regex_op.set_case_sensitive(false);  // set case-insensitive for ilike function.
+
+    return Make(arrow::util::get<std::string>(literal->holder()), holder, regex_op);
+  }
   if (node.children().size() == 2) {
     return Make(arrow::util::get<std::string>(literal->holder()), holder);
   } else {
@@ -132,4 +139,18 @@ Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escap
   return Status::OK();
 }
 
+Status LikeHolder::Make(const std::string& sql_pattern,
+                        std::shared_ptr<LikeHolder>* holder, RE2::Options regex_op) {
+  std::string pcre_pattern;
+  ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern));
+
+  std::shared_ptr<LikeHolder> lholder;
+  lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern, regex_op));
+
+  ARROW_RETURN_IF(!lholder->regex_.ok(),
+                  Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
+
+  *holder = lholder;
+  return Status::OK();
+}
 }  // namespace gandiva
diff --git a/cpp/src/gandiva/like_holder.h b/cpp/src/gandiva/like_holder.h
index c7982e9..73e5801 100644
--- a/cpp/src/gandiva/like_holder.h
+++ b/cpp/src/gandiva/like_holder.h
@@ -42,6 +42,9 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder {
   static Status Make(const std::string& sql_pattern, const std::string& escape_char,
                      std::shared_ptr<LikeHolder>* holder);
 
+  static Status Make(const std::string& sql_pattern, std::shared_ptr<LikeHolder>* holder,
+                     RE2::Options regex_op);
+
   // Try and optimise a function node with a "like" pattern.
   static const FunctionNode TryOptimize(const FunctionNode& node);
 
@@ -51,6 +54,9 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder {
  private:
   explicit LikeHolder(const std::string& pattern) : pattern_(pattern), regex_(pattern) {}
 
+  LikeHolder(const std::string& pattern, RE2::Options regex_op)
+      : pattern_(pattern), regex_(pattern, regex_op) {}
+
   std::string pattern_;  // posix pattern string, to help debugging
   RE2 regex_;            // compiled regex for the pattern
 
diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc
index 18e585f..a52533a 100644
--- a/cpp/src/gandiva/like_holder_test.cc
+++ b/cpp/src/gandiva/like_holder_test.cc
@@ -27,6 +27,7 @@ namespace gandiva {
 
 class TestLikeHolder : public ::testing::Test {
  public:
+  RE2::Options regex_op;
   FunctionNode BuildLike(std::string pattern) {
     auto field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8()));
     auto pattern_node =
@@ -48,7 +49,7 @@ class TestLikeHolder : public ::testing::Test {
 TEST_F(TestLikeHolder, TestMatchAny) {
   std::shared_ptr<LikeHolder> like_holder;
 
-  auto status = LikeHolder::Make("ab%", &like_holder);
+  auto status = LikeHolder::Make("ab%", &like_holder, regex_op);
   EXPECT_EQ(status.ok(), true) << status.message();
 
   auto& like = *like_holder;
@@ -63,7 +64,7 @@ TEST_F(TestLikeHolder, TestMatchAny) {
 TEST_F(TestLikeHolder, TestMatchOne) {
   std::shared_ptr<LikeHolder> like_holder;
 
-  auto status = LikeHolder::Make("ab_", &like_holder);
+  auto status = LikeHolder::Make("ab_", &like_holder, regex_op);
   EXPECT_EQ(status.ok(), true) << status.message();
 
   auto& like = *like_holder;
@@ -78,7 +79,7 @@ TEST_F(TestLikeHolder, TestMatchOne) {
 TEST_F(TestLikeHolder, TestPcreSpecial) {
   std::shared_ptr<LikeHolder> like_holder;
 
-  auto status = LikeHolder::Make(".*ab_", &like_holder);
+  auto status = LikeHolder::Make(".*ab_", &like_holder, regex_op);
   EXPECT_EQ(status.ok(), true) << status.message();
 
   auto& like = *like_holder;
@@ -97,7 +98,7 @@ TEST_F(TestLikeHolder, TestRegexEscape) {
 TEST_F(TestLikeHolder, TestDot) {
   std::shared_ptr<LikeHolder> like_holder;
 
-  auto status = LikeHolder::Make("abc.", &like_holder);
+  auto status = LikeHolder::Make("abc.", &like_holder, regex_op);
   EXPECT_EQ(status.ok(), true) << status.message();
 
   auto& like = *like_holder;
@@ -211,4 +212,70 @@ TEST_F(TestLikeHolder, TestMultipleEscapeChar) {
   auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder);
   EXPECT_EQ(status.ok(), false) << status.message();
 }
+class TestILikeHolder : public ::testing::Test {
+ public:
+  RE2::Options regex_op;
+  FunctionNode BuildILike(std::string pattern) {
+    auto field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8()));
+    auto pattern_node =
+        std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern), false);
+    return FunctionNode("ilike", {field, pattern_node}, arrow::boolean());
+  }
+};
+
+TEST_F(TestILikeHolder, TestMatchAny) {
+  std::shared_ptr<LikeHolder> like_holder;
+
+  regex_op.set_case_sensitive(false);
+  auto status = LikeHolder::Make("ab%", &like_holder, regex_op);
+  EXPECT_EQ(status.ok(), true) << status.message();
+
+  auto& like = *like_holder;
+  EXPECT_TRUE(like("ab"));
+  EXPECT_TRUE(like("aBc"));
+  EXPECT_TRUE(like("ABCD"));
+
+  EXPECT_FALSE(like("a"));
+  EXPECT_FALSE(like("cab"));
+}
+
+TEST_F(TestILikeHolder, TestMatchOne) {
+  std::shared_ptr<LikeHolder> like_holder;
+
+  regex_op.set_case_sensitive(false);
+  auto status = LikeHolder::Make("Ab_", &like_holder, regex_op);
+  EXPECT_EQ(status.ok(), true) << status.message();
+
+  auto& like = *like_holder;
+  EXPECT_TRUE(like("abc"));
+  EXPECT_TRUE(like("aBd"));
+
+  EXPECT_FALSE(like("A"));
+  EXPECT_FALSE(like("Abcd"));
+  EXPECT_FALSE(like("DaBc"));
+}
+
+TEST_F(TestILikeHolder, TestPcreSpecial) {
+  std::shared_ptr<LikeHolder> like_holder;
+
+  regex_op.set_case_sensitive(false);
+  auto status = LikeHolder::Make(".*aB_", &like_holder, regex_op);
+  EXPECT_EQ(status.ok(), true) << status.message();
+
+  auto& like = *like_holder;
+  EXPECT_TRUE(like(".*Abc"));  // . and * aren't special in sql regex
+  EXPECT_FALSE(like("xxAbc"));
+}
+
+TEST_F(TestILikeHolder, TestDot) {
+  std::shared_ptr<LikeHolder> like_holder;
+
+  regex_op.set_case_sensitive(false);
+  auto status = LikeHolder::Make("aBc.", &like_holder, regex_op);
+  EXPECT_EQ(status.ok(), true) << status.message();
+
+  auto& like = *like_holder;
+  EXPECT_FALSE(like("abcd"));
+}
+
 }  // namespace gandiva