You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ra...@apache.org on 2022/02/14 13:08:36 UTC

[arrow] branch master updated: ARROW-15674: [C++][Gandiva] Like function doesn't properly handle patterns with special characters in certain cases

This is an automated email from the ASF dual-hosted git repository.

ravindra pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5f590e9  ARROW-15674: [C++][Gandiva] Like function doesn't properly handle patterns with special characters in certain cases
5f590e9 is described below

commit 5f590e9e64d880e2290dacc76ac85b4cd0d5f40a
Author: Projjal Chanda <ia...@pchanda.com>
AuthorDate: Mon Feb 14 18:35:59 2022 +0530

    ARROW-15674: [C++][Gandiva] Like function doesn't properly handle patterns with special characters in certain cases
    
    For example following pattern 'abc-xyz%' doesn't work however 'abc-xyz' works. This is because special characters are escaped to work with regex matcher, but when they are optimised with starts_with/ends_with/is_substr function these escape characters are not currently removed for the default case ('\\' escape char).
    
    Closes #12417 from projjal/fixlike
    
    Authored-by: Projjal Chanda <ia...@pchanda.com>
    Signed-off-by: Pindikura Ravindra <ra...@dremio.com>
---
 cpp/src/gandiva/like_holder.cc       |  3 ++-
 cpp/src/gandiva/like_holder_test.cc  |  6 +++---
 cpp/src/gandiva/tests/filter_test.cc | 39 ++++++++++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc
index eeb4527..3391c7e 100644
--- a/cpp/src/gandiva/like_holder.cc
+++ b/cpp/src/gandiva/like_holder.cc
@@ -32,7 +32,8 @@ std::string& RemovePatternEscapeChars(const FunctionNode& node, std::string& pat
     pattern.erase(std::remove(pattern.begin(), pattern.end(),
                               arrow::util::get<std::string>(escape_char->holder()).at(0)),
                   pattern.end());  // remove escape chars
-    return pattern;
+  } else {
+    pattern.erase(std::remove(pattern.begin(), pattern.end(), '\\'), pattern.end());
   }
   return pattern;
 }
diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc
index a2b9560..76a77542 100644
--- a/cpp/src/gandiva/like_holder_test.cc
+++ b/cpp/src/gandiva/like_holder_test.cc
@@ -142,21 +142,21 @@ TEST_F(TestLikeHolder, TestOptimise) {
   // optimise for 'is_substr with special characters'
   fnode = LikeHolder::TryOptimize(BuildLike("%ab-c%"));
   EXPECT_EQ(fnode.descriptor()->name(), "is_substr");
-  EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) 'ab\\-c')");
+  EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) 'ab-c')");
 
   // optimise for 'ends_with with special characters'
   fnode = LikeHolder::TryOptimize(BuildLike("%ab-c"));
   EXPECT_EQ(fnode.descriptor()->name(), "ends_with");
   EXPECT_EQ(fnode.ToString(),
             "bool ends_with((string) in, (const string) "
-            "'ab\\-c')");
+            "'ab-c')");
 
   // optimise for 'starts_with with special characters'
   fnode = LikeHolder::TryOptimize(BuildLike("ab-c%"));
   EXPECT_EQ(fnode.descriptor()->name(), "starts_with");
   EXPECT_EQ(fnode.ToString(),
             "bool starts_with((string) in, (const string) "
-            "'ab\\-c')");
+            "'ab-c')");
 
   // no optimisation for others.
   fnode = LikeHolder::TryOptimize(BuildLike("xyz_"));
diff --git a/cpp/src/gandiva/tests/filter_test.cc b/cpp/src/gandiva/tests/filter_test.cc
index 161b32a..54d3aa0 100644
--- a/cpp/src/gandiva/tests/filter_test.cc
+++ b/cpp/src/gandiva/tests/filter_test.cc
@@ -377,4 +377,43 @@ TEST_F(TestFilter, TestOffset) {
   EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
 }
 
+TEST_F(TestFilter, TestLike) {
+  // schema for input fields
+  auto field0 = field("f0", utf8());
+  auto schema = arrow::schema({field0});
+
+  auto node_f0 = TreeExprBuilder::MakeField(field0);
+  auto literal_pattern = TreeExprBuilder::MakeStringLiteral("abc-xyz%");
+  auto like_func =
+      TreeExprBuilder::MakeFunction("like", {node_f0, literal_pattern}, boolean());
+
+  auto condition = TreeExprBuilder::MakeCondition(like_func);
+
+  std::shared_ptr<Filter> filter;
+  auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+  EXPECT_TRUE(status.ok());
+
+  // Create a row-batch with some sample data
+  int num_records = 5;
+  auto array0 = MakeArrowArrayUtf8({"abc-xyz", "hello", "bye", "abc-x", "abc-xyzw"},
+                                   {true, true, true, true, true});
+
+  // expected output (indices for which condition matches)
+  auto exp = MakeArrowArrayUint16({0, 4});
+
+  // prepare input record batch
+  auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+  std::shared_ptr<SelectionVector> selection_vector;
+  status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
+  EXPECT_TRUE(status.ok());
+
+  // Evaluate expression
+  status = filter->Evaluate(*in_batch, selection_vector);
+  EXPECT_TRUE(status.ok());
+
+  // Validate results
+  EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
 }  // namespace gandiva