You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ra...@apache.org on 2022/02/14 13:08:36 UTC
[arrow] branch master updated: ARROW-15674: [C++][Gandiva] Like function doesn't properly handle patterns with special characters in certain cases
This is an automated email from the ASF dual-hosted git repository.
ravindra pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5f590e9 ARROW-15674: [C++][Gandiva] Like function doesn't properly handle patterns with special characters in certain cases
5f590e9 is described below
commit 5f590e9e64d880e2290dacc76ac85b4cd0d5f40a
Author: Projjal Chanda <ia...@pchanda.com>
AuthorDate: Mon Feb 14 18:35:59 2022 +0530
ARROW-15674: [C++][Gandiva] Like function doesn't properly handle patterns with special characters in certain cases
For example following pattern 'abc-xyz%' doesn't work however 'abc-xyz' works. This is because special characters are escaped to work with regex matcher, but when they are optimised with starts_with/ends_with/is_substr function these escape characters are not currently removed for the default case ('\\' escape char).
Closes #12417 from projjal/fixlike
Authored-by: Projjal Chanda <ia...@pchanda.com>
Signed-off-by: Pindikura Ravindra <ra...@dremio.com>
---
cpp/src/gandiva/like_holder.cc | 3 ++-
cpp/src/gandiva/like_holder_test.cc | 6 +++---
cpp/src/gandiva/tests/filter_test.cc | 39 ++++++++++++++++++++++++++++++++++++
3 files changed, 44 insertions(+), 4 deletions(-)
diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc
index eeb4527..3391c7e 100644
--- a/cpp/src/gandiva/like_holder.cc
+++ b/cpp/src/gandiva/like_holder.cc
@@ -32,7 +32,8 @@ std::string& RemovePatternEscapeChars(const FunctionNode& node, std::string& pat
pattern.erase(std::remove(pattern.begin(), pattern.end(),
arrow::util::get<std::string>(escape_char->holder()).at(0)),
pattern.end()); // remove escape chars
- return pattern;
+ } else {
+ pattern.erase(std::remove(pattern.begin(), pattern.end(), '\\'), pattern.end());
}
return pattern;
}
diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc
index a2b9560..76a77542 100644
--- a/cpp/src/gandiva/like_holder_test.cc
+++ b/cpp/src/gandiva/like_holder_test.cc
@@ -142,21 +142,21 @@ TEST_F(TestLikeHolder, TestOptimise) {
// optimise for 'is_substr with special characters'
fnode = LikeHolder::TryOptimize(BuildLike("%ab-c%"));
EXPECT_EQ(fnode.descriptor()->name(), "is_substr");
- EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) 'ab\\-c')");
+ EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) 'ab-c')");
// optimise for 'ends_with with special characters'
fnode = LikeHolder::TryOptimize(BuildLike("%ab-c"));
EXPECT_EQ(fnode.descriptor()->name(), "ends_with");
EXPECT_EQ(fnode.ToString(),
"bool ends_with((string) in, (const string) "
- "'ab\\-c')");
+ "'ab-c')");
// optimise for 'starts_with with special characters'
fnode = LikeHolder::TryOptimize(BuildLike("ab-c%"));
EXPECT_EQ(fnode.descriptor()->name(), "starts_with");
EXPECT_EQ(fnode.ToString(),
"bool starts_with((string) in, (const string) "
- "'ab\\-c')");
+ "'ab-c')");
// no optimisation for others.
fnode = LikeHolder::TryOptimize(BuildLike("xyz_"));
diff --git a/cpp/src/gandiva/tests/filter_test.cc b/cpp/src/gandiva/tests/filter_test.cc
index 161b32a..54d3aa0 100644
--- a/cpp/src/gandiva/tests/filter_test.cc
+++ b/cpp/src/gandiva/tests/filter_test.cc
@@ -377,4 +377,43 @@ TEST_F(TestFilter, TestOffset) {
EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
}
+TEST_F(TestFilter, TestLike) {
+ // schema for input fields
+ auto field0 = field("f0", utf8());
+ auto schema = arrow::schema({field0});
+
+ auto node_f0 = TreeExprBuilder::MakeField(field0);
+ auto literal_pattern = TreeExprBuilder::MakeStringLiteral("abc-xyz%");
+ auto like_func =
+ TreeExprBuilder::MakeFunction("like", {node_f0, literal_pattern}, boolean());
+
+ auto condition = TreeExprBuilder::MakeCondition(like_func);
+
+ std::shared_ptr<Filter> filter;
+ auto status = Filter::Make(schema, condition, TestConfiguration(), &filter);
+ EXPECT_TRUE(status.ok());
+
+ // Create a row-batch with some sample data
+ int num_records = 5;
+ auto array0 = MakeArrowArrayUtf8({"abc-xyz", "hello", "bye", "abc-x", "abc-xyzw"},
+ {true, true, true, true, true});
+
+ // expected output (indices for which condition matches)
+ auto exp = MakeArrowArrayUint16({0, 4});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
+
+ std::shared_ptr<SelectionVector> selection_vector;
+ status = SelectionVector::MakeInt16(num_records, pool_, &selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Evaluate expression
+ status = filter->Evaluate(*in_batch, selection_vector);
+ EXPECT_TRUE(status.ok());
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
+}
+
} // namespace gandiva