You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2021/06/04 06:13:25 UTC
[arrow] branch master updated: ARROW-11960: [C++][Gandiva] Support
escape in LIKE
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new ca66567 ARROW-11960: [C++][Gandiva] Support escape in LIKE
ca66567 is described below
commit ca66567febd283fc635da68361a72877a23d0c26
Author: crystrix <ch...@live.com>
AuthorDate: Fri Jun 4 15:11:54 2021 +0900
ARROW-11960: [C++][Gandiva] Support escape in LIKE
Add gdv_fn_like_utf8_utf8_int8 function in Gandiva to support escape char in LIKE. An escape char is stored in an int8 type which is compatible with char type in C++.
Closes #9700 from Crystrix/arrow-11960
Authored-by: crystrix <ch...@live.com>
Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
cpp/src/gandiva/function_registry_string.cc | 4 ++
cpp/src/gandiva/gdv_function_stubs.cc | 20 +++++++
cpp/src/gandiva/gdv_function_stubs.h | 4 ++
cpp/src/gandiva/like_holder.cc | 43 +++++++++++++--
cpp/src/gandiva/like_holder.h | 3 ++
cpp/src/gandiva/like_holder_test.cc | 84 +++++++++++++++++++++++++++++
cpp/src/gandiva/tests/utf8_test.cc | 43 +++++++++++++++
7 files changed, 197 insertions(+), 4 deletions(-)
diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc
index cbc7006..35ef2df 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -124,6 +124,10 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
kResultNullIfNull, "gdv_fn_like_utf8_utf8",
NativeFunction::kNeedsFunctionHolder),
+ NativeFunction("like", {}, DataTypeVector{utf8(), utf8(), utf8()}, boolean(),
+ kResultNullIfNull, "gdv_fn_like_utf8_utf8_utf8",
+ NativeFunction::kNeedsFunctionHolder),
+
NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
kResultNullIfNull, "ltrim_utf8_utf8", NativeFunction::kNeedsContext),
diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc
index a890775..26b8654 100644
--- a/cpp/src/gandiva/gdv_function_stubs.cc
+++ b/cpp/src/gandiva/gdv_function_stubs.cc
@@ -45,6 +45,13 @@ bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len,
return (*holder)(std::string(data, data_len));
}
+bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len,
+ const char* pattern, int pattern_len,
+ const char* escape_char, int escape_char_len) {
+ gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr);
+ return (*holder)(std::string(data, data_len));
+}
+
double gdv_fn_random(int64_t ptr) {
gandiva::RandomGeneratorHolder* holder =
reinterpret_cast<gandiva::RandomGeneratorHolder*>(ptr);
@@ -732,6 +739,19 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
types->i1_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_like_utf8_utf8));
+ // gdv_fn_like_utf8_utf8_utf8
+ args = {types->i64_type(), // int64_t ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type(), // int data_len
+ types->i8_ptr_type(), // const char* pattern
+ types->i32_type(), // int pattern_len
+ types->i8_ptr_type(), // const char* escape_char
+ types->i32_type()}; // int escape_char_len
+
+ engine->AddGlobalMappingForFunc("gdv_fn_like_utf8_utf8_utf8",
+ types->i1_type() /*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_like_utf8_utf8_utf8));
+
// gdv_fn_to_date_utf8_utf8
args = {types->i64_type(), // int64_t execution_context
types->i64_type(), // int64_t holder_ptr
diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h
index 847772b..d4a127d 100644
--- a/cpp/src/gandiva/gdv_function_stubs.h
+++ b/cpp/src/gandiva/gdv_function_stubs.h
@@ -46,6 +46,10 @@ using gdv_day_time_interval = int64_t;
bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len,
const char* pattern, int pattern_len);
+bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len,
+ const char* pattern, int pattern_len,
+ const char* escape_char, int escape_char_len);
+
int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context, int64_t ptr, const char* data,
int data_len, bool in1_validity,
const char* pattern, int pattern_len,
diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc
index 688a4ff..5a3510e 100644
--- a/cpp/src/gandiva/like_holder.cc
+++ b/cpp/src/gandiva/like_holder.cc
@@ -67,8 +67,8 @@ static bool IsArrowStringLiteral(arrow::Type::type type) {
}
Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr<LikeHolder>* holder) {
- ARROW_RETURN_IF(node.children().size() != 2,
- Status::Invalid("'like' function requires two parameters"));
+ ARROW_RETURN_IF(node.children().size() != 2 && node.children().size() != 3,
+ Status::Invalid("'like' function requires two or three parameters"));
auto literal = dynamic_cast<LiteralNode*>(node.children().at(1).get());
ARROW_RETURN_IF(
@@ -80,8 +80,22 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr<LikeHolder>* h
!IsArrowStringLiteral(literal_type),
Status::Invalid(
"'like' function requires a string literal as the second parameter"));
-
- return Make(arrow::util::get<std::string>(literal->holder()), holder);
+ if (node.children().size() == 2) {
+ return Make(arrow::util::get<std::string>(literal->holder()), holder);
+ } else {
+ auto escape_char = dynamic_cast<LiteralNode*>(node.children().at(2).get());
+ ARROW_RETURN_IF(
+ escape_char == nullptr,
+ Status::Invalid("'like' function requires a literal as the third parameter"));
+
+ auto escape_char_type = escape_char->return_type()->id();
+ ARROW_RETURN_IF(
+ !IsArrowStringLiteral(escape_char_type),
+ Status::Invalid(
+ "'like' function requires a string literal as the third parameter"));
+ return Make(arrow::util::get<std::string>(literal->holder()),
+ arrow::util::get<std::string>(escape_char->holder()), holder);
+ }
}
Status LikeHolder::Make(const std::string& sql_pattern,
@@ -97,4 +111,25 @@ Status LikeHolder::Make(const std::string& sql_pattern,
return Status::OK();
}
+Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escape_char,
+ std::shared_ptr<LikeHolder>* holder) {
+ ARROW_RETURN_IF(escape_char.length() > 1,
+ Status::Invalid("The length of escape char ", escape_char,
+ " in 'like' function is greater than 1"));
+ std::string pcre_pattern;
+ if (escape_char.length() == 1) {
+ ARROW_RETURN_NOT_OK(
+ RegexUtil::SqlLikePatternToPcre(sql_pattern, escape_char.at(0), pcre_pattern));
+ } else {
+ ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern));
+ }
+
+ auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
+ ARROW_RETURN_IF(!lholder->regex_.ok(),
+ Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
+
+ *holder = lholder;
+ return Status::OK();
+}
+
} // namespace gandiva
diff --git a/cpp/src/gandiva/like_holder.h b/cpp/src/gandiva/like_holder.h
index 82c9e3b..c7982e9 100644
--- a/cpp/src/gandiva/like_holder.h
+++ b/cpp/src/gandiva/like_holder.h
@@ -39,6 +39,9 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder {
static Status Make(const std::string& sql_pattern, std::shared_ptr<LikeHolder>* holder);
+ static Status Make(const std::string& sql_pattern, const std::string& escape_char,
+ std::shared_ptr<LikeHolder>* holder);
+
// Try and optimise a function node with a "like" pattern.
static const FunctionNode TryOptimize(const FunctionNode& node);
diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc
index ce6697e..18e585f 100644
--- a/cpp/src/gandiva/like_holder_test.cc
+++ b/cpp/src/gandiva/like_holder_test.cc
@@ -33,6 +33,16 @@ class TestLikeHolder : public ::testing::Test {
std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern), false);
return FunctionNode("like", {field, pattern_node}, arrow::boolean());
}
+
+ FunctionNode BuildLike(std::string pattern, char escape_char) {
+ auto field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8()));
+ auto pattern_node =
+ std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern), false);
+ auto escape_char_node = std::make_shared<LiteralNode>(
+ arrow::int8(), LiteralHolder((int8_t)escape_char), false);
+ return FunctionNode("like", {field, pattern_node, escape_char_node},
+ arrow::boolean());
+ }
};
TEST_F(TestLikeHolder, TestMatchAny) {
@@ -125,6 +135,80 @@ TEST_F(TestLikeHolder, TestOptimise) {
fnode = LikeHolder::TryOptimize(BuildLike("x_yz%"));
EXPECT_EQ(fnode.descriptor()->name(), "like");
+
+ // no optimisation for escaped pattern.
+ fnode = LikeHolder::TryOptimize(BuildLike("\\%xyz", '\\'));
+ EXPECT_EQ(fnode.descriptor()->name(), "like");
+ EXPECT_EQ(fnode.ToString(),
+ "bool like((string) in, (const string) \\%xyz, (const int8) \\)");
+}
+
+TEST_F(TestLikeHolder, TestMatchOneEscape) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\_", "\\", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+
+ EXPECT_TRUE(like("ab_"));
+
+ EXPECT_FALSE(like("abc"));
+ EXPECT_FALSE(like("abd"));
+ EXPECT_FALSE(like("a"));
+ EXPECT_FALSE(like("abcd"));
+ EXPECT_FALSE(like("dabc"));
+}
+
+TEST_F(TestLikeHolder, TestMatchManyEscape) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\%", "\\", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+
+ EXPECT_TRUE(like("ab%"));
+
+ EXPECT_FALSE(like("abc"));
+ EXPECT_FALSE(like("abd"));
+ EXPECT_FALSE(like("a"));
+ EXPECT_FALSE(like("abcd"));
+ EXPECT_FALSE(like("dabc"));
+}
+
+TEST_F(TestLikeHolder, TestMatchEscape) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\\\", "\\", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+
+ EXPECT_TRUE(like("ab\\"));
+
+ EXPECT_FALSE(like("abc"));
}
+TEST_F(TestLikeHolder, TestEmptyEscapeChar) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\_", "", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+
+ EXPECT_TRUE(like("ab\\c"));
+ EXPECT_TRUE(like("ab\\_"));
+
+ EXPECT_FALSE(like("ab\\_d"));
+ EXPECT_FALSE(like("ab__"));
+}
+
+TEST_F(TestLikeHolder, TestMultipleEscapeChar) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder);
+ EXPECT_EQ(status.ok(), false) << status.message();
+}
} // namespace gandiva
diff --git a/cpp/src/gandiva/tests/utf8_test.cc b/cpp/src/gandiva/tests/utf8_test.cc
index 29ce81f..01e62a5 100644
--- a/cpp/src/gandiva/tests/utf8_test.cc
+++ b/cpp/src/gandiva/tests/utf8_test.cc
@@ -221,6 +221,49 @@ TEST_F(TestUtf8, TestLike) {
EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
}
+TEST_F(TestUtf8, TestLikeWithEscape) {
+ // schema for input fields
+ auto field_a = field("a", utf8());
+ auto schema = arrow::schema({field_a});
+
+ // output fields
+ auto res = field("res", boolean());
+
+ // build expressions.
+ // like(literal(s), a, '\')
+
+ auto node_a = TreeExprBuilder::MakeField(field_a);
+ auto literal_s = TreeExprBuilder::MakeStringLiteral("%pa\\%rk%");
+ auto escape_char = TreeExprBuilder::MakeStringLiteral("\\");
+ auto is_like =
+ TreeExprBuilder::MakeFunction("like", {node_a, literal_s, escape_char}, boolean());
+ auto expr = TreeExprBuilder::MakeExpression(is_like, res);
+
+ // Build a projector for the expressions.
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 4;
+ auto array_a = MakeArrowArrayUtf8(
+ {"park", "spa%rkle", "bright spa%rk and fire", "spark"}, {true, true, true, true});
+
+ // expected output
+ auto exp = MakeArrowArrayBool({false, true, true, false}, {true, true, true, true});
+
+ // prepare input record batch
+ auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in_batch, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
+}
+
TEST_F(TestUtf8, TestBeginsEnds) {
// schema for input fields
auto field_a = field("a", utf8());