You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2023/01/22 08:33:51 UTC
[arrow] branch master updated: GH-33723: [C++] re2::RE2::RE2() result must be checked (#33806)
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new cb0f0178da GH-33723: [C++] re2::RE2::RE2() result must be checked (#33806)
cb0f0178da is described below
commit cb0f0178dacd922f355255a16c85820fb4e390c8
Author: Jin Shang <sh...@gmail.com>
AuthorDate: Sun Jan 22 16:33:45 2023 +0800
GH-33723: [C++] re2::RE2::RE2() result must be checked (#33806)
### Rationale for this change
RE2 construction needs to be checked.
### What changes are included in this PR?
Check all RE2 object status.
### Are these changes tested?
Tested with existing tests.
### Are there any user-facing changes?
No.
* Closes: #33723
Authored-by: Jin Shang <sh...@gmail.com>
Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
.../arrow/compute/kernels/scalar_string_ascii.cc | 29 ++++++++++--
.../arrow/compute/kernels/scalar_string_test.cc | 11 +++++
cpp/src/gandiva/regex_functions_holder.cc | 53 +++++++++++++++-------
cpp/src/gandiva/regex_functions_holder.h | 4 --
4 files changed, 71 insertions(+), 26 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
index d3d0ac3201..d36ef26d9a 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
@@ -18,6 +18,7 @@
#include <algorithm>
#include <cctype>
#include <iterator>
+#include <memory>
#include <string>
#ifdef ARROW_WITH_RE2
@@ -26,6 +27,8 @@
#include "arrow/array/builder_nested.h"
#include "arrow/compute/kernels/scalar_string_internal.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
#include "arrow/util/string.h"
#include "arrow/util/value_parsing.h"
@@ -1505,6 +1508,13 @@ struct MatchLike {
static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)", kRE2Options);
// A LIKE pattern matching this regex can be translated into a suffix search.
static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))", kRE2Options);
+ static bool global_checked = false;
+ if (ARROW_PREDICT_FALSE(!global_checked)) {
+ RETURN_NOT_OK(RegexStatus(kLikePatternIsSubstringMatch));
+ RETURN_NOT_OK(RegexStatus(kLikePatternIsStartsWith));
+ RETURN_NOT_OK(RegexStatus(kLikePatternIsEndsWith));
+ global_checked = true;
+ }
auto original_options = MatchSubstringState::Get(ctx);
auto original_state = ctx->state();
@@ -1669,14 +1679,21 @@ struct FindSubstring {
struct FindSubstringRegex {
std::unique_ptr<RE2> regex_match_;
+ static Result<FindSubstringRegex> Make(const MatchSubstringOptions& options,
+ bool is_utf8 = true, bool literal = false) {
+ auto matcher = FindSubstringRegex(options, is_utf8, literal);
+ RETURN_NOT_OK(RegexStatus(*matcher.regex_match_));
+ return std::move(matcher);
+ }
+
explicit FindSubstringRegex(const MatchSubstringOptions& options, bool is_utf8 = true,
bool literal = false) {
std::string regex = "(";
regex.reserve(options.pattern.length() + 2);
regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern;
regex += ")";
- regex_match_.reset(
- new RE2(regex, MakeRE2Options(is_utf8, options.ignore_case, /*literal=*/false)));
+ regex_match_ = std::make_unique<RE2>(
+ regex, MakeRE2Options(is_utf8, options.ignore_case, /*literal=*/false));
}
template <typename OutValue, typename... Ignored>
@@ -1698,9 +1715,10 @@ struct FindSubstringExec {
const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
if (options.ignore_case) {
#ifdef ARROW_WITH_RE2
+ ARROW_ASSIGN_OR_RAISE(auto matcher,
+ FindSubstringRegex::Make(options, InputType::is_utf8, true));
applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
- kernel{FindSubstringRegex(options, /*is_utf8=*/InputType::is_utf8,
- /*literal=*/true)};
+ kernel{std::move(matcher)};
return kernel.Exec(ctx, batch, out);
#else
return Status::NotImplemented("ignore_case requires RE2");
@@ -1725,8 +1743,9 @@ struct FindSubstringRegexExec {
using OffsetType = typename TypeTraits<InputType>::OffsetType;
static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ ARROW_ASSIGN_OR_RAISE(auto matcher, FindSubstringRegex::Make(options, false));
applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
- kernel{FindSubstringRegex(options, /*literal=*/false)};
+ kernel{std::move(matcher)};
return kernel.Exec(ctx, batch, out);
}
};
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index ad71c23f2e..ba43191fe0 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -28,6 +28,7 @@
#endif
#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/exec.h"
#include "arrow/compute/kernels/codegen_internal.h"
#include "arrow/compute/kernels/test_util.h"
#include "arrow/testing/gtest_util.h"
@@ -494,6 +495,16 @@ TYPED_TEST(TestBaseBinaryKernels, FindSubstringRegex) {
this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])",
this->offset_type(), "[0, 0, 1, null, -1, 0]", &options);
}
+
+TYPED_TEST(TestBaseBinaryKernels, FindSubstringRegexWrongPattern) {
+ MatchSubstringOptions options{"(a", /*ignore_case=*/false};
+
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid, ::testing::HasSubstr("Invalid regular expression"),
+ CallFunction("find_substring_regex",
+ {Datum(R"(["a", "A", "baaa", null, "", "AaaA"])")}, &options));
+}
+
#else
TYPED_TEST(TestBaseBinaryKernels, FindSubstringIgnoreCase) {
MatchSubstringOptions options{"a+", /*ignore_case=*/true};
diff --git a/cpp/src/gandiva/regex_functions_holder.cc b/cpp/src/gandiva/regex_functions_holder.cc
index f2be7bd27e..30d68cbc87 100644
--- a/cpp/src/gandiva/regex_functions_holder.cc
+++ b/cpp/src/gandiva/regex_functions_holder.cc
@@ -17,15 +17,12 @@
#include "gandiva/regex_functions_holder.h"
#include <regex>
+#include "arrow/util/macros.h"
#include "gandiva/node.h"
#include "gandiva/regex_util.h"
namespace gandiva {
-RE2 LikeHolder::starts_with_regex_(R"(([^\.\*])*\.\*)");
-RE2 LikeHolder::ends_with_regex_(R"(\.\*([^\.\*])*)");
-RE2 LikeHolder::is_substr_regex_(R"(\.\*([^\.\*])*\.\*)");
-
std::string& RemovePatternEscapeChars(const FunctionNode& node, std::string& pattern) {
pattern.erase(std::remove(pattern.begin(), pattern.end(), '\\'), pattern.end());
return pattern;
@@ -34,27 +31,44 @@ std::string& RemovePatternEscapeChars(const FunctionNode& node, std::string& pat
// Short-circuit pattern matches for the following common sub cases :
// - starts_with, ends_with and is_substr
const FunctionNode LikeHolder::TryOptimize(const FunctionNode& node) {
+ // NOTE: avoid making those constants global to avoid compiling regexes at startup
+ // pre-compiled pattern for matching starts_with
+ static const RE2 starts_with_regex(R"(([^\.\*])*\.\*)");
+ // pre-compiled pattern for matching ends_with
+ static const RE2 ends_with_regex(R"(\.\*([^\.\*])*)");
+ // pre-compiled pattern for matching is_substr
+ static const RE2 is_substr_regex(R"(\.\*([^\.\*])*\.\*)");
+
+ static bool global_checked = false;
+ if (ARROW_PREDICT_FALSE(!global_checked)) {
+ if (ARROW_PREDICT_FALSE(
+ !(starts_with_regex.ok() && ends_with_regex.ok() && is_substr_regex.ok()))) {
+ return node;
+ }
+ global_checked = true;
+ }
+
std::shared_ptr<LikeHolder> holder;
auto status = Make(node, &holder);
if (status.ok()) {
std::string& pattern = holder->pattern_;
auto literal_type = node.children().at(1)->return_type();
- if (RE2::FullMatch(pattern, starts_with_regex_)) {
+ if (RE2::FullMatch(pattern, starts_with_regex)) {
auto prefix = pattern.substr(0, pattern.length() - 2); // trim .*
auto parsed_prefix = RemovePatternEscapeChars(node, prefix);
auto prefix_node = std::make_shared<LiteralNode>(
literal_type, LiteralHolder(parsed_prefix), false);
return FunctionNode("starts_with", {node.children().at(0), prefix_node},
node.return_type());
- } else if (RE2::FullMatch(pattern, ends_with_regex_)) {
+ } else if (RE2::FullMatch(pattern, ends_with_regex)) {
auto suffix = pattern.substr(2); // skip .*
auto parsed_suffix = RemovePatternEscapeChars(node, suffix);
auto suffix_node = std::make_shared<LiteralNode>(
literal_type, LiteralHolder(parsed_suffix), false);
return FunctionNode("ends_with", {node.children().at(0), suffix_node},
node.return_type());
- } else if (RE2::FullMatch(pattern, is_substr_regex_)) {
+ } else if (RE2::FullMatch(pattern, is_substr_regex)) {
auto substr =
pattern.substr(2, pattern.length() - 4); // trim starting and ending .*
auto parsed_substr = RemovePatternEscapeChars(node, substr);
@@ -115,9 +129,10 @@ Status LikeHolder::Make(const std::string& sql_pattern,
auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
ARROW_RETURN_IF(!lholder->regex_.ok(),
- Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
+ Status::Invalid("Building RE2 pattern '", pcre_pattern,
+ "' failed with: ", lholder->regex_.error()));
- *holder = lholder;
+ *holder = std::move(lholder);
return Status::OK();
}
@@ -136,9 +151,10 @@ Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escap
auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
ARROW_RETURN_IF(!lholder->regex_.ok(),
- Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
+ Status::Invalid("Building RE2 pattern '", pcre_pattern,
+ "' failed with: ", lholder->regex_.error()));
- *holder = lholder;
+ *holder = std::move(lholder);
return Status::OK();
}
@@ -151,9 +167,10 @@ Status LikeHolder::Make(const std::string& sql_pattern,
lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern, regex_op));
ARROW_RETURN_IF(!lholder->regex_.ok(),
- Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
+ Status::Invalid("Building RE2 pattern '", pcre_pattern,
+ "' failed with: ", lholder->regex_.error()));
- *holder = lholder;
+ *holder = std::move(lholder);
return Status::OK();
}
@@ -180,9 +197,10 @@ Status ReplaceHolder::Make(const std::string& sql_pattern,
std::shared_ptr<ReplaceHolder>* holder) {
auto lholder = std::shared_ptr<ReplaceHolder>(new ReplaceHolder(sql_pattern));
ARROW_RETURN_IF(!lholder->regex_.ok(),
- Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed"));
+ Status::Invalid("Building RE2 pattern '", sql_pattern,
+ "' failed with: ", lholder->regex_.error()));
- *holder = lholder;
+ *holder = std::move(lholder);
return Status::OK();
}
@@ -210,9 +228,10 @@ Status ExtractHolder::Make(const std::string& sql_pattern,
std::shared_ptr<ExtractHolder>* holder) {
auto lholder = std::shared_ptr<ExtractHolder>(new ExtractHolder(sql_pattern));
ARROW_RETURN_IF(!lholder->regex_.ok(),
- Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed"));
+ Status::Invalid("Building RE2 pattern '", sql_pattern,
+ "' failed with: ", lholder->regex_.error()));
- *holder = lholder;
+ *holder = std::move(lholder);
return Status::OK();
}
diff --git a/cpp/src/gandiva/regex_functions_holder.h b/cpp/src/gandiva/regex_functions_holder.h
index 8e0fe44269..ecf4095f3d 100644
--- a/cpp/src/gandiva/regex_functions_holder.h
+++ b/cpp/src/gandiva/regex_functions_holder.h
@@ -59,10 +59,6 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder {
std::string pattern_; // posix pattern string, to help debugging
RE2 regex_; // compiled regex for the pattern
-
- static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with
- static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with
- static RE2 is_substr_regex_; // pre-compiled pattern for matching is_substr
};
/// Function Holder for 'replace'