You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2023/01/22 08:33:51 UTC

[arrow] branch master updated: GH-33723: [C++] re2::RE2::RE2() result must be checked (#33806)

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new cb0f0178da GH-33723: [C++] re2::RE2::RE2() result must be checked (#33806)
cb0f0178da is described below

commit cb0f0178dacd922f355255a16c85820fb4e390c8
Author: Jin Shang <sh...@gmail.com>
AuthorDate: Sun Jan 22 16:33:45 2023 +0800

    GH-33723: [C++] re2::RE2::RE2() result must be checked (#33806)
    
    
    
    ### Rationale for this change
    
    RE2 construction needs to be checked.
    
    ### What changes are included in this PR?
    
    Check all RE2 object status.
    
    ### Are these changes tested?
    
    Tested with existing tests.
    
    ### Are there any user-facing changes?
    
    No.
    * Closes: #33723
    
    Authored-by: Jin Shang <sh...@gmail.com>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 .../arrow/compute/kernels/scalar_string_ascii.cc   | 29 ++++++++++--
 .../arrow/compute/kernels/scalar_string_test.cc    | 11 +++++
 cpp/src/gandiva/regex_functions_holder.cc          | 53 +++++++++++++++-------
 cpp/src/gandiva/regex_functions_holder.h           |  4 --
 4 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
index d3d0ac3201..d36ef26d9a 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include <cctype>
 #include <iterator>
+#include <memory>
 #include <string>
 
 #ifdef ARROW_WITH_RE2
@@ -26,6 +27,8 @@
 
 #include "arrow/array/builder_nested.h"
 #include "arrow/compute/kernels/scalar_string_internal.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
 #include "arrow/util/string.h"
 #include "arrow/util/value_parsing.h"
 
@@ -1505,6 +1508,13 @@ struct MatchLike {
     static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)", kRE2Options);
     // A LIKE pattern matching this regex can be translated into a suffix search.
     static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))", kRE2Options);
+    static bool global_checked = false;
+    if (ARROW_PREDICT_FALSE(!global_checked)) {
+      RETURN_NOT_OK(RegexStatus(kLikePatternIsSubstringMatch));
+      RETURN_NOT_OK(RegexStatus(kLikePatternIsStartsWith));
+      RETURN_NOT_OK(RegexStatus(kLikePatternIsEndsWith));
+      global_checked = true;
+    }
 
     auto original_options = MatchSubstringState::Get(ctx);
     auto original_state = ctx->state();
@@ -1669,14 +1679,21 @@ struct FindSubstring {
 struct FindSubstringRegex {
   std::unique_ptr<RE2> regex_match_;
 
+  static Result<FindSubstringRegex> Make(const MatchSubstringOptions& options,
+                                         bool is_utf8 = true, bool literal = false) {
+    auto matcher = FindSubstringRegex(options, is_utf8, literal);
+    RETURN_NOT_OK(RegexStatus(*matcher.regex_match_));
+    return std::move(matcher);
+  }
+
   explicit FindSubstringRegex(const MatchSubstringOptions& options, bool is_utf8 = true,
                               bool literal = false) {
     std::string regex = "(";
     regex.reserve(options.pattern.length() + 2);
     regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern;
     regex += ")";
-    regex_match_.reset(
-        new RE2(regex, MakeRE2Options(is_utf8, options.ignore_case, /*literal=*/false)));
+    regex_match_ = std::make_unique<RE2>(
+        regex, MakeRE2Options(is_utf8, options.ignore_case, /*literal=*/false));
   }
 
   template <typename OutValue, typename... Ignored>
@@ -1698,9 +1715,10 @@ struct FindSubstringExec {
     const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
     if (options.ignore_case) {
 #ifdef ARROW_WITH_RE2
+      ARROW_ASSIGN_OR_RAISE(auto matcher,
+                            FindSubstringRegex::Make(options, InputType::is_utf8, true));
       applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
-          kernel{FindSubstringRegex(options, /*is_utf8=*/InputType::is_utf8,
-                                    /*literal=*/true)};
+          kernel{std::move(matcher)};
       return kernel.Exec(ctx, batch, out);
 #else
       return Status::NotImplemented("ignore_case requires RE2");
@@ -1725,8 +1743,9 @@ struct FindSubstringRegexExec {
   using OffsetType = typename TypeTraits<InputType>::OffsetType;
   static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+    ARROW_ASSIGN_OR_RAISE(auto matcher, FindSubstringRegex::Make(options, false));
     applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
-        kernel{FindSubstringRegex(options, /*literal=*/false)};
+        kernel{std::move(matcher)};
     return kernel.Exec(ctx, batch, out);
   }
 };
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index ad71c23f2e..ba43191fe0 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -28,6 +28,7 @@
 #endif
 
 #include "arrow/compute/api_scalar.h"
+#include "arrow/compute/exec.h"
 #include "arrow/compute/kernels/codegen_internal.h"
 #include "arrow/compute/kernels/test_util.h"
 #include "arrow/testing/gtest_util.h"
@@ -494,6 +495,16 @@ TYPED_TEST(TestBaseBinaryKernels, FindSubstringRegex) {
   this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])",
                    this->offset_type(), "[0, 0, 1, null, -1, 0]", &options);
 }
+
+TYPED_TEST(TestBaseBinaryKernels, FindSubstringRegexWrongPattern) {
+  MatchSubstringOptions options{"(a", /*ignore_case=*/false};
+
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("Invalid regular expression"),
+      CallFunction("find_substring_regex",
+                   {Datum(R"(["a", "A", "baaa", null, "", "AaaA"])")}, &options));
+}
+
 #else
 TYPED_TEST(TestBaseBinaryKernels, FindSubstringIgnoreCase) {
   MatchSubstringOptions options{"a+", /*ignore_case=*/true};
diff --git a/cpp/src/gandiva/regex_functions_holder.cc b/cpp/src/gandiva/regex_functions_holder.cc
index f2be7bd27e..30d68cbc87 100644
--- a/cpp/src/gandiva/regex_functions_holder.cc
+++ b/cpp/src/gandiva/regex_functions_holder.cc
@@ -17,15 +17,12 @@
 
 #include "gandiva/regex_functions_holder.h"
 #include <regex>
+#include "arrow/util/macros.h"
 #include "gandiva/node.h"
 #include "gandiva/regex_util.h"
 
 namespace gandiva {
 
-RE2 LikeHolder::starts_with_regex_(R"(([^\.\*])*\.\*)");
-RE2 LikeHolder::ends_with_regex_(R"(\.\*([^\.\*])*)");
-RE2 LikeHolder::is_substr_regex_(R"(\.\*([^\.\*])*\.\*)");
-
 std::string& RemovePatternEscapeChars(const FunctionNode& node, std::string& pattern) {
   pattern.erase(std::remove(pattern.begin(), pattern.end(), '\\'), pattern.end());
   return pattern;
@@ -34,27 +31,44 @@ std::string& RemovePatternEscapeChars(const FunctionNode& node, std::string& pat
 // Short-circuit pattern matches for the following common sub cases :
 // - starts_with, ends_with and is_substr
 const FunctionNode LikeHolder::TryOptimize(const FunctionNode& node) {
+  // NOTE: avoid making those constants global to avoid compiling regexes at startup
+  // pre-compiled pattern for matching starts_with
+  static const RE2 starts_with_regex(R"(([^\.\*])*\.\*)");
+  // pre-compiled pattern for matching ends_with
+  static const RE2 ends_with_regex(R"(\.\*([^\.\*])*)");
+  // pre-compiled pattern for matching is_substr
+  static const RE2 is_substr_regex(R"(\.\*([^\.\*])*\.\*)");
+
+  static bool global_checked = false;
+  if (ARROW_PREDICT_FALSE(!global_checked)) {
+    if (ARROW_PREDICT_FALSE(
+            !(starts_with_regex.ok() && ends_with_regex.ok() && is_substr_regex.ok()))) {
+      return node;
+    }
+    global_checked = true;
+  }
+
   std::shared_ptr<LikeHolder> holder;
   auto status = Make(node, &holder);
   if (status.ok()) {
     std::string& pattern = holder->pattern_;
     auto literal_type = node.children().at(1)->return_type();
 
-    if (RE2::FullMatch(pattern, starts_with_regex_)) {
+    if (RE2::FullMatch(pattern, starts_with_regex)) {
       auto prefix = pattern.substr(0, pattern.length() - 2);  // trim .*
       auto parsed_prefix = RemovePatternEscapeChars(node, prefix);
       auto prefix_node = std::make_shared<LiteralNode>(
           literal_type, LiteralHolder(parsed_prefix), false);
       return FunctionNode("starts_with", {node.children().at(0), prefix_node},
                           node.return_type());
-    } else if (RE2::FullMatch(pattern, ends_with_regex_)) {
+    } else if (RE2::FullMatch(pattern, ends_with_regex)) {
       auto suffix = pattern.substr(2);  // skip .*
       auto parsed_suffix = RemovePatternEscapeChars(node, suffix);
       auto suffix_node = std::make_shared<LiteralNode>(
           literal_type, LiteralHolder(parsed_suffix), false);
       return FunctionNode("ends_with", {node.children().at(0), suffix_node},
                           node.return_type());
-    } else if (RE2::FullMatch(pattern, is_substr_regex_)) {
+    } else if (RE2::FullMatch(pattern, is_substr_regex)) {
       auto substr =
           pattern.substr(2, pattern.length() - 4);  // trim starting and ending .*
       auto parsed_substr = RemovePatternEscapeChars(node, substr);
@@ -115,9 +129,10 @@ Status LikeHolder::Make(const std::string& sql_pattern,
 
   auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
   ARROW_RETURN_IF(!lholder->regex_.ok(),
-                  Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
+                  Status::Invalid("Building RE2 pattern '", pcre_pattern,
+                                  "' failed with: ", lholder->regex_.error()));
 
-  *holder = lholder;
+  *holder = std::move(lholder);
   return Status::OK();
 }
 
@@ -136,9 +151,10 @@ Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escap
 
   auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
   ARROW_RETURN_IF(!lholder->regex_.ok(),
-                  Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
+                  Status::Invalid("Building RE2 pattern '", pcre_pattern,
+                                  "' failed with: ", lholder->regex_.error()));
 
-  *holder = lholder;
+  *holder = std::move(lholder);
   return Status::OK();
 }
 
@@ -151,9 +167,10 @@ Status LikeHolder::Make(const std::string& sql_pattern,
   lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern, regex_op));
 
   ARROW_RETURN_IF(!lholder->regex_.ok(),
-                  Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
+                  Status::Invalid("Building RE2 pattern '", pcre_pattern,
+                                  "' failed with: ", lholder->regex_.error()));
 
-  *holder = lholder;
+  *holder = std::move(lholder);
   return Status::OK();
 }
 
@@ -180,9 +197,10 @@ Status ReplaceHolder::Make(const std::string& sql_pattern,
                            std::shared_ptr<ReplaceHolder>* holder) {
   auto lholder = std::shared_ptr<ReplaceHolder>(new ReplaceHolder(sql_pattern));
   ARROW_RETURN_IF(!lholder->regex_.ok(),
-                  Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed"));
+                  Status::Invalid("Building RE2 pattern '", sql_pattern,
+                                  "' failed with: ", lholder->regex_.error()));
 
-  *holder = lholder;
+  *holder = std::move(lholder);
   return Status::OK();
 }
 
@@ -210,9 +228,10 @@ Status ExtractHolder::Make(const std::string& sql_pattern,
                            std::shared_ptr<ExtractHolder>* holder) {
   auto lholder = std::shared_ptr<ExtractHolder>(new ExtractHolder(sql_pattern));
   ARROW_RETURN_IF(!lholder->regex_.ok(),
-                  Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed"));
+                  Status::Invalid("Building RE2 pattern '", sql_pattern,
+                                  "' failed with: ", lholder->regex_.error()));
 
-  *holder = lholder;
+  *holder = std::move(lholder);
   return Status::OK();
 }
 
diff --git a/cpp/src/gandiva/regex_functions_holder.h b/cpp/src/gandiva/regex_functions_holder.h
index 8e0fe44269..ecf4095f3d 100644
--- a/cpp/src/gandiva/regex_functions_holder.h
+++ b/cpp/src/gandiva/regex_functions_holder.h
@@ -59,10 +59,6 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder {
 
   std::string pattern_;  // posix pattern string, to help debugging
   RE2 regex_;            // compiled regex for the pattern
-
-  static RE2 starts_with_regex_;  // pre-compiled pattern for matching starts_with
-  static RE2 ends_with_regex_;    // pre-compiled pattern for matching ends_with
-  static RE2 is_substr_regex_;    // pre-compiled pattern for matching is_substr
 };
 
 /// Function Holder for 'replace'