You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2021/06/07 16:07:40 UTC

[arrow] branch master updated: ARROW-12969: [C++] Fix match_substring with empty haystack

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new e6d632e  ARROW-12969: [C++] Fix match_substring with empty haystack
e6d632e is described below

commit e6d632ed2f7f97445a25a61bff8fe9dd126e550e
Author: David Li <li...@gmail.com>
AuthorDate: Mon Jun 7 18:06:37 2021 +0200

    ARROW-12969: [C++] Fix match_substring with empty haystack
    
    Closes #10453 from lidavidm/arrow-12969
    
    Authored-by: David Li <li...@gmail.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/compute/kernels/scalar_string.cc     | 34 +++++++++++++---------
 .../arrow/compute/kernels/scalar_string_test.cc    | 10 +++++++
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 1d87bd8..9db16e2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -475,6 +475,7 @@ struct PlainSubstringMatcher {
     const auto pattern_length = options_.pattern.size();
     int64_t pattern_pos = 0;
     int64_t pos = 0;
+    if (pattern_length == 0) return 0;
     for (const auto c : current) {
       while ((pattern_pos >= 0) && (options_.pattern[pattern_pos] != c)) {
         pattern_pos = prefix_table[pattern_pos];
@@ -737,12 +738,14 @@ struct FindSubstring {
 };
 
 template <typename InputType>
-Status FindSubstringExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  using offset_type = typename TypeTraits<InputType>::OffsetType;
-  applicator::ScalarUnaryNotNullStateful<offset_type, InputType, FindSubstring> kernel{
-      FindSubstring(PlainSubstringMatcher(MatchSubstringState::Get(ctx)))};
-  return kernel.Exec(ctx, batch, out);
-}
+struct FindSubstringExec {
+  using OffsetType = typename TypeTraits<InputType>::OffsetType;
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstring> kernel{
+        FindSubstring(PlainSubstringMatcher(MatchSubstringState::Get(ctx)))};
+    return kernel.Exec(ctx, batch, out);
+  }
+};
 
 const FunctionDoc find_substring_doc(
     "Find first occurrence of substring",
@@ -754,14 +757,17 @@ const FunctionDoc find_substring_doc(
 void AddFindSubstring(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("find_substring", Arity::Unary(),
                                                &find_substring_doc);
-  DCHECK_OK(func->AddKernel({binary()}, int32(), FindSubstringExec<BinaryType>,
-                            MatchSubstringState::Init));
-  DCHECK_OK(func->AddKernel({utf8()}, int32(), FindSubstringExec<StringType>,
-                            MatchSubstringState::Init));
-  DCHECK_OK(func->AddKernel({large_binary()}, int64(), FindSubstringExec<LargeBinaryType>,
-                            MatchSubstringState::Init));
-  DCHECK_OK(func->AddKernel({large_utf8()}, int64(), FindSubstringExec<LargeStringType>,
-                            MatchSubstringState::Init));
+  for (const auto& ty : BaseBinaryTypes()) {
+    std::shared_ptr<DataType> offset_type;
+    if (ty->id() == Type::type::LARGE_BINARY || ty->id() == Type::type::LARGE_STRING) {
+      offset_type = int64();
+    } else {
+      offset_type = int32();
+    }
+    DCHECK_OK(func->AddKernel({ty}, offset_type,
+                              GenerateTypeAgnosticVarBinaryBase<FindSubstringExec>(ty),
+                              MatchSubstringState::Init));
+  }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index fe06981..bd5c8ee 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -91,6 +91,10 @@ TYPED_TEST(TestBinaryKernels, FindSubstring) {
   MatchSubstringOptions options_double_char_2{"bbcaa"};
   this->CheckUnary("find_substring", R"(["abcbaabbbcaabccabaab"])", this->offset_type(),
                    "[7]", &options_double_char_2);
+
+  MatchSubstringOptions options_empty{""};
+  this->CheckUnary("find_substring", R"(["", "a", null])", this->offset_type(),
+                   "[0, 0, null]", &options_empty);
 }
 
 template <typename TestType>
@@ -391,6 +395,12 @@ TYPED_TEST(TestStringKernels, MatchSubstring) {
   MatchSubstringOptions options_double_char_2{"bbcaa"};
   this->CheckUnary("match_substring", R"(["abcbaabbbcaabccabaab"])", boolean(), "[true]",
                    &options_double_char_2);
+
+  MatchSubstringOptions options_empty{""};
+  this->CheckUnary("match_substring", "[]", boolean(), "[]", &options);
+  this->CheckUnary("match_substring", R"(["abc", "acb", "cab", null, "bac", "AB", ""])",
+                   boolean(), "[true, true, true, null, true, true, true]",
+                   &options_empty);
 }
 
 #ifdef ARROW_WITH_RE2