You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2021/06/07 16:07:40 UTC
[arrow] branch master updated: ARROW-12969: [C++] Fix
match_substring with empty haystack
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e6d632e ARROW-12969: [C++] Fix match_substring with empty haystack
e6d632e is described below
commit e6d632ed2f7f97445a25a61bff8fe9dd126e550e
Author: David Li <li...@gmail.com>
AuthorDate: Mon Jun 7 18:06:37 2021 +0200
ARROW-12969: [C++] Fix match_substring with empty haystack
Closes #10453 from lidavidm/arrow-12969
Authored-by: David Li <li...@gmail.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/arrow/compute/kernels/scalar_string.cc | 34 +++++++++++++---------
.../arrow/compute/kernels/scalar_string_test.cc | 10 +++++++
2 files changed, 30 insertions(+), 14 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 1d87bd8..9db16e2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -475,6 +475,7 @@ struct PlainSubstringMatcher {
const auto pattern_length = options_.pattern.size();
int64_t pattern_pos = 0;
int64_t pos = 0;
+ if (pattern_length == 0) return 0;
for (const auto c : current) {
while ((pattern_pos >= 0) && (options_.pattern[pattern_pos] != c)) {
pattern_pos = prefix_table[pattern_pos];
@@ -737,12 +738,14 @@ struct FindSubstring {
};
template <typename InputType>
-Status FindSubstringExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- using offset_type = typename TypeTraits<InputType>::OffsetType;
- applicator::ScalarUnaryNotNullStateful<offset_type, InputType, FindSubstring> kernel{
- FindSubstring(PlainSubstringMatcher(MatchSubstringState::Get(ctx)))};
- return kernel.Exec(ctx, batch, out);
-}
+struct FindSubstringExec {
+ using OffsetType = typename TypeTraits<InputType>::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstring> kernel{
+ FindSubstring(PlainSubstringMatcher(MatchSubstringState::Get(ctx)))};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
const FunctionDoc find_substring_doc(
"Find first occurrence of substring",
@@ -754,14 +757,17 @@ const FunctionDoc find_substring_doc(
void AddFindSubstring(FunctionRegistry* registry) {
auto func = std::make_shared<ScalarFunction>("find_substring", Arity::Unary(),
&find_substring_doc);
- DCHECK_OK(func->AddKernel({binary()}, int32(), FindSubstringExec<BinaryType>,
- MatchSubstringState::Init));
- DCHECK_OK(func->AddKernel({utf8()}, int32(), FindSubstringExec<StringType>,
- MatchSubstringState::Init));
- DCHECK_OK(func->AddKernel({large_binary()}, int64(), FindSubstringExec<LargeBinaryType>,
- MatchSubstringState::Init));
- DCHECK_OK(func->AddKernel({large_utf8()}, int64(), FindSubstringExec<LargeStringType>,
- MatchSubstringState::Init));
+ for (const auto& ty : BaseBinaryTypes()) {
+ std::shared_ptr<DataType> offset_type;
+ if (ty->id() == Type::type::LARGE_BINARY || ty->id() == Type::type::LARGE_STRING) {
+ offset_type = int64();
+ } else {
+ offset_type = int32();
+ }
+ DCHECK_OK(func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase<FindSubstringExec>(ty),
+ MatchSubstringState::Init));
+ }
DCHECK_OK(registry->AddFunction(std::move(func)));
}
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index fe06981..bd5c8ee 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -91,6 +91,10 @@ TYPED_TEST(TestBinaryKernels, FindSubstring) {
MatchSubstringOptions options_double_char_2{"bbcaa"};
this->CheckUnary("find_substring", R"(["abcbaabbbcaabccabaab"])", this->offset_type(),
"[7]", &options_double_char_2);
+
+ MatchSubstringOptions options_empty{""};
+ this->CheckUnary("find_substring", R"(["", "a", null])", this->offset_type(),
+ "[0, 0, null]", &options_empty);
}
template <typename TestType>
@@ -391,6 +395,12 @@ TYPED_TEST(TestStringKernels, MatchSubstring) {
MatchSubstringOptions options_double_char_2{"bbcaa"};
this->CheckUnary("match_substring", R"(["abcbaabbbcaabccabaab"])", boolean(), "[true]",
&options_double_char_2);
+
+ MatchSubstringOptions options_empty{""};
+ this->CheckUnary("match_substring", "[]", boolean(), "[]", &options);
+ this->CheckUnary("match_substring", R"(["abc", "acb", "cab", null, "bac", "AB", ""])",
+ boolean(), "[true, true, true, null, true, true, true]",
+ &options_empty);
}
#ifdef ARROW_WITH_RE2