You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/12/18 08:12:24 UTC

[GitHub] [arrow] maartenbreddels commented on a change in pull request #8468: ARROW-10306: [C++] Add string replacement kernel

maartenbreddels commented on a change in pull request #8468:
URL: https://github.com/apache/arrow/pull/8468#discussion_r545637801



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1194,6 +1198,197 @@ void AddSplit(FunctionRegistry* registry) {
 #endif
 }
 
+// ----------------------------------------------------------------------
+// replace substring
+
+template <typename Type, typename Derived>
+struct ReplaceSubStringBase {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ScalarType = typename TypeTraits<Type>::ScalarType;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+  using offset_type = typename Type::offset_type;
+  using ValueDataBuilder = TypedBufferBuilder<uint8_t>;
+  using OffsetBuilder = TypedBufferBuilder<offset_type>;
+  using State = OptionsWrapper<ReplaceSubstringOptions>;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    Derived derived(ctx, State::Get(ctx));
+    if (ctx->status().ok()) {
+      derived.Replace(ctx, batch, out);
+    }
+  }
+  void Replace(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    std::shared_ptr<ValueDataBuilder> value_data_builder =
+        std::make_shared<ValueDataBuilder>();
+    std::shared_ptr<OffsetBuilder> offset_builder = std::make_shared<OffsetBuilder>();
+
+    if (batch[0].kind() == Datum::ARRAY) {
+      // We already know how many strings we have, so we can use Reserve/UnsafeAppend
+      KERNEL_RETURN_IF_ERROR(ctx, offset_builder->Reserve(batch[0].array()->length));
+
+      const ArrayData& input = *batch[0].array();
+      KERNEL_RETURN_IF_ERROR(ctx, offset_builder->Append(0));  // offsets start at 0
+      KERNEL_RETURN_IF_ERROR(
+          ctx, VisitArrayDataInline<Type>(
+                   input,
+                   [&](util::string_view s) {
+                     RETURN_NOT_OK(static_cast<Derived&>(*this).ReplaceString(
+                         s, value_data_builder.get()));
+                     offset_builder->UnsafeAppend(
+                         static_cast<offset_type>(value_data_builder->length()));
+                     return Status::OK();
+                   },
+                   [&]() {
+                     // offset for null value
+                     offset_builder->UnsafeAppend(
+                         static_cast<offset_type>(value_data_builder->length()));
+                     return Status::OK();
+                   }));
+      ArrayData* output = out->mutable_array();
+      KERNEL_RETURN_IF_ERROR(ctx, value_data_builder->Finish(&output->buffers[2]));
+      KERNEL_RETURN_IF_ERROR(ctx, offset_builder->Finish(&output->buffers[1]));
+    } else {
+      const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
+      auto result = std::make_shared<ScalarType>();
+      if (input.is_valid) {
+        util::string_view s = static_cast<util::string_view>(*input.value);
+        KERNEL_RETURN_IF_ERROR(
+            ctx, static_cast<Derived&>(*this).ReplaceString(s, value_data_builder.get()));
+        KERNEL_RETURN_IF_ERROR(ctx, value_data_builder->Finish(&result->value));
+        result->is_valid = true;
+      }
+      out->value = result;
+    }
+  }
+};
+
+template <typename Type>
+struct ReplaceSubString : ReplaceSubStringBase<Type, ReplaceSubString<Type>> {

Review comment:
       Maybe I misunderstand, but via `offset_type` we are not independent of `Type` right?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org