You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/07/01 12:37:07 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #7593: ARROW-9160: [C++] Implement contains for exact matches

pitrou commented on a change in pull request #7593:
URL: https://github.com/apache/arrow/pull/7593#discussion_r448326108



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -297,6 +297,116 @@ void AddAsciiLength(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// ----------------------------------------------------------------------
+// exact pattern detection
+
+using StrToBoolTransformFunc =
+    std::function<void(const void*, const uint8_t*, int64_t, int64_t, uint8_t*)>;
+
+// Apply `transform` to input character data- this function cannot change the
+// length
+template <typename Type>
+void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch,
+                         StrToBoolTransformFunc transform, Datum* out) {
+  using offset_type = typename Type::offset_type;
+
+  if (batch[0].kind() == Datum::ARRAY) {
+    const ArrayData& input = *batch[0].array();
+    ArrayData* out_arr = out->mutable_array();
+    if (input.length > 0) {
+      transform(
+          reinterpret_cast<const offset_type*>(input.buffers[1]->data()) + input.offset,
+          input.buffers[2]->data(), input.length, out_arr->offset,
+          out_arr->buffers[1]->mutable_data());
+    }
+  } else {
+    const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
+    auto result = checked_pointer_cast<BooleanScalar>(MakeNullScalar(out->type()));
+    uint8_t result_value = 0;
+    if (input.is_valid) {
+      result->is_valid = true;
+      std::array<offset_type, 2> offsets{0,
+                                         static_cast<offset_type>(input.value->size())};
+      transform(offsets.data(), input.value->data(), 1, /*output_offset=*/0,
+                &result_value);
+      out->value = std::make_shared<BooleanScalar>(result_value > 0);
+    }
+  }
+}
+
+template <typename offset_type>
+void TransformContainsExact(const uint8_t* pattern, int64_t pattern_length,
+                            const offset_type* offsets, const uint8_t* data,
+                            int64_t length, int64_t output_offset, uint8_t* output) {
+  // This is an implementation of the Knuth-Morris-Pratt algorithm
+
+  // Phase 1: Build the prefix table
+  std::vector<offset_type> prefix_table(pattern_length + 1);
+  offset_type prefix_length = -1;
+  prefix_table[0] = -1;
+  for (offset_type pos = 0; pos < pattern_length; ++pos) {
+    // The prefix cannot be expanded, reset.
+    if (prefix_length >= 0 && pattern[pos] != pattern[prefix_length]) {
+      prefix_length = prefix_table[prefix_length];
+    }
+    prefix_length++;
+    prefix_table[pos + 1] = prefix_length;
+  }
+
+  // Phase 2: Find the prefix in the data
+  FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
+  for (int64_t i = 0; i < length; ++i) {
+    const uint8_t* current_data = data + offsets[i];
+    int64_t current_length = offsets[i + 1] - offsets[i];
+
+    int64_t pattern_pos = 0;
+    for (int64_t k = 0; k < current_length; k++) {
+      if (pattern[pattern_pos] == current_data[k]) {
+        pattern_pos++;
+        if (pattern_pos == pattern_length) {
+          bitmap_writer.Set();
+          break;
+        }
+      } else {
+        pattern_pos = std::max<offset_type>(0, prefix_table[pattern_pos]);
+      }
+    }
+    bitmap_writer.Next();
+  }
+  bitmap_writer.Finish();

Review comment:
       It is me, or you're never clearing the output bit when the pattern isn't found?

##########
File path: python/pyarrow/compute.py
##########
@@ -113,6 +113,24 @@ def func(left, right):
 multiply = _simple_binary_function('multiply')
 
 
+def contains_exact(array, pattern):
+    """
+    Check whether a pattern occurs as part of the values of the array.

Review comment:
       That's rather vague. Does `contains_exact(pa.array([1,2,3]), pa.array([1,2]))` return true? I don't think so.

##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -259,6 +259,15 @@ Result<Datum> IsValid(const Datum& values, ExecContext* ctx = NULLPTR);
 ARROW_EXPORT
 Result<Datum> IsNull(const Datum& values, ExecContext* ctx = NULLPTR);
 
+// ----------------------------------------------------------------------
+// String functions
+
+struct ARROW_EXPORT ContainsExactOptions : public FunctionOptions {
+  explicit ContainsExactOptions(std::string pattern = "") : pattern(pattern) {}

Review comment:
       I don't think defaulting to the empty string makes sense here.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org