You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "felipecrv (via GitHub)" <gi...@apache.org> on 2023/04/10 14:12:28 UTC

[GitHub] [arrow] felipecrv opened a new pull request, #35003: GH-34569: [C++] Diffing of Run-End Encoded arrays

felipecrv opened a new pull request, #35003:
URL: https://github.com/apache/arrow/pull/35003

   ### What changes are included in this PR?
   
   Ability to diff run-end encoded arrays without expanding them first. This is useful for debugging and understanding of issues in Arrow.
   
   ### Are these changes tested?
   
   Yes.
   
   ### Are there any user-facing changes?
   
   No.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1370840253


##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) {
   return UnitSlice{&array, index};
 }
 
-using ValueComparator = std::function<bool(const Array&, int64_t, const Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+  virtual ~ValueComparator() = default;
+
+  /// \brief Compare the validity and values at the given indices in the base and target
+  /// arrays.
+  ///
+  /// \param base_index The index in the base array.
+  /// \param target_index The index in the target array.
+  /// \return true if the values at the given indices are equal, false otherwise.
+  /// \pre base_index and target_index are valid indices in their respective arrays.
+  virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+  /// \brief Return the run length of equal values starting at the given indices in the
+  /// base and target arrays.
+  ///
+  /// \param base_index The starting index in the base array.
+  /// \param base_length The length of the base array.
+  /// \param target_index The starting index in the target array.
+  /// \param target_length The length of the target array.
+  /// \return The run length of equal values starting at the given indices in the base
+  /// and target arrays.
+  virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                        int64_t target_index, int64_t target_length) {
+    int64_t run_length_of_equals = 0;
+    while (base_index < base_length && target_index < target_length) {
+      if (!Equals(base_index, target_index)) {
+        break;
+      }
+      base_index += 1;
+      target_index += 1;
+      run_length_of_equals += 1;
+    }
+    return run_length_of_equals;
+  }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+  const ArrayType& base;
+  const ArrayType& target;
+
+  DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+      : base(base), target(target) {}
+
+  ~DefaultValueComparator() override = default;
+
+  /// \brief Compare validity bits of base[base_index] and target[target_index]
+  ///
+  /// \return std::nullopt if the validity bits differ, otherwise a bool
+  /// containing the validity bit found in both arrays.
+  static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t base_index,
+                                              const ArrayType& target,
+                                              int64_t target_index) {
+    const bool base_valid = base.IsValid(base_index);
+    const bool target_valid = target.IsValid(target_index);
+    return base_valid ^ target_valid ? std::nullopt : std::optional<bool>{base_valid};
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const auto valid = ValidityIfEquals(base, base_index, target, target_index);
+    if (valid.has_value()) {
+      return *valid ? GetView(base, base_index) == GetView(target, target_index) : true;
+    }
+    return false;
+  }
+};
+
+template <typename RunEndCType>
+class REEValueComparator : public ValueComparator {
+ private:
+  const RunEndEncodedArray& base_;
+  const RunEndEncodedArray& target_;
+  std::unique_ptr<ValueComparator> inner_value_comparator_;
+  ree_util::PhysicalIndexFinder<RunEndCType> base_physical_index_finder_;
+  ree_util::PhysicalIndexFinder<RunEndCType> target_physical_index_finder_;
+
+ public:
+  REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray& target,
+                     std::unique_ptr<ValueComparator>&& inner_value_comparator)
+      : base_(base),
+        target_(target),
+        inner_value_comparator_(std::move(inner_value_comparator)),
+        base_physical_index_finder_(*base_.data()),
+        target_physical_index_finder_(*target_.data()) {
+    DCHECK_EQ(*base_.type(), *target_.type());
+  }
+
+  ~REEValueComparator() override = default;
 
-struct ValueComparatorVisitor {
+ private:
+  /// \pre 0 <= i < base_.length()
+  inline int64_t FindPhysicalIndexOnBase(int64_t i) {
+    return base_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  /// \pre 0 <= i < target_.length()
+  inline int64_t FindPhysicalIndexOnTarget(int64_t i) {
+    return target_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  const RunEndCType* base_run_ends() { return base_physical_index_finder_.run_ends; }
+
+  const RunEndCType* target_run_ends() { return target_physical_index_finder_.run_ends; }
+
+ public:
+  int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                int64_t target_index, int64_t target_length) override {
+    // Ensure the first search for physical index on the values arrays is safe.
+    if (base_index >= base_length || target_index >= target_length) {
+      // Without values on either side, there is no run of equal values.
+      return 0;
+    }
+
+    // Translate the two logical indices into physical indices.
+    int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+    int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+
+    int64_t run_length_of_equals = 0;
+    // The loop invariant (base_index < base_length && target_index < target_length)
+    // is valid when the loop starts because of the check above.
+    for (;;) {
+      const auto base_run_end =
+          static_cast<int64_t>(base_run_ends()[physical_base_index]) - base_.offset();
+      const auto target_run_end =
+          static_cast<int64_t>(target_run_ends()[physical_target_index]) -
+          target_.offset();
+      // The end of the runs containing the logical indices, by definition, ends
+      // after the logical indices.
+      DCHECK_LT(base_index, base_run_end);
+      DCHECK_LT(target_index, target_run_end);
+
+      // Compare the physical values that make up the runs containing base_index
+      // and target_index.
+      if (!inner_value_comparator_->Equals(physical_base_index, physical_target_index)) {
+        // First difference found, stop because the run of equal values cannot
+        // be extended further.
+        break;
+      }
+
+      const int64_t base_run = std::min(base_run_end, base_length) - base_index;
+      const int64_t target_run = std::min(target_run_end, target_length) - target_index;
+      // Due to the loop-invariant (base_index < base_length && target_index <
+      // target_length) and properties of the run-ends asserted above, both base_run and
+      // target_run are strictly greater than zero.
+      DCHECK_GT(base_run, 0);
+      DCHECK_GT(target_run, 0);
+
+      int64_t increment = 0;
+      if (base_run < target_run) {
+        increment = base_run;
+        physical_base_index += 1;  // skip to next run on base
+      } else if (base_run > target_run) {
+        increment = target_run;
+        physical_target_index += 1;  // skip to next run on target
+      } else {
+        increment = base_run;
+        // skip to next run on both base and target
+        physical_base_index += 1;
+        physical_target_index += 1;
+      }
+      // Since both base_run and target_run are greater than zero,
+      // increment is also greater than zero...
+      DCHECK_GT(increment, 0);
+      // ...which implies that the loop will make progress and eventually terminate
+      // because base_index or target_index will equal base_length or target_length,
+      // respectively.
+      base_index += increment;
+      target_index += increment;
+      // The value representing the two runs are equal, so we can assume that at
+      // least `increment` (size of smallest run) values are equal.
+      run_length_of_equals += increment;
+
+      if (base_index >= base_length || target_index >= target_length) {
+        break;
+      }
+    }
+
+    return run_length_of_equals;
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+    const int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+    return inner_value_comparator_->Equals(physical_base_index, physical_target_index);
+  }
+};
+
+class CreateValueComparator {
+ private:
+  std::unique_ptr<ValueComparator> comparator_;
+
+ public:
   template <typename T>
-  Status Visit(const T&) {
+  Status Visit(const T&, const Array& base, const Array& target) {
     using ArrayType = typename TypeTraits<T>::ArrayType;
-    out = [](const Array& base, int64_t base_index, const Array& target,
-             int64_t target_index) {
-      return (GetView(checked_cast<const ArrayType&>(base), base_index) ==
-              GetView(checked_cast<const ArrayType&>(target), target_index));
-    };
+    comparator_ = std::make_unique<DefaultValueComparator<ArrayType>>(
+        checked_cast<const ArrayType&>(base), checked_cast<const ArrayType&>(target));
     return Status::OK();
   }
 
-  Status Visit(const NullType&) { return Status::NotImplemented("null type"); }
+  Status Visit(const NullType&, const Array&, const Array&) {
+    return Status::NotImplemented("null type");
+  }
 
-  Status Visit(const ExtensionType&) { return Status::NotImplemented("extension type"); }
+  Status Visit(const ExtensionType&, const Array&, const Array&) {
+    return Status::NotImplemented("extension type");
+  }
 
-  Status Visit(const DictionaryType&) {
+  Status Visit(const DictionaryType&, const Array& base, const Array& target) {
     return Status::NotImplemented("dictionary type");
   }
 
-  Status Visit(const RunEndEncodedType&) {
-    return Status::NotImplemented("run-end encoded type");
+  Status Visit(const RunEndEncodedType& ree_type, const Array& base,
+               const Array& target) {
+    const auto& base_ree = checked_cast<const RunEndEncodedArray&>(base);
+    const auto& target_ree = checked_cast<const RunEndEncodedArray&>(target);
+    ARROW_ASSIGN_OR_RAISE(
+        auto inner_values_comparator,
+        (*this)(*ree_type.value_type(), *base_ree.values(), *target_ree.values()));
+    auto* ree_value_comparator =
+        ([&ree_type, &base_ree, &target_ree,
+          inner_values_comparator =
+              std::move(inner_values_comparator)]() mutable -> ValueComparator* {
+          const auto run_end_type_id = ree_type.run_end_type()->id();
+          switch (run_end_type_id) {
+            case Type::INT16:
+              return new REEValueComparator<int16_t>(base_ree, target_ree,
+                                                     std::move(inner_values_comparator));
+            case Type::INT32:
+              return new REEValueComparator<int32_t>(base_ree, target_ree,
+                                                     std::move(inner_values_comparator));
+            default:
+              DCHECK_EQ(run_end_type_id, Type::INT64);
+              return new REEValueComparator<int64_t>(base_ree, target_ree,
+                                                     std::move(inner_values_comparator));
+          }
+        }());
+    comparator_ = std::unique_ptr<ValueComparator>{ree_value_comparator};
+    return Status::OK();
   }
 
-  ValueComparator Create(const DataType& type) {
-    DCHECK_OK(VisitTypeInline(type, this));
-    return out;
+  Result<std::unique_ptr<ValueComparator>> operator()(const DataType& type,

Review Comment:
   Sure. I will do it and rename the class from `CreateValueComparator` to ` ValueComparatorFactory`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1373367436


##########
cpp/src/arrow/array/diff.cc:
##########
@@ -164,56 +382,38 @@ class QuadraticSpaceMyersDiff {
       : base_(base),
         target_(target),
         pool_(pool),
-        value_comparator_(GetValueComparator(*base.type())),
-        base_begin_(0),
         base_end_(base.length()),
-        target_begin_(0),
         target_end_(target.length()),
-        endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}),
         insert_({true}) {
-    if ((base_end_ - base_begin_ == target_end_ - target_begin_) &&
-        endpoint_base_[0] == base_end_) {
-      // trivial case: base == target
-      finish_index_ = 0;
-    }
-  }
-
-  bool ValuesEqual(int64_t base_index, int64_t target_index) const {
-    bool base_null = base_.IsNull(base_index);
-    bool target_null = target_.IsNull(target_index);
-    if (base_null || target_null) {
-      // If only one is null, then this is false, otherwise true
-      return base_null && target_null;
-    }
-    return value_comparator_(base_, base_index, target_, target_index);
+    // endpoint_base_ is initialized when Diff() is called to start the algorithm.

Review Comment:
   And since we've got a single entry point which can create value comparator once, I think it should be a member rather than being passed explicitly



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1373362658


##########
cpp/src/arrow/util/ree_util.cc:
##########
@@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) {
   return LogicalNullCount<int64_t>(span);
 }
 
+namespace internal {
+
+/// \pre 0 <= i < array_span_.length()
+template <typename RunEndCType>
+int64_t FindPhysicalIndexImpl(PhysicalIndexFinder<RunEndCType>& self, int64_t i) {
+  DCHECK_LT(i, self.array_span.length);
+  const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length;
+  DCHECK_LT(self.last_physical_index, run_ends_size);
+  // This access to self.run_ends_[last_physical_index_] is always safe because:
+  // 1. 0 <= i < array_span_.length() implies there is at least one run and the initial
+  //    value 0 will be safe to index with.
+  // 2. last_physical_index_ > 0 is always the result of a valid call to
+  //    internal::FindPhysicalIndex.
+  if (ARROW_PREDICT_TRUE(self.array_span.offset + i <
+                         self.run_ends[self.last_physical_index])) {
+    // The cached value is an upper-bound, but is it the least upper-bound?
+    if (self.last_physical_index == 0 ||
+        self.array_span.offset + i >= self.run_ends[self.last_physical_index - 1]) {
+      return self.last_physical_index;
+    }
+    // last_physical_index_ - 1 is a candidate for the least upper-bound,
+    // so search for the least upper-bound in the range that includes it.
+    const int64_t j = ree_util::internal::FindPhysicalIndex<RunEndCType>(
+        self.run_ends, /*run_ends_size=*/self.last_physical_index, i,
+        self.array_span.offset);
+    DCHECK_LT(j, self.last_physical_index);
+    return self.last_physical_index = j;

Review Comment:
   that seems fine, then



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1373522512


##########
cpp/src/arrow/array/diff.cc:
##########
@@ -164,56 +382,38 @@ class QuadraticSpaceMyersDiff {
       : base_(base),
         target_(target),
         pool_(pool),
-        value_comparator_(GetValueComparator(*base.type())),
-        base_begin_(0),
         base_end_(base.length()),
-        target_begin_(0),
         target_end_(target.length()),
-        endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}),
         insert_({true}) {
-    if ((base_end_ - base_begin_ == target_end_ - target_begin_) &&
-        endpoint_base_[0] == base_end_) {
-      // trivial case: base == target
-      finish_index_ = 0;
-    }
-  }
-
-  bool ValuesEqual(int64_t base_index, int64_t target_index) const {
-    bool base_null = base_.IsNull(base_index);
-    bool target_null = target_.IsNull(target_index);
-    if (base_null || target_null) {
-      // If only one is null, then this is false, otherwise true
-      return base_null && target_null;
-    }
-    return value_comparator_(base_, base_index, target_, target_index);
+    // endpoint_base_ is initialized when Diff() is called to start the algorithm.

Review Comment:
   Pushed a new commit doing that.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1370618960


##########
cpp/src/arrow/util/ree_util.cc:
##########
@@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) {
   return LogicalNullCount<int64_t>(span);
 }
 
+namespace internal {
+
+/// \pre 0 <= i < array_span_.length()
+template <typename RunEndCType>
+int64_t FindPhysicalIndexImpl(PhysicalIndexFinder<RunEndCType>& self, int64_t i) {
+  DCHECK_LT(i, self.array_span.length);
+  const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length;
+  DCHECK_LT(self.last_physical_index, run_ends_size);
+  // This access to self.run_ends_[last_physical_index_] is always safe because:
+  // 1. 0 <= i < array_span_.length() implies there is at least one run and the initial
+  //    value 0 will be safe to index with.
+  // 2. last_physical_index_ > 0 is always the result of a valid call to
+  //    internal::FindPhysicalIndex.
+  if (ARROW_PREDICT_TRUE(self.array_span.offset + i <
+                         self.run_ends[self.last_physical_index])) {
+    // The cached value is an upper-bound, but is it the least upper-bound?
+    if (self.last_physical_index == 0 ||
+        self.array_span.offset + i >= self.run_ends[self.last_physical_index - 1]) {
+      return self.last_physical_index;
+    }
+    // last_physical_index_ - 1 is a candidate for the least upper-bound,
+    // so search for the least upper-bound in the range that includes it.
+    const int64_t j = ree_util::internal::FindPhysicalIndex<RunEndCType>(
+        self.run_ends, /*run_ends_size=*/self.last_physical_index, i,
+        self.array_span.offset);
+    DCHECK_LT(j, self.last_physical_index);
+    return self.last_physical_index = j;

Review Comment:
   Since the return value is always `last_physical_index`, please change this function to return void and have `FindPhysicalIndexImplN` return that explicitly



##########
cpp/src/arrow/util/ree_util.cc:
##########
@@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) {
   return LogicalNullCount<int64_t>(span);
 }
 
+namespace internal {
+
+/// \pre 0 <= i < array_span_.length()
+template <typename RunEndCType>
+int64_t FindPhysicalIndexImpl(PhysicalIndexFinder<RunEndCType>& self, int64_t i) {
+  DCHECK_LT(i, self.array_span.length);
+  const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length;
+  DCHECK_LT(self.last_physical_index, run_ends_size);
+  // This access to self.run_ends_[last_physical_index_] is always safe because:
+  // 1. 0 <= i < array_span_.length() implies there is at least one run and the initial
+  //    value 0 will be safe to index with.
+  // 2. last_physical_index_ > 0 is always the result of a valid call to

Review Comment:
   Nit: please ensure the identifiers in comments are consistent with the source
   ```suggestion
     // This access to self.run_ends[last_physical_index] is always safe because:
     // 1. 0 <= i < array_span.length implies there is at least one run and the initial
     //    value 0 will be safe to index with.
     // 2. last_physical_index > 0 is always the result of a valid call to
   ```



##########
cpp/src/arrow/util/ree_util.cc:
##########
@@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) {
   return LogicalNullCount<int64_t>(span);
 }
 
+namespace internal {
+
+/// \pre 0 <= i < array_span_.length()
+template <typename RunEndCType>
+int64_t FindPhysicalIndexImpl(PhysicalIndexFinder<RunEndCType>& self, int64_t i) {
+  DCHECK_LT(i, self.array_span.length);
+  const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length;
+  DCHECK_LT(self.last_physical_index, run_ends_size);
+  // This access to self.run_ends_[last_physical_index_] is always safe because:
+  // 1. 0 <= i < array_span_.length() implies there is at least one run and the initial
+  //    value 0 will be safe to index with.
+  // 2. last_physical_index_ > 0 is always the result of a valid call to
+  //    internal::FindPhysicalIndex.
+  if (ARROW_PREDICT_TRUE(self.array_span.offset + i <
+                         self.run_ends[self.last_physical_index])) {
+    // The cached value is an upper-bound, but is it the least upper-bound?
+    if (self.last_physical_index == 0 ||
+        self.array_span.offset + i >= self.run_ends[self.last_physical_index - 1]) {
+      return self.last_physical_index;
+    }
+    // last_physical_index_ - 1 is a candidate for the least upper-bound,
+    // so search for the least upper-bound in the range that includes it.
+    const int64_t j = ree_util::internal::FindPhysicalIndex<RunEndCType>(
+        self.run_ends, /*run_ends_size=*/self.last_physical_index, i,
+        self.array_span.offset);
+    DCHECK_LT(j, self.last_physical_index);
+    return self.last_physical_index = j;
+  }
+
+  // last_physical_index_ is not an upper-bound, and the logical index i MUST be
+  // in the runs that follow it. Since i is a valid logical index, we know that at least
+  // one extra run is present.
+  DCHECK_LT(self.last_physical_index + 1, run_ends_size);
+  const int64_t lower_physical_index = self.last_physical_index + 1;

Review Comment:
   nit: I understand you mean "lower bound on physical index", but a casual reader would probably find "min" more obvious
   ```suggestion
     const int64_t min_physical_index = self.last_physical_index + 1;
   ```



##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) {
   return UnitSlice{&array, index};
 }
 
-using ValueComparator = std::function<bool(const Array&, int64_t, const Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+  virtual ~ValueComparator() = default;
+
+  /// \brief Compare the validity and values at the given indices in the base and target
+  /// arrays.
+  ///
+  /// \param base_index The index in the base array.
+  /// \param target_index The index in the target array.
+  /// \return true if the values at the given indices are equal, false otherwise.
+  /// \pre base_index and target_index are valid indices in their respective arrays.
+  virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+  /// \brief Return the run length of equal values starting at the given indices in the
+  /// base and target arrays.
+  ///
+  /// \param base_index The starting index in the base array.
+  /// \param base_length The length of the base array.
+  /// \param target_index The starting index in the target array.
+  /// \param target_length The length of the target array.
+  /// \return The run length of equal values starting at the given indices in the base
+  /// and target arrays.
+  virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                        int64_t target_index, int64_t target_length) {
+    int64_t run_length_of_equals = 0;
+    while (base_index < base_length && target_index < target_length) {
+      if (!Equals(base_index, target_index)) {
+        break;
+      }
+      base_index += 1;
+      target_index += 1;
+      run_length_of_equals += 1;
+    }
+    return run_length_of_equals;
+  }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+  const ArrayType& base;
+  const ArrayType& target;
+
+  DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+      : base(base), target(target) {}
+
+  ~DefaultValueComparator() override = default;
+
+  /// \brief Compare validity bits of base[base_index] and target[target_index]
+  ///
+  /// \return std::nullopt if the validity bits differ, otherwise a bool
+  /// containing the validity bit found in both arrays.
+  static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t base_index,
+                                              const ArrayType& target,
+                                              int64_t target_index) {
+    const bool base_valid = base.IsValid(base_index);
+    const bool target_valid = target.IsValid(target_index);
+    return base_valid ^ target_valid ? std::nullopt : std::optional<bool>{base_valid};
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const auto valid = ValidityIfEquals(base, base_index, target, target_index);
+    if (valid.has_value()) {
+      return *valid ? GetView(base, base_index) == GetView(target, target_index) : true;
+    }
+    return false;
+  }
+};
+
+template <typename RunEndCType>
+class REEValueComparator : public ValueComparator {
+ private:
+  const RunEndEncodedArray& base_;
+  const RunEndEncodedArray& target_;
+  std::unique_ptr<ValueComparator> inner_value_comparator_;
+  ree_util::PhysicalIndexFinder<RunEndCType> base_physical_index_finder_;
+  ree_util::PhysicalIndexFinder<RunEndCType> target_physical_index_finder_;
+
+ public:
+  REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray& target,
+                     std::unique_ptr<ValueComparator>&& inner_value_comparator)
+      : base_(base),
+        target_(target),
+        inner_value_comparator_(std::move(inner_value_comparator)),
+        base_physical_index_finder_(*base_.data()),
+        target_physical_index_finder_(*target_.data()) {
+    DCHECK_EQ(*base_.type(), *target_.type());
+  }
+
+  ~REEValueComparator() override = default;
 
-struct ValueComparatorVisitor {
+ private:
+  /// \pre 0 <= i < base_.length()
+  inline int64_t FindPhysicalIndexOnBase(int64_t i) {
+    return base_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  /// \pre 0 <= i < target_.length()
+  inline int64_t FindPhysicalIndexOnTarget(int64_t i) {
+    return target_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  const RunEndCType* base_run_ends() { return base_physical_index_finder_.run_ends; }
+
+  const RunEndCType* target_run_ends() { return target_physical_index_finder_.run_ends; }
+
+ public:
+  int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                int64_t target_index, int64_t target_length) override {
+    // Ensure the first search for physical index on the values arrays is safe.
+    if (base_index >= base_length || target_index >= target_length) {
+      // Without values on either side, there is no run of equal values.
+      return 0;
+    }
+
+    // Translate the two logical indices into physical indices.
+    int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+    int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+
+    int64_t run_length_of_equals = 0;
+    // The loop invariant (base_index < base_length && target_index < target_length)
+    // is valid when the loop starts because of the check above.
+    for (;;) {
+      const auto base_run_end =
+          static_cast<int64_t>(base_run_ends()[physical_base_index]) - base_.offset();
+      const auto target_run_end =
+          static_cast<int64_t>(target_run_ends()[physical_target_index]) -
+          target_.offset();
+      // The end of the runs containing the logical indices, by definition, ends
+      // after the logical indices.
+      DCHECK_LT(base_index, base_run_end);
+      DCHECK_LT(target_index, target_run_end);
+
+      // Compare the physical values that make up the runs containing base_index
+      // and target_index.
+      if (!inner_value_comparator_->Equals(physical_base_index, physical_target_index)) {
+        // First difference found, stop because the run of equal values cannot
+        // be extended further.
+        break;
+      }
+
+      const int64_t base_run = std::min(base_run_end, base_length) - base_index;
+      const int64_t target_run = std::min(target_run_end, target_length) - target_index;
+      // Due to the loop-invariant (base_index < base_length && target_index <
+      // target_length) and properties of the run-ends asserted above, both base_run and
+      // target_run are strictly greater than zero.
+      DCHECK_GT(base_run, 0);
+      DCHECK_GT(target_run, 0);
+
+      int64_t increment = 0;
+      if (base_run < target_run) {
+        increment = base_run;
+        physical_base_index += 1;  // skip to next run on base
+      } else if (base_run > target_run) {
+        increment = target_run;
+        physical_target_index += 1;  // skip to next run on target
+      } else {
+        increment = base_run;
+        // skip to next run on both base and target
+        physical_base_index += 1;
+        physical_target_index += 1;
+      }

Review Comment:
   ```suggestion
         int64_t increment = std::min(base_run, target_run);
         physical_base_index += base_run == increment;
         physical_target_index += target_run == increment;
   ```



##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) {
   return UnitSlice{&array, index};
 }
 
-using ValueComparator = std::function<bool(const Array&, int64_t, const Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+  virtual ~ValueComparator() = default;
+
+  /// \brief Compare the validity and values at the given indices in the base and target
+  /// arrays.
+  ///
+  /// \param base_index The index in the base array.
+  /// \param target_index The index in the target array.
+  /// \return true if the values at the given indices are equal, false otherwise.
+  /// \pre base_index and target_index are valid indices in their respective arrays.
+  virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+  /// \brief Return the run length of equal values starting at the given indices in the
+  /// base and target arrays.
+  ///
+  /// \param base_index The starting index in the base array.
+  /// \param base_length The length of the base array.
+  /// \param target_index The starting index in the target array.
+  /// \param target_length The length of the target array.
+  /// \return The run length of equal values starting at the given indices in the base
+  /// and target arrays.
+  virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                        int64_t target_index, int64_t target_length) {
+    int64_t run_length_of_equals = 0;
+    while (base_index < base_length && target_index < target_length) {
+      if (!Equals(base_index, target_index)) {
+        break;
+      }
+      base_index += 1;
+      target_index += 1;
+      run_length_of_equals += 1;
+    }
+    return run_length_of_equals;
+  }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+  const ArrayType& base;
+  const ArrayType& target;
+
+  DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+      : base(base), target(target) {}
+
+  ~DefaultValueComparator() override = default;
+
+  /// \brief Compare validity bits of base[base_index] and target[target_index]
+  ///
+  /// \return std::nullopt if the validity bits differ, otherwise a bool
+  /// containing the validity bit found in both arrays.
+  static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t base_index,
+                                              const ArrayType& target,
+                                              int64_t target_index) {
+    const bool base_valid = base.IsValid(base_index);
+    const bool target_valid = target.IsValid(target_index);
+    return base_valid ^ target_valid ? std::nullopt : std::optional<bool>{base_valid};
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const auto valid = ValidityIfEquals(base, base_index, target, target_index);
+    if (valid.has_value()) {
+      return *valid ? GetView(base, base_index) == GetView(target, target_index) : true;
+    }
+    return false;
+  }
+};
+
+template <typename RunEndCType>
+class REEValueComparator : public ValueComparator {
+ private:
+  const RunEndEncodedArray& base_;
+  const RunEndEncodedArray& target_;
+  std::unique_ptr<ValueComparator> inner_value_comparator_;
+  ree_util::PhysicalIndexFinder<RunEndCType> base_physical_index_finder_;
+  ree_util::PhysicalIndexFinder<RunEndCType> target_physical_index_finder_;
+
+ public:
+  REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray& target,
+                     std::unique_ptr<ValueComparator>&& inner_value_comparator)
+      : base_(base),
+        target_(target),
+        inner_value_comparator_(std::move(inner_value_comparator)),
+        base_physical_index_finder_(*base_.data()),
+        target_physical_index_finder_(*target_.data()) {
+    DCHECK_EQ(*base_.type(), *target_.type());
+  }
+
+  ~REEValueComparator() override = default;
 
-struct ValueComparatorVisitor {
+ private:
+  /// \pre 0 <= i < base_.length()
+  inline int64_t FindPhysicalIndexOnBase(int64_t i) {
+    return base_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  /// \pre 0 <= i < target_.length()
+  inline int64_t FindPhysicalIndexOnTarget(int64_t i) {
+    return target_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  const RunEndCType* base_run_ends() { return base_physical_index_finder_.run_ends; }
+
+  const RunEndCType* target_run_ends() { return target_physical_index_finder_.run_ends; }
+
+ public:
+  int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                int64_t target_index, int64_t target_length) override {
+    // Ensure the first search for physical index on the values arrays is safe.
+    if (base_index >= base_length || target_index >= target_length) {
+      // Without values on either side, there is no run of equal values.
+      return 0;
+    }
+
+    // Translate the two logical indices into physical indices.
+    int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+    int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+
+    int64_t run_length_of_equals = 0;
+    // The loop invariant (base_index < base_length && target_index < target_length)
+    // is valid when the loop starts because of the check above.
+    for (;;) {
+      const auto base_run_end =
+          static_cast<int64_t>(base_run_ends()[physical_base_index]) - base_.offset();
+      const auto target_run_end =
+          static_cast<int64_t>(target_run_ends()[physical_target_index]) -
+          target_.offset();
+      // The end of the runs containing the logical indices, by definition, ends
+      // after the logical indices.
+      DCHECK_LT(base_index, base_run_end);
+      DCHECK_LT(target_index, target_run_end);
+
+      // Compare the physical values that make up the runs containing base_index
+      // and target_index.
+      if (!inner_value_comparator_->Equals(physical_base_index, physical_target_index)) {
+        // First difference found, stop because the run of equal values cannot
+        // be extended further.
+        break;
+      }
+
+      const int64_t base_run = std::min(base_run_end, base_length) - base_index;
+      const int64_t target_run = std::min(target_run_end, target_length) - target_index;
+      // Due to the loop-invariant (base_index < base_length && target_index <
+      // target_length) and properties of the run-ends asserted above, both base_run and
+      // target_run are strictly greater than zero.
+      DCHECK_GT(base_run, 0);
+      DCHECK_GT(target_run, 0);
+
+      int64_t increment = 0;
+      if (base_run < target_run) {
+        increment = base_run;
+        physical_base_index += 1;  // skip to next run on base
+      } else if (base_run > target_run) {
+        increment = target_run;
+        physical_target_index += 1;  // skip to next run on target
+      } else {
+        increment = base_run;
+        // skip to next run on both base and target
+        physical_base_index += 1;
+        physical_target_index += 1;
+      }
+      // Since both base_run and target_run are greater than zero,
+      // increment is also greater than zero...
+      DCHECK_GT(increment, 0);
+      // ...which implies that the loop will make progress and eventually terminate
+      // because base_index or target_index will equal base_length or target_length,
+      // respectively.
+      base_index += increment;
+      target_index += increment;
+      // The value representing the two runs are equal, so we can assume that at
+      // least `increment` (size of smallest run) values are equal.
+      run_length_of_equals += increment;
+
+      if (base_index >= base_length || target_index >= target_length) {
+        break;
+      }
+    }
+
+    return run_length_of_equals;
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+    const int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+    return inner_value_comparator_->Equals(physical_base_index, physical_target_index);
+  }
+};
+
+class CreateValueComparator {
+ private:
+  std::unique_ptr<ValueComparator> comparator_;
+
+ public:
   template <typename T>
-  Status Visit(const T&) {
+  Status Visit(const T&, const Array& base, const Array& target) {
     using ArrayType = typename TypeTraits<T>::ArrayType;
-    out = [](const Array& base, int64_t base_index, const Array& target,
-             int64_t target_index) {
-      return (GetView(checked_cast<const ArrayType&>(base), base_index) ==
-              GetView(checked_cast<const ArrayType&>(target), target_index));
-    };
+    comparator_ = std::make_unique<DefaultValueComparator<ArrayType>>(
+        checked_cast<const ArrayType&>(base), checked_cast<const ArrayType&>(target));
     return Status::OK();
   }
 
-  Status Visit(const NullType&) { return Status::NotImplemented("null type"); }
+  Status Visit(const NullType&, const Array&, const Array&) {
+    return Status::NotImplemented("null type");
+  }
 
-  Status Visit(const ExtensionType&) { return Status::NotImplemented("extension type"); }
+  Status Visit(const ExtensionType&, const Array&, const Array&) {
+    return Status::NotImplemented("extension type");
+  }
 
-  Status Visit(const DictionaryType&) {
+  Status Visit(const DictionaryType&, const Array& base, const Array& target) {
     return Status::NotImplemented("dictionary type");
   }
 
-  Status Visit(const RunEndEncodedType&) {
-    return Status::NotImplemented("run-end encoded type");
+  Status Visit(const RunEndEncodedType& ree_type, const Array& base,
+               const Array& target) {
+    const auto& base_ree = checked_cast<const RunEndEncodedArray&>(base);
+    const auto& target_ree = checked_cast<const RunEndEncodedArray&>(target);
+    ARROW_ASSIGN_OR_RAISE(
+        auto inner_values_comparator,
+        (*this)(*ree_type.value_type(), *base_ree.values(), *target_ree.values()));
+    auto* ree_value_comparator =
+        ([&ree_type, &base_ree, &target_ree,
+          inner_values_comparator =
+              std::move(inner_values_comparator)]() mutable -> ValueComparator* {
+          const auto run_end_type_id = ree_type.run_end_type()->id();
+          switch (run_end_type_id) {
+            case Type::INT16:
+              return new REEValueComparator<int16_t>(base_ree, target_ree,
+                                                     std::move(inner_values_comparator));
+            case Type::INT32:
+              return new REEValueComparator<int32_t>(base_ree, target_ree,
+                                                     std::move(inner_values_comparator));
+            default:
+              DCHECK_EQ(run_end_type_id, Type::INT64);
+              return new REEValueComparator<int64_t>(base_ree, target_ree,
+                                                     std::move(inner_values_comparator));
+          }
+        }());
+    comparator_ = std::unique_ptr<ValueComparator>{ree_value_comparator};
+    return Status::OK();

Review Comment:
   ```suggestion
             switch (ree_type.run_end_type()->id()) {
               case Type::INT16:
                 comparator_.reset(new REEValueComparator<int16_t>(base_ree, target_ree,
                                                        std::move(inner_values_comparator)));
                 return Status::OK();
               case Type::INT32:
                 comparator_.reset(new REEValueComparator<int32_t>(base_ree, target_ree,
                                                        std::move(inner_values_comparator)));
                 return Status::OK();
               case Type::INT64:
                 comparator_.reset(new REEValueComparator<int64_t>(base_ree, target_ree,
                                                        std::move(inner_values_comparator)));
                 return Status::OK();
               default:
                 Unreachable();
             }
   ```



##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) {
   return UnitSlice{&array, index};
 }
 
-using ValueComparator = std::function<bool(const Array&, int64_t, const Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+  virtual ~ValueComparator() = default;
+
+  /// \brief Compare the validity and values at the given indices in the base and target
+  /// arrays.
+  ///
+  /// \param base_index The index in the base array.
+  /// \param target_index The index in the target array.
+  /// \return true if the values at the given indices are equal, false otherwise.
+  /// \pre base_index and target_index are valid indices in their respective arrays.
+  virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+  /// \brief Return the run length of equal values starting at the given indices in the
+  /// base and target arrays.
+  ///
+  /// \param base_index The starting index in the base array.
+  /// \param base_length The length of the base array.
+  /// \param target_index The starting index in the target array.
+  /// \param target_length The length of the target array.
+  /// \return The run length of equal values starting at the given indices in the base
+  /// and target arrays.
+  virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                        int64_t target_index, int64_t target_length) {
+    int64_t run_length_of_equals = 0;
+    while (base_index < base_length && target_index < target_length) {
+      if (!Equals(base_index, target_index)) {
+        break;
+      }
+      base_index += 1;
+      target_index += 1;
+      run_length_of_equals += 1;
+    }
+    return run_length_of_equals;
+  }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+  const ArrayType& base;
+  const ArrayType& target;
+
+  DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+      : base(base), target(target) {}
+
+  ~DefaultValueComparator() override = default;
+
+  /// \brief Compare validity bits of base[base_index] and target[target_index]
+  ///
+  /// \return std::nullopt if the validity bits differ, otherwise a bool
+  /// containing the validity bit found in both arrays.
+  static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t base_index,
+                                              const ArrayType& target,
+                                              int64_t target_index) {
+    const bool base_valid = base.IsValid(base_index);
+    const bool target_valid = target.IsValid(target_index);
+    return base_valid ^ target_valid ? std::nullopt : std::optional<bool>{base_valid};
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const auto valid = ValidityIfEquals(base, base_index, target, target_index);

Review Comment:
   Please make this more obvious/explicit:
   ```suggestion
       bool is_valid = base.IsValid(base_index);
       if (target.IsValid(target_index) != is_valid) return false;
       
       if (!is_valid) return true;
       
       return GetView(base, base_index) == GetView(target, target_index);
   ```



##########
cpp/src/arrow/array/diff.cc:
##########
@@ -164,56 +382,38 @@ class QuadraticSpaceMyersDiff {
       : base_(base),
         target_(target),
         pool_(pool),
-        value_comparator_(GetValueComparator(*base.type())),
-        base_begin_(0),
         base_end_(base.length()),
-        target_begin_(0),
         target_end_(target.length()),
-        endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}),
         insert_({true}) {
-    if ((base_end_ - base_begin_ == target_end_ - target_begin_) &&
-        endpoint_base_[0] == base_end_) {
-      // trivial case: base == target
-      finish_index_ = 0;
-    }
-  }
-
-  bool ValuesEqual(int64_t base_index, int64_t target_index) const {
-    bool base_null = base_.IsNull(base_index);
-    bool target_null = target_.IsNull(target_index);
-    if (base_null || target_null) {
-      // If only one is null, then this is false, otherwise true
-      return base_null && target_null;
-    }
-    return value_comparator_(base_, base_index, target_, target_index);
+    // endpoint_base_ is initialized when Diff() is called to start the algorithm.

Review Comment:
   Could we have the comparator also be a member, initialized at the same time?



##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) {
   return UnitSlice{&array, index};
 }
 
-using ValueComparator = std::function<bool(const Array&, int64_t, const Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+  virtual ~ValueComparator() = default;
+
+  /// \brief Compare the validity and values at the given indices in the base and target
+  /// arrays.
+  ///
+  /// \param base_index The index in the base array.
+  /// \param target_index The index in the target array.
+  /// \return true if the values at the given indices are equal, false otherwise.
+  /// \pre base_index and target_index are valid indices in their respective arrays.
+  virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+  /// \brief Return the run length of equal values starting at the given indices in the
+  /// base and target arrays.
+  ///
+  /// \param base_index The starting index in the base array.
+  /// \param base_length The length of the base array.
+  /// \param target_index The starting index in the target array.
+  /// \param target_length The length of the target array.
+  /// \return The run length of equal values starting at the given indices in the base
+  /// and target arrays.
+  virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                        int64_t target_index, int64_t target_length) {
+    int64_t run_length_of_equals = 0;
+    while (base_index < base_length && target_index < target_length) {
+      if (!Equals(base_index, target_index)) {
+        break;
+      }
+      base_index += 1;
+      target_index += 1;
+      run_length_of_equals += 1;
+    }
+    return run_length_of_equals;
+  }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+  const ArrayType& base;
+  const ArrayType& target;
+
+  DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+      : base(base), target(target) {}
+
+  ~DefaultValueComparator() override = default;
+
+  /// \brief Compare validity bits of base[base_index] and target[target_index]
+  ///
+  /// \return std::nullopt if the validity bits differ, otherwise a bool
+  /// containing the validity bit found in both arrays.
+  static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t base_index,
+                                              const ArrayType& target,
+                                              int64_t target_index) {
+    const bool base_valid = base.IsValid(base_index);
+    const bool target_valid = target.IsValid(target_index);
+    return base_valid ^ target_valid ? std::nullopt : std::optional<bool>{base_valid};
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const auto valid = ValidityIfEquals(base, base_index, target, target_index);
+    if (valid.has_value()) {
+      return *valid ? GetView(base, base_index) == GetView(target, target_index) : true;
+    }
+    return false;
+  }
+};
+
+template <typename RunEndCType>
+class REEValueComparator : public ValueComparator {
+ private:
+  const RunEndEncodedArray& base_;
+  const RunEndEncodedArray& target_;
+  std::unique_ptr<ValueComparator> inner_value_comparator_;
+  ree_util::PhysicalIndexFinder<RunEndCType> base_physical_index_finder_;
+  ree_util::PhysicalIndexFinder<RunEndCType> target_physical_index_finder_;
+
+ public:
+  REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray& target,
+                     std::unique_ptr<ValueComparator>&& inner_value_comparator)
+      : base_(base),
+        target_(target),
+        inner_value_comparator_(std::move(inner_value_comparator)),
+        base_physical_index_finder_(*base_.data()),
+        target_physical_index_finder_(*target_.data()) {
+    DCHECK_EQ(*base_.type(), *target_.type());
+  }
+
+  ~REEValueComparator() override = default;
 
-struct ValueComparatorVisitor {
+ private:
+  /// \pre 0 <= i < base_.length()
+  inline int64_t FindPhysicalIndexOnBase(int64_t i) {
+    return base_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  /// \pre 0 <= i < target_.length()
+  inline int64_t FindPhysicalIndexOnTarget(int64_t i) {
+    return target_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  const RunEndCType* base_run_ends() { return base_physical_index_finder_.run_ends; }
+
+  const RunEndCType* target_run_ends() { return target_physical_index_finder_.run_ends; }
+
+ public:
+  int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                int64_t target_index, int64_t target_length) override {
+    // Ensure the first search for physical index on the values arrays is safe.
+    if (base_index >= base_length || target_index >= target_length) {
+      // Without values on either side, there is no run of equal values.
+      return 0;
+    }
+
+    // Translate the two logical indices into physical indices.
+    int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+    int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+
+    int64_t run_length_of_equals = 0;
+    // The loop invariant (base_index < base_length && target_index < target_length)
+    // is valid when the loop starts because of the check above.
+    for (;;) {
+      const auto base_run_end =
+          static_cast<int64_t>(base_run_ends()[physical_base_index]) - base_.offset();
+      const auto target_run_end =
+          static_cast<int64_t>(target_run_ends()[physical_target_index]) -
+          target_.offset();
+      // The end of the runs containing the logical indices, by definition, ends
+      // after the logical indices.
+      DCHECK_LT(base_index, base_run_end);
+      DCHECK_LT(target_index, target_run_end);
+
+      // Compare the physical values that make up the runs containing base_index
+      // and target_index.
+      if (!inner_value_comparator_->Equals(physical_base_index, physical_target_index)) {
+        // First difference found, stop because the run of equal values cannot
+        // be extended further.
+        break;
+      }
+
+      const int64_t base_run = std::min(base_run_end, base_length) - base_index;
+      const int64_t target_run = std::min(target_run_end, target_length) - target_index;
+      // Due to the loop-invariant (base_index < base_length && target_index <
+      // target_length) and properties of the run-ends asserted above, both base_run and
+      // target_run are strictly greater than zero.
+      DCHECK_GT(base_run, 0);
+      DCHECK_GT(target_run, 0);
+
+      int64_t increment = 0;
+      if (base_run < target_run) {
+        increment = base_run;
+        physical_base_index += 1;  // skip to next run on base
+      } else if (base_run > target_run) {
+        increment = target_run;
+        physical_target_index += 1;  // skip to next run on target
+      } else {
+        increment = base_run;
+        // skip to next run on both base and target
+        physical_base_index += 1;
+        physical_target_index += 1;
+      }
+      // Since both base_run and target_run are greater than zero,
+      // increment is also greater than zero...
+      DCHECK_GT(increment, 0);
+      // ...which implies that the loop will make progress and eventually terminate
+      // because base_index or target_index will equal base_length or target_length,
+      // respectively.
+      base_index += increment;
+      target_index += increment;
+      // The value representing the two runs are equal, so we can assume that at
+      // least `increment` (size of smallest run) values are equal.
+      run_length_of_equals += increment;
+
+      if (base_index >= base_length || target_index >= target_length) {
+        break;
+      }
+    }
+
+    return run_length_of_equals;
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+    const int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+    return inner_value_comparator_->Equals(physical_base_index, physical_target_index);
+  }
+};
+
+class CreateValueComparator {
+ private:
+  std::unique_ptr<ValueComparator> comparator_;
+
+ public:
   template <typename T>
-  Status Visit(const T&) {
+  Status Visit(const T&, const Array& base, const Array& target) {
     using ArrayType = typename TypeTraits<T>::ArrayType;
-    out = [](const Array& base, int64_t base_index, const Array& target,
-             int64_t target_index) {
-      return (GetView(checked_cast<const ArrayType&>(base), base_index) ==
-              GetView(checked_cast<const ArrayType&>(target), target_index));
-    };
+    comparator_ = std::make_unique<DefaultValueComparator<ArrayType>>(
+        checked_cast<const ArrayType&>(base), checked_cast<const ArrayType&>(target));
     return Status::OK();
   }
 
-  Status Visit(const NullType&) { return Status::NotImplemented("null type"); }
+  Status Visit(const NullType&, const Array&, const Array&) {
+    return Status::NotImplemented("null type");
+  }
 
-  Status Visit(const ExtensionType&) { return Status::NotImplemented("extension type"); }
+  Status Visit(const ExtensionType&, const Array&, const Array&) {
+    return Status::NotImplemented("extension type");
+  }
 
-  Status Visit(const DictionaryType&) {
+  Status Visit(const DictionaryType&, const Array& base, const Array& target) {
     return Status::NotImplemented("dictionary type");
   }
 
-  Status Visit(const RunEndEncodedType&) {
-    return Status::NotImplemented("run-end encoded type");
+  Status Visit(const RunEndEncodedType& ree_type, const Array& base,
+               const Array& target) {
+    const auto& base_ree = checked_cast<const RunEndEncodedArray&>(base);
+    const auto& target_ree = checked_cast<const RunEndEncodedArray&>(target);
+    ARROW_ASSIGN_OR_RAISE(
+        auto inner_values_comparator,
+        (*this)(*ree_type.value_type(), *base_ree.values(), *target_ree.values()));
+    auto* ree_value_comparator =
+        ([&ree_type, &base_ree, &target_ree,
+          inner_values_comparator =
+              std::move(inner_values_comparator)]() mutable -> ValueComparator* {
+          const auto run_end_type_id = ree_type.run_end_type()->id();
+          switch (run_end_type_id) {
+            case Type::INT16:
+              return new REEValueComparator<int16_t>(base_ree, target_ree,
+                                                     std::move(inner_values_comparator));
+            case Type::INT32:
+              return new REEValueComparator<int32_t>(base_ree, target_ree,
+                                                     std::move(inner_values_comparator));
+            default:
+              DCHECK_EQ(run_end_type_id, Type::INT64);
+              return new REEValueComparator<int64_t>(base_ree, target_ree,
+                                                     std::move(inner_values_comparator));
+          }
+        }());
+    comparator_ = std::unique_ptr<ValueComparator>{ree_value_comparator};
+    return Status::OK();
   }
 
-  ValueComparator Create(const DataType& type) {
-    DCHECK_OK(VisitTypeInline(type, this));
-    return out;
+  Result<std::unique_ptr<ValueComparator>> operator()(const DataType& type,

Review Comment:
   Could this be a static member function named Create instead?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1372155714


##########
cpp/src/arrow/util/ree_util.cc:
##########
@@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) {
   return LogicalNullCount<int64_t>(span);
 }
 
+namespace internal {
+
+/// \pre 0 <= i < array_span_.length()
+template <typename RunEndCType>
+int64_t FindPhysicalIndexImpl(PhysicalIndexFinder<RunEndCType>& self, int64_t i) {
+  DCHECK_LT(i, self.array_span.length);
+  const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length;
+  DCHECK_LT(self.last_physical_index, run_ends_size);
+  // This access to self.run_ends_[last_physical_index_] is always safe because:
+  // 1. 0 <= i < array_span_.length() implies there is at least one run and the initial
+  //    value 0 will be safe to index with.
+  // 2. last_physical_index_ > 0 is always the result of a valid call to
+  //    internal::FindPhysicalIndex.
+  if (ARROW_PREDICT_TRUE(self.array_span.offset + i <
+                         self.run_ends[self.last_physical_index])) {
+    // The cached value is an upper-bound, but is it the least upper-bound?
+    if (self.last_physical_index == 0 ||
+        self.array_span.offset + i >= self.run_ends[self.last_physical_index - 1]) {
+      return self.last_physical_index;
+    }
+    // last_physical_index_ - 1 is a candidate for the least upper-bound,
+    // so search for the least upper-bound in the range that includes it.
+    const int64_t j = ree_util::internal::FindPhysicalIndex<RunEndCType>(
+        self.run_ends, /*run_ends_size=*/self.last_physical_index, i,
+        self.array_span.offset);
+    DCHECK_LT(j, self.last_physical_index);
+    return self.last_physical_index = j;

Review Comment:
   I'm treating this as a "pure function" that memoizes some of the work between calls. Is it OK if I just break this into two statements?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz merged PR #35003:
URL: https://github.com/apache/arrow/pull/35003


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "conbench-apache-arrow[bot] (via GitHub)" <gi...@apache.org>.
conbench-apache-arrow[bot] commented on PR #35003:
URL: https://github.com/apache/arrow/pull/35003#issuecomment-1783367774

   After merging your PR, Conbench analyzed the 5 benchmarking runs that have been run so far on merge-commit fcbcb7dab71b14ff3436af7b21bd23132877e2fa.
   
   There was 1 benchmark result indicating a performance regression:
   
   - Commit Run on `ursa-i9-9960x` at [2023-10-27 01:21:47Z](https://conbench.ursa.dev/compare/runs/bae16e8d8e4c49118fd0d423d511e29f...7ef851a2901a47188086c78d65600901/)
     - [`tpch` (R) with engine=arrow, format=native, language=R, memory_map=False, query_id=TPCH-20, scale_factor=1](https://conbench.ursa.dev/compare/benchmarks/0653b02dc6ed78cd80002eb4d8621cef...0653b2db15af7fba800093c7aad97482)
   
   The [full Conbench report](https://github.com/apache/arrow/runs/18132442214) has more details.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1370830276


##########
cpp/src/arrow/array/diff.cc:
##########
@@ -164,56 +382,38 @@ class QuadraticSpaceMyersDiff {
       : base_(base),
         target_(target),
         pool_(pool),
-        value_comparator_(GetValueComparator(*base.type())),
-        base_begin_(0),
         base_end_(base.length()),
-        target_begin_(0),
         target_end_(target.length()),
-        endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}),
         insert_({true}) {
-    if ((base_end_ - base_begin_ == target_end_ - target_begin_) &&
-        endpoint_base_[0] == base_end_) {
-      // trivial case: base == target
-      finish_index_ = 0;
-    }
-  }
-
-  bool ValuesEqual(int64_t base_index, int64_t target_index) const {
-    bool base_null = base_.IsNull(base_index);
-    bool target_null = target_.IsNull(target_index);
-    if (base_null || target_null) {
-      // If only one is null, then this is false, otherwise true
-      return base_null && target_null;
-    }
-    return value_comparator_(base_, base_index, target_, target_index);
+    // endpoint_base_ is initialized when Diff() is called to start the algorithm.

Review Comment:
   That is how it was before, but in [diff.cc: Remove the unsafe creation of value comparators](https://github.com/apache/arrow/pull/35003/commits/cdbe50ec07699aa4f267e3061e7186c979aadcbe) I refactored to make it possible to do proper error handling of the comparator creation. The comparator is created on `Diff` (allowed to return a `Status` unlike a class constructor) and then injected to all the other member functions as an argument. To make it more explicit that `Diff` is the only valid entry point into the class, I've made all the other functions `private` except for `Diff`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1370822464


##########
cpp/src/arrow/util/ree_util.cc:
##########
@@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) {
   return LogicalNullCount<int64_t>(span);
 }
 
+namespace internal {
+
+/// \pre 0 <= i < array_span_.length()
+template <typename RunEndCType>
+int64_t FindPhysicalIndexImpl(PhysicalIndexFinder<RunEndCType>& self, int64_t i) {
+  DCHECK_LT(i, self.array_span.length);
+  const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length;
+  DCHECK_LT(self.last_physical_index, run_ends_size);
+  // This access to self.run_ends_[last_physical_index_] is always safe because:
+  // 1. 0 <= i < array_span_.length() implies there is at least one run and the initial
+  //    value 0 will be safe to index with.
+  // 2. last_physical_index_ > 0 is always the result of a valid call to
+  //    internal::FindPhysicalIndex.
+  if (ARROW_PREDICT_TRUE(self.array_span.offset + i <
+                         self.run_ends[self.last_physical_index])) {
+    // The cached value is an upper-bound, but is it the least upper-bound?
+    if (self.last_physical_index == 0 ||
+        self.array_span.offset + i >= self.run_ends[self.last_physical_index - 1]) {
+      return self.last_physical_index;
+    }
+    // last_physical_index_ - 1 is a candidate for the least upper-bound,
+    // so search for the least upper-bound in the range that includes it.
+    const int64_t j = ree_util::internal::FindPhysicalIndex<RunEndCType>(
+        self.run_ends, /*run_ends_size=*/self.last_physical_index, i,
+        self.array_span.offset);
+    DCHECK_LT(j, self.last_physical_index);
+    return self.last_physical_index = j;

Review Comment:
   The advantage being? Binary size reduciton?



##########
cpp/src/arrow/util/ree_util.cc:
##########
@@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) {
   return LogicalNullCount<int64_t>(span);
 }
 
+namespace internal {
+
+/// \pre 0 <= i < array_span_.length()
+template <typename RunEndCType>
+int64_t FindPhysicalIndexImpl(PhysicalIndexFinder<RunEndCType>& self, int64_t i) {
+  DCHECK_LT(i, self.array_span.length);
+  const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length;
+  DCHECK_LT(self.last_physical_index, run_ends_size);
+  // This access to self.run_ends_[last_physical_index_] is always safe because:
+  // 1. 0 <= i < array_span_.length() implies there is at least one run and the initial
+  //    value 0 will be safe to index with.
+  // 2. last_physical_index_ > 0 is always the result of a valid call to
+  //    internal::FindPhysicalIndex.
+  if (ARROW_PREDICT_TRUE(self.array_span.offset + i <
+                         self.run_ends[self.last_physical_index])) {
+    // The cached value is an upper-bound, but is it the least upper-bound?
+    if (self.last_physical_index == 0 ||
+        self.array_span.offset + i >= self.run_ends[self.last_physical_index - 1]) {
+      return self.last_physical_index;
+    }
+    // last_physical_index_ - 1 is a candidate for the least upper-bound,
+    // so search for the least upper-bound in the range that includes it.
+    const int64_t j = ree_util::internal::FindPhysicalIndex<RunEndCType>(
+        self.run_ends, /*run_ends_size=*/self.last_physical_index, i,
+        self.array_span.offset);
+    DCHECK_LT(j, self.last_physical_index);
+    return self.last_physical_index = j;
+  }
+
+  // last_physical_index_ is not an upper-bound, and the logical index i MUST be
+  // in the runs that follow it. Since i is a valid logical index, we know that at least
+  // one extra run is present.
+  DCHECK_LT(self.last_physical_index + 1, run_ends_size);
+  const int64_t lower_physical_index = self.last_physical_index + 1;

Review Comment:
   Indeed. I will rename.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1370822185


##########
cpp/src/arrow/util/ree_util.cc:
##########
@@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) {
   return LogicalNullCount<int64_t>(span);
 }
 
+namespace internal {
+
+/// \pre 0 <= i < array_span_.length()
+template <typename RunEndCType>
+int64_t FindPhysicalIndexImpl(PhysicalIndexFinder<RunEndCType>& self, int64_t i) {
+  DCHECK_LT(i, self.array_span.length);
+  const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length;
+  DCHECK_LT(self.last_physical_index, run_ends_size);
+  // This access to self.run_ends_[last_physical_index_] is always safe because:
+  // 1. 0 <= i < array_span_.length() implies there is at least one run and the initial
+  //    value 0 will be safe to index with.
+  // 2. last_physical_index_ > 0 is always the result of a valid call to

Review Comment:
   Oops. Missed these when I renamed the struct members.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1370839520


##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) {
   return UnitSlice{&array, index};
 }
 
-using ValueComparator = std::function<bool(const Array&, int64_t, const Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+  virtual ~ValueComparator() = default;
+
+  /// \brief Compare the validity and values at the given indices in the base and target
+  /// arrays.
+  ///
+  /// \param base_index The index in the base array.
+  /// \param target_index The index in the target array.
+  /// \return true if the values at the given indices are equal, false otherwise.
+  /// \pre base_index and target_index are valid indices in their respective arrays.
+  virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+  /// \brief Return the run length of equal values starting at the given indices in the
+  /// base and target arrays.
+  ///
+  /// \param base_index The starting index in the base array.
+  /// \param base_length The length of the base array.
+  /// \param target_index The starting index in the target array.
+  /// \param target_length The length of the target array.
+  /// \return The run length of equal values starting at the given indices in the base
+  /// and target arrays.
+  virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                        int64_t target_index, int64_t target_length) {
+    int64_t run_length_of_equals = 0;
+    while (base_index < base_length && target_index < target_length) {
+      if (!Equals(base_index, target_index)) {
+        break;
+      }
+      base_index += 1;
+      target_index += 1;
+      run_length_of_equals += 1;
+    }
+    return run_length_of_equals;
+  }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+  const ArrayType& base;
+  const ArrayType& target;
+
+  DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+      : base(base), target(target) {}
+
+  ~DefaultValueComparator() override = default;
+
+  /// \brief Compare validity bits of base[base_index] and target[target_index]
+  ///
+  /// \return std::nullopt if the validity bits differ, otherwise a bool
+  /// containing the validity bit found in both arrays.
+  static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t base_index,
+                                              const ArrayType& target,
+                                              int64_t target_index) {
+    const bool base_valid = base.IsValid(base_index);
+    const bool target_valid = target.IsValid(target_index);
+    return base_valid ^ target_valid ? std::nullopt : std::optional<bool>{base_valid};
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const auto valid = ValidityIfEquals(base, base_index, target, target_index);
+    if (valid.has_value()) {
+      return *valid ? GetView(base, base_index) == GetView(target, target_index) : true;
+    }
+    return false;
+  }
+};
+
+template <typename RunEndCType>
+class REEValueComparator : public ValueComparator {
+ private:
+  const RunEndEncodedArray& base_;
+  const RunEndEncodedArray& target_;
+  std::unique_ptr<ValueComparator> inner_value_comparator_;
+  ree_util::PhysicalIndexFinder<RunEndCType> base_physical_index_finder_;
+  ree_util::PhysicalIndexFinder<RunEndCType> target_physical_index_finder_;
+
+ public:
+  REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray& target,
+                     std::unique_ptr<ValueComparator>&& inner_value_comparator)
+      : base_(base),
+        target_(target),
+        inner_value_comparator_(std::move(inner_value_comparator)),
+        base_physical_index_finder_(*base_.data()),
+        target_physical_index_finder_(*target_.data()) {
+    DCHECK_EQ(*base_.type(), *target_.type());
+  }
+
+  ~REEValueComparator() override = default;
 
-struct ValueComparatorVisitor {
+ private:
+  /// \pre 0 <= i < base_.length()
+  inline int64_t FindPhysicalIndexOnBase(int64_t i) {
+    return base_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  /// \pre 0 <= i < target_.length()
+  inline int64_t FindPhysicalIndexOnTarget(int64_t i) {
+    return target_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  const RunEndCType* base_run_ends() { return base_physical_index_finder_.run_ends; }
+
+  const RunEndCType* target_run_ends() { return target_physical_index_finder_.run_ends; }
+
+ public:
+  int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                int64_t target_index, int64_t target_length) override {
+    // Ensure the first search for physical index on the values arrays is safe.
+    if (base_index >= base_length || target_index >= target_length) {
+      // Without values on either side, there is no run of equal values.
+      return 0;
+    }
+
+    // Translate the two logical indices into physical indices.
+    int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+    int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+
+    int64_t run_length_of_equals = 0;
+    // The loop invariant (base_index < base_length && target_index < target_length)
+    // is valid when the loop starts because of the check above.
+    for (;;) {
+      const auto base_run_end =
+          static_cast<int64_t>(base_run_ends()[physical_base_index]) - base_.offset();
+      const auto target_run_end =
+          static_cast<int64_t>(target_run_ends()[physical_target_index]) -
+          target_.offset();
+      // The end of the runs containing the logical indices, by definition, ends
+      // after the logical indices.
+      DCHECK_LT(base_index, base_run_end);
+      DCHECK_LT(target_index, target_run_end);
+
+      // Compare the physical values that make up the runs containing base_index
+      // and target_index.
+      if (!inner_value_comparator_->Equals(physical_base_index, physical_target_index)) {
+        // First difference found, stop because the run of equal values cannot
+        // be extended further.
+        break;
+      }
+
+      const int64_t base_run = std::min(base_run_end, base_length) - base_index;
+      const int64_t target_run = std::min(target_run_end, target_length) - target_index;
+      // Due to the loop-invariant (base_index < base_length && target_index <
+      // target_length) and properties of the run-ends asserted above, both base_run and
+      // target_run are strictly greater than zero.
+      DCHECK_GT(base_run, 0);
+      DCHECK_GT(target_run, 0);
+
+      int64_t increment = 0;
+      if (base_run < target_run) {
+        increment = base_run;
+        physical_base_index += 1;  // skip to next run on base
+      } else if (base_run > target_run) {
+        increment = target_run;
+        physical_target_index += 1;  // skip to next run on target
+      } else {
+        increment = base_run;
+        // skip to next run on both base and target
+        physical_base_index += 1;
+        physical_target_index += 1;
+      }

Review Comment:
   This is very clever, but drops all the comments explaining why each increment is justified.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] github-actions[bot] commented on pull request #35003: GH-34569: [C++] Diffing of Run-End Encoded arrays

Posted by "github-actions[bot] (via GitHub)" <gi...@apache.org>.
github-actions[bot] commented on PR #35003:
URL: https://github.com/apache/arrow/pull/35003#issuecomment-1501866449

   * Closes: #34569


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1372132861


##########
cpp/src/arrow/util/ree_util.cc:
##########
@@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) {
   return LogicalNullCount<int64_t>(span);
 }
 
+namespace internal {
+
+/// \pre 0 <= i < array_span_.length()
+template <typename RunEndCType>
+int64_t FindPhysicalIndexImpl(PhysicalIndexFinder<RunEndCType>& self, int64_t i) {
+  DCHECK_LT(i, self.array_span.length);
+  const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length;
+  DCHECK_LT(self.last_physical_index, run_ends_size);
+  // This access to self.run_ends_[last_physical_index_] is always safe because:
+  // 1. 0 <= i < array_span_.length() implies there is at least one run and the initial
+  //    value 0 will be safe to index with.
+  // 2. last_physical_index_ > 0 is always the result of a valid call to
+  //    internal::FindPhysicalIndex.
+  if (ARROW_PREDICT_TRUE(self.array_span.offset + i <
+                         self.run_ends[self.last_physical_index])) {
+    // The cached value is an upper-bound, but is it the least upper-bound?
+    if (self.last_physical_index == 0 ||
+        self.array_span.offset + i >= self.run_ends[self.last_physical_index - 1]) {
+      return self.last_physical_index;
+    }
+    // last_physical_index_ - 1 is a candidate for the least upper-bound,
+    // so search for the least upper-bound in the range that includes it.
+    const int64_t j = ree_util::internal::FindPhysicalIndex<RunEndCType>(
+        self.run_ends, /*run_ends_size=*/self.last_physical_index, i,
+        self.array_span.offset);
+    DCHECK_LT(j, self.last_physical_index);
+    return self.last_physical_index = j;

Review Comment:
   style; avoiding return of an assignment expression and making it clear that the return is not distinct from the assignment



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1373365479


##########
cpp/src/arrow/array/diff.cc:
##########
@@ -164,56 +382,38 @@ class QuadraticSpaceMyersDiff {
       : base_(base),
         target_(target),
         pool_(pool),
-        value_comparator_(GetValueComparator(*base.type())),
-        base_begin_(0),
         base_end_(base.length()),
-        target_begin_(0),
         target_end_(target.length()),
-        endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}),
         insert_({true}) {
-    if ((base_end_ - base_begin_ == target_end_ - target_begin_) &&
-        endpoint_base_[0] == base_end_) {
-      // trivial case: base == target
-      finish_index_ = 0;
-    }
-  }
-
-  bool ValuesEqual(int64_t base_index, int64_t target_index) const {
-    bool base_null = base_.IsNull(base_index);
-    bool target_null = target_.IsNull(target_index);
-    if (base_null || target_null) {
-      // If only one is null, then this is false, otherwise true
-      return base_null && target_null;
-    }
-    return value_comparator_(base_, base_index, target_, target_index);
+    // endpoint_base_ is initialized when Diff() is called to start the algorithm.

Review Comment:
   I think `Diff` being the only entry point is good, but in either case I'd like to avoid splitting initialization as much as possible. Would you mind moving everything except the reference member init into Diff?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1370830276


##########
cpp/src/arrow/array/diff.cc:
##########
@@ -164,56 +382,38 @@ class QuadraticSpaceMyersDiff {
       : base_(base),
         target_(target),
         pool_(pool),
-        value_comparator_(GetValueComparator(*base.type())),
-        base_begin_(0),
         base_end_(base.length()),
-        target_begin_(0),
         target_end_(target.length()),
-        endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}),
         insert_({true}) {
-    if ((base_end_ - base_begin_ == target_end_ - target_begin_) &&
-        endpoint_base_[0] == base_end_) {
-      // trivial case: base == target
-      finish_index_ = 0;
-    }
-  }
-
-  bool ValuesEqual(int64_t base_index, int64_t target_index) const {
-    bool base_null = base_.IsNull(base_index);
-    bool target_null = target_.IsNull(target_index);
-    if (base_null || target_null) {
-      // If only one is null, then this is false, otherwise true
-      return base_null && target_null;
-    }
-    return value_comparator_(base_, base_index, target_, target_index);
+    // endpoint_base_ is initialized when Diff() is called to start the algorithm.

Review Comment:
   That is how it was before, but in [diff.cc: Remove the unsafe creation of value comparators](https://github.com/apache/arrow/pull/35003/commits/cdbe50ec07699aa4f267e3061e7186c979aadcbe) I refactored to make it possible to do proper error handling of the comparator creation. The comparator is created on `Diff` (allowed to return a `Status` unlike a class constructor) and then injected to all the other member functions as an argument. To make it more explicit that `Diff` is the only valid entry point into the class, I've made all the other functions `private` making `Diff` the only `public` one.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1370860449


##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) {
   return UnitSlice{&array, index};
 }
 
-using ValueComparator = std::function<bool(const Array&, int64_t, const Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+  virtual ~ValueComparator() = default;
+
+  /// \brief Compare the validity and values at the given indices in the base and target
+  /// arrays.
+  ///
+  /// \param base_index The index in the base array.
+  /// \param target_index The index in the target array.
+  /// \return true if the values at the given indices are equal, false otherwise.
+  /// \pre base_index and target_index are valid indices in their respective arrays.
+  virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+  /// \brief Return the run length of equal values starting at the given indices in the
+  /// base and target arrays.
+  ///
+  /// \param base_index The starting index in the base array.
+  /// \param base_length The length of the base array.
+  /// \param target_index The starting index in the target array.
+  /// \param target_length The length of the target array.
+  /// \return The run length of equal values starting at the given indices in the base
+  /// and target arrays.
+  virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                        int64_t target_index, int64_t target_length) {
+    int64_t run_length_of_equals = 0;
+    while (base_index < base_length && target_index < target_length) {
+      if (!Equals(base_index, target_index)) {
+        break;
+      }
+      base_index += 1;
+      target_index += 1;
+      run_length_of_equals += 1;
+    }
+    return run_length_of_equals;
+  }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+  const ArrayType& base;
+  const ArrayType& target;
+
+  DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+      : base(base), target(target) {}
+
+  ~DefaultValueComparator() override = default;
+
+  /// \brief Compare validity bits of base[base_index] and target[target_index]
+  ///
+  /// \return std::nullopt if the validity bits differ, otherwise a bool
+  /// containing the validity bit found in both arrays.
+  static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t base_index,
+                                              const ArrayType& target,
+                                              int64_t target_index) {
+    const bool base_valid = base.IsValid(base_index);
+    const bool target_valid = target.IsValid(target_index);
+    return base_valid ^ target_valid ? std::nullopt : std::optional<bool>{base_valid};
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const auto valid = ValidityIfEquals(base, base_index, target, target_index);
+    if (valid.has_value()) {
+      return *valid ? GetView(base, base_index) == GetView(target, target_index) : true;
+    }
+    return false;
+  }
+};
+
+template <typename RunEndCType>
+class REEValueComparator : public ValueComparator {
+ private:
+  const RunEndEncodedArray& base_;
+  const RunEndEncodedArray& target_;
+  std::unique_ptr<ValueComparator> inner_value_comparator_;
+  ree_util::PhysicalIndexFinder<RunEndCType> base_physical_index_finder_;
+  ree_util::PhysicalIndexFinder<RunEndCType> target_physical_index_finder_;
+
+ public:
+  REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray& target,
+                     std::unique_ptr<ValueComparator>&& inner_value_comparator)
+      : base_(base),
+        target_(target),
+        inner_value_comparator_(std::move(inner_value_comparator)),
+        base_physical_index_finder_(*base_.data()),
+        target_physical_index_finder_(*target_.data()) {
+    DCHECK_EQ(*base_.type(), *target_.type());
+  }
+
+  ~REEValueComparator() override = default;
 
-struct ValueComparatorVisitor {
+ private:
+  /// \pre 0 <= i < base_.length()
+  inline int64_t FindPhysicalIndexOnBase(int64_t i) {
+    return base_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  /// \pre 0 <= i < target_.length()
+  inline int64_t FindPhysicalIndexOnTarget(int64_t i) {
+    return target_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  const RunEndCType* base_run_ends() { return base_physical_index_finder_.run_ends; }
+
+  const RunEndCType* target_run_ends() { return target_physical_index_finder_.run_ends; }
+
+ public:
+  int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                int64_t target_index, int64_t target_length) override {
+    // Ensure the first search for physical index on the values arrays is safe.
+    if (base_index >= base_length || target_index >= target_length) {
+      // Without values on either side, there is no run of equal values.
+      return 0;
+    }
+
+    // Translate the two logical indices into physical indices.
+    int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+    int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+
+    int64_t run_length_of_equals = 0;
+    // The loop invariant (base_index < base_length && target_index < target_length)
+    // is valid when the loop starts because of the check above.
+    for (;;) {
+      const auto base_run_end =
+          static_cast<int64_t>(base_run_ends()[physical_base_index]) - base_.offset();
+      const auto target_run_end =
+          static_cast<int64_t>(target_run_ends()[physical_target_index]) -
+          target_.offset();
+      // The end of the runs containing the logical indices, by definition, ends
+      // after the logical indices.
+      DCHECK_LT(base_index, base_run_end);
+      DCHECK_LT(target_index, target_run_end);
+
+      // Compare the physical values that make up the runs containing base_index
+      // and target_index.
+      if (!inner_value_comparator_->Equals(physical_base_index, physical_target_index)) {
+        // First difference found, stop because the run of equal values cannot
+        // be extended further.
+        break;
+      }
+
+      const int64_t base_run = std::min(base_run_end, base_length) - base_index;
+      const int64_t target_run = std::min(target_run_end, target_length) - target_index;
+      // Due to the loop-invariant (base_index < base_length && target_index <
+      // target_length) and properties of the run-ends asserted above, both base_run and
+      // target_run are strictly greater than zero.
+      DCHECK_GT(base_run, 0);
+      DCHECK_GT(target_run, 0);
+
+      int64_t increment = 0;
+      if (base_run < target_run) {
+        increment = base_run;
+        physical_base_index += 1;  // skip to next run on base
+      } else if (base_run > target_run) {
+        increment = target_run;
+        physical_target_index += 1;  // skip to next run on target
+      } else {
+        increment = base_run;
+        // skip to next run on both base and target
+        physical_base_index += 1;
+        physical_target_index += 1;
+      }
+      // Since both base_run and target_run are greater than zero,
+      // increment is also greater than zero...
+      DCHECK_GT(increment, 0);
+      // ...which implies that the loop will make progress and eventually terminate
+      // because base_index or target_index will equal base_length or target_length,
+      // respectively.
+      base_index += increment;
+      target_index += increment;
+      // The value representing the two runs are equal, so we can assume that at
+      // least `increment` (size of smallest run) values are equal.
+      run_length_of_equals += increment;
+
+      if (base_index >= base_length || target_index >= target_length) {
+        break;
+      }
+    }
+
+    return run_length_of_equals;
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+    const int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+    return inner_value_comparator_->Equals(physical_base_index, physical_target_index);
+  }
+};
+
+class CreateValueComparator {
+ private:
+  std::unique_ptr<ValueComparator> comparator_;
+
+ public:
   template <typename T>
-  Status Visit(const T&) {
+  Status Visit(const T&, const Array& base, const Array& target) {
     using ArrayType = typename TypeTraits<T>::ArrayType;
-    out = [](const Array& base, int64_t base_index, const Array& target,
-             int64_t target_index) {
-      return (GetView(checked_cast<const ArrayType&>(base), base_index) ==
-              GetView(checked_cast<const ArrayType&>(target), target_index));
-    };
+    comparator_ = std::make_unique<DefaultValueComparator<ArrayType>>(
+        checked_cast<const ArrayType&>(base), checked_cast<const ArrayType&>(target));
     return Status::OK();
   }
 
-  Status Visit(const NullType&) { return Status::NotImplemented("null type"); }
+  Status Visit(const NullType&, const Array&, const Array&) {
+    return Status::NotImplemented("null type");
+  }
 
-  Status Visit(const ExtensionType&) { return Status::NotImplemented("extension type"); }
+  Status Visit(const ExtensionType&, const Array&, const Array&) {
+    return Status::NotImplemented("extension type");
+  }
 
-  Status Visit(const DictionaryType&) {
+  Status Visit(const DictionaryType&, const Array& base, const Array& target) {
     return Status::NotImplemented("dictionary type");
   }
 
-  Status Visit(const RunEndEncodedType&) {
-    return Status::NotImplemented("run-end encoded type");
+  Status Visit(const RunEndEncodedType& ree_type, const Array& base,
+               const Array& target) {
+    const auto& base_ree = checked_cast<const RunEndEncodedArray&>(base);
+    const auto& target_ree = checked_cast<const RunEndEncodedArray&>(target);
+    ARROW_ASSIGN_OR_RAISE(
+        auto inner_values_comparator,
+        (*this)(*ree_type.value_type(), *base_ree.values(), *target_ree.values()));
+    auto* ree_value_comparator =
+        ([&ree_type, &base_ree, &target_ree,
+          inner_values_comparator =
+              std::move(inner_values_comparator)]() mutable -> ValueComparator* {
+          const auto run_end_type_id = ree_type.run_end_type()->id();
+          switch (run_end_type_id) {
+            case Type::INT16:
+              return new REEValueComparator<int16_t>(base_ree, target_ree,
+                                                     std::move(inner_values_comparator));
+            case Type::INT32:
+              return new REEValueComparator<int32_t>(base_ree, target_ree,
+                                                     std::move(inner_values_comparator));
+            default:
+              DCHECK_EQ(run_end_type_id, Type::INT64);
+              return new REEValueComparator<int64_t>(base_ree, target_ree,
+                                                     std::move(inner_values_comparator));
+          }
+        }());
+    comparator_ = std::unique_ptr<ValueComparator>{ree_value_comparator};
+    return Status::OK();

Review Comment:
   This exact change cause `clang-tidy` to ask for `make_unique`, but I found a way around that.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1370859925


##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) {
   return UnitSlice{&array, index};
 }
 
-using ValueComparator = std::function<bool(const Array&, int64_t, const Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+  virtual ~ValueComparator() = default;
+
+  /// \brief Compare the validity and values at the given indices in the base and target
+  /// arrays.
+  ///
+  /// \param base_index The index in the base array.
+  /// \param target_index The index in the target array.
+  /// \return true if the values at the given indices are equal, false otherwise.
+  /// \pre base_index and target_index are valid indices in their respective arrays.
+  virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+  /// \brief Return the run length of equal values starting at the given indices in the
+  /// base and target arrays.
+  ///
+  /// \param base_index The starting index in the base array.
+  /// \param base_length The length of the base array.
+  /// \param target_index The starting index in the target array.
+  /// \param target_length The length of the target array.
+  /// \return The run length of equal values starting at the given indices in the base
+  /// and target arrays.
+  virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                        int64_t target_index, int64_t target_length) {
+    int64_t run_length_of_equals = 0;
+    while (base_index < base_length && target_index < target_length) {
+      if (!Equals(base_index, target_index)) {
+        break;
+      }
+      base_index += 1;
+      target_index += 1;
+      run_length_of_equals += 1;
+    }
+    return run_length_of_equals;
+  }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+  const ArrayType& base;
+  const ArrayType& target;
+
+  DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+      : base(base), target(target) {}
+
+  ~DefaultValueComparator() override = default;
+
+  /// \brief Compare validity bits of base[base_index] and target[target_index]
+  ///
+  /// \return std::nullopt if the validity bits differ, otherwise a bool
+  /// containing the validity bit found in both arrays.
+  static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t base_index,
+                                              const ArrayType& target,
+                                              int64_t target_index) {
+    const bool base_valid = base.IsValid(base_index);
+    const bool target_valid = target.IsValid(target_index);
+    return base_valid ^ target_valid ? std::nullopt : std::optional<bool>{base_valid};
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const auto valid = ValidityIfEquals(base, base_index, target, target_index);

Review Comment:
   `ValidityIfEquals` is now removed. Pushing soon.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.
bkietz commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1373553556


##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) {
   return UnitSlice{&array, index};
 }
 
-using ValueComparator = std::function<bool(const Array&, int64_t, const Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+  virtual ~ValueComparator() = default;
+
+  /// \brief Compare the validity and values at the given indices in the base and target
+  /// arrays.
+  ///
+  /// \param base_index The index in the base array.
+  /// \param target_index The index in the target array.
+  /// \return true if the values at the given indices are equal, false otherwise.
+  /// \pre base_index and target_index are valid indices in their respective arrays.
+  virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+  /// \brief Return the run length of equal values starting at the given indices in the
+  /// base and target arrays.
+  ///
+  /// \param base_index The starting index in the base array.
+  /// \param base_length The length of the base array.
+  /// \param target_index The starting index in the target array.
+  /// \param target_length The length of the target array.
+  /// \return The run length of equal values starting at the given indices in the base
+  /// and target arrays.
+  virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                        int64_t target_index, int64_t target_length) {
+    int64_t run_length_of_equals = 0;
+    while (base_index < base_length && target_index < target_length) {
+      if (!Equals(base_index, target_index)) {
+        break;
+      }
+      base_index += 1;
+      target_index += 1;
+      run_length_of_equals += 1;
+    }
+    return run_length_of_equals;
+  }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+  const ArrayType& base;
+  const ArrayType& target;
+
+  DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+      : base(base), target(target) {}
+
+  ~DefaultValueComparator() override = default;
+
+  /// \brief Compare validity bits of base[base_index] and target[target_index]
+  ///
+  /// \return std::nullopt if the validity bits differ, otherwise a bool
+  /// containing the validity bit found in both arrays.
+  static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t base_index,
+                                              const ArrayType& target,
+                                              int64_t target_index) {
+    const bool base_valid = base.IsValid(base_index);
+    const bool target_valid = target.IsValid(target_index);
+    return base_valid ^ target_valid ? std::nullopt : std::optional<bool>{base_valid};
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const auto valid = ValidityIfEquals(base, base_index, target, target_index);
+    if (valid.has_value()) {
+      return *valid ? GetView(base, base_index) == GetView(target, target_index) : true;
+    }
+    return false;
+  }
+};
+
+template <typename RunEndCType>
+class REEValueComparator : public ValueComparator {
+ private:
+  const RunEndEncodedArray& base_;
+  const RunEndEncodedArray& target_;
+  std::unique_ptr<ValueComparator> inner_value_comparator_;
+  ree_util::PhysicalIndexFinder<RunEndCType> base_physical_index_finder_;
+  ree_util::PhysicalIndexFinder<RunEndCType> target_physical_index_finder_;
+
+ public:
+  REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray& target,
+                     std::unique_ptr<ValueComparator>&& inner_value_comparator)
+      : base_(base),
+        target_(target),
+        inner_value_comparator_(std::move(inner_value_comparator)),
+        base_physical_index_finder_(*base_.data()),
+        target_physical_index_finder_(*target_.data()) {
+    DCHECK_EQ(*base_.type(), *target_.type());
+  }
+
+  ~REEValueComparator() override = default;
 
-struct ValueComparatorVisitor {
+ private:
+  /// \pre 0 <= i < base_.length()
+  inline int64_t FindPhysicalIndexOnBase(int64_t i) {
+    return base_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  /// \pre 0 <= i < target_.length()
+  inline int64_t FindPhysicalIndexOnTarget(int64_t i) {
+    return target_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  const RunEndCType* base_run_ends() { return base_physical_index_finder_.run_ends; }
+
+  const RunEndCType* target_run_ends() { return target_physical_index_finder_.run_ends; }
+
+ public:
+  int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                int64_t target_index, int64_t target_length) override {
+    // Ensure the first search for physical index on the values arrays is safe.
+    if (base_index >= base_length || target_index >= target_length) {
+      // Without values on either side, there is no run of equal values.
+      return 0;
+    }
+
+    // Translate the two logical indices into physical indices.
+    int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+    int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+
+    int64_t run_length_of_equals = 0;
+    // The loop invariant (base_index < base_length && target_index < target_length)
+    // is valid when the loop starts because of the check above.
+    for (;;) {
+      const auto base_run_end =
+          static_cast<int64_t>(base_run_ends()[physical_base_index]) - base_.offset();
+      const auto target_run_end =
+          static_cast<int64_t>(target_run_ends()[physical_target_index]) -
+          target_.offset();
+      // The end of the runs containing the logical indices, by definition, ends
+      // after the logical indices.
+      DCHECK_LT(base_index, base_run_end);
+      DCHECK_LT(target_index, target_run_end);
+
+      // Compare the physical values that make up the runs containing base_index
+      // and target_index.
+      if (!inner_value_comparator_->Equals(physical_base_index, physical_target_index)) {
+        // First difference found, stop because the run of equal values cannot
+        // be extended further.
+        break;
+      }
+
+      const int64_t base_run = std::min(base_run_end, base_length) - base_index;
+      const int64_t target_run = std::min(target_run_end, target_length) - target_index;
+      // Due to the loop-invariant (base_index < base_length && target_index <
+      // target_length) and properties of the run-ends asserted above, both base_run and
+      // target_run are strictly greater than zero.
+      DCHECK_GT(base_run, 0);
+      DCHECK_GT(target_run, 0);
+
+      int64_t increment = 0;
+      if (base_run < target_run) {
+        increment = base_run;
+        physical_base_index += 1;  // skip to next run on base
+      } else if (base_run > target_run) {
+        increment = target_run;
+        physical_target_index += 1;  // skip to next run on target
+      } else {
+        increment = base_run;
+        // skip to next run on both base and target
+        physical_base_index += 1;
+        physical_target_index += 1;
+      }

Review Comment:
   ```suggestion
         /// Advance to the end of whichever run is shorter
         int64_t increment = std::min(base_run, target_run);
         // If we reached the end of the base run, advance to the next base run
         physical_base_index += base_run == increment;
         // If we reached the end of the target run, advance to the next target run
         physical_target_index += target_run == increment;
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] GH-34569: [C++] Diffing of Run-End Encoded arrays [arrow]

Posted by "felipecrv (via GitHub)" <gi...@apache.org>.
felipecrv commented on code in PR #35003:
URL: https://github.com/apache/arrow/pull/35003#discussion_r1373691004


##########
cpp/src/arrow/array/diff.cc:
##########
@@ -96,45 +99,260 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) {
   return UnitSlice{&array, index};
 }
 
-using ValueComparator = std::function<bool(const Array&, int64_t, const Array&, int64_t)>;
+/// \brief A simple virtual comparator interface for two arrays.
+///
+/// The base and target array ara bound at construction time. Then
+/// Equals(base_index, target_index) should return true if the values
+/// at the given indices are equal.
+struct ValueComparator {
+  virtual ~ValueComparator() = default;
+
+  /// \brief Compare the validity and values at the given indices in the base and target
+  /// arrays.
+  ///
+  /// \param base_index The index in the base array.
+  /// \param target_index The index in the target array.
+  /// \return true if the values at the given indices are equal, false otherwise.
+  /// \pre base_index and target_index are valid indices in their respective arrays.
+  virtual bool Equals(int64_t base_index, int64_t target_index) = 0;
+
+  /// \brief Return the run length of equal values starting at the given indices in the
+  /// base and target arrays.
+  ///
+  /// \param base_index The starting index in the base array.
+  /// \param base_length The length of the base array.
+  /// \param target_index The starting index in the target array.
+  /// \param target_length The length of the target array.
+  /// \return The run length of equal values starting at the given indices in the base
+  /// and target arrays.
+  virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                        int64_t target_index, int64_t target_length) {
+    int64_t run_length_of_equals = 0;
+    while (base_index < base_length && target_index < target_length) {
+      if (!Equals(base_index, target_index)) {
+        break;
+      }
+      base_index += 1;
+      target_index += 1;
+      run_length_of_equals += 1;
+    }
+    return run_length_of_equals;
+  }
+};
+
+template <typename ArrayType>
+struct DefaultValueComparator : public ValueComparator {
+  const ArrayType& base;
+  const ArrayType& target;
+
+  DefaultValueComparator(const ArrayType& base, const ArrayType& target)
+      : base(base), target(target) {}
+
+  ~DefaultValueComparator() override = default;
+
+  /// \brief Compare validity bits of base[base_index] and target[target_index]
+  ///
+  /// \return std::nullopt if the validity bits differ, otherwise a bool
+  /// containing the validity bit found in both arrays.
+  static std::optional<bool> ValidityIfEquals(const ArrayType& base, int64_t base_index,
+                                              const ArrayType& target,
+                                              int64_t target_index) {
+    const bool base_valid = base.IsValid(base_index);
+    const bool target_valid = target.IsValid(target_index);
+    return base_valid ^ target_valid ? std::nullopt : std::optional<bool>{base_valid};
+  }
+
+  bool Equals(int64_t base_index, int64_t target_index) override {
+    const auto valid = ValidityIfEquals(base, base_index, target, target_index);
+    if (valid.has_value()) {
+      return *valid ? GetView(base, base_index) == GetView(target, target_index) : true;
+    }
+    return false;
+  }
+};
+
+template <typename RunEndCType>
+class REEValueComparator : public ValueComparator {
+ private:
+  const RunEndEncodedArray& base_;
+  const RunEndEncodedArray& target_;
+  std::unique_ptr<ValueComparator> inner_value_comparator_;
+  ree_util::PhysicalIndexFinder<RunEndCType> base_physical_index_finder_;
+  ree_util::PhysicalIndexFinder<RunEndCType> target_physical_index_finder_;
+
+ public:
+  REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray& target,
+                     std::unique_ptr<ValueComparator>&& inner_value_comparator)
+      : base_(base),
+        target_(target),
+        inner_value_comparator_(std::move(inner_value_comparator)),
+        base_physical_index_finder_(*base_.data()),
+        target_physical_index_finder_(*target_.data()) {
+    DCHECK_EQ(*base_.type(), *target_.type());
+  }
+
+  ~REEValueComparator() override = default;
 
-struct ValueComparatorVisitor {
+ private:
+  /// \pre 0 <= i < base_.length()
+  inline int64_t FindPhysicalIndexOnBase(int64_t i) {
+    return base_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  /// \pre 0 <= i < target_.length()
+  inline int64_t FindPhysicalIndexOnTarget(int64_t i) {
+    return target_physical_index_finder_.FindPhysicalIndex(i);
+  }
+
+  const RunEndCType* base_run_ends() { return base_physical_index_finder_.run_ends; }
+
+  const RunEndCType* target_run_ends() { return target_physical_index_finder_.run_ends; }
+
+ public:
+  int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length,
+                                int64_t target_index, int64_t target_length) override {
+    // Ensure the first search for physical index on the values arrays is safe.
+    if (base_index >= base_length || target_index >= target_length) {
+      // Without values on either side, there is no run of equal values.
+      return 0;
+    }
+
+    // Translate the two logical indices into physical indices.
+    int64_t physical_base_index = FindPhysicalIndexOnBase(base_index);
+    int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index);
+
+    int64_t run_length_of_equals = 0;
+    // The loop invariant (base_index < base_length && target_index < target_length)
+    // is valid when the loop starts because of the check above.
+    for (;;) {
+      const auto base_run_end =
+          static_cast<int64_t>(base_run_ends()[physical_base_index]) - base_.offset();
+      const auto target_run_end =
+          static_cast<int64_t>(target_run_ends()[physical_target_index]) -
+          target_.offset();
+      // The end of the runs containing the logical indices, by definition, ends
+      // after the logical indices.
+      DCHECK_LT(base_index, base_run_end);
+      DCHECK_LT(target_index, target_run_end);
+
+      // Compare the physical values that make up the runs containing base_index
+      // and target_index.
+      if (!inner_value_comparator_->Equals(physical_base_index, physical_target_index)) {
+        // First difference found, stop because the run of equal values cannot
+        // be extended further.
+        break;
+      }
+
+      const int64_t base_run = std::min(base_run_end, base_length) - base_index;
+      const int64_t target_run = std::min(target_run_end, target_length) - target_index;
+      // Due to the loop-invariant (base_index < base_length && target_index <
+      // target_length) and properties of the run-ends asserted above, both base_run and
+      // target_run are strictly greater than zero.
+      DCHECK_GT(base_run, 0);
+      DCHECK_GT(target_run, 0);
+
+      int64_t increment = 0;
+      if (base_run < target_run) {
+        increment = base_run;
+        physical_base_index += 1;  // skip to next run on base
+      } else if (base_run > target_run) {
+        increment = target_run;
+        physical_target_index += 1;  // skip to next run on target
+      } else {
+        increment = base_run;
+        // skip to next run on both base and target
+        physical_base_index += 1;
+        physical_target_index += 1;
+      }

Review Comment:
   Pushing a commit with different language on the comments.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org