You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/11/18 10:01:11 UTC

[GitHub] [arrow] pitrou opened a new pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

pitrou opened a new pull request #8703:
URL: https://github.com/apache/arrow/pull/8703


   ArrayEquals now defers to ArrayRangeEquals under the hood.
   ArrayRangeEquals now allows passing an EqualOptions argument.
   Also add ArrayRangeApproxEquals.
   
   Comparison speed is massively improved on many input types:
   ```
                                         benchmark            baseline           contender   change %                                                                                                                                                                               counters
   26               ArrayRangeEqualsStruct/32768/0    6.338m items/sec  797.926m items/sec  12490.248                 {'run_name': 'ArrayRangeEqualsStruct/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 135, 'null_percent': 0.0}
   16              ArrayRangeEqualsBoolean/32768/0  839.237m items/sec   51.203b items/sec   6001.168              {'run_name': 'ArrayRangeEqualsBoolean/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 17929, 'null_percent': 0.0}
   28      ArrayRangeEqualsFixedSizeBinary/32768/0  369.542m items/sec   14.798b items/sec   3904.348       {'run_name': 'ArrayRangeEqualsFixedSizeBinary/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 8130, 'null_percent': 0.0}
   24           ArrayRangeEqualsStruct/32768/10000    6.251m items/sec  240.453m items/sec   3746.338            {'run_name': 'ArrayRangeEqualsStruct/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 134, 'null_percent': 0.01}
   36               ArrayRangeEqualsStruct/32768/1  412.074m items/sec   13.733b items/sec   3232.616              {'run_name': 'ArrayRangeEqualsStruct/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 8817, 'null_percent': 100.0}
   9                ArrayRangeEqualsString/32768/0   67.419m items/sec    1.937b items/sec   2772.931                {'run_name': 'ArrayRangeEqualsString/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 1441, 'null_percent': 0.0}
   25          ArrayRangeEqualsListOfInt32/32768/1  524.231m items/sec   13.774b items/sec   2527.447        {'run_name': 'ArrayRangeEqualsListOfInt32/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 11179, 'null_percent': 100.0}
   44               ArrayRangeEqualsString/32768/1  577.902m items/sec   13.686b items/sec   2268.185             {'run_name': 'ArrayRangeEqualsString/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 12328, 'null_percent': 100.0}
   53      ArrayRangeEqualsFixedSizeBinary/32768/1  596.284m items/sec   13.140b items/sec   2103.587    {'run_name': 'ArrayRangeEqualsFixedSizeBinary/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 12703, 'null_percent': 100.0}
   11           ArrayRangeEqualsString/32768/10000   67.501m items/sec    1.382b items/sec   1947.876           {'run_name': 'ArrayRangeEqualsString/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 1442, 'null_percent': 0.01}
   46          ArrayRangeEqualsListOfInt32/32768/0   42.696m items/sec  833.958m items/sec   1853.229            {'run_name': 'ArrayRangeEqualsListOfInt32/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 911, 'null_percent': 0.0}
   14              ArrayRangeEqualsBoolean/32768/1  698.866m items/sec   13.374b items/sec   1813.652            {'run_name': 'ArrayRangeEqualsBoolean/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 14929, 'null_percent': 100.0}
   38                ArrayRangeEqualsInt32/32768/1  835.824m items/sec   13.688b items/sec   1537.720              {'run_name': 'ArrayRangeEqualsInt32/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 17818, 'null_percent': 100.0}
   29  ArrayRangeEqualsFixedSizeBinary/32768/10000  278.025m items/sec    4.375b items/sec   1473.475  {'run_name': 'ArrayRangeEqualsFixedSizeBinary/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 5937, 'null_percent': 0.01}
   17                ArrayRangeEqualsInt32/32768/0    2.104b items/sec   32.033b items/sec   1422.785                {'run_name': 'ArrayRangeEqualsInt32/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 44741, 'null_percent': 0.0}
   18              ArrayRangeEqualsFloat32/32768/1  838.934m items/sec   12.432b items/sec   1381.896            {'run_name': 'ArrayRangeEqualsFloat32/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 17895, 'null_percent': 100.0}
   19            ArrayRangeEqualsInt32/32768/10000  757.797m items/sec    6.848b items/sec    803.693           {'run_name': 'ArrayRangeEqualsInt32/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 16177, 'null_percent': 0.01}
   15      ArrayRangeEqualsListOfInt32/32768/10000   28.438m items/sec  210.226m items/sec    639.253       {'run_name': 'ArrayRangeEqualsListOfInt32/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 606, 'null_percent': 0.01}
   39          ArrayRangeEqualsFloat32/32768/10000  694.965m items/sec    4.706b items/sec    577.178         {'run_name': 'ArrayRangeEqualsFloat32/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 14833, 'null_percent': 0.01}
   3           ArrayRangeEqualsBoolean/32768/10000  538.353m items/sec    2.968b items/sec    451.394         {'run_name': 'ArrayRangeEqualsBoolean/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 11475, 'null_percent': 0.01}
   41              ArrayRangeEqualsFloat32/32768/0    2.036b items/sec   10.235b items/sec    402.761              {'run_name': 'ArrayRangeEqualsFloat32/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 43978, 'null_percent': 0.0}
   23             ArrayRangeEqualsStruct/32768/100    6.298m items/sec   26.042m items/sec    313.515               {'run_name': 'ArrayRangeEqualsStruct/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 133, 'null_percent': 1.0}
   1              ArrayRangeEqualsString/32768/100   68.074m items/sec  275.727m items/sec    305.039              {'run_name': 'ArrayRangeEqualsString/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 1453, 'null_percent': 1.0}
   42          ArrayRangeEqualsSparseUnion/32768/0   12.649m items/sec   38.360m items/sec    203.256            {'run_name': 'ArrayRangeEqualsSparseUnion/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 274, 'null_percent': 0.0}
   45           ArrayRangeEqualsDenseUnion/32768/0   12.995m items/sec   38.503m items/sec    196.287             {'run_name': 'ArrayRangeEqualsDenseUnion/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 277, 'null_percent': 0.0}
   7     ArrayRangeEqualsFixedSizeBinary/32768/100  274.991m items/sec  633.253m items/sec    130.281     {'run_name': 'ArrayRangeEqualsFixedSizeBinary/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 5864, 'null_percent': 1.0}
   13              ArrayRangeEqualsInt32/32768/100  729.634m items/sec    1.644b items/sec    125.256              {'run_name': 'ArrayRangeEqualsInt32/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 15582, 'null_percent': 1.0}
   12              ArrayRangeEqualsStruct/32768/10    6.948m items/sec   14.273m items/sec    105.427               {'run_name': 'ArrayRangeEqualsStruct/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 147, 'null_percent': 10.0}
   34               ArrayRangeEqualsStruct/32768/2   11.923m items/sec   23.845m items/sec     99.993                {'run_name': 'ArrayRangeEqualsStruct/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 255, 'null_percent': 50.0}
   10              ArrayRangeEqualsString/32768/10   73.461m items/sec  146.598m items/sec     99.558              {'run_name': 'ArrayRangeEqualsString/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 1568, 'null_percent': 10.0}
   48            ArrayRangeEqualsBoolean/32768/100  525.635m items/sec    1.026b items/sec     95.102            {'run_name': 'ArrayRangeEqualsBoolean/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 11216, 'null_percent': 1.0}
   21          ArrayRangeEqualsSparseUnion/32768/1   14.032m items/sec   26.225m items/sec     86.889          {'run_name': 'ArrayRangeEqualsSparseUnion/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 300, 'null_percent': 100.0}
   47          ArrayRangeEqualsSparseUnion/32768/2   12.643m items/sec   23.030m items/sec     82.160           {'run_name': 'ArrayRangeEqualsSparseUnion/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 272, 'null_percent': 50.0}
   43      ArrayRangeEqualsSparseUnion/32768/10000   12.596m items/sec   22.801m items/sec     81.018       {'run_name': 'ArrayRangeEqualsSparseUnion/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 264, 'null_percent': 0.01}
   52         ArrayRangeEqualsSparseUnion/32768/10   12.717m items/sec   22.911m items/sec     80.168          {'run_name': 'ArrayRangeEqualsSparseUnion/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 272, 'null_percent': 10.0}
   0         ArrayRangeEqualsSparseUnion/32768/100   12.783m items/sec   22.714m items/sec     77.694          {'run_name': 'ArrayRangeEqualsSparseUnion/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 275, 'null_percent': 1.0}
   31           ArrayRangeEqualsDenseUnion/32768/1   14.509m items/sec   25.576m items/sec     76.279           {'run_name': 'ArrayRangeEqualsDenseUnion/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 309, 'null_percent': 100.0}
   20       ArrayRangeEqualsDenseUnion/32768/10000   13.193m items/sec   22.447m items/sec     70.152        {'run_name': 'ArrayRangeEqualsDenseUnion/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 280, 'null_percent': 0.01}
   35         ArrayRangeEqualsDenseUnion/32768/100   13.204m items/sec   22.256m items/sec     68.561           {'run_name': 'ArrayRangeEqualsDenseUnion/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 282, 'null_percent': 1.0}
   40          ArrayRangeEqualsDenseUnion/32768/10   13.183m items/sec   22.191m items/sec     68.338           {'run_name': 'ArrayRangeEqualsDenseUnion/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 279, 'null_percent': 10.0}
   4            ArrayRangeEqualsDenseUnion/32768/2   13.148m items/sec   21.996m items/sec     67.297            {'run_name': 'ArrayRangeEqualsDenseUnion/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 280, 'null_percent': 50.0}
   22            ArrayRangeEqualsFloat32/32768/100  671.171m items/sec    1.097b items/sec     63.414            {'run_name': 'ArrayRangeEqualsFloat32/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 14325, 'null_percent': 1.0}
   30               ArrayRangeEqualsString/32768/2   94.060m items/sec  149.539m items/sec     58.983               {'run_name': 'ArrayRangeEqualsString/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 2008, 'null_percent': 50.0}
   37     ArrayRangeEqualsFixedSizeBinary/32768/10  250.975m items/sec  305.864m items/sec     21.870     {'run_name': 'ArrayRangeEqualsFixedSizeBinary/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 5341, 'null_percent': 10.0}
   32               ArrayRangeEqualsInt32/32768/10  607.891m items/sec  676.139m items/sec     11.227              {'run_name': 'ArrayRangeEqualsInt32/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 12895, 'null_percent': 10.0}
   8       ArrayRangeEqualsFixedSizeBinary/32768/2  187.404m items/sec  207.700m items/sec     10.830      {'run_name': 'ArrayRangeEqualsFixedSizeBinary/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 3971, 'null_percent': 50.0}
   49             ArrayRangeEqualsBoolean/32768/10  468.989m items/sec  518.521m items/sec     10.561             {'run_name': 'ArrayRangeEqualsBoolean/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 9996, 'null_percent': 10.0}
   5                 ArrayRangeEqualsInt32/32768/2  259.891m items/sec  270.547m items/sec      4.100                {'run_name': 'ArrayRangeEqualsInt32/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 5505, 'null_percent': 50.0}
   33              ArrayRangeEqualsFloat32/32768/2  262.994m items/sec  268.155m items/sec      1.962              {'run_name': 'ArrayRangeEqualsFloat32/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 5580, 'null_percent': 50.0}
   50             ArrayRangeEqualsFloat32/32768/10  577.200m items/sec  573.157m items/sec     -0.701            {'run_name': 'ArrayRangeEqualsFloat32/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 12270, 'null_percent': 10.0}
   2               ArrayRangeEqualsBoolean/32768/2  242.498m items/sec  230.947m items/sec     -4.763              {'run_name': 'ArrayRangeEqualsBoolean/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 5152, 'null_percent': 50.0}
   27        ArrayRangeEqualsListOfInt32/32768/100   28.080m items/sec   25.874m items/sec     -7.855          {'run_name': 'ArrayRangeEqualsListOfInt32/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 598, 'null_percent': 1.0}
   51          ArrayRangeEqualsListOfInt32/32768/2   18.613m items/sec   12.136m items/sec    -34.798           {'run_name': 'ArrayRangeEqualsListOfInt32/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 397, 'null_percent': 50.0}
   6          ArrayRangeEqualsListOfInt32/32768/10   26.194m items/sec   12.610m items/sec    -51.859          {'run_name': 'ArrayRangeEqualsListOfInt32/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 560, 'null_percent': 10.0}
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] bkietz commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528212877



##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const FixedSizeBinaryArray& left) {
-    const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+  Status Visit(const FloatType& type) { return CompareFloating(type); }
 
-    int32_t width = left.byte_width();
+  Status Visit(const DoubleType& type) { return CompareFloating(type); }
 
-    const uint8_t* left_data = nullptr;
-    const uint8_t* right_data = nullptr;
+  // Also matches StringType
+  Status Visit(const BinaryType& type) { return CompareBinary(type); }
 
-    if (left.values()) {
-      left_data = left.raw_values();
-    }
+  // Also matches LargeStringType
+  Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
 
-    if (right.values()) {
-      right_data = right.raw_values();
-    }
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        result_ = false;
-        return Status::OK();
-      }
-      if (is_null) continue;
+  Status Visit(const FixedSizeBinaryType& type) {
+    const auto byte_width = type.byte_width();
+    const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
 
-      if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
-        result_ = false;
-        return Status::OK();
-      }
+    if (left_data != nullptr && right_data != nullptr) {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+        return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+                      right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+                      length * byte_width) == 0;
+      };
+      VisitValidRuns(compare_runs);
+    } else {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+      VisitValidRuns(compare_runs);
     }
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const Decimal128Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  // Also matches MapType
+  Status Visit(const ListType& type) { return CompareList(type); }
 
-  Status Visit(const Decimal256Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  Status Visit(const LargeListType& type) { return CompareList(type); }
 
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
-    return Status::OK();
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
-      const T& left) {
-    return CompareValues<T>(left);
-  }
+  Status Visit(const FixedSizeListType& type) {
+    const auto list_size = type.list_size();
+    const ArrayData& left_data = *left_.child_data[0];
+    const ArrayData& right_data = *right_.child_data[0];
 
-  Status Visit(const ListArray& left) {
-    result_ = CompareLists(left);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+                               (left_start_idx_ + left_.offset + i) * list_size,
+                               (right_start_idx_ + right_.offset + i) * list_size,
+                               length * list_size);
+      return impl.Compare();
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const LargeListArray& left) {
-    result_ = CompareLists(left);
-    return Status::OK();
-  }
+  Status Visit(const StructType& type) {
+    const int32_t num_fields = type.num_fields();
 
-  Status Visit(const FixedSizeListArray& left) {
-    const auto& right = checked_cast<const FixedSizeListArray&>(right_);
-    result_ = left.values()->RangeEquals(
-        left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
-        right.value_offset(right_start_idx_), right.values());
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      for (int32_t f = 0; f < num_fields; ++f) {
+        RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+                                 *right_.child_data[f],
+                                 left_start_idx_ + left_.offset + i,
+                                 right_start_idx_ + right_.offset + i, length);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const MapArray& left) {
-    result_ = CompareMaps(left);
-    return Status::OK();
-  }
+  Status Visit(const SparseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
 
-  Status Visit(const StructArray& left) {
-    result_ = CompareStructs(left);
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];
+        // XXX can we instead detect runs of same-child union values?
+        RangeDataEqualsImpl impl(
+            options_, floating_approximate_, *left_.child_data[child_num],
+            *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+            right_start_idx_ + right_.offset + j, 1);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    });
     return Status::OK();
   }
 
-  Status Visit(const UnionArray& left) {
-    result_ = CompareUnions(left);
+  Status Visit(const DenseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
+    const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+    const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];
+        RangeDataEqualsImpl impl(
+            options_, floating_approximate_, *left_.child_data[child_num],
+            *right_.child_data[child_num], left_offsets[left_start_idx_ + j],
+            right_offsets[right_start_idx_ + j], 1);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    });
     return Status::OK();
   }
 
-  Status Visit(const DictionaryArray& left) {
-    const auto& right = checked_cast<const DictionaryArray&>(right_);
-    if (!left.dictionary()->Equals(right.dictionary())) {
-      result_ = false;
-      return Status::OK();
+  Status Visit(const DictionaryType& type) {
+    // Compare dictionaries
+    result_ &= CompareArrayRanges(
+        *left_.dictionary, *right_.dictionary,
+        /*left_start_idx=*/0,
+        /*left_end_idx=*/std::max(left_.dictionary->length, right_.dictionary->length),
+        /*right_start_idx=*/0, options_, floating_approximate_);
+    if (result_) {
+      // Compare indices
+      result_ &= CompareWithType(*type.index_type());
     }
-    result_ = left.indices()->RangeEquals(left_start_idx_, left_end_idx_,
-                                          right_start_idx_, right.indices());
     return Status::OK();
   }
 
-  Status Visit(const ExtensionArray& left) {
-    result_ = (right_.type()->Equals(*left.type()) &&
-               ArrayRangeEquals(*left.storage(),
-                                *static_cast<const ExtensionArray&>(right_).storage(),
-                                left_start_idx_, left_end_idx_, right_start_idx_));
+  Status Visit(const ExtensionType& type) {
+    // Compare storages
+    result_ &= CompareWithType(*type.storage_type());
     return Status::OK();
   }
 
-  bool result() const { return result_; }
-
  protected:
-  const Array& right_;
-  int64_t left_start_idx_;
-  int64_t left_end_idx_;
-  int64_t right_start_idx_;
-
-  bool result_;
-};
-
-static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& right) {
-  const int byte_width = internal::GetByteWidth(*left.type());
-
-  const uint8_t* left_data = nullptr;
-  const uint8_t* right_data = nullptr;
-
-  if (left.values()) {
-    left_data = left.values()->data() + left.offset() * byte_width;
+  template <typename TypeClass, typename CType = typename TypeClass::c_type>
+  Status ComparePrimitive(const TypeClass&) {
+    const CType* left_values = left_.GetValues<CType>(1);
+    const CType* right_values = right_.GetValues<CType>(1);
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      return memcmp(left_values + left_start_idx_ + i,
+                    right_values + right_start_idx_ + i, length * sizeof(CType)) == 0;
+    });
+    return Status::OK();
   }
 
-  if (right.values()) {
-    right_data = right.values()->data() + right.offset() * byte_width;
-  }
+  template <typename TypeClass>
+  Status CompareFloating(const TypeClass&) {
+    using T = typename TypeClass::c_type;
+    const T* left_values = left_.GetValues<T>(1);
+    const T* right_values = right_.GetValues<T>(1);
 
-  if (byte_width == 0) {
-    // Special case 0-width data, as the data pointers may be null
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i) != right.IsNull(i)) {
-        return false;
-      }
-    }
-    return true;
-  } else if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      const bool left_null = left.IsNull(i);
-      const bool right_null = right.IsNull(i);
-      if (left_null != right_null) {
-        return false;
+    if (floating_approximate_) {
+      const T epsilon = static_cast<T>(options_.atol());
+      if (options_.nans_equal()) {
+        VisitValues([&](int64_t i) {
+          const T x = left_values[i + left_start_idx_];
+          const T y = right_values[i + right_start_idx_];
+          return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
+        });
+      } else {
+        VisitValues([&](int64_t i) {
+          const T x = left_values[i + left_start_idx_];
+          const T y = right_values[i + right_start_idx_];
+          return (fabs(x - y) <= epsilon) || (x == y);
+        });
       }
-      if (!left_null && memcmp(left_data, right_data, byte_width) != 0) {
-        return false;
+    } else {
+      if (options_.nans_equal()) {
+        VisitValues([&](int64_t i) {
+          const T x = left_values[i + left_start_idx_];
+          const T y = right_values[i + right_start_idx_];
+          return (x == y) || (std::isnan(x) && std::isnan(y));
+        });
+      } else {
+        VisitValues([&](int64_t i) {
+          const T x = left_values[i + left_start_idx_];
+          const T y = right_values[i + right_start_idx_];
+          return x == y;
+        });
       }
-      left_data += byte_width;
-      right_data += byte_width;
     }
-    return true;
-  } else {
-    auto number_of_bytes_to_compare = static_cast<size_t>(byte_width * left.length());
-    return memcmp(left_data, right_data, number_of_bytes_to_compare) == 0;
-  }
-}
-
-// A bit confusing: ArrayEqualsVisitor inherits from RangeEqualsVisitor but
-// doesn't share the same preconditions.
-// When RangeEqualsVisitor is called, we only know the range sizes equal.
-// When ArrayEqualsVisitor is called, we know the sizes and null bitmaps are equal.
-
-class ArrayEqualsVisitor : public RangeEqualsVisitor {
- public:
-  explicit ArrayEqualsVisitor(const Array& right, const EqualOptions& opts)
-      : RangeEqualsVisitor(right, 0, right.length(), 0), opts_(opts) {}
-
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const BooleanArray& left) {
-    const auto& right = checked_cast<const BooleanArray&>(right_);
+  template <typename TypeClass>
+  Status CompareBinary(const TypeClass&) {
+    const uint8_t* left_data = left_.GetValues<uint8_t>(2, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(2, 0);
 
-    if (left.null_count() > 0) {
-      const uint8_t* left_data = left.values()->data();
-      const uint8_t* right_data = right.values()->data();
-
-      for (int64_t i = 0; i < left.length(); ++i) {
-        if (left.IsValid(i) && BitUtil::GetBit(left_data, i + left.offset()) !=
-                                   BitUtil::GetBit(right_data, i + right.offset())) {
-          result_ = false;
-          return Status::OK();
-        }
-      }
-      result_ = true;
+    if (left_data != nullptr && right_data != nullptr) {
+      const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+                                      int64_t length) -> bool {
+        return memcmp(left_data + left_offset, right_data + right_offset, length) == 0;
+      };
+      CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
     } else {
-      result_ = BitmapEquals(left.values()->data(), left.offset(), right.values()->data(),
-                             right.offset(), left.length());
+      // One of the arrays is an array of empty strings and nulls.
+      // We just need to compare the offsets.
+      // (note we must not call memcmp() with null data pointers)
+      const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+                                      int64_t length) -> bool { return true; };
+      CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);

Review comment:
       Yes, `...` can be used to ignore any trivially constructible argument(s).




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210676



##########
File path: cpp/src/arrow/util/bitmap_reader.h
##########
@@ -69,6 +69,77 @@ class BitmapReader {
   int64_t bit_offset_;
 };
 
+// XXX Cannot name it BitmapWordReader because the name is already used
+// in bitmap_ops.cc

Review comment:
       Yeah, I think that would be useful. I should open JIRAs for the various XXX and TODOs here.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528211002



##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const FixedSizeBinaryArray& left) {
-    const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+  Status Visit(const FloatType& type) { return CompareFloating(type); }
 
-    int32_t width = left.byte_width();
+  Status Visit(const DoubleType& type) { return CompareFloating(type); }
 
-    const uint8_t* left_data = nullptr;
-    const uint8_t* right_data = nullptr;
+  // Also matches StringType
+  Status Visit(const BinaryType& type) { return CompareBinary(type); }
 
-    if (left.values()) {
-      left_data = left.raw_values();
-    }
+  // Also matches LargeStringType
+  Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
 
-    if (right.values()) {
-      right_data = right.raw_values();
-    }
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        result_ = false;
-        return Status::OK();
-      }
-      if (is_null) continue;
+  Status Visit(const FixedSizeBinaryType& type) {
+    const auto byte_width = type.byte_width();
+    const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
 
-      if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
-        result_ = false;
-        return Status::OK();
-      }
+    if (left_data != nullptr && right_data != nullptr) {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+        return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+                      right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+                      length * byte_width) == 0;
+      };
+      VisitValidRuns(compare_runs);
+    } else {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+      VisitValidRuns(compare_runs);
     }
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const Decimal128Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  // Also matches MapType
+  Status Visit(const ListType& type) { return CompareList(type); }
 
-  Status Visit(const Decimal256Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  Status Visit(const LargeListType& type) { return CompareList(type); }
 
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
-    return Status::OK();
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
-      const T& left) {
-    return CompareValues<T>(left);
-  }
+  Status Visit(const FixedSizeListType& type) {
+    const auto list_size = type.list_size();
+    const ArrayData& left_data = *left_.child_data[0];
+    const ArrayData& right_data = *right_.child_data[0];
 
-  Status Visit(const ListArray& left) {
-    result_ = CompareLists(left);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+                               (left_start_idx_ + left_.offset + i) * list_size,
+                               (right_start_idx_ + right_.offset + i) * list_size,
+                               length * list_size);
+      return impl.Compare();
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const LargeListArray& left) {
-    result_ = CompareLists(left);
-    return Status::OK();
-  }
+  Status Visit(const StructType& type) {
+    const int32_t num_fields = type.num_fields();
 
-  Status Visit(const FixedSizeListArray& left) {
-    const auto& right = checked_cast<const FixedSizeListArray&>(right_);
-    result_ = left.values()->RangeEquals(
-        left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
-        right.value_offset(right_start_idx_), right.values());
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      for (int32_t f = 0; f < num_fields; ++f) {
+        RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+                                 *right_.child_data[f],
+                                 left_start_idx_ + left_.offset + i,
+                                 right_start_idx_ + right_.offset + i, length);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const MapArray& left) {
-    result_ = CompareMaps(left);
-    return Status::OK();
-  }
+  Status Visit(const SparseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
 
-  Status Visit(const StructArray& left) {
-    result_ = CompareStructs(left);
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];
+        // XXX can we instead detect runs of same-child union values?

Review comment:
       Right, though the question is whether this would improve or worsen common cases (I have no idea how unions are used in the wild).
   In any case, this could be a separate low-priority JIRA (I don't think optimizing unions is in our current priorities).




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#issuecomment-729584943


   https://issues.apache.org/jira/browse/ARROW-10143


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#issuecomment-730355163


   Rebased.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] bkietz commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528213657



##########
File path: cpp/src/arrow/array/diff.h
##########
@@ -59,6 +57,27 @@ ARROW_EXPORT
 Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
                                           MemoryPool* pool = default_memory_pool());
 
+/// \brief Compare two array ranges, returning an edit script which expresses the
+/// difference between them
+///
+/// Same as Diff(), but only the ranges defined by the given offsets and lengths
+/// are compared.
+///
+/// \param[in] base baseline for comparison
+/// \param[in] target an array of identical type to base whose elements differ from base's
+/// \param[in] base_offset the start offset of the range to consider inside `base`
+/// \param[in] base_length the length of the range to consider inside `base`
+/// \param[in] target_offset the start offset of the range to consider inside `target`
+/// \param[in] target_length the length of the range to consider inside `target`
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \return an edit script array which can be applied to base to produce target
+ARROW_EXPORT
+Result<std::shared_ptr<StructArray>> DiffRanges(const Array& base, const Array& target,

Review comment:
       Right; if you wanted to minimize this patch some then this could be removed




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528211150



##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const FixedSizeBinaryArray& left) {
-    const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+  Status Visit(const FloatType& type) { return CompareFloating(type); }
 
-    int32_t width = left.byte_width();
+  Status Visit(const DoubleType& type) { return CompareFloating(type); }
 
-    const uint8_t* left_data = nullptr;
-    const uint8_t* right_data = nullptr;
+  // Also matches StringType
+  Status Visit(const BinaryType& type) { return CompareBinary(type); }
 
-    if (left.values()) {
-      left_data = left.raw_values();
-    }
+  // Also matches LargeStringType
+  Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
 
-    if (right.values()) {
-      right_data = right.raw_values();
-    }
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        result_ = false;
-        return Status::OK();
-      }
-      if (is_null) continue;
+  Status Visit(const FixedSizeBinaryType& type) {
+    const auto byte_width = type.byte_width();
+    const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
 
-      if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
-        result_ = false;
-        return Status::OK();
-      }
+    if (left_data != nullptr && right_data != nullptr) {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+        return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+                      right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+                      length * byte_width) == 0;
+      };
+      VisitValidRuns(compare_runs);
+    } else {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+      VisitValidRuns(compare_runs);
     }
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const Decimal128Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  // Also matches MapType
+  Status Visit(const ListType& type) { return CompareList(type); }
 
-  Status Visit(const Decimal256Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  Status Visit(const LargeListType& type) { return CompareList(type); }
 
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
-    return Status::OK();
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
-      const T& left) {
-    return CompareValues<T>(left);
-  }
+  Status Visit(const FixedSizeListType& type) {
+    const auto list_size = type.list_size();
+    const ArrayData& left_data = *left_.child_data[0];
+    const ArrayData& right_data = *right_.child_data[0];
 
-  Status Visit(const ListArray& left) {
-    result_ = CompareLists(left);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+                               (left_start_idx_ + left_.offset + i) * list_size,
+                               (right_start_idx_ + right_.offset + i) * list_size,
+                               length * list_size);
+      return impl.Compare();
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const LargeListArray& left) {
-    result_ = CompareLists(left);
-    return Status::OK();
-  }
+  Status Visit(const StructType& type) {
+    const int32_t num_fields = type.num_fields();
 
-  Status Visit(const FixedSizeListArray& left) {
-    const auto& right = checked_cast<const FixedSizeListArray&>(right_);
-    result_ = left.values()->RangeEquals(
-        left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
-        right.value_offset(right_start_idx_), right.values());
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      for (int32_t f = 0; f < num_fields; ++f) {
+        RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+                                 *right_.child_data[f],
+                                 left_start_idx_ + left_.offset + i,
+                                 right_start_idx_ + right_.offset + i, length);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const MapArray& left) {
-    result_ = CompareMaps(left);
-    return Status::OK();
-  }
+  Status Visit(const SparseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
 
-  Status Visit(const StructArray& left) {
-    result_ = CompareStructs(left);
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];
+        // XXX can we instead detect runs of same-child union values?
+        RangeDataEqualsImpl impl(
+            options_, floating_approximate_, *left_.child_data[child_num],
+            *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+            right_start_idx_ + right_.offset + j, 1);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    });
     return Status::OK();
   }
 
-  Status Visit(const UnionArray& left) {
-    result_ = CompareUnions(left);
+  Status Visit(const DenseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
+    const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+    const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];

Review comment:
       I kept the previous semantics of dictionary array comparison, that is: if the dictionaries themselves are unequal, the arrays are considered unequal even if the indices only refer to equal dictionary elements.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] wesm commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
wesm commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r527275663



##########
File path: cpp/src/arrow/ipc/feather_test.cc
##########
@@ -286,10 +286,13 @@ TEST_P(TestFeather, PrimitiveNullRoundTrip) {
     std::vector<std::shared_ptr<Array>> expected_fields;
     for (int i = 0; i < batch->num_columns(); ++i) {
       ASSERT_EQ(batch->column_name(i), reader_->schema()->field(i)->name());
-      StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
-                             batch->column(i)->null_bitmap(),
-                             batch->column(i)->null_count());
-      AssertArraysEqual(str_values, *result->column(i)->chunk(0));
+      ASSERT_OK_AND_ASSIGN(auto expected, MakeArrayOfNull(utf8(), batch->num_rows()));
+      AssertArraysEqual(*expected, *result->column(i)->chunk(0));
+      //       StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
+      //                              batch->column(i)->null_bitmap(),
+      //                              batch->column(i)->null_count());
+      //       AssertArraysEqual(str_values, *result->column(i)->chunk(0),

Review comment:
       I recall this question coming up in the past and I don't recall the outcome, @xhochy do you remember?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou closed pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou closed pull request #8703:
URL: https://github.com/apache/arrow/pull/8703


   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210425



##########
File path: cpp/src/arrow/util/bit_run_reader.h
##########
@@ -162,5 +166,7 @@ class ARROW_EXPORT BitRunReader {
 using BitRunReader = BitRunReaderLinear;
 #endif
 
+// TODO SetBitRunReader?
+

Review comment:
       Yes, it would. That would make its logic much simpler and more easily optimized by the compiler (though, of course, numbers will be the final judge).




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r526854313



##########
File path: cpp/src/arrow/ipc/feather_test.cc
##########
@@ -286,10 +286,13 @@ TEST_P(TestFeather, PrimitiveNullRoundTrip) {
     std::vector<std::shared_ptr<Array>> expected_fields;
     for (int i = 0; i < batch->num_columns(); ++i) {
       ASSERT_EQ(batch->column_name(i), reader_->schema()->field(i)->name());
-      StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
-                             batch->column(i)->null_bitmap(),
-                             batch->column(i)->null_count());
-      AssertArraysEqual(str_values, *result->column(i)->chunk(0));
+      ASSERT_OK_AND_ASSIGN(auto expected, MakeArrayOfNull(utf8(), batch->num_rows()));
+      AssertArraysEqual(*expected, *result->column(i)->chunk(0));
+      //       StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
+      //                              batch->column(i)->null_bitmap(),
+      //                              batch->column(i)->null_count());
+      //       AssertArraysEqual(str_values, *result->column(i)->chunk(0),

Review comment:
       @wesm Is constructing a StringArray with null offsets and data supported (it's non-empty but all-nulls)?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528211187



##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const FixedSizeBinaryArray& left) {
-    const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+  Status Visit(const FloatType& type) { return CompareFloating(type); }
 
-    int32_t width = left.byte_width();
+  Status Visit(const DoubleType& type) { return CompareFloating(type); }
 
-    const uint8_t* left_data = nullptr;
-    const uint8_t* right_data = nullptr;
+  // Also matches StringType
+  Status Visit(const BinaryType& type) { return CompareBinary(type); }
 
-    if (left.values()) {
-      left_data = left.raw_values();
-    }
+  // Also matches LargeStringType
+  Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
 
-    if (right.values()) {
-      right_data = right.raw_values();
-    }
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        result_ = false;
-        return Status::OK();
-      }
-      if (is_null) continue;
+  Status Visit(const FixedSizeBinaryType& type) {
+    const auto byte_width = type.byte_width();
+    const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
 
-      if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
-        result_ = false;
-        return Status::OK();
-      }
+    if (left_data != nullptr && right_data != nullptr) {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+        return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+                      right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+                      length * byte_width) == 0;
+      };
+      VisitValidRuns(compare_runs);
+    } else {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+      VisitValidRuns(compare_runs);
     }
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const Decimal128Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  // Also matches MapType
+  Status Visit(const ListType& type) { return CompareList(type); }
 
-  Status Visit(const Decimal256Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  Status Visit(const LargeListType& type) { return CompareList(type); }
 
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
-    return Status::OK();
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
-      const T& left) {
-    return CompareValues<T>(left);
-  }
+  Status Visit(const FixedSizeListType& type) {
+    const auto list_size = type.list_size();
+    const ArrayData& left_data = *left_.child_data[0];
+    const ArrayData& right_data = *right_.child_data[0];
 
-  Status Visit(const ListArray& left) {
-    result_ = CompareLists(left);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+                               (left_start_idx_ + left_.offset + i) * list_size,
+                               (right_start_idx_ + right_.offset + i) * list_size,
+                               length * list_size);
+      return impl.Compare();
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const LargeListArray& left) {
-    result_ = CompareLists(left);
-    return Status::OK();
-  }
+  Status Visit(const StructType& type) {
+    const int32_t num_fields = type.num_fields();
 
-  Status Visit(const FixedSizeListArray& left) {
-    const auto& right = checked_cast<const FixedSizeListArray&>(right_);
-    result_ = left.values()->RangeEquals(
-        left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
-        right.value_offset(right_start_idx_), right.values());
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      for (int32_t f = 0; f < num_fields; ++f) {
+        RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+                                 *right_.child_data[f],
+                                 left_start_idx_ + left_.offset + i,
+                                 right_start_idx_ + right_.offset + i, length);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const MapArray& left) {
-    result_ = CompareMaps(left);
-    return Status::OK();
-  }
+  Status Visit(const SparseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
 
-  Status Visit(const StructArray& left) {
-    result_ = CompareStructs(left);
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];
+        // XXX can we instead detect runs of same-child union values?
+        RangeDataEqualsImpl impl(
+            options_, floating_approximate_, *left_.child_data[child_num],
+            *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+            right_start_idx_ + right_.offset + j, 1);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    });
     return Status::OK();
   }
 
-  Status Visit(const UnionArray& left) {
-    result_ = CompareUnions(left);
+  Status Visit(const DenseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
+    const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+    const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];
+        RangeDataEqualsImpl impl(
+            options_, floating_approximate_, *left_.child_data[child_num],
+            *right_.child_data[child_num], left_offsets[left_start_idx_ + j],
+            right_offsets[right_start_idx_ + j], 1);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    });
     return Status::OK();
   }
 
-  Status Visit(const DictionaryArray& left) {
-    const auto& right = checked_cast<const DictionaryArray&>(right_);
-    if (!left.dictionary()->Equals(right.dictionary())) {
-      result_ = false;
-      return Status::OK();
+  Status Visit(const DictionaryType& type) {
+    // Compare dictionaries
+    result_ &= CompareArrayRanges(
+        *left_.dictionary, *right_.dictionary,
+        /*left_start_idx=*/0,
+        /*left_end_idx=*/std::max(left_.dictionary->length, right_.dictionary->length),
+        /*right_start_idx=*/0, options_, floating_approximate_);
+    if (result_) {
+      // Compare indices
+      result_ &= CompareWithType(*type.index_type());
     }
-    result_ = left.indices()->RangeEquals(left_start_idx_, left_end_idx_,
-                                          right_start_idx_, right.indices());
     return Status::OK();
   }
 
-  Status Visit(const ExtensionArray& left) {
-    result_ = (right_.type()->Equals(*left.type()) &&
-               ArrayRangeEquals(*left.storage(),
-                                *static_cast<const ExtensionArray&>(right_).storage(),
-                                left_start_idx_, left_end_idx_, right_start_idx_));
+  Status Visit(const ExtensionType& type) {
+    // Compare storages
+    result_ &= CompareWithType(*type.storage_type());
     return Status::OK();
   }
 
-  bool result() const { return result_; }
-
  protected:
-  const Array& right_;
-  int64_t left_start_idx_;
-  int64_t left_end_idx_;
-  int64_t right_start_idx_;
-
-  bool result_;
-};
-
-static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& right) {
-  const int byte_width = internal::GetByteWidth(*left.type());
-
-  const uint8_t* left_data = nullptr;
-  const uint8_t* right_data = nullptr;
-
-  if (left.values()) {
-    left_data = left.values()->data() + left.offset() * byte_width;
+  template <typename TypeClass, typename CType = typename TypeClass::c_type>
+  Status ComparePrimitive(const TypeClass&) {
+    const CType* left_values = left_.GetValues<CType>(1);
+    const CType* right_values = right_.GetValues<CType>(1);
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      return memcmp(left_values + left_start_idx_ + i,
+                    right_values + right_start_idx_ + i, length * sizeof(CType)) == 0;
+    });
+    return Status::OK();
   }
 
-  if (right.values()) {
-    right_data = right.values()->data() + right.offset() * byte_width;
-  }
+  template <typename TypeClass>
+  Status CompareFloating(const TypeClass&) {
+    using T = typename TypeClass::c_type;
+    const T* left_values = left_.GetValues<T>(1);
+    const T* right_values = right_.GetValues<T>(1);
 
-  if (byte_width == 0) {
-    // Special case 0-width data, as the data pointers may be null
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i) != right.IsNull(i)) {
-        return false;
-      }
-    }
-    return true;
-  } else if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      const bool left_null = left.IsNull(i);
-      const bool right_null = right.IsNull(i);
-      if (left_null != right_null) {
-        return false;
+    if (floating_approximate_) {
+      const T epsilon = static_cast<T>(options_.atol());
+      if (options_.nans_equal()) {
+        VisitValues([&](int64_t i) {
+          const T x = left_values[i + left_start_idx_];
+          const T y = right_values[i + right_start_idx_];
+          return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
+        });
+      } else {
+        VisitValues([&](int64_t i) {
+          const T x = left_values[i + left_start_idx_];
+          const T y = right_values[i + right_start_idx_];
+          return (fabs(x - y) <= epsilon) || (x == y);
+        });
       }
-      if (!left_null && memcmp(left_data, right_data, byte_width) != 0) {
-        return false;
+    } else {
+      if (options_.nans_equal()) {
+        VisitValues([&](int64_t i) {
+          const T x = left_values[i + left_start_idx_];
+          const T y = right_values[i + right_start_idx_];
+          return (x == y) || (std::isnan(x) && std::isnan(y));
+        });
+      } else {
+        VisitValues([&](int64_t i) {
+          const T x = left_values[i + left_start_idx_];
+          const T y = right_values[i + right_start_idx_];
+          return x == y;
+        });
       }
-      left_data += byte_width;
-      right_data += byte_width;
     }
-    return true;
-  } else {
-    auto number_of_bytes_to_compare = static_cast<size_t>(byte_width * left.length());
-    return memcmp(left_data, right_data, number_of_bytes_to_compare) == 0;
-  }
-}
-
-// A bit confusing: ArrayEqualsVisitor inherits from RangeEqualsVisitor but
-// doesn't share the same preconditions.
-// When RangeEqualsVisitor is called, we only know the range sizes equal.
-// When ArrayEqualsVisitor is called, we know the sizes and null bitmaps are equal.
-
-class ArrayEqualsVisitor : public RangeEqualsVisitor {
- public:
-  explicit ArrayEqualsVisitor(const Array& right, const EqualOptions& opts)
-      : RangeEqualsVisitor(right, 0, right.length(), 0), opts_(opts) {}
-
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const BooleanArray& left) {
-    const auto& right = checked_cast<const BooleanArray&>(right_);
+  template <typename TypeClass>
+  Status CompareBinary(const TypeClass&) {
+    const uint8_t* left_data = left_.GetValues<uint8_t>(2, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(2, 0);
 
-    if (left.null_count() > 0) {
-      const uint8_t* left_data = left.values()->data();
-      const uint8_t* right_data = right.values()->data();
-
-      for (int64_t i = 0; i < left.length(); ++i) {
-        if (left.IsValid(i) && BitUtil::GetBit(left_data, i + left.offset()) !=
-                                   BitUtil::GetBit(right_data, i + right.offset())) {
-          result_ = false;
-          return Status::OK();
-        }
-      }
-      result_ = true;
+    if (left_data != nullptr && right_data != nullptr) {
+      const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+                                      int64_t length) -> bool {
+        return memcmp(left_data + left_offset, right_data + right_offset, length) == 0;
+      };
+      CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
     } else {
-      result_ = BitmapEquals(left.values()->data(), left.offset(), right.values()->data(),
-                             right.offset(), left.length());
+      // One of the arrays is an array of empty strings and nulls.
+      // We just need to compare the offsets.
+      // (note we must not call memcmp() with null data pointers)
+      const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+                                      int64_t length) -> bool { return true; };
+      CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);

Review comment:
       Does this work in C++11? :-o




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528622512



##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);

Review comment:
       That will be part of the JIRA followup (investigate replacing BitmapWordReader with BitmapUInt64Reader).




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210778



##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }

Review comment:
       It's ok to compute the null count, IMHO, it is often used for other tasks.
   The reason why this heuristic only works for non-ranged arrays is that you could have two whole arrays with different null counts, but the compared ranges would still be equal.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] bkietz commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528213541



##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const FixedSizeBinaryArray& left) {
-    const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+  Status Visit(const FloatType& type) { return CompareFloating(type); }
 
-    int32_t width = left.byte_width();
+  Status Visit(const DoubleType& type) { return CompareFloating(type); }
 
-    const uint8_t* left_data = nullptr;
-    const uint8_t* right_data = nullptr;
+  // Also matches StringType
+  Status Visit(const BinaryType& type) { return CompareBinary(type); }
 
-    if (left.values()) {
-      left_data = left.raw_values();
-    }
+  // Also matches LargeStringType
+  Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
 
-    if (right.values()) {
-      right_data = right.raw_values();
-    }
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        result_ = false;
-        return Status::OK();
-      }
-      if (is_null) continue;
+  Status Visit(const FixedSizeBinaryType& type) {
+    const auto byte_width = type.byte_width();
+    const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
 
-      if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
-        result_ = false;
-        return Status::OK();
-      }
+    if (left_data != nullptr && right_data != nullptr) {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+        return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+                      right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+                      length * byte_width) == 0;
+      };
+      VisitValidRuns(compare_runs);
+    } else {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+      VisitValidRuns(compare_runs);
     }
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const Decimal128Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  // Also matches MapType
+  Status Visit(const ListType& type) { return CompareList(type); }
 
-  Status Visit(const Decimal256Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  Status Visit(const LargeListType& type) { return CompareList(type); }
 
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
-    return Status::OK();
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
-      const T& left) {
-    return CompareValues<T>(left);
-  }
+  Status Visit(const FixedSizeListType& type) {
+    const auto list_size = type.list_size();
+    const ArrayData& left_data = *left_.child_data[0];
+    const ArrayData& right_data = *right_.child_data[0];
 
-  Status Visit(const ListArray& left) {
-    result_ = CompareLists(left);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+                               (left_start_idx_ + left_.offset + i) * list_size,
+                               (right_start_idx_ + right_.offset + i) * list_size,
+                               length * list_size);
+      return impl.Compare();
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const LargeListArray& left) {
-    result_ = CompareLists(left);
-    return Status::OK();
-  }
+  Status Visit(const StructType& type) {
+    const int32_t num_fields = type.num_fields();
 
-  Status Visit(const FixedSizeListArray& left) {
-    const auto& right = checked_cast<const FixedSizeListArray&>(right_);
-    result_ = left.values()->RangeEquals(
-        left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
-        right.value_offset(right_start_idx_), right.values());
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      for (int32_t f = 0; f < num_fields; ++f) {
+        RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+                                 *right_.child_data[f],
+                                 left_start_idx_ + left_.offset + i,
+                                 right_start_idx_ + right_.offset + i, length);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const MapArray& left) {
-    result_ = CompareMaps(left);
-    return Status::OK();
-  }
+  Status Visit(const SparseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
 
-  Status Visit(const StructArray& left) {
-    result_ = CompareStructs(left);
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];
+        // XXX can we instead detect runs of same-child union values?
+        RangeDataEqualsImpl impl(
+            options_, floating_approximate_, *left_.child_data[child_num],
+            *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+            right_start_idx_ + right_.offset + j, 1);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    });
     return Status::OK();
   }
 
-  Status Visit(const UnionArray& left) {
-    result_ = CompareUnions(left);
+  Status Visit(const DenseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
+    const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+    const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];

Review comment:
       I see that you kept previous semantics, I was just noting that the difference is odd. I would have expected the two cases to be more symmetric and wondered if you could comment




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] bkietz commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r526259632



##########
File path: cpp/src/arrow/ipc/feather_test.cc
##########
@@ -286,10 +286,13 @@ TEST_P(TestFeather, PrimitiveNullRoundTrip) {
     std::vector<std::shared_ptr<Array>> expected_fields;
     for (int i = 0; i < batch->num_columns(); ++i) {
       ASSERT_EQ(batch->column_name(i), reader_->schema()->field(i)->name());
-      StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
-                             batch->column(i)->null_bitmap(),
-                             batch->column(i)->null_count());
-      AssertArraysEqual(str_values, *result->column(i)->chunk(0));
+      ASSERT_OK_AND_ASSIGN(auto expected, MakeArrayOfNull(utf8(), batch->num_rows()));
+      AssertArraysEqual(*expected, *result->column(i)->chunk(0));
+      //       StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
+      //                              batch->column(i)->null_bitmap(),
+      //                              batch->column(i)->null_count());
+      //       AssertArraysEqual(str_values, *result->column(i)->chunk(0),
+      //       /*verbose=*/true);

Review comment:
       Looks like leftovers
   ```suggestion
   ```

##########
File path: cpp/src/arrow/util/bit_run_reader.h
##########
@@ -162,5 +166,7 @@ class ARROW_EXPORT BitRunReader {
 using BitRunReader = BitRunReaderLinear;
 #endif
 
+// TODO SetBitRunReader?
+

Review comment:
       IIUC BitRunReader yields alternating set/unset BitRuns. Would SetBitRunReader yield only the set BitRuns?

##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }

Review comment:
       I'm not sure why this optimization only applies to entire arrays. Additionally, as currently stated it doesn't necessarily compare cached null counts.
   
   If we want to absolutely avoid accessing the null bitmap here, we'll need:
   ```suggestion
       // Compare null bitmaps
       // Try to compare cached null counts first:
       int64_t left_null_count = left_.null_count.load(), right_null_count = right_.null_count.load();
       if (left_null_count != kUnknownNullCount && right_null_count != kUnknownNullCount &&
           left_null_count != right_null_count) {
         return false;
       }
   ```

##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const FixedSizeBinaryArray& left) {
-    const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+  Status Visit(const FloatType& type) { return CompareFloating(type); }
 
-    int32_t width = left.byte_width();
+  Status Visit(const DoubleType& type) { return CompareFloating(type); }
 
-    const uint8_t* left_data = nullptr;
-    const uint8_t* right_data = nullptr;
+  // Also matches StringType
+  Status Visit(const BinaryType& type) { return CompareBinary(type); }
 
-    if (left.values()) {
-      left_data = left.raw_values();
-    }
+  // Also matches LargeStringType
+  Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
 
-    if (right.values()) {
-      right_data = right.raw_values();
-    }
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        result_ = false;
-        return Status::OK();
-      }
-      if (is_null) continue;
+  Status Visit(const FixedSizeBinaryType& type) {
+    const auto byte_width = type.byte_width();
+    const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
 
-      if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
-        result_ = false;
-        return Status::OK();
-      }
+    if (left_data != nullptr && right_data != nullptr) {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+        return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+                      right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+                      length * byte_width) == 0;
+      };
+      VisitValidRuns(compare_runs);
+    } else {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+      VisitValidRuns(compare_runs);
     }
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const Decimal128Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  // Also matches MapType
+  Status Visit(const ListType& type) { return CompareList(type); }
 
-  Status Visit(const Decimal256Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  Status Visit(const LargeListType& type) { return CompareList(type); }
 
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
-    return Status::OK();
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
-      const T& left) {
-    return CompareValues<T>(left);
-  }
+  Status Visit(const FixedSizeListType& type) {
+    const auto list_size = type.list_size();
+    const ArrayData& left_data = *left_.child_data[0];
+    const ArrayData& right_data = *right_.child_data[0];
 
-  Status Visit(const ListArray& left) {
-    result_ = CompareLists(left);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+                               (left_start_idx_ + left_.offset + i) * list_size,
+                               (right_start_idx_ + right_.offset + i) * list_size,
+                               length * list_size);
+      return impl.Compare();
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const LargeListArray& left) {
-    result_ = CompareLists(left);
-    return Status::OK();
-  }
+  Status Visit(const StructType& type) {
+    const int32_t num_fields = type.num_fields();
 
-  Status Visit(const FixedSizeListArray& left) {
-    const auto& right = checked_cast<const FixedSizeListArray&>(right_);
-    result_ = left.values()->RangeEquals(
-        left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
-        right.value_offset(right_start_idx_), right.values());
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      for (int32_t f = 0; f < num_fields; ++f) {
+        RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+                                 *right_.child_data[f],
+                                 left_start_idx_ + left_.offset + i,
+                                 right_start_idx_ + right_.offset + i, length);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const MapArray& left) {
-    result_ = CompareMaps(left);
-    return Status::OK();
-  }
+  Status Visit(const SparseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
 
-  Status Visit(const StructArray& left) {
-    result_ = CompareStructs(left);
+    VisitValidRuns([&](int64_t i, int64_t length) {

Review comment:
       Since unions don't have top level nulls is this worthwhile?

##########
File path: cpp/src/arrow/array/diff.h
##########
@@ -59,6 +57,27 @@ ARROW_EXPORT
 Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
                                           MemoryPool* pool = default_memory_pool());
 
+/// \brief Compare two array ranges, returning an edit script which expresses the
+/// difference between them
+///
+/// Same as Diff(), but only the ranges defined by the given offsets and lengths
+/// are compared.
+///
+/// \param[in] base baseline for comparison
+/// \param[in] target an array of identical type to base whose elements differ from base's
+/// \param[in] base_offset the start offset of the range to consider inside `base`
+/// \param[in] base_length the length of the range to consider inside `base`
+/// \param[in] target_offset the start offset of the range to consider inside `target`
+/// \param[in] target_length the length of the range to consider inside `target`
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \return an edit script array which can be applied to base to produce target
+ARROW_EXPORT
+Result<std::shared_ptr<StructArray>> DiffRanges(const Array& base, const Array& target,

Review comment:
       Not really an objection, but: I'm not sure what this adds over applying the ranges to the base and target ArrayData then running Diff on those

##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);

Review comment:
       This seems like a generally useful bitmap comparison utility. If it's faster than BitmapsEqual then maybe it could replace that function?

##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const FixedSizeBinaryArray& left) {
-    const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+  Status Visit(const FloatType& type) { return CompareFloating(type); }
 
-    int32_t width = left.byte_width();
+  Status Visit(const DoubleType& type) { return CompareFloating(type); }
 
-    const uint8_t* left_data = nullptr;
-    const uint8_t* right_data = nullptr;
+  // Also matches StringType
+  Status Visit(const BinaryType& type) { return CompareBinary(type); }
 
-    if (left.values()) {
-      left_data = left.raw_values();
-    }
+  // Also matches LargeStringType
+  Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
 
-    if (right.values()) {
-      right_data = right.raw_values();
-    }
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        result_ = false;
-        return Status::OK();
-      }
-      if (is_null) continue;
+  Status Visit(const FixedSizeBinaryType& type) {
+    const auto byte_width = type.byte_width();
+    const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
 
-      if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
-        result_ = false;
-        return Status::OK();
-      }
+    if (left_data != nullptr && right_data != nullptr) {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+        return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+                      right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+                      length * byte_width) == 0;
+      };
+      VisitValidRuns(compare_runs);
+    } else {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+      VisitValidRuns(compare_runs);
     }
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const Decimal128Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  // Also matches MapType
+  Status Visit(const ListType& type) { return CompareList(type); }
 
-  Status Visit(const Decimal256Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  Status Visit(const LargeListType& type) { return CompareList(type); }
 
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
-    return Status::OK();
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
-      const T& left) {
-    return CompareValues<T>(left);
-  }
+  Status Visit(const FixedSizeListType& type) {
+    const auto list_size = type.list_size();
+    const ArrayData& left_data = *left_.child_data[0];
+    const ArrayData& right_data = *right_.child_data[0];
 
-  Status Visit(const ListArray& left) {
-    result_ = CompareLists(left);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+                               (left_start_idx_ + left_.offset + i) * list_size,
+                               (right_start_idx_ + right_.offset + i) * list_size,
+                               length * list_size);
+      return impl.Compare();
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const LargeListArray& left) {
-    result_ = CompareLists(left);
-    return Status::OK();
-  }
+  Status Visit(const StructType& type) {
+    const int32_t num_fields = type.num_fields();
 
-  Status Visit(const FixedSizeListArray& left) {
-    const auto& right = checked_cast<const FixedSizeListArray&>(right_);
-    result_ = left.values()->RangeEquals(
-        left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
-        right.value_offset(right_start_idx_), right.values());
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      for (int32_t f = 0; f < num_fields; ++f) {
+        RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+                                 *right_.child_data[f],
+                                 left_start_idx_ + left_.offset + i,
+                                 right_start_idx_ + right_.offset + i, length);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const MapArray& left) {
-    result_ = CompareMaps(left);
-    return Status::OK();
-  }
+  Status Visit(const SparseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
 
-  Status Visit(const StructArray& left) {
-    result_ = CompareStructs(left);
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];
+        // XXX can we instead detect runs of same-child union values?

Review comment:
       Interesting, this seems very doable:
   - get next type_id of left and right
   - early exit if not equal
   - find length of run in left_codes
   - find length of run in right_codes
   - early exit if run lengths are not equal
   - compare ranges of children

##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const FixedSizeBinaryArray& left) {
-    const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+  Status Visit(const FloatType& type) { return CompareFloating(type); }
 
-    int32_t width = left.byte_width();
+  Status Visit(const DoubleType& type) { return CompareFloating(type); }
 
-    const uint8_t* left_data = nullptr;
-    const uint8_t* right_data = nullptr;
+  // Also matches StringType
+  Status Visit(const BinaryType& type) { return CompareBinary(type); }
 
-    if (left.values()) {
-      left_data = left.raw_values();
-    }
+  // Also matches LargeStringType
+  Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
 
-    if (right.values()) {
-      right_data = right.raw_values();
-    }
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        result_ = false;
-        return Status::OK();
-      }
-      if (is_null) continue;
+  Status Visit(const FixedSizeBinaryType& type) {
+    const auto byte_width = type.byte_width();
+    const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
 
-      if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
-        result_ = false;
-        return Status::OK();
-      }
+    if (left_data != nullptr && right_data != nullptr) {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+        return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+                      right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+                      length * byte_width) == 0;
+      };
+      VisitValidRuns(compare_runs);
+    } else {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+      VisitValidRuns(compare_runs);
     }
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const Decimal128Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  // Also matches MapType
+  Status Visit(const ListType& type) { return CompareList(type); }
 
-  Status Visit(const Decimal256Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  Status Visit(const LargeListType& type) { return CompareList(type); }
 
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
-    return Status::OK();
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
-      const T& left) {
-    return CompareValues<T>(left);
-  }
+  Status Visit(const FixedSizeListType& type) {
+    const auto list_size = type.list_size();
+    const ArrayData& left_data = *left_.child_data[0];
+    const ArrayData& right_data = *right_.child_data[0];
 
-  Status Visit(const ListArray& left) {
-    result_ = CompareLists(left);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+                               (left_start_idx_ + left_.offset + i) * list_size,
+                               (right_start_idx_ + right_.offset + i) * list_size,
+                               length * list_size);
+      return impl.Compare();
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const LargeListArray& left) {
-    result_ = CompareLists(left);
-    return Status::OK();
-  }
+  Status Visit(const StructType& type) {
+    const int32_t num_fields = type.num_fields();
 
-  Status Visit(const FixedSizeListArray& left) {
-    const auto& right = checked_cast<const FixedSizeListArray&>(right_);
-    result_ = left.values()->RangeEquals(
-        left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
-        right.value_offset(right_start_idx_), right.values());
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      for (int32_t f = 0; f < num_fields; ++f) {
+        RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+                                 *right_.child_data[f],
+                                 left_start_idx_ + left_.offset + i,
+                                 right_start_idx_ + right_.offset + i, length);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const MapArray& left) {
-    result_ = CompareMaps(left);
-    return Status::OK();
-  }
+  Status Visit(const SparseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
 
-  Status Visit(const StructArray& left) {
-    result_ = CompareStructs(left);
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];
+        // XXX can we instead detect runs of same-child union values?
+        RangeDataEqualsImpl impl(
+            options_, floating_approximate_, *left_.child_data[child_num],
+            *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+            right_start_idx_ + right_.offset + j, 1);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    });
     return Status::OK();
   }
 
-  Status Visit(const UnionArray& left) {
-    result_ = CompareUnions(left);
+  Status Visit(const DenseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
+    const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+    const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];
+        RangeDataEqualsImpl impl(
+            options_, floating_approximate_, *left_.child_data[child_num],
+            *right_.child_data[child_num], left_offsets[left_start_idx_ + j],
+            right_offsets[right_start_idx_ + j], 1);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    });
     return Status::OK();
   }
 
-  Status Visit(const DictionaryArray& left) {
-    const auto& right = checked_cast<const DictionaryArray&>(right_);
-    if (!left.dictionary()->Equals(right.dictionary())) {
-      result_ = false;
-      return Status::OK();
+  Status Visit(const DictionaryType& type) {
+    // Compare dictionaries
+    result_ &= CompareArrayRanges(
+        *left_.dictionary, *right_.dictionary,
+        /*left_start_idx=*/0,
+        /*left_end_idx=*/std::max(left_.dictionary->length, right_.dictionary->length),
+        /*right_start_idx=*/0, options_, floating_approximate_);
+    if (result_) {
+      // Compare indices
+      result_ &= CompareWithType(*type.index_type());
     }
-    result_ = left.indices()->RangeEquals(left_start_idx_, left_end_idx_,
-                                          right_start_idx_, right.indices());
     return Status::OK();
   }
 
-  Status Visit(const ExtensionArray& left) {
-    result_ = (right_.type()->Equals(*left.type()) &&
-               ArrayRangeEquals(*left.storage(),
-                                *static_cast<const ExtensionArray&>(right_).storage(),
-                                left_start_idx_, left_end_idx_, right_start_idx_));
+  Status Visit(const ExtensionType& type) {
+    // Compare storages
+    result_ &= CompareWithType(*type.storage_type());
     return Status::OK();
   }
 
-  bool result() const { return result_; }
-
  protected:
-  const Array& right_;
-  int64_t left_start_idx_;
-  int64_t left_end_idx_;
-  int64_t right_start_idx_;
-
-  bool result_;
-};
-
-static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& right) {
-  const int byte_width = internal::GetByteWidth(*left.type());
-
-  const uint8_t* left_data = nullptr;
-  const uint8_t* right_data = nullptr;
-
-  if (left.values()) {
-    left_data = left.values()->data() + left.offset() * byte_width;
+  template <typename TypeClass, typename CType = typename TypeClass::c_type>
+  Status ComparePrimitive(const TypeClass&) {
+    const CType* left_values = left_.GetValues<CType>(1);
+    const CType* right_values = right_.GetValues<CType>(1);
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      return memcmp(left_values + left_start_idx_ + i,
+                    right_values + right_start_idx_ + i, length * sizeof(CType)) == 0;
+    });
+    return Status::OK();
   }
 
-  if (right.values()) {
-    right_data = right.values()->data() + right.offset() * byte_width;
-  }
+  template <typename TypeClass>
+  Status CompareFloating(const TypeClass&) {
+    using T = typename TypeClass::c_type;
+    const T* left_values = left_.GetValues<T>(1);
+    const T* right_values = right_.GetValues<T>(1);
 
-  if (byte_width == 0) {
-    // Special case 0-width data, as the data pointers may be null
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i) != right.IsNull(i)) {
-        return false;
-      }
-    }
-    return true;
-  } else if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      const bool left_null = left.IsNull(i);
-      const bool right_null = right.IsNull(i);
-      if (left_null != right_null) {
-        return false;
+    if (floating_approximate_) {
+      const T epsilon = static_cast<T>(options_.atol());
+      if (options_.nans_equal()) {
+        VisitValues([&](int64_t i) {
+          const T x = left_values[i + left_start_idx_];
+          const T y = right_values[i + right_start_idx_];
+          return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
+        });
+      } else {
+        VisitValues([&](int64_t i) {
+          const T x = left_values[i + left_start_idx_];
+          const T y = right_values[i + right_start_idx_];
+          return (fabs(x - y) <= epsilon) || (x == y);
+        });
       }
-      if (!left_null && memcmp(left_data, right_data, byte_width) != 0) {
-        return false;
+    } else {
+      if (options_.nans_equal()) {
+        VisitValues([&](int64_t i) {
+          const T x = left_values[i + left_start_idx_];
+          const T y = right_values[i + right_start_idx_];
+          return (x == y) || (std::isnan(x) && std::isnan(y));
+        });
+      } else {
+        VisitValues([&](int64_t i) {
+          const T x = left_values[i + left_start_idx_];
+          const T y = right_values[i + right_start_idx_];
+          return x == y;
+        });
       }
-      left_data += byte_width;
-      right_data += byte_width;
     }
-    return true;
-  } else {
-    auto number_of_bytes_to_compare = static_cast<size_t>(byte_width * left.length());
-    return memcmp(left_data, right_data, number_of_bytes_to_compare) == 0;
-  }
-}
-
-// A bit confusing: ArrayEqualsVisitor inherits from RangeEqualsVisitor but
-// doesn't share the same preconditions.
-// When RangeEqualsVisitor is called, we only know the range sizes equal.
-// When ArrayEqualsVisitor is called, we know the sizes and null bitmaps are equal.
-
-class ArrayEqualsVisitor : public RangeEqualsVisitor {
- public:
-  explicit ArrayEqualsVisitor(const Array& right, const EqualOptions& opts)
-      : RangeEqualsVisitor(right, 0, right.length(), 0), opts_(opts) {}
-
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const BooleanArray& left) {
-    const auto& right = checked_cast<const BooleanArray&>(right_);
+  template <typename TypeClass>
+  Status CompareBinary(const TypeClass&) {
+    const uint8_t* left_data = left_.GetValues<uint8_t>(2, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(2, 0);
 
-    if (left.null_count() > 0) {
-      const uint8_t* left_data = left.values()->data();
-      const uint8_t* right_data = right.values()->data();
-
-      for (int64_t i = 0; i < left.length(); ++i) {
-        if (left.IsValid(i) && BitUtil::GetBit(left_data, i + left.offset()) !=
-                                   BitUtil::GetBit(right_data, i + right.offset())) {
-          result_ = false;
-          return Status::OK();
-        }
-      }
-      result_ = true;
+    if (left_data != nullptr && right_data != nullptr) {
+      const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+                                      int64_t length) -> bool {
+        return memcmp(left_data + left_offset, right_data + right_offset, length) == 0;
+      };
+      CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
     } else {
-      result_ = BitmapEquals(left.values()->data(), left.offset(), right.values()->data(),
-                             right.offset(), left.length());
+      // One of the arrays is an array of empty strings and nulls.
+      // We just need to compare the offsets.
+      // (note we must not call memcmp() with null data pointers)
+      const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+                                      int64_t length) -> bool { return true; };
+      CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);

Review comment:
       ```suggestion
         CompareWithOffsets<typename TypeClass::offset_type>(1, [](...) { return true; });
   ```

##########
File path: cpp/src/arrow/util/bitmap_reader.h
##########
@@ -69,6 +69,77 @@ class BitmapReader {
   int64_t bit_offset_;
 };
 
+// XXX Cannot name it BitmapWordReader because the name is already used
+// in bitmap_ops.cc

Review comment:
       There's a lot of duplication of logic under the heading of "reading words from bitmaps": VisitWords, BitmapWordReader, BitmapUInt64Reader. In a follow up it'd be handy to consolidate these and ensure they're rigorously benchmarked (or if there are compelling reasons not to consolidate then provide comments indicating when each should be preferred)

##########
File path: cpp/src/arrow/util/bitmap.h
##########
@@ -110,8 +110,8 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
   ///
   /// TODO(bkietz) allow for early termination
   template <size_t N, typename Visitor,
-            typename Word =
-                typename internal::call_traits::argument_type<0, Visitor&&>::value_type>
+            typename Word = typename std::decay<
+                internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>

Review comment:
       I don't think VisitWords is being used in this PR. IIUC this change would only apply if the visitor took a constant reference to the words array rather than just taking the words array?

##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const FixedSizeBinaryArray& left) {
-    const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+  Status Visit(const FloatType& type) { return CompareFloating(type); }
 
-    int32_t width = left.byte_width();
+  Status Visit(const DoubleType& type) { return CompareFloating(type); }
 
-    const uint8_t* left_data = nullptr;
-    const uint8_t* right_data = nullptr;
+  // Also matches StringType
+  Status Visit(const BinaryType& type) { return CompareBinary(type); }
 
-    if (left.values()) {
-      left_data = left.raw_values();
-    }
+  // Also matches LargeStringType
+  Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
 
-    if (right.values()) {
-      right_data = right.raw_values();
-    }
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        result_ = false;
-        return Status::OK();
-      }
-      if (is_null) continue;
+  Status Visit(const FixedSizeBinaryType& type) {
+    const auto byte_width = type.byte_width();
+    const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
 
-      if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
-        result_ = false;
-        return Status::OK();
-      }
+    if (left_data != nullptr && right_data != nullptr) {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+        return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+                      right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+                      length * byte_width) == 0;
+      };
+      VisitValidRuns(compare_runs);
+    } else {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+      VisitValidRuns(compare_runs);
     }
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const Decimal128Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  // Also matches MapType
+  Status Visit(const ListType& type) { return CompareList(type); }
 
-  Status Visit(const Decimal256Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  Status Visit(const LargeListType& type) { return CompareList(type); }
 
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
-    return Status::OK();
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
-      const T& left) {
-    return CompareValues<T>(left);
-  }
+  Status Visit(const FixedSizeListType& type) {
+    const auto list_size = type.list_size();
+    const ArrayData& left_data = *left_.child_data[0];
+    const ArrayData& right_data = *right_.child_data[0];
 
-  Status Visit(const ListArray& left) {
-    result_ = CompareLists(left);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+                               (left_start_idx_ + left_.offset + i) * list_size,
+                               (right_start_idx_ + right_.offset + i) * list_size,
+                               length * list_size);
+      return impl.Compare();
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const LargeListArray& left) {
-    result_ = CompareLists(left);
-    return Status::OK();
-  }
+  Status Visit(const StructType& type) {
+    const int32_t num_fields = type.num_fields();
 
-  Status Visit(const FixedSizeListArray& left) {
-    const auto& right = checked_cast<const FixedSizeListArray&>(right_);
-    result_ = left.values()->RangeEquals(
-        left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
-        right.value_offset(right_start_idx_), right.values());
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      for (int32_t f = 0; f < num_fields; ++f) {
+        RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+                                 *right_.child_data[f],
+                                 left_start_idx_ + left_.offset + i,
+                                 right_start_idx_ + right_.offset + i, length);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const MapArray& left) {
-    result_ = CompareMaps(left);
-    return Status::OK();
-  }
+  Status Visit(const SparseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
 
-  Status Visit(const StructArray& left) {
-    result_ = CompareStructs(left);
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];
+        // XXX can we instead detect runs of same-child union values?
+        RangeDataEqualsImpl impl(
+            options_, floating_approximate_, *left_.child_data[child_num],
+            *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+            right_start_idx_ + right_.offset + j, 1);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    });
     return Status::OK();
   }
 
-  Status Visit(const UnionArray& left) {
-    result_ = CompareUnions(left);
+  Status Visit(const DenseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
+    const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+    const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];

Review comment:
       runs of type_id can still be used here, but ranges of children can't be directly compared due to the offsets. It's odd that we "dereference" offsets when comparing dense union arrays but we don't do the same with a dictionary array's indices




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210816



##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);

Review comment:
       Will take a look.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528593225



##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const FixedSizeBinaryArray& left) {
-    const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+  Status Visit(const FloatType& type) { return CompareFloating(type); }
 
-    int32_t width = left.byte_width();
+  Status Visit(const DoubleType& type) { return CompareFloating(type); }
 
-    const uint8_t* left_data = nullptr;
-    const uint8_t* right_data = nullptr;
+  // Also matches StringType
+  Status Visit(const BinaryType& type) { return CompareBinary(type); }
 
-    if (left.values()) {
-      left_data = left.raw_values();
-    }
+  // Also matches LargeStringType
+  Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
 
-    if (right.values()) {
-      right_data = right.raw_values();
-    }
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        result_ = false;
-        return Status::OK();
-      }
-      if (is_null) continue;
+  Status Visit(const FixedSizeBinaryType& type) {
+    const auto byte_width = type.byte_width();
+    const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
 
-      if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
-        result_ = false;
-        return Status::OK();
-      }
+    if (left_data != nullptr && right_data != nullptr) {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+        return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+                      right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+                      length * byte_width) == 0;
+      };
+      VisitValidRuns(compare_runs);
+    } else {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+      VisitValidRuns(compare_runs);
     }
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const Decimal128Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  // Also matches MapType
+  Status Visit(const ListType& type) { return CompareList(type); }
 
-  Status Visit(const Decimal256Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  Status Visit(const LargeListType& type) { return CompareList(type); }
 
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
-    return Status::OK();
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
-      const T& left) {
-    return CompareValues<T>(left);
-  }
+  Status Visit(const FixedSizeListType& type) {
+    const auto list_size = type.list_size();
+    const ArrayData& left_data = *left_.child_data[0];
+    const ArrayData& right_data = *right_.child_data[0];
 
-  Status Visit(const ListArray& left) {
-    result_ = CompareLists(left);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+                               (left_start_idx_ + left_.offset + i) * list_size,
+                               (right_start_idx_ + right_.offset + i) * list_size,
+                               length * list_size);
+      return impl.Compare();
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const LargeListArray& left) {
-    result_ = CompareLists(left);
-    return Status::OK();
-  }
+  Status Visit(const StructType& type) {
+    const int32_t num_fields = type.num_fields();
 
-  Status Visit(const FixedSizeListArray& left) {
-    const auto& right = checked_cast<const FixedSizeListArray&>(right_);
-    result_ = left.values()->RangeEquals(
-        left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
-        right.value_offset(right_start_idx_), right.values());
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      for (int32_t f = 0; f < num_fields; ++f) {
+        RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+                                 *right_.child_data[f],
+                                 left_start_idx_ + left_.offset + i,
+                                 right_start_idx_ + right_.offset + i, length);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const MapArray& left) {
-    result_ = CompareMaps(left);
-    return Status::OK();
-  }
+  Status Visit(const SparseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
 
-  Status Visit(const StructArray& left) {
-    result_ = CompareStructs(left);
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];
+        // XXX can we instead detect runs of same-child union values?
+        RangeDataEqualsImpl impl(
+            options_, floating_approximate_, *left_.child_data[child_num],
+            *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+            right_start_idx_ + right_.offset + j, 1);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    });
     return Status::OK();
   }
 
-  Status Visit(const UnionArray& left) {
-    result_ = CompareUnions(left);
+  Status Visit(const DenseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
+    const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+    const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (int64_t j = i; j < i + length; ++j) {
+        const auto type_id = left_codes[left_start_idx_ + j];
+        if (type_id != right_codes[right_start_idx_ + j]) {
+          return false;
+        }
+        const auto child_num = child_ids[type_id];

Review comment:
       I would defer to @wesm for the historical reasons.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210910



##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }
-    result_ = true;
-    return Status::OK();
+    if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+                              right_.buffers[0], right_.offset + right_start_idx_,
+                              range_length_)) {
+      return false;
+    }
+    // Compare values
+    return CompareWithType(*left_.type);
   }
 
-  template <typename ArrayType, typename CompareValuesFunc>
-  bool CompareWithOffsets(const ArrayType& left,
-                          CompareValuesFunc&& compare_values) const {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        return false;
-      }
-      if (is_null) continue;
-      const auto begin_offset = left.value_offset(i);
-      const auto end_offset = left.value_offset(i + 1);
-      const auto right_begin_offset = right.value_offset(o_i);
-      const auto right_end_offset = right.value_offset(o_i + 1);
-      // Underlying can't be equal if the size isn't equal
-      if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
-        return false;
-      }
-
-      if (!compare_values(left, right, begin_offset, right_begin_offset,
-                          end_offset - begin_offset)) {
-        return false;
-      }
+  bool CompareWithType(const DataType& type) {
+    result_ = true;
+    if (range_length_ != 0) {
+      ARROW_CHECK_OK(VisitTypeInline(type, this));
     }
-    return true;
+    return result_;
   }
 
-  template <typename BinaryArrayType>
-  bool CompareBinaryRange(const BinaryArrayType& left) const {
-    using offset_type = typename BinaryArrayType::offset_type;
+  Status Visit(const NullType&) { return Status::OK(); }
 
-    auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
-                             offset_type left_offset, offset_type right_offset,
-                             offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return std::memcmp(left.value_data()->data() + left_offset,
-                         right.value_data()->data() + right_offset,
-                         static_cast<size_t>(nvalues)) == 0;
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  template <typename ListArrayType>
-  bool CompareLists(const ListArrayType& left) {
-    using offset_type = typename ListArrayType::offset_type;
-    const auto& right = checked_cast<const ListArrayType&>(right_);
-    const std::shared_ptr<Array>& left_values = left.values();
-    const std::shared_ptr<Array>& right_values = right.values();
-
-    auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                      right_values);
-    };
-    return CompareWithOffsets(left, compare_values);
-  }
-
-  bool CompareMaps(const MapArray& left) {
-    // We need a specific comparison helper for maps to avoid comparing
-    // struct field names (which are indifferent for maps)
-    using offset_type = typename MapArray::offset_type;
-    const auto& right = checked_cast<const MapArray&>(right_);
-    const auto left_keys = left.keys();
-    const auto left_items = left.items();
-    const auto right_keys = right.keys();
-    const auto right_items = right.items();
-
-    auto compare_values = [&](const MapArray& left, const MapArray& right,
-                              offset_type left_offset, offset_type right_offset,
-                              offset_type nvalues) {
-      if (nvalues == 0) {
-        return true;
-      }
-      return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                    right_keys) &&
-             left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
-                                     right_items);
-    };
-    return CompareWithOffsets(left, compare_values);
+  template <typename TypeClass>
+  enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+    return ComparePrimitive(type);
   }
 
-  bool CompareStructs(const StructArray& left) {
-    const auto& right = checked_cast<const StructArray&>(right_);
-    bool equal_fields = true;
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      for (int j = 0; j < left.num_fields(); ++j) {
-        // TODO: really we should be comparing stretches of non-null data rather
-        // than looking at one value at a time.
-        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
-        if (!equal_fields) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  bool CompareUnions(const UnionArray& left) const {
-    const auto& right = checked_cast<const UnionArray&>(right_);
-
-    const UnionMode::type union_mode = left.mode();
-    if (union_mode != right.mode()) {
-      return false;
-    }
-
-    const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
-    const std::vector<int>& child_ids = left_type.child_ids();
-
-    const int8_t* left_codes = left.raw_type_codes();
-    const int8_t* right_codes = right.raw_type_codes();
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      if (left.IsNull(i) != right.IsNull(o_i)) {
-        return false;
-      }
-      if (left.IsNull(i)) continue;
-      if (left_codes[i] != right_codes[o_i]) {
-        return false;
-      }
-
-      auto child_num = child_ids[left_codes[i]];
-
-      // TODO(wesm): really we should be comparing stretches of non-null data
-      // rather than looking at one value at a time.
-      if (union_mode == UnionMode::SPARSE) {
-        if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
-          return false;
+  Status Visit(const BooleanType&) {
+    const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      if (length <= 8) {
+        // Avoid the BitmapUInt64Reader overhead for very small runs
+        for (int64_t j = i; j < i + length; ++j) {
+          if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+              BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+            return false;
+          }
         }
+        return true;
       } else {
-        const int32_t offset =
-            checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
-        const int32_t o_offset =
-            checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
-        if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
-                                                right.field(child_num))) {
-          return false;
+        BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+                                       length);
+        BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+                                        length);
+        while (left_reader.position() < length) {
+          if (left_reader.NextWord() != right_reader.NextWord()) {
+            return false;
+          }
         }
+        DCHECK_EQ(right_reader.position(), length);
       }
-    }
-    return true;
-  }
-
-  Status Visit(const BinaryArray& left) {
-    result_ = CompareBinaryRange(left);
-    return Status::OK();
-  }
-
-  Status Visit(const LargeBinaryArray& left) {
-    result_ = CompareBinaryRange(left);
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const FixedSizeBinaryArray& left) {
-    const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+  Status Visit(const FloatType& type) { return CompareFloating(type); }
 
-    int32_t width = left.byte_width();
+  Status Visit(const DoubleType& type) { return CompareFloating(type); }
 
-    const uint8_t* left_data = nullptr;
-    const uint8_t* right_data = nullptr;
+  // Also matches StringType
+  Status Visit(const BinaryType& type) { return CompareBinary(type); }
 
-    if (left.values()) {
-      left_data = left.raw_values();
-    }
+  // Also matches LargeStringType
+  Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
 
-    if (right.values()) {
-      right_data = right.raw_values();
-    }
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i)) {
-        result_ = false;
-        return Status::OK();
-      }
-      if (is_null) continue;
+  Status Visit(const FixedSizeBinaryType& type) {
+    const auto byte_width = type.byte_width();
+    const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+    const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
 
-      if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
-        result_ = false;
-        return Status::OK();
-      }
+    if (left_data != nullptr && right_data != nullptr) {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+        return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+                      right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+                      length * byte_width) == 0;
+      };
+      VisitValidRuns(compare_runs);
+    } else {
+      auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+      VisitValidRuns(compare_runs);
     }
-    result_ = true;
     return Status::OK();
   }
 
-  Status Visit(const Decimal128Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  // Also matches MapType
+  Status Visit(const ListType& type) { return CompareList(type); }
 
-  Status Visit(const Decimal256Array& left) {
-    return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
-  }
+  Status Visit(const LargeListType& type) { return CompareList(type); }
 
-  Status Visit(const NullArray& left) {
-    ARROW_UNUSED(left);
-    result_ = true;
-    return Status::OK();
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
-      const T& left) {
-    return CompareValues<T>(left);
-  }
+  Status Visit(const FixedSizeListType& type) {
+    const auto list_size = type.list_size();
+    const ArrayData& left_data = *left_.child_data[0];
+    const ArrayData& right_data = *right_.child_data[0];
 
-  Status Visit(const ListArray& left) {
-    result_ = CompareLists(left);
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+                               (left_start_idx_ + left_.offset + i) * list_size,
+                               (right_start_idx_ + right_.offset + i) * list_size,
+                               length * list_size);
+      return impl.Compare();
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const LargeListArray& left) {
-    result_ = CompareLists(left);
-    return Status::OK();
-  }
+  Status Visit(const StructType& type) {
+    const int32_t num_fields = type.num_fields();
 
-  Status Visit(const FixedSizeListArray& left) {
-    const auto& right = checked_cast<const FixedSizeListArray&>(right_);
-    result_ = left.values()->RangeEquals(
-        left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
-        right.value_offset(right_start_idx_), right.values());
+    auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+      for (int32_t f = 0; f < num_fields; ++f) {
+        RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+                                 *right_.child_data[f],
+                                 left_start_idx_ + left_.offset + i,
+                                 right_start_idx_ + right_.offset + i, length);
+        if (!impl.Compare()) {
+          return false;
+        }
+      }
+      return true;
+    };
+    VisitValidRuns(compare_runs);
     return Status::OK();
   }
 
-  Status Visit(const MapArray& left) {
-    result_ = CompareMaps(left);
-    return Status::OK();
-  }
+  Status Visit(const SparseUnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const int8_t* left_codes = left_.GetValues<int8_t>(1);
+    const int8_t* right_codes = right_.GetValues<int8_t>(1);
 
-  Status Visit(const StructArray& left) {
-    result_ = CompareStructs(left);
+    VisitValidRuns([&](int64_t i, int64_t length) {

Review comment:
       Ha, I still occasionally forget about that :-)




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#issuecomment-732104009


   I've addressed review comments. I'd like to merge this if it is ok.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210378



##########
File path: cpp/src/arrow/ipc/feather_test.cc
##########
@@ -286,10 +286,13 @@ TEST_P(TestFeather, PrimitiveNullRoundTrip) {
     std::vector<std::shared_ptr<Array>> expected_fields;
     for (int i = 0; i < batch->num_columns(); ++i) {
       ASSERT_EQ(batch->column_name(i), reader_->schema()->field(i)->name());
-      StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
-                             batch->column(i)->null_bitmap(),
-                             batch->column(i)->null_count());
-      AssertArraysEqual(str_values, *result->column(i)->chunk(0));
+      ASSERT_OK_AND_ASSIGN(auto expected, MakeArrayOfNull(utf8(), batch->num_rows()));
+      AssertArraysEqual(*expected, *result->column(i)->chunk(0));
+      //       StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
+      //                              batch->column(i)->null_bitmap(),
+      //                              batch->column(i)->null_count());
+      //       AssertArraysEqual(str_values, *result->column(i)->chunk(0),
+      //       /*verbose=*/true);

Review comment:
       Yes, I was waiting for an answer to the question above.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] xhochy commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
xhochy commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528719927



##########
File path: cpp/src/arrow/ipc/feather_test.cc
##########
@@ -286,10 +286,13 @@ TEST_P(TestFeather, PrimitiveNullRoundTrip) {
     std::vector<std::shared_ptr<Array>> expected_fields;
     for (int i = 0; i < batch->num_columns(); ++i) {
       ASSERT_EQ(batch->column_name(i), reader_->schema()->field(i)->name());
-      StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
-                             batch->column(i)->null_bitmap(),
-                             batch->column(i)->null_count());
-      AssertArraysEqual(str_values, *result->column(i)->chunk(0));
+      ASSERT_OK_AND_ASSIGN(auto expected, MakeArrayOfNull(utf8(), batch->num_rows()));
+      AssertArraysEqual(*expected, *result->column(i)->chunk(0));
+      //       StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
+      //                              batch->column(i)->null_bitmap(),
+      //                              batch->column(i)->null_count());
+      //       AssertArraysEqual(str_values, *result->column(i)->chunk(0),

Review comment:
       I can also remember that we had the conversation but I'm unsure about the outcome, too.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210366



##########
File path: cpp/src/arrow/array/diff.h
##########
@@ -59,6 +57,27 @@ ARROW_EXPORT
 Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
                                           MemoryPool* pool = default_memory_pool());
 
+/// \brief Compare two array ranges, returning an edit script which expresses the
+/// difference between them
+///
+/// Same as Diff(), but only the ranges defined by the given offsets and lengths
+/// are compared.
+///
+/// \param[in] base baseline for comparison
+/// \param[in] target an array of identical type to base whose elements differ from base's
+/// \param[in] base_offset the start offset of the range to consider inside `base`
+/// \param[in] base_length the length of the range to consider inside `base`
+/// \param[in] target_offset the start offset of the range to consider inside `target`
+/// \param[in] target_length the length of the range to consider inside `target`
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \return an edit script array which can be applied to base to produce target
+ARROW_EXPORT
+Result<std::shared_ptr<StructArray>> DiffRanges(const Array& base, const Array& target,

Review comment:
       Hmm... well, there is a performance argument for avoiding slices, but I'm not sure it matters for diffing :-)




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210648



##########
File path: cpp/src/arrow/util/bitmap.h
##########
@@ -110,8 +110,8 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
   ///
   /// TODO(bkietz) allow for early termination
   template <size_t N, typename Visitor,
-            typename Word =
-                typename internal::call_traits::argument_type<0, Visitor&&>::value_type>
+            typename Word = typename std::decay<
+                internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>

Review comment:
       A previous version of this work used `VisitWords`, but it ended up exhibiting bad performance (`VisitWords` might be too complex for the compiler). Yes, this change is for the case where the visitor argument takes a `const& std::array<...>`. Perhaps there's no advantage in doing so, but it's better if it works anyway.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210778



##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
 namespace arrow {
 
 using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
 using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
 
 // ----------------------------------------------------------------------
 // Public method implementations
 
 namespace {
 
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+                        int64_t left_start_idx, int64_t left_end_idx,
+                        int64_t right_start_idx, const EqualOptions& options,
+                        bool floating_approximate);
 
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
-                               const NumericArray<ArrowType>& right,
-                               EqualityFunc&& equals) {
-  using T = typename ArrowType::c_type;
-
-  const T* left_data = left.raw_values();
-  const T* right_data = right.raw_values();
-
-  if (left.null_count() > 0) {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (left.IsNull(i)) continue;
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < left.length(); ++i) {
-      if (!equals(left_data[i], right_data[i])) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
-                           const NumericArray<ArrowType>& right,
-                           const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
-      return (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right,
-                                         [](T x, T y) -> bool { return x == y; });
-  }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
-                                 const NumericArray<ArrowType>& right,
-                                 const EqualOptions& opts) {
-  using T = typename ArrowType::c_type;
-  const T epsilon = static_cast<T>(opts.atol());
-
-  if (opts.nans_equal()) {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
-    });
-  } else {
-    return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
-      return (fabs(x - y) <= epsilon) || (x == y);
-    });
-  }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
  public:
-  RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
-                     int64_t right_start_idx)
-      : right_(right),
+  // PRE-CONDITIONS:
+  // - the types are equal
+  // - the ranges are in bounds
+  RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+                      const ArrayData& left, const ArrayData& right,
+                      int64_t left_start_idx, int64_t right_start_idx,
+                      int64_t range_length)
+      : options_(options),
+        floating_approximate_(floating_approximate),
+        left_(left),
+        right_(right),
         left_start_idx_(left_start_idx),
-        left_end_idx_(left_end_idx),
         right_start_idx_(right_start_idx),
+        range_length_(range_length),
         result_(false) {}
 
-  template <typename ArrayType>
-  inline Status CompareValues(const ArrayType& left) {
-    const auto& right = checked_cast<const ArrayType&>(right_);
-
-    for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
-         ++i, ++o_i) {
-      const bool is_null = left.IsNull(i);
-      if (is_null != right.IsNull(o_i) ||
-          (!is_null && left.Value(i) != right.Value(o_i))) {
-        result_ = false;
-        return Status::OK();
+  bool Compare() {
+    // Compare null bitmaps
+    if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+        range_length_ == right_.length) {
+      // If we're comparing entire arrays, we can first compare the cached null counts
+      if (left_.GetNullCount() != right_.GetNullCount()) {
+        return false;
       }
     }

Review comment:
       It's ok to compute the null count, IMHO, the computed value is often re-used for other tasks.
   The reason why this heuristic only works for non-ranged arrays is that you could have two whole arrays with different null counts, but the compared ranges would still be equal.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org