You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/11/18 10:01:11 UTC
[GitHub] [arrow] pitrou opened a new pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
pitrou opened a new pull request #8703:
URL: https://github.com/apache/arrow/pull/8703
ArrayEquals now defers to ArrayRangeEquals under the hood.
ArrayRangeEquals now allows passing an EqualOptions argument.
Also add ArrayRangeApproxEquals.
Comparison speed is massively improved on many input types:
```
benchmark baseline contender change % counters
26 ArrayRangeEqualsStruct/32768/0 6.338m items/sec 797.926m items/sec 12490.248 {'run_name': 'ArrayRangeEqualsStruct/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 135, 'null_percent': 0.0}
16 ArrayRangeEqualsBoolean/32768/0 839.237m items/sec 51.203b items/sec 6001.168 {'run_name': 'ArrayRangeEqualsBoolean/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 17929, 'null_percent': 0.0}
28 ArrayRangeEqualsFixedSizeBinary/32768/0 369.542m items/sec 14.798b items/sec 3904.348 {'run_name': 'ArrayRangeEqualsFixedSizeBinary/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 8130, 'null_percent': 0.0}
24 ArrayRangeEqualsStruct/32768/10000 6.251m items/sec 240.453m items/sec 3746.338 {'run_name': 'ArrayRangeEqualsStruct/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 134, 'null_percent': 0.01}
36 ArrayRangeEqualsStruct/32768/1 412.074m items/sec 13.733b items/sec 3232.616 {'run_name': 'ArrayRangeEqualsStruct/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 8817, 'null_percent': 100.0}
9 ArrayRangeEqualsString/32768/0 67.419m items/sec 1.937b items/sec 2772.931 {'run_name': 'ArrayRangeEqualsString/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 1441, 'null_percent': 0.0}
25 ArrayRangeEqualsListOfInt32/32768/1 524.231m items/sec 13.774b items/sec 2527.447 {'run_name': 'ArrayRangeEqualsListOfInt32/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 11179, 'null_percent': 100.0}
44 ArrayRangeEqualsString/32768/1 577.902m items/sec 13.686b items/sec 2268.185 {'run_name': 'ArrayRangeEqualsString/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 12328, 'null_percent': 100.0}
53 ArrayRangeEqualsFixedSizeBinary/32768/1 596.284m items/sec 13.140b items/sec 2103.587 {'run_name': 'ArrayRangeEqualsFixedSizeBinary/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 12703, 'null_percent': 100.0}
11 ArrayRangeEqualsString/32768/10000 67.501m items/sec 1.382b items/sec 1947.876 {'run_name': 'ArrayRangeEqualsString/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 1442, 'null_percent': 0.01}
46 ArrayRangeEqualsListOfInt32/32768/0 42.696m items/sec 833.958m items/sec 1853.229 {'run_name': 'ArrayRangeEqualsListOfInt32/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 911, 'null_percent': 0.0}
14 ArrayRangeEqualsBoolean/32768/1 698.866m items/sec 13.374b items/sec 1813.652 {'run_name': 'ArrayRangeEqualsBoolean/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 14929, 'null_percent': 100.0}
38 ArrayRangeEqualsInt32/32768/1 835.824m items/sec 13.688b items/sec 1537.720 {'run_name': 'ArrayRangeEqualsInt32/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 17818, 'null_percent': 100.0}
29 ArrayRangeEqualsFixedSizeBinary/32768/10000 278.025m items/sec 4.375b items/sec 1473.475 {'run_name': 'ArrayRangeEqualsFixedSizeBinary/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 5937, 'null_percent': 0.01}
17 ArrayRangeEqualsInt32/32768/0 2.104b items/sec 32.033b items/sec 1422.785 {'run_name': 'ArrayRangeEqualsInt32/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 44741, 'null_percent': 0.0}
18 ArrayRangeEqualsFloat32/32768/1 838.934m items/sec 12.432b items/sec 1381.896 {'run_name': 'ArrayRangeEqualsFloat32/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 17895, 'null_percent': 100.0}
19 ArrayRangeEqualsInt32/32768/10000 757.797m items/sec 6.848b items/sec 803.693 {'run_name': 'ArrayRangeEqualsInt32/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 16177, 'null_percent': 0.01}
15 ArrayRangeEqualsListOfInt32/32768/10000 28.438m items/sec 210.226m items/sec 639.253 {'run_name': 'ArrayRangeEqualsListOfInt32/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 606, 'null_percent': 0.01}
39 ArrayRangeEqualsFloat32/32768/10000 694.965m items/sec 4.706b items/sec 577.178 {'run_name': 'ArrayRangeEqualsFloat32/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 14833, 'null_percent': 0.01}
3 ArrayRangeEqualsBoolean/32768/10000 538.353m items/sec 2.968b items/sec 451.394 {'run_name': 'ArrayRangeEqualsBoolean/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 11475, 'null_percent': 0.01}
41 ArrayRangeEqualsFloat32/32768/0 2.036b items/sec 10.235b items/sec 402.761 {'run_name': 'ArrayRangeEqualsFloat32/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 43978, 'null_percent': 0.0}
23 ArrayRangeEqualsStruct/32768/100 6.298m items/sec 26.042m items/sec 313.515 {'run_name': 'ArrayRangeEqualsStruct/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 133, 'null_percent': 1.0}
1 ArrayRangeEqualsString/32768/100 68.074m items/sec 275.727m items/sec 305.039 {'run_name': 'ArrayRangeEqualsString/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 1453, 'null_percent': 1.0}
42 ArrayRangeEqualsSparseUnion/32768/0 12.649m items/sec 38.360m items/sec 203.256 {'run_name': 'ArrayRangeEqualsSparseUnion/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 274, 'null_percent': 0.0}
45 ArrayRangeEqualsDenseUnion/32768/0 12.995m items/sec 38.503m items/sec 196.287 {'run_name': 'ArrayRangeEqualsDenseUnion/32768/0', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 277, 'null_percent': 0.0}
7 ArrayRangeEqualsFixedSizeBinary/32768/100 274.991m items/sec 633.253m items/sec 130.281 {'run_name': 'ArrayRangeEqualsFixedSizeBinary/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 5864, 'null_percent': 1.0}
13 ArrayRangeEqualsInt32/32768/100 729.634m items/sec 1.644b items/sec 125.256 {'run_name': 'ArrayRangeEqualsInt32/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 15582, 'null_percent': 1.0}
12 ArrayRangeEqualsStruct/32768/10 6.948m items/sec 14.273m items/sec 105.427 {'run_name': 'ArrayRangeEqualsStruct/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 147, 'null_percent': 10.0}
34 ArrayRangeEqualsStruct/32768/2 11.923m items/sec 23.845m items/sec 99.993 {'run_name': 'ArrayRangeEqualsStruct/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 255, 'null_percent': 50.0}
10 ArrayRangeEqualsString/32768/10 73.461m items/sec 146.598m items/sec 99.558 {'run_name': 'ArrayRangeEqualsString/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 1568, 'null_percent': 10.0}
48 ArrayRangeEqualsBoolean/32768/100 525.635m items/sec 1.026b items/sec 95.102 {'run_name': 'ArrayRangeEqualsBoolean/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 11216, 'null_percent': 1.0}
21 ArrayRangeEqualsSparseUnion/32768/1 14.032m items/sec 26.225m items/sec 86.889 {'run_name': 'ArrayRangeEqualsSparseUnion/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 300, 'null_percent': 100.0}
47 ArrayRangeEqualsSparseUnion/32768/2 12.643m items/sec 23.030m items/sec 82.160 {'run_name': 'ArrayRangeEqualsSparseUnion/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 272, 'null_percent': 50.0}
43 ArrayRangeEqualsSparseUnion/32768/10000 12.596m items/sec 22.801m items/sec 81.018 {'run_name': 'ArrayRangeEqualsSparseUnion/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 264, 'null_percent': 0.01}
52 ArrayRangeEqualsSparseUnion/32768/10 12.717m items/sec 22.911m items/sec 80.168 {'run_name': 'ArrayRangeEqualsSparseUnion/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 272, 'null_percent': 10.0}
0 ArrayRangeEqualsSparseUnion/32768/100 12.783m items/sec 22.714m items/sec 77.694 {'run_name': 'ArrayRangeEqualsSparseUnion/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 275, 'null_percent': 1.0}
31 ArrayRangeEqualsDenseUnion/32768/1 14.509m items/sec 25.576m items/sec 76.279 {'run_name': 'ArrayRangeEqualsDenseUnion/32768/1', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 309, 'null_percent': 100.0}
20 ArrayRangeEqualsDenseUnion/32768/10000 13.193m items/sec 22.447m items/sec 70.152 {'run_name': 'ArrayRangeEqualsDenseUnion/32768/10000', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 280, 'null_percent': 0.01}
35 ArrayRangeEqualsDenseUnion/32768/100 13.204m items/sec 22.256m items/sec 68.561 {'run_name': 'ArrayRangeEqualsDenseUnion/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 282, 'null_percent': 1.0}
40 ArrayRangeEqualsDenseUnion/32768/10 13.183m items/sec 22.191m items/sec 68.338 {'run_name': 'ArrayRangeEqualsDenseUnion/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 279, 'null_percent': 10.0}
4 ArrayRangeEqualsDenseUnion/32768/2 13.148m items/sec 21.996m items/sec 67.297 {'run_name': 'ArrayRangeEqualsDenseUnion/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 280, 'null_percent': 50.0}
22 ArrayRangeEqualsFloat32/32768/100 671.171m items/sec 1.097b items/sec 63.414 {'run_name': 'ArrayRangeEqualsFloat32/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 14325, 'null_percent': 1.0}
30 ArrayRangeEqualsString/32768/2 94.060m items/sec 149.539m items/sec 58.983 {'run_name': 'ArrayRangeEqualsString/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 2008, 'null_percent': 50.0}
37 ArrayRangeEqualsFixedSizeBinary/32768/10 250.975m items/sec 305.864m items/sec 21.870 {'run_name': 'ArrayRangeEqualsFixedSizeBinary/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 5341, 'null_percent': 10.0}
32 ArrayRangeEqualsInt32/32768/10 607.891m items/sec 676.139m items/sec 11.227 {'run_name': 'ArrayRangeEqualsInt32/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 12895, 'null_percent': 10.0}
8 ArrayRangeEqualsFixedSizeBinary/32768/2 187.404m items/sec 207.700m items/sec 10.830 {'run_name': 'ArrayRangeEqualsFixedSizeBinary/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 3971, 'null_percent': 50.0}
49 ArrayRangeEqualsBoolean/32768/10 468.989m items/sec 518.521m items/sec 10.561 {'run_name': 'ArrayRangeEqualsBoolean/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 9996, 'null_percent': 10.0}
5 ArrayRangeEqualsInt32/32768/2 259.891m items/sec 270.547m items/sec 4.100 {'run_name': 'ArrayRangeEqualsInt32/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 5505, 'null_percent': 50.0}
33 ArrayRangeEqualsFloat32/32768/2 262.994m items/sec 268.155m items/sec 1.962 {'run_name': 'ArrayRangeEqualsFloat32/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 5580, 'null_percent': 50.0}
50 ArrayRangeEqualsFloat32/32768/10 577.200m items/sec 573.157m items/sec -0.701 {'run_name': 'ArrayRangeEqualsFloat32/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 12270, 'null_percent': 10.0}
2 ArrayRangeEqualsBoolean/32768/2 242.498m items/sec 230.947m items/sec -4.763 {'run_name': 'ArrayRangeEqualsBoolean/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 5152, 'null_percent': 50.0}
27 ArrayRangeEqualsListOfInt32/32768/100 28.080m items/sec 25.874m items/sec -7.855 {'run_name': 'ArrayRangeEqualsListOfInt32/32768/100', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 598, 'null_percent': 1.0}
51 ArrayRangeEqualsListOfInt32/32768/2 18.613m items/sec 12.136m items/sec -34.798 {'run_name': 'ArrayRangeEqualsListOfInt32/32768/2', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 397, 'null_percent': 50.0}
6 ArrayRangeEqualsListOfInt32/32768/10 26.194m items/sec 12.610m items/sec -51.859 {'run_name': 'ArrayRangeEqualsListOfInt32/32768/10', 'run_type': 'iteration', 'repetitions': 0, 'repetition_index': 0, 'threads': 1, 'iterations': 560, 'null_percent': 10.0}
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] bkietz commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528212877
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const FixedSizeBinaryArray& left) {
- const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
- int32_t width = left.byte_width();
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
- if (left.values()) {
- left_data = left.raw_values();
- }
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
- if (right.values()) {
- right_data = right.raw_values();
- }
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- result_ = false;
- return Status::OK();
- }
- if (is_null) continue;
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
- if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
- result_ = false;
- return Status::OK();
- }
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
}
- result_ = true;
return Status::OK();
}
- Status Visit(const Decimal128Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
- Status Visit(const Decimal256Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ Status Visit(const LargeListType& type) { return CompareList(type); }
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
- return Status::OK();
- }
-
- template <typename T>
- typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
- const T& left) {
- return CompareValues<T>(left);
- }
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
- Status Visit(const ListArray& left) {
- result_ = CompareLists(left);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const LargeListArray& left) {
- result_ = CompareLists(left);
- return Status::OK();
- }
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
- Status Visit(const FixedSizeListArray& left) {
- const auto& right = checked_cast<const FixedSizeListArray&>(right_);
- result_ = left.values()->RangeEquals(
- left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
- right.value_offset(right_start_idx_), right.values());
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const MapArray& left) {
- result_ = CompareMaps(left);
- return Status::OK();
- }
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
- Status Visit(const StructArray& left) {
- result_ = CompareStructs(left);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
+ // XXX can we instead detect runs of same-child union values?
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+ right_start_idx_ + right_.offset + j, 1);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ });
return Status::OK();
}
- Status Visit(const UnionArray& left) {
- result_ = CompareUnions(left);
+ Status Visit(const DenseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
+ const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+ const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_offsets[left_start_idx_ + j],
+ right_offsets[right_start_idx_ + j], 1);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ });
return Status::OK();
}
- Status Visit(const DictionaryArray& left) {
- const auto& right = checked_cast<const DictionaryArray&>(right_);
- if (!left.dictionary()->Equals(right.dictionary())) {
- result_ = false;
- return Status::OK();
+ Status Visit(const DictionaryType& type) {
+ // Compare dictionaries
+ result_ &= CompareArrayRanges(
+ *left_.dictionary, *right_.dictionary,
+ /*left_start_idx=*/0,
+ /*left_end_idx=*/std::max(left_.dictionary->length, right_.dictionary->length),
+ /*right_start_idx=*/0, options_, floating_approximate_);
+ if (result_) {
+ // Compare indices
+ result_ &= CompareWithType(*type.index_type());
}
- result_ = left.indices()->RangeEquals(left_start_idx_, left_end_idx_,
- right_start_idx_, right.indices());
return Status::OK();
}
- Status Visit(const ExtensionArray& left) {
- result_ = (right_.type()->Equals(*left.type()) &&
- ArrayRangeEquals(*left.storage(),
- *static_cast<const ExtensionArray&>(right_).storage(),
- left_start_idx_, left_end_idx_, right_start_idx_));
+ Status Visit(const ExtensionType& type) {
+ // Compare storages
+ result_ &= CompareWithType(*type.storage_type());
return Status::OK();
}
- bool result() const { return result_; }
-
protected:
- const Array& right_;
- int64_t left_start_idx_;
- int64_t left_end_idx_;
- int64_t right_start_idx_;
-
- bool result_;
-};
-
-static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& right) {
- const int byte_width = internal::GetByteWidth(*left.type());
-
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
-
- if (left.values()) {
- left_data = left.values()->data() + left.offset() * byte_width;
+ template <typename TypeClass, typename CType = typename TypeClass::c_type>
+ Status ComparePrimitive(const TypeClass&) {
+ const CType* left_values = left_.GetValues<CType>(1);
+ const CType* right_values = right_.GetValues<CType>(1);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ return memcmp(left_values + left_start_idx_ + i,
+ right_values + right_start_idx_ + i, length * sizeof(CType)) == 0;
+ });
+ return Status::OK();
}
- if (right.values()) {
- right_data = right.values()->data() + right.offset() * byte_width;
- }
+ template <typename TypeClass>
+ Status CompareFloating(const TypeClass&) {
+ using T = typename TypeClass::c_type;
+ const T* left_values = left_.GetValues<T>(1);
+ const T* right_values = right_.GetValues<T>(1);
- if (byte_width == 0) {
- // Special case 0-width data, as the data pointers may be null
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i) != right.IsNull(i)) {
- return false;
- }
- }
- return true;
- } else if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- const bool left_null = left.IsNull(i);
- const bool right_null = right.IsNull(i);
- if (left_null != right_null) {
- return false;
+ if (floating_approximate_) {
+ const T epsilon = static_cast<T>(options_.atol());
+ if (options_.nans_equal()) {
+ VisitValues([&](int64_t i) {
+ const T x = left_values[i + left_start_idx_];
+ const T y = right_values[i + right_start_idx_];
+ return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
+ });
+ } else {
+ VisitValues([&](int64_t i) {
+ const T x = left_values[i + left_start_idx_];
+ const T y = right_values[i + right_start_idx_];
+ return (fabs(x - y) <= epsilon) || (x == y);
+ });
}
- if (!left_null && memcmp(left_data, right_data, byte_width) != 0) {
- return false;
+ } else {
+ if (options_.nans_equal()) {
+ VisitValues([&](int64_t i) {
+ const T x = left_values[i + left_start_idx_];
+ const T y = right_values[i + right_start_idx_];
+ return (x == y) || (std::isnan(x) && std::isnan(y));
+ });
+ } else {
+ VisitValues([&](int64_t i) {
+ const T x = left_values[i + left_start_idx_];
+ const T y = right_values[i + right_start_idx_];
+ return x == y;
+ });
}
- left_data += byte_width;
- right_data += byte_width;
}
- return true;
- } else {
- auto number_of_bytes_to_compare = static_cast<size_t>(byte_width * left.length());
- return memcmp(left_data, right_data, number_of_bytes_to_compare) == 0;
- }
-}
-
-// A bit confusing: ArrayEqualsVisitor inherits from RangeEqualsVisitor but
-// doesn't share the same preconditions.
-// When RangeEqualsVisitor is called, we only know the range sizes equal.
-// When ArrayEqualsVisitor is called, we know the sizes and null bitmaps are equal.
-
-class ArrayEqualsVisitor : public RangeEqualsVisitor {
- public:
- explicit ArrayEqualsVisitor(const Array& right, const EqualOptions& opts)
- : RangeEqualsVisitor(right, 0, right.length(), 0), opts_(opts) {}
-
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
return Status::OK();
}
- Status Visit(const BooleanArray& left) {
- const auto& right = checked_cast<const BooleanArray&>(right_);
+ template <typename TypeClass>
+ Status CompareBinary(const TypeClass&) {
+ const uint8_t* left_data = left_.GetValues<uint8_t>(2, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(2, 0);
- if (left.null_count() > 0) {
- const uint8_t* left_data = left.values()->data();
- const uint8_t* right_data = right.values()->data();
-
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsValid(i) && BitUtil::GetBit(left_data, i + left.offset()) !=
- BitUtil::GetBit(right_data, i + right.offset())) {
- result_ = false;
- return Status::OK();
- }
- }
- result_ = true;
+ if (left_data != nullptr && right_data != nullptr) {
+ const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+ int64_t length) -> bool {
+ return memcmp(left_data + left_offset, right_data + right_offset, length) == 0;
+ };
+ CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
} else {
- result_ = BitmapEquals(left.values()->data(), left.offset(), right.values()->data(),
- right.offset(), left.length());
+ // One of the arrays is an array of empty strings and nulls.
+ // We just need to compare the offsets.
+ // (note we must not call memcmp() with null data pointers)
+ const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+ int64_t length) -> bool { return true; };
+ CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
Review comment:
Yes, `...` can be used to ignore any trivially constructible argument(s).
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210676
##########
File path: cpp/src/arrow/util/bitmap_reader.h
##########
@@ -69,6 +69,77 @@ class BitmapReader {
int64_t bit_offset_;
};
+// XXX Cannot name it BitmapWordReader because the name is already used
+// in bitmap_ops.cc
Review comment:
Yeah, I think that would be useful. I should open JIRAs for the various XXX and TODOs here.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528211002
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const FixedSizeBinaryArray& left) {
- const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
- int32_t width = left.byte_width();
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
- if (left.values()) {
- left_data = left.raw_values();
- }
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
- if (right.values()) {
- right_data = right.raw_values();
- }
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- result_ = false;
- return Status::OK();
- }
- if (is_null) continue;
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
- if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
- result_ = false;
- return Status::OK();
- }
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
}
- result_ = true;
return Status::OK();
}
- Status Visit(const Decimal128Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
- Status Visit(const Decimal256Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ Status Visit(const LargeListType& type) { return CompareList(type); }
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
- return Status::OK();
- }
-
- template <typename T>
- typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
- const T& left) {
- return CompareValues<T>(left);
- }
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
- Status Visit(const ListArray& left) {
- result_ = CompareLists(left);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const LargeListArray& left) {
- result_ = CompareLists(left);
- return Status::OK();
- }
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
- Status Visit(const FixedSizeListArray& left) {
- const auto& right = checked_cast<const FixedSizeListArray&>(right_);
- result_ = left.values()->RangeEquals(
- left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
- right.value_offset(right_start_idx_), right.values());
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const MapArray& left) {
- result_ = CompareMaps(left);
- return Status::OK();
- }
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
- Status Visit(const StructArray& left) {
- result_ = CompareStructs(left);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
+ // XXX can we instead detect runs of same-child union values?
Review comment:
Right, though the question is whether this would improve or worsen common cases (I have no idea how unions are used in the wild).
In any case, this could be a separate low-priority JIRA (I don't think optimizing unions is in our current priorities).
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#issuecomment-729584943
https://issues.apache.org/jira/browse/ARROW-10143
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#issuecomment-730355163
Rebased.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] bkietz commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528213657
##########
File path: cpp/src/arrow/array/diff.h
##########
@@ -59,6 +57,27 @@ ARROW_EXPORT
Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
MemoryPool* pool = default_memory_pool());
+/// \brief Compare two array ranges, returning an edit script which expresses the
+/// difference between them
+///
+/// Same as Diff(), but only the ranges defined by the given offsets and lengths
+/// are compared.
+///
+/// \param[in] base baseline for comparison
+/// \param[in] target an array of identical type to base whose elements differ from base's
+/// \param[in] base_offset the start offset of the range to consider inside `base`
+/// \param[in] base_length the length of the range to consider inside `base`
+/// \param[in] target_offset the start offset of the range to consider inside `target`
+/// \param[in] target_length the length of the range to consider inside `target`
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \return an edit script array which can be applied to base to produce target
+ARROW_EXPORT
+Result<std::shared_ptr<StructArray>> DiffRanges(const Array& base, const Array& target,
Review comment:
Right; if you wanted to minimize this patch some then this could be removed
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528211150
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const FixedSizeBinaryArray& left) {
- const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
- int32_t width = left.byte_width();
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
- if (left.values()) {
- left_data = left.raw_values();
- }
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
- if (right.values()) {
- right_data = right.raw_values();
- }
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- result_ = false;
- return Status::OK();
- }
- if (is_null) continue;
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
- if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
- result_ = false;
- return Status::OK();
- }
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
}
- result_ = true;
return Status::OK();
}
- Status Visit(const Decimal128Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
- Status Visit(const Decimal256Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ Status Visit(const LargeListType& type) { return CompareList(type); }
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
- return Status::OK();
- }
-
- template <typename T>
- typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
- const T& left) {
- return CompareValues<T>(left);
- }
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
- Status Visit(const ListArray& left) {
- result_ = CompareLists(left);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const LargeListArray& left) {
- result_ = CompareLists(left);
- return Status::OK();
- }
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
- Status Visit(const FixedSizeListArray& left) {
- const auto& right = checked_cast<const FixedSizeListArray&>(right_);
- result_ = left.values()->RangeEquals(
- left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
- right.value_offset(right_start_idx_), right.values());
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const MapArray& left) {
- result_ = CompareMaps(left);
- return Status::OK();
- }
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
- Status Visit(const StructArray& left) {
- result_ = CompareStructs(left);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
+ // XXX can we instead detect runs of same-child union values?
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+ right_start_idx_ + right_.offset + j, 1);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ });
return Status::OK();
}
- Status Visit(const UnionArray& left) {
- result_ = CompareUnions(left);
+ Status Visit(const DenseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
+ const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+ const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
Review comment:
I kept the previous semantics of dictionary array comparison, that is: if the dictionaries themselves are unequal, the arrays are considered unequal even if the indices only refer to equal dictionary elements.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] wesm commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
wesm commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r527275663
##########
File path: cpp/src/arrow/ipc/feather_test.cc
##########
@@ -286,10 +286,13 @@ TEST_P(TestFeather, PrimitiveNullRoundTrip) {
std::vector<std::shared_ptr<Array>> expected_fields;
for (int i = 0; i < batch->num_columns(); ++i) {
ASSERT_EQ(batch->column_name(i), reader_->schema()->field(i)->name());
- StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
- batch->column(i)->null_bitmap(),
- batch->column(i)->null_count());
- AssertArraysEqual(str_values, *result->column(i)->chunk(0));
+ ASSERT_OK_AND_ASSIGN(auto expected, MakeArrayOfNull(utf8(), batch->num_rows()));
+ AssertArraysEqual(*expected, *result->column(i)->chunk(0));
+ // StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
+ // batch->column(i)->null_bitmap(),
+ // batch->column(i)->null_count());
+ // AssertArraysEqual(str_values, *result->column(i)->chunk(0),
Review comment:
I recall this question coming up in the past and I don't recall the outcome, @xhochy do you remember?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou closed pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou closed pull request #8703:
URL: https://github.com/apache/arrow/pull/8703
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210425
##########
File path: cpp/src/arrow/util/bit_run_reader.h
##########
@@ -162,5 +166,7 @@ class ARROW_EXPORT BitRunReader {
using BitRunReader = BitRunReaderLinear;
#endif
+// TODO SetBitRunReader?
+
Review comment:
Yes, it would. That would make its logic much simpler and more easily optimized by the compiler (though, of course, numbers will be the final judge).
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r526854313
##########
File path: cpp/src/arrow/ipc/feather_test.cc
##########
@@ -286,10 +286,13 @@ TEST_P(TestFeather, PrimitiveNullRoundTrip) {
std::vector<std::shared_ptr<Array>> expected_fields;
for (int i = 0; i < batch->num_columns(); ++i) {
ASSERT_EQ(batch->column_name(i), reader_->schema()->field(i)->name());
- StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
- batch->column(i)->null_bitmap(),
- batch->column(i)->null_count());
- AssertArraysEqual(str_values, *result->column(i)->chunk(0));
+ ASSERT_OK_AND_ASSIGN(auto expected, MakeArrayOfNull(utf8(), batch->num_rows()));
+ AssertArraysEqual(*expected, *result->column(i)->chunk(0));
+ // StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
+ // batch->column(i)->null_bitmap(),
+ // batch->column(i)->null_count());
+ // AssertArraysEqual(str_values, *result->column(i)->chunk(0),
Review comment:
@wesm Is constructing a StringArray with null offsets and data supported (it's non-empty but all-nulls)?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528211187
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const FixedSizeBinaryArray& left) {
- const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
- int32_t width = left.byte_width();
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
- if (left.values()) {
- left_data = left.raw_values();
- }
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
- if (right.values()) {
- right_data = right.raw_values();
- }
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- result_ = false;
- return Status::OK();
- }
- if (is_null) continue;
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
- if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
- result_ = false;
- return Status::OK();
- }
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
}
- result_ = true;
return Status::OK();
}
- Status Visit(const Decimal128Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
- Status Visit(const Decimal256Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ Status Visit(const LargeListType& type) { return CompareList(type); }
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
- return Status::OK();
- }
-
- template <typename T>
- typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
- const T& left) {
- return CompareValues<T>(left);
- }
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
- Status Visit(const ListArray& left) {
- result_ = CompareLists(left);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const LargeListArray& left) {
- result_ = CompareLists(left);
- return Status::OK();
- }
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
- Status Visit(const FixedSizeListArray& left) {
- const auto& right = checked_cast<const FixedSizeListArray&>(right_);
- result_ = left.values()->RangeEquals(
- left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
- right.value_offset(right_start_idx_), right.values());
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const MapArray& left) {
- result_ = CompareMaps(left);
- return Status::OK();
- }
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
- Status Visit(const StructArray& left) {
- result_ = CompareStructs(left);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
+ // XXX can we instead detect runs of same-child union values?
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+ right_start_idx_ + right_.offset + j, 1);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ });
return Status::OK();
}
- Status Visit(const UnionArray& left) {
- result_ = CompareUnions(left);
+ Status Visit(const DenseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
+ const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+ const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_offsets[left_start_idx_ + j],
+ right_offsets[right_start_idx_ + j], 1);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ });
return Status::OK();
}
- Status Visit(const DictionaryArray& left) {
- const auto& right = checked_cast<const DictionaryArray&>(right_);
- if (!left.dictionary()->Equals(right.dictionary())) {
- result_ = false;
- return Status::OK();
+ Status Visit(const DictionaryType& type) {
+ // Compare dictionaries
+ result_ &= CompareArrayRanges(
+ *left_.dictionary, *right_.dictionary,
+ /*left_start_idx=*/0,
+ /*left_end_idx=*/std::max(left_.dictionary->length, right_.dictionary->length),
+ /*right_start_idx=*/0, options_, floating_approximate_);
+ if (result_) {
+ // Compare indices
+ result_ &= CompareWithType(*type.index_type());
}
- result_ = left.indices()->RangeEquals(left_start_idx_, left_end_idx_,
- right_start_idx_, right.indices());
return Status::OK();
}
- Status Visit(const ExtensionArray& left) {
- result_ = (right_.type()->Equals(*left.type()) &&
- ArrayRangeEquals(*left.storage(),
- *static_cast<const ExtensionArray&>(right_).storage(),
- left_start_idx_, left_end_idx_, right_start_idx_));
+ Status Visit(const ExtensionType& type) {
+ // Compare storages
+ result_ &= CompareWithType(*type.storage_type());
return Status::OK();
}
- bool result() const { return result_; }
-
protected:
- const Array& right_;
- int64_t left_start_idx_;
- int64_t left_end_idx_;
- int64_t right_start_idx_;
-
- bool result_;
-};
-
-static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& right) {
- const int byte_width = internal::GetByteWidth(*left.type());
-
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
-
- if (left.values()) {
- left_data = left.values()->data() + left.offset() * byte_width;
+ template <typename TypeClass, typename CType = typename TypeClass::c_type>
+ Status ComparePrimitive(const TypeClass&) {
+ const CType* left_values = left_.GetValues<CType>(1);
+ const CType* right_values = right_.GetValues<CType>(1);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ return memcmp(left_values + left_start_idx_ + i,
+ right_values + right_start_idx_ + i, length * sizeof(CType)) == 0;
+ });
+ return Status::OK();
}
- if (right.values()) {
- right_data = right.values()->data() + right.offset() * byte_width;
- }
+ template <typename TypeClass>
+ Status CompareFloating(const TypeClass&) {
+ using T = typename TypeClass::c_type;
+ const T* left_values = left_.GetValues<T>(1);
+ const T* right_values = right_.GetValues<T>(1);
- if (byte_width == 0) {
- // Special case 0-width data, as the data pointers may be null
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i) != right.IsNull(i)) {
- return false;
- }
- }
- return true;
- } else if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- const bool left_null = left.IsNull(i);
- const bool right_null = right.IsNull(i);
- if (left_null != right_null) {
- return false;
+ if (floating_approximate_) {
+ const T epsilon = static_cast<T>(options_.atol());
+ if (options_.nans_equal()) {
+ VisitValues([&](int64_t i) {
+ const T x = left_values[i + left_start_idx_];
+ const T y = right_values[i + right_start_idx_];
+ return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
+ });
+ } else {
+ VisitValues([&](int64_t i) {
+ const T x = left_values[i + left_start_idx_];
+ const T y = right_values[i + right_start_idx_];
+ return (fabs(x - y) <= epsilon) || (x == y);
+ });
}
- if (!left_null && memcmp(left_data, right_data, byte_width) != 0) {
- return false;
+ } else {
+ if (options_.nans_equal()) {
+ VisitValues([&](int64_t i) {
+ const T x = left_values[i + left_start_idx_];
+ const T y = right_values[i + right_start_idx_];
+ return (x == y) || (std::isnan(x) && std::isnan(y));
+ });
+ } else {
+ VisitValues([&](int64_t i) {
+ const T x = left_values[i + left_start_idx_];
+ const T y = right_values[i + right_start_idx_];
+ return x == y;
+ });
}
- left_data += byte_width;
- right_data += byte_width;
}
- return true;
- } else {
- auto number_of_bytes_to_compare = static_cast<size_t>(byte_width * left.length());
- return memcmp(left_data, right_data, number_of_bytes_to_compare) == 0;
- }
-}
-
-// A bit confusing: ArrayEqualsVisitor inherits from RangeEqualsVisitor but
-// doesn't share the same preconditions.
-// When RangeEqualsVisitor is called, we only know the range sizes equal.
-// When ArrayEqualsVisitor is called, we know the sizes and null bitmaps are equal.
-
-class ArrayEqualsVisitor : public RangeEqualsVisitor {
- public:
- explicit ArrayEqualsVisitor(const Array& right, const EqualOptions& opts)
- : RangeEqualsVisitor(right, 0, right.length(), 0), opts_(opts) {}
-
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
return Status::OK();
}
- Status Visit(const BooleanArray& left) {
- const auto& right = checked_cast<const BooleanArray&>(right_);
+ template <typename TypeClass>
+ Status CompareBinary(const TypeClass&) {
+ const uint8_t* left_data = left_.GetValues<uint8_t>(2, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(2, 0);
- if (left.null_count() > 0) {
- const uint8_t* left_data = left.values()->data();
- const uint8_t* right_data = right.values()->data();
-
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsValid(i) && BitUtil::GetBit(left_data, i + left.offset()) !=
- BitUtil::GetBit(right_data, i + right.offset())) {
- result_ = false;
- return Status::OK();
- }
- }
- result_ = true;
+ if (left_data != nullptr && right_data != nullptr) {
+ const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+ int64_t length) -> bool {
+ return memcmp(left_data + left_offset, right_data + right_offset, length) == 0;
+ };
+ CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
} else {
- result_ = BitmapEquals(left.values()->data(), left.offset(), right.values()->data(),
- right.offset(), left.length());
+ // One of the arrays is an array of empty strings and nulls.
+ // We just need to compare the offsets.
+ // (note we must not call memcmp() with null data pointers)
+ const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+ int64_t length) -> bool { return true; };
+ CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
Review comment:
Does this work in C++11? :-o
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528622512
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
Review comment:
That will be part of the JIRA followup (investigate replacing BitmapWordReader with BitmapUInt64Reader).
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210778
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
Review comment:
It's ok to compute the null count, IMHO, it is often used for other tasks.
The reason why this heuristic only works for non-ranged arrays is that you could have two whole arrays with different null counts, but the compared ranges would still be equal.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] bkietz commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528213541
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const FixedSizeBinaryArray& left) {
- const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
- int32_t width = left.byte_width();
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
- if (left.values()) {
- left_data = left.raw_values();
- }
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
- if (right.values()) {
- right_data = right.raw_values();
- }
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- result_ = false;
- return Status::OK();
- }
- if (is_null) continue;
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
- if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
- result_ = false;
- return Status::OK();
- }
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
}
- result_ = true;
return Status::OK();
}
- Status Visit(const Decimal128Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
- Status Visit(const Decimal256Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ Status Visit(const LargeListType& type) { return CompareList(type); }
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
- return Status::OK();
- }
-
- template <typename T>
- typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
- const T& left) {
- return CompareValues<T>(left);
- }
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
- Status Visit(const ListArray& left) {
- result_ = CompareLists(left);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const LargeListArray& left) {
- result_ = CompareLists(left);
- return Status::OK();
- }
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
- Status Visit(const FixedSizeListArray& left) {
- const auto& right = checked_cast<const FixedSizeListArray&>(right_);
- result_ = left.values()->RangeEquals(
- left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
- right.value_offset(right_start_idx_), right.values());
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const MapArray& left) {
- result_ = CompareMaps(left);
- return Status::OK();
- }
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
- Status Visit(const StructArray& left) {
- result_ = CompareStructs(left);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
+ // XXX can we instead detect runs of same-child union values?
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+ right_start_idx_ + right_.offset + j, 1);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ });
return Status::OK();
}
- Status Visit(const UnionArray& left) {
- result_ = CompareUnions(left);
+ Status Visit(const DenseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
+ const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+ const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
Review comment:
I see that you kept previous semantics, I was just noting that the difference is odd. I would have expected the two cases to be more symmetric and wondered if you could comment
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] bkietz commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r526259632
##########
File path: cpp/src/arrow/ipc/feather_test.cc
##########
@@ -286,10 +286,13 @@ TEST_P(TestFeather, PrimitiveNullRoundTrip) {
std::vector<std::shared_ptr<Array>> expected_fields;
for (int i = 0; i < batch->num_columns(); ++i) {
ASSERT_EQ(batch->column_name(i), reader_->schema()->field(i)->name());
- StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
- batch->column(i)->null_bitmap(),
- batch->column(i)->null_count());
- AssertArraysEqual(str_values, *result->column(i)->chunk(0));
+ ASSERT_OK_AND_ASSIGN(auto expected, MakeArrayOfNull(utf8(), batch->num_rows()));
+ AssertArraysEqual(*expected, *result->column(i)->chunk(0));
+ // StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
+ // batch->column(i)->null_bitmap(),
+ // batch->column(i)->null_count());
+ // AssertArraysEqual(str_values, *result->column(i)->chunk(0),
+ // /*verbose=*/true);
Review comment:
Looks like leftovers
```suggestion
```
##########
File path: cpp/src/arrow/util/bit_run_reader.h
##########
@@ -162,5 +166,7 @@ class ARROW_EXPORT BitRunReader {
using BitRunReader = BitRunReaderLinear;
#endif
+// TODO SetBitRunReader?
+
Review comment:
IIUC BitRunReader yields alternating set/unset BitRuns. Would SetBitRunReader yield only the set BitRuns?
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
Review comment:
I'm not sure why this optimization only applies to entire arrays. Additionally, as currently stated it doesn't necessarily compare cached null counts.
If we want to absolutely avoid accessing the null bitmap here, we'll need:
```suggestion
// Compare null bitmaps
// Try to compare cached null counts first:
int64_t left_null_count = left_.null_count.load(), right_null_count = right_.null_count.load();
if (left_null_count != kUnknownNullCount && right_null_count != kUnknownNullCount &&
left_null_count != right_null_count) {
return false;
}
```
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const FixedSizeBinaryArray& left) {
- const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
- int32_t width = left.byte_width();
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
- if (left.values()) {
- left_data = left.raw_values();
- }
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
- if (right.values()) {
- right_data = right.raw_values();
- }
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- result_ = false;
- return Status::OK();
- }
- if (is_null) continue;
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
- if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
- result_ = false;
- return Status::OK();
- }
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
}
- result_ = true;
return Status::OK();
}
- Status Visit(const Decimal128Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
- Status Visit(const Decimal256Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ Status Visit(const LargeListType& type) { return CompareList(type); }
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
- return Status::OK();
- }
-
- template <typename T>
- typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
- const T& left) {
- return CompareValues<T>(left);
- }
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
- Status Visit(const ListArray& left) {
- result_ = CompareLists(left);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const LargeListArray& left) {
- result_ = CompareLists(left);
- return Status::OK();
- }
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
- Status Visit(const FixedSizeListArray& left) {
- const auto& right = checked_cast<const FixedSizeListArray&>(right_);
- result_ = left.values()->RangeEquals(
- left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
- right.value_offset(right_start_idx_), right.values());
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const MapArray& left) {
- result_ = CompareMaps(left);
- return Status::OK();
- }
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
- Status Visit(const StructArray& left) {
- result_ = CompareStructs(left);
+ VisitValidRuns([&](int64_t i, int64_t length) {
Review comment:
Since unions don't have top level nulls is this worthwhile?
##########
File path: cpp/src/arrow/array/diff.h
##########
@@ -59,6 +57,27 @@ ARROW_EXPORT
Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
MemoryPool* pool = default_memory_pool());
+/// \brief Compare two array ranges, returning an edit script which expresses the
+/// difference between them
+///
+/// Same as Diff(), but only the ranges defined by the given offsets and lengths
+/// are compared.
+///
+/// \param[in] base baseline for comparison
+/// \param[in] target an array of identical type to base whose elements differ from base's
+/// \param[in] base_offset the start offset of the range to consider inside `base`
+/// \param[in] base_length the length of the range to consider inside `base`
+/// \param[in] target_offset the start offset of the range to consider inside `target`
+/// \param[in] target_length the length of the range to consider inside `target`
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \return an edit script array which can be applied to base to produce target
+ARROW_EXPORT
+Result<std::shared_ptr<StructArray>> DiffRanges(const Array& base, const Array& target,
Review comment:
Not really an objection, but: I'm not sure what this adds over applying the ranges to the base and target ArrayData then running Diff on those
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
Review comment:
This seems like a generally useful bitmap comparison utility. If it's faster than BitmapsEqual then maybe it could replace that function?
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const FixedSizeBinaryArray& left) {
- const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
- int32_t width = left.byte_width();
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
- if (left.values()) {
- left_data = left.raw_values();
- }
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
- if (right.values()) {
- right_data = right.raw_values();
- }
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- result_ = false;
- return Status::OK();
- }
- if (is_null) continue;
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
- if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
- result_ = false;
- return Status::OK();
- }
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
}
- result_ = true;
return Status::OK();
}
- Status Visit(const Decimal128Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
- Status Visit(const Decimal256Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ Status Visit(const LargeListType& type) { return CompareList(type); }
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
- return Status::OK();
- }
-
- template <typename T>
- typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
- const T& left) {
- return CompareValues<T>(left);
- }
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
- Status Visit(const ListArray& left) {
- result_ = CompareLists(left);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const LargeListArray& left) {
- result_ = CompareLists(left);
- return Status::OK();
- }
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
- Status Visit(const FixedSizeListArray& left) {
- const auto& right = checked_cast<const FixedSizeListArray&>(right_);
- result_ = left.values()->RangeEquals(
- left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
- right.value_offset(right_start_idx_), right.values());
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const MapArray& left) {
- result_ = CompareMaps(left);
- return Status::OK();
- }
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
- Status Visit(const StructArray& left) {
- result_ = CompareStructs(left);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
+ // XXX can we instead detect runs of same-child union values?
Review comment:
Interesting, this seems very doable:
- get next type_id of left and right
- early exit if not equal
- find length of run in left_codes
- find length of run in right_codes
- early exit if run lengths are not equal
- compare ranges of children
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const FixedSizeBinaryArray& left) {
- const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
- int32_t width = left.byte_width();
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
- if (left.values()) {
- left_data = left.raw_values();
- }
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
- if (right.values()) {
- right_data = right.raw_values();
- }
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- result_ = false;
- return Status::OK();
- }
- if (is_null) continue;
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
- if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
- result_ = false;
- return Status::OK();
- }
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
}
- result_ = true;
return Status::OK();
}
- Status Visit(const Decimal128Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
- Status Visit(const Decimal256Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ Status Visit(const LargeListType& type) { return CompareList(type); }
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
- return Status::OK();
- }
-
- template <typename T>
- typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
- const T& left) {
- return CompareValues<T>(left);
- }
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
- Status Visit(const ListArray& left) {
- result_ = CompareLists(left);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const LargeListArray& left) {
- result_ = CompareLists(left);
- return Status::OK();
- }
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
- Status Visit(const FixedSizeListArray& left) {
- const auto& right = checked_cast<const FixedSizeListArray&>(right_);
- result_ = left.values()->RangeEquals(
- left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
- right.value_offset(right_start_idx_), right.values());
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const MapArray& left) {
- result_ = CompareMaps(left);
- return Status::OK();
- }
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
- Status Visit(const StructArray& left) {
- result_ = CompareStructs(left);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
+ // XXX can we instead detect runs of same-child union values?
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+ right_start_idx_ + right_.offset + j, 1);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ });
return Status::OK();
}
- Status Visit(const UnionArray& left) {
- result_ = CompareUnions(left);
+ Status Visit(const DenseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
+ const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+ const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_offsets[left_start_idx_ + j],
+ right_offsets[right_start_idx_ + j], 1);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ });
return Status::OK();
}
- Status Visit(const DictionaryArray& left) {
- const auto& right = checked_cast<const DictionaryArray&>(right_);
- if (!left.dictionary()->Equals(right.dictionary())) {
- result_ = false;
- return Status::OK();
+ Status Visit(const DictionaryType& type) {
+ // Compare dictionaries
+ result_ &= CompareArrayRanges(
+ *left_.dictionary, *right_.dictionary,
+ /*left_start_idx=*/0,
+ /*left_end_idx=*/std::max(left_.dictionary->length, right_.dictionary->length),
+ /*right_start_idx=*/0, options_, floating_approximate_);
+ if (result_) {
+ // Compare indices
+ result_ &= CompareWithType(*type.index_type());
}
- result_ = left.indices()->RangeEquals(left_start_idx_, left_end_idx_,
- right_start_idx_, right.indices());
return Status::OK();
}
- Status Visit(const ExtensionArray& left) {
- result_ = (right_.type()->Equals(*left.type()) &&
- ArrayRangeEquals(*left.storage(),
- *static_cast<const ExtensionArray&>(right_).storage(),
- left_start_idx_, left_end_idx_, right_start_idx_));
+ Status Visit(const ExtensionType& type) {
+ // Compare storages
+ result_ &= CompareWithType(*type.storage_type());
return Status::OK();
}
- bool result() const { return result_; }
-
protected:
- const Array& right_;
- int64_t left_start_idx_;
- int64_t left_end_idx_;
- int64_t right_start_idx_;
-
- bool result_;
-};
-
-static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& right) {
- const int byte_width = internal::GetByteWidth(*left.type());
-
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
-
- if (left.values()) {
- left_data = left.values()->data() + left.offset() * byte_width;
+ template <typename TypeClass, typename CType = typename TypeClass::c_type>
+ Status ComparePrimitive(const TypeClass&) {
+ const CType* left_values = left_.GetValues<CType>(1);
+ const CType* right_values = right_.GetValues<CType>(1);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ return memcmp(left_values + left_start_idx_ + i,
+ right_values + right_start_idx_ + i, length * sizeof(CType)) == 0;
+ });
+ return Status::OK();
}
- if (right.values()) {
- right_data = right.values()->data() + right.offset() * byte_width;
- }
+ template <typename TypeClass>
+ Status CompareFloating(const TypeClass&) {
+ using T = typename TypeClass::c_type;
+ const T* left_values = left_.GetValues<T>(1);
+ const T* right_values = right_.GetValues<T>(1);
- if (byte_width == 0) {
- // Special case 0-width data, as the data pointers may be null
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i) != right.IsNull(i)) {
- return false;
- }
- }
- return true;
- } else if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- const bool left_null = left.IsNull(i);
- const bool right_null = right.IsNull(i);
- if (left_null != right_null) {
- return false;
+ if (floating_approximate_) {
+ const T epsilon = static_cast<T>(options_.atol());
+ if (options_.nans_equal()) {
+ VisitValues([&](int64_t i) {
+ const T x = left_values[i + left_start_idx_];
+ const T y = right_values[i + right_start_idx_];
+ return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
+ });
+ } else {
+ VisitValues([&](int64_t i) {
+ const T x = left_values[i + left_start_idx_];
+ const T y = right_values[i + right_start_idx_];
+ return (fabs(x - y) <= epsilon) || (x == y);
+ });
}
- if (!left_null && memcmp(left_data, right_data, byte_width) != 0) {
- return false;
+ } else {
+ if (options_.nans_equal()) {
+ VisitValues([&](int64_t i) {
+ const T x = left_values[i + left_start_idx_];
+ const T y = right_values[i + right_start_idx_];
+ return (x == y) || (std::isnan(x) && std::isnan(y));
+ });
+ } else {
+ VisitValues([&](int64_t i) {
+ const T x = left_values[i + left_start_idx_];
+ const T y = right_values[i + right_start_idx_];
+ return x == y;
+ });
}
- left_data += byte_width;
- right_data += byte_width;
}
- return true;
- } else {
- auto number_of_bytes_to_compare = static_cast<size_t>(byte_width * left.length());
- return memcmp(left_data, right_data, number_of_bytes_to_compare) == 0;
- }
-}
-
-// A bit confusing: ArrayEqualsVisitor inherits from RangeEqualsVisitor but
-// doesn't share the same preconditions.
-// When RangeEqualsVisitor is called, we only know the range sizes equal.
-// When ArrayEqualsVisitor is called, we know the sizes and null bitmaps are equal.
-
-class ArrayEqualsVisitor : public RangeEqualsVisitor {
- public:
- explicit ArrayEqualsVisitor(const Array& right, const EqualOptions& opts)
- : RangeEqualsVisitor(right, 0, right.length(), 0), opts_(opts) {}
-
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
return Status::OK();
}
- Status Visit(const BooleanArray& left) {
- const auto& right = checked_cast<const BooleanArray&>(right_);
+ template <typename TypeClass>
+ Status CompareBinary(const TypeClass&) {
+ const uint8_t* left_data = left_.GetValues<uint8_t>(2, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(2, 0);
- if (left.null_count() > 0) {
- const uint8_t* left_data = left.values()->data();
- const uint8_t* right_data = right.values()->data();
-
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsValid(i) && BitUtil::GetBit(left_data, i + left.offset()) !=
- BitUtil::GetBit(right_data, i + right.offset())) {
- result_ = false;
- return Status::OK();
- }
- }
- result_ = true;
+ if (left_data != nullptr && right_data != nullptr) {
+ const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+ int64_t length) -> bool {
+ return memcmp(left_data + left_offset, right_data + right_offset, length) == 0;
+ };
+ CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
} else {
- result_ = BitmapEquals(left.values()->data(), left.offset(), right.values()->data(),
- right.offset(), left.length());
+ // One of the arrays is an array of empty strings and nulls.
+ // We just need to compare the offsets.
+ // (note we must not call memcmp() with null data pointers)
+ const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+ int64_t length) -> bool { return true; };
+ CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
Review comment:
```suggestion
CompareWithOffsets<typename TypeClass::offset_type>(1, [](...) { return true; });
```
##########
File path: cpp/src/arrow/util/bitmap_reader.h
##########
@@ -69,6 +69,77 @@ class BitmapReader {
int64_t bit_offset_;
};
+// XXX Cannot name it BitmapWordReader because the name is already used
+// in bitmap_ops.cc
Review comment:
There's a lot of duplication of logic under the heading of "reading words from bitmaps": VisitWords, BitmapWordReader, BitmapUInt64Reader. In a follow up it'd be handy to consolidate these and ensure they're rigorously benchmarked (or if there are compelling reasons not to consolidate then provide comments indicating when each should be preferred)
##########
File path: cpp/src/arrow/util/bitmap.h
##########
@@ -110,8 +110,8 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
///
/// TODO(bkietz) allow for early termination
template <size_t N, typename Visitor,
- typename Word =
- typename internal::call_traits::argument_type<0, Visitor&&>::value_type>
+ typename Word = typename std::decay<
+ internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
Review comment:
I don't think VisitWords is being used in this PR. IIUC this change would only apply if the visitor took a constant reference to the words array rather than just taking the words array?
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const FixedSizeBinaryArray& left) {
- const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
- int32_t width = left.byte_width();
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
- if (left.values()) {
- left_data = left.raw_values();
- }
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
- if (right.values()) {
- right_data = right.raw_values();
- }
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- result_ = false;
- return Status::OK();
- }
- if (is_null) continue;
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
- if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
- result_ = false;
- return Status::OK();
- }
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
}
- result_ = true;
return Status::OK();
}
- Status Visit(const Decimal128Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
- Status Visit(const Decimal256Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ Status Visit(const LargeListType& type) { return CompareList(type); }
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
- return Status::OK();
- }
-
- template <typename T>
- typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
- const T& left) {
- return CompareValues<T>(left);
- }
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
- Status Visit(const ListArray& left) {
- result_ = CompareLists(left);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const LargeListArray& left) {
- result_ = CompareLists(left);
- return Status::OK();
- }
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
- Status Visit(const FixedSizeListArray& left) {
- const auto& right = checked_cast<const FixedSizeListArray&>(right_);
- result_ = left.values()->RangeEquals(
- left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
- right.value_offset(right_start_idx_), right.values());
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const MapArray& left) {
- result_ = CompareMaps(left);
- return Status::OK();
- }
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
- Status Visit(const StructArray& left) {
- result_ = CompareStructs(left);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
+ // XXX can we instead detect runs of same-child union values?
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+ right_start_idx_ + right_.offset + j, 1);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ });
return Status::OK();
}
- Status Visit(const UnionArray& left) {
- result_ = CompareUnions(left);
+ Status Visit(const DenseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
+ const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+ const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
Review comment:
runs of type_id can still be used here, but ranges of children can't be directly compared due to the offsets. It's odd that we "dereference" offsets when comparing dense union arrays but we don't do the same with a dictionary array's indices
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210816
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
Review comment:
Will take a look.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528593225
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const FixedSizeBinaryArray& left) {
- const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
- int32_t width = left.byte_width();
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
- if (left.values()) {
- left_data = left.raw_values();
- }
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
- if (right.values()) {
- right_data = right.raw_values();
- }
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- result_ = false;
- return Status::OK();
- }
- if (is_null) continue;
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
- if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
- result_ = false;
- return Status::OK();
- }
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
}
- result_ = true;
return Status::OK();
}
- Status Visit(const Decimal128Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
- Status Visit(const Decimal256Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ Status Visit(const LargeListType& type) { return CompareList(type); }
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
- return Status::OK();
- }
-
- template <typename T>
- typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
- const T& left) {
- return CompareValues<T>(left);
- }
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
- Status Visit(const ListArray& left) {
- result_ = CompareLists(left);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const LargeListArray& left) {
- result_ = CompareLists(left);
- return Status::OK();
- }
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
- Status Visit(const FixedSizeListArray& left) {
- const auto& right = checked_cast<const FixedSizeListArray&>(right_);
- result_ = left.values()->RangeEquals(
- left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
- right.value_offset(right_start_idx_), right.values());
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const MapArray& left) {
- result_ = CompareMaps(left);
- return Status::OK();
- }
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
- Status Visit(const StructArray& left) {
- result_ = CompareStructs(left);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
+ // XXX can we instead detect runs of same-child union values?
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_start_idx_ + left_.offset + j,
+ right_start_idx_ + right_.offset + j, 1);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ });
return Status::OK();
}
- Status Visit(const UnionArray& left) {
- result_ = CompareUnions(left);
+ Status Visit(const DenseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
+ const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+ const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ const auto type_id = left_codes[left_start_idx_ + j];
+ if (type_id != right_codes[right_start_idx_ + j]) {
+ return false;
+ }
+ const auto child_num = child_ids[type_id];
Review comment:
I would defer to @wesm for the historical reasons.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210910
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- result_ = true;
- return Status::OK();
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
}
- template <typename ArrayType, typename CompareValuesFunc>
- bool CompareWithOffsets(const ArrayType& left,
- CompareValuesFunc&& compare_values) const {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- return false;
- }
- if (is_null) continue;
- const auto begin_offset = left.value_offset(i);
- const auto end_offset = left.value_offset(i + 1);
- const auto right_begin_offset = right.value_offset(o_i);
- const auto right_end_offset = right.value_offset(o_i + 1);
- // Underlying can't be equal if the size isn't equal
- if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
- return false;
- }
-
- if (!compare_values(left, right, begin_offset, right_begin_offset,
- end_offset - begin_offset)) {
- return false;
- }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- return true;
+ return result_;
}
- template <typename BinaryArrayType>
- bool CompareBinaryRange(const BinaryArrayType& left) const {
- using offset_type = typename BinaryArrayType::offset_type;
+ Status Visit(const NullType&) { return Status::OK(); }
- auto compare_values = [](const BinaryArrayType& left, const BinaryArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return std::memcmp(left.value_data()->data() + left_offset,
- right.value_data()->data() + right_offset,
- static_cast<size_t>(nvalues)) == 0;
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename ListArrayType>
- bool CompareLists(const ListArrayType& left) {
- using offset_type = typename ListArrayType::offset_type;
- const auto& right = checked_cast<const ListArrayType&>(right_);
- const std::shared_ptr<Array>& left_values = left.values();
- const std::shared_ptr<Array>& right_values = right.values();
-
- auto compare_values = [&](const ListArrayType& left, const ListArrayType& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_values->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_values);
- };
- return CompareWithOffsets(left, compare_values);
- }
-
- bool CompareMaps(const MapArray& left) {
- // We need a specific comparison helper for maps to avoid comparing
- // struct field names (which are indifferent for maps)
- using offset_type = typename MapArray::offset_type;
- const auto& right = checked_cast<const MapArray&>(right_);
- const auto left_keys = left.keys();
- const auto left_items = left.items();
- const auto right_keys = right.keys();
- const auto right_items = right.items();
-
- auto compare_values = [&](const MapArray& left, const MapArray& right,
- offset_type left_offset, offset_type right_offset,
- offset_type nvalues) {
- if (nvalues == 0) {
- return true;
- }
- return left_keys->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_keys) &&
- left_items->RangeEquals(left_offset, left_offset + nvalues, right_offset,
- right_items);
- };
- return CompareWithOffsets(left, compare_values);
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- bool CompareStructs(const StructArray& left) {
- const auto& right = checked_cast<const StructArray&>(right_);
- bool equal_fields = true;
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- for (int j = 0; j < left.num_fields(); ++j) {
- // TODO: really we should be comparing stretches of non-null data rather
- // than looking at one value at a time.
- equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j));
- if (!equal_fields) {
- return false;
- }
- }
- }
- return true;
- }
-
- bool CompareUnions(const UnionArray& left) const {
- const auto& right = checked_cast<const UnionArray&>(right_);
-
- const UnionMode::type union_mode = left.mode();
- if (union_mode != right.mode()) {
- return false;
- }
-
- const auto& left_type = checked_cast<const UnionType&>(*left.type());
-
- const std::vector<int>& child_ids = left_type.child_ids();
-
- const int8_t* left_codes = left.raw_type_codes();
- const int8_t* right_codes = right.raw_type_codes();
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- if (left.IsNull(i) != right.IsNull(o_i)) {
- return false;
- }
- if (left.IsNull(i)) continue;
- if (left_codes[i] != right_codes[o_i]) {
- return false;
- }
-
- auto child_num = child_ids[left_codes[i]];
-
- // TODO(wesm): really we should be comparing stretches of non-null data
- // rather than looking at one value at a time.
- if (union_mode == UnionMode::SPARSE) {
- if (!left.field(child_num)->RangeEquals(i, i + 1, o_i, right.field(child_num))) {
- return false;
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
}
+ return true;
} else {
- const int32_t offset =
- checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
- const int32_t o_offset =
- checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
- if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
- right.field(child_num))) {
- return false;
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
}
+ DCHECK_EQ(right_reader.position(), length);
}
- }
- return true;
- }
-
- Status Visit(const BinaryArray& left) {
- result_ = CompareBinaryRange(left);
- return Status::OK();
- }
-
- Status Visit(const LargeBinaryArray& left) {
- result_ = CompareBinaryRange(left);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const FixedSizeBinaryArray& left) {
- const auto& right = checked_cast<const FixedSizeBinaryArray&>(right_);
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
- int32_t width = left.byte_width();
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
- const uint8_t* left_data = nullptr;
- const uint8_t* right_data = nullptr;
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
- if (left.values()) {
- left_data = left.raw_values();
- }
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
- if (right.values()) {
- right_data = right.raw_values();
- }
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i)) {
- result_ = false;
- return Status::OK();
- }
- if (is_null) continue;
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
- if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) {
- result_ = false;
- return Status::OK();
- }
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
}
- result_ = true;
return Status::OK();
}
- Status Visit(const Decimal128Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
- Status Visit(const Decimal256Array& left) {
- return Visit(checked_cast<const FixedSizeBinaryArray&>(left));
- }
+ Status Visit(const LargeListType& type) { return CompareList(type); }
- Status Visit(const NullArray& left) {
- ARROW_UNUSED(left);
- result_ = true;
- return Status::OK();
- }
-
- template <typename T>
- typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value, Status>::type Visit(
- const T& left) {
- return CompareValues<T>(left);
- }
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
- Status Visit(const ListArray& left) {
- result_ = CompareLists(left);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const LargeListArray& left) {
- result_ = CompareLists(left);
- return Status::OK();
- }
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
- Status Visit(const FixedSizeListArray& left) {
- const auto& right = checked_cast<const FixedSizeListArray&>(right_);
- result_ = left.values()->RangeEquals(
- left.value_offset(left_start_idx_), left.value_offset(left_end_idx_),
- right.value_offset(right_start_idx_), right.values());
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
+ return false;
+ }
+ }
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const MapArray& left) {
- result_ = CompareMaps(left);
- return Status::OK();
- }
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
- Status Visit(const StructArray& left) {
- result_ = CompareStructs(left);
+ VisitValidRuns([&](int64_t i, int64_t length) {
Review comment:
Ha, I still occasionally forget about that :-)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#issuecomment-732104009
I've addressed review comments. I'd like to merge this if it is ok.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210378
##########
File path: cpp/src/arrow/ipc/feather_test.cc
##########
@@ -286,10 +286,13 @@ TEST_P(TestFeather, PrimitiveNullRoundTrip) {
std::vector<std::shared_ptr<Array>> expected_fields;
for (int i = 0; i < batch->num_columns(); ++i) {
ASSERT_EQ(batch->column_name(i), reader_->schema()->field(i)->name());
- StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
- batch->column(i)->null_bitmap(),
- batch->column(i)->null_count());
- AssertArraysEqual(str_values, *result->column(i)->chunk(0));
+ ASSERT_OK_AND_ASSIGN(auto expected, MakeArrayOfNull(utf8(), batch->num_rows()));
+ AssertArraysEqual(*expected, *result->column(i)->chunk(0));
+ // StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
+ // batch->column(i)->null_bitmap(),
+ // batch->column(i)->null_count());
+ // AssertArraysEqual(str_values, *result->column(i)->chunk(0),
+ // /*verbose=*/true);
Review comment:
Yes, I was waiting for an answer to the question above.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] xhochy commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
xhochy commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528719927
##########
File path: cpp/src/arrow/ipc/feather_test.cc
##########
@@ -286,10 +286,13 @@ TEST_P(TestFeather, PrimitiveNullRoundTrip) {
std::vector<std::shared_ptr<Array>> expected_fields;
for (int i = 0; i < batch->num_columns(); ++i) {
ASSERT_EQ(batch->column_name(i), reader_->schema()->field(i)->name());
- StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
- batch->column(i)->null_bitmap(),
- batch->column(i)->null_count());
- AssertArraysEqual(str_values, *result->column(i)->chunk(0));
+ ASSERT_OK_AND_ASSIGN(auto expected, MakeArrayOfNull(utf8(), batch->num_rows()));
+ AssertArraysEqual(*expected, *result->column(i)->chunk(0));
+ // StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
+ // batch->column(i)->null_bitmap(),
+ // batch->column(i)->null_count());
+ // AssertArraysEqual(str_values, *result->column(i)->chunk(0),
Review comment:
I can also remember that we had the conversation but I'm unsure about the outcome, too.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210366
##########
File path: cpp/src/arrow/array/diff.h
##########
@@ -59,6 +57,27 @@ ARROW_EXPORT
Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
MemoryPool* pool = default_memory_pool());
+/// \brief Compare two array ranges, returning an edit script which expresses the
+/// difference between them
+///
+/// Same as Diff(), but only the ranges defined by the given offsets and lengths
+/// are compared.
+///
+/// \param[in] base baseline for comparison
+/// \param[in] target an array of identical type to base whose elements differ from base's
+/// \param[in] base_offset the start offset of the range to consider inside `base`
+/// \param[in] base_length the length of the range to consider inside `base`
+/// \param[in] target_offset the start offset of the range to consider inside `target`
+/// \param[in] target_length the length of the range to consider inside `target`
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \return an edit script array which can be applied to base to produce target
+ARROW_EXPORT
+Result<std::shared_ptr<StructArray>> DiffRanges(const Array& base, const Array& target,
Review comment:
Hmm... well, there is a performance argument for avoiding slices, but I'm not sure it matters for diffing :-)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210648
##########
File path: cpp/src/arrow/util/bitmap.h
##########
@@ -110,8 +110,8 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
///
/// TODO(bkietz) allow for early termination
template <size_t N, typename Visitor,
- typename Word =
- typename internal::call_traits::argument_type<0, Visitor&&>::value_type>
+ typename Word = typename std::decay<
+ internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
Review comment:
A previous version of this work used `VisitWords`, but it ended up exhibiting bad performance (`VisitWords` might be too complex for the compiler). Yes, this change is for the case where the visitor argument takes a `const& std::array<...>`. Perhaps there's no advantage in doing so, but it's better if it works anyway.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8703: ARROW-10143: [C++] Rewrite Array(Range)Equals
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8703:
URL: https://github.com/apache/arrow/pull/8703#discussion_r528210778
##########
File path: cpp/src/arrow/compare.cc
##########
@@ -49,700 +51,441 @@
namespace arrow {
using internal::BitmapEquals;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
+using internal::OptionalBitBlockCounter;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// These helper functions assume we already checked the arrays have equal
-// sizes and null bitmaps.
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-template <typename ArrowType, typename EqualityFunc>
-inline bool BaseFloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- EqualityFunc&& equals) {
- using T = typename ArrowType::c_type;
-
- const T* left_data = left.raw_values();
- const T* right_data = right.raw_values();
-
- if (left.null_count() > 0) {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (left.IsNull(i)) continue;
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- } else {
- for (int64_t i = 0; i < left.length(); ++i) {
- if (!equals(left_data[i], right_data[i])) {
- return false;
- }
- }
- }
- return true;
-}
-
-template <typename ArrowType>
-inline bool FloatingEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [](T x, T y) -> bool {
- return (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right,
- [](T x, T y) -> bool { return x == y; });
- }
-}
-
-template <typename ArrowType>
-inline bool FloatingApproxEquals(const NumericArray<ArrowType>& left,
- const NumericArray<ArrowType>& right,
- const EqualOptions& opts) {
- using T = typename ArrowType::c_type;
- const T epsilon = static_cast<T>(opts.atol());
-
- if (opts.nans_equal()) {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- });
- } else {
- return BaseFloatingEquals<ArrowType>(left, right, [epsilon](T x, T y) -> bool {
- return (fabs(x - y) <= epsilon) || (x == y);
- });
- }
-}
-
-// RangeEqualsVisitor assumes the range sizes are equal
-
-class RangeEqualsVisitor {
+class RangeDataEqualsImpl {
public:
- RangeEqualsVisitor(const Array& right, int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx)
- : right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
- left_end_idx_(left_end_idx),
right_start_idx_(right_start_idx),
+ range_length_(range_length),
result_(false) {}
- template <typename ArrayType>
- inline Status CompareValues(const ArrayType& left) {
- const auto& right = checked_cast<const ArrayType&>(right_);
-
- for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
- ++i, ++o_i) {
- const bool is_null = left.IsNull(i);
- if (is_null != right.IsNull(o_i) ||
- (!is_null && left.Value(i) != right.Value(o_i))) {
- result_ = false;
- return Status::OK();
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
Review comment:
It's ok to compute the null count, IMHO, the computed value is often re-used for other tasks.
The reason why this heuristic only works for non-ranged arrays is that you could have two whole arrays with different null counts, but the compared ranges would still be equal.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org