You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/08/13 17:29:57 UTC
[arrow] branch master updated: ARROW-517: [C++] array comparison,
uses D**2 space Myers
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new cba9c7b ARROW-517: [C++] array comparison, uses D**2 space Myers
cba9c7b is described below
commit cba9c7b3945f2207d74d33ad407b35e6b362d166
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Tue Aug 13 19:29:40 2019 +0200
ARROW-517: [C++] array comparison, uses D**2 space Myers
Adds `arrow::Diff`, which compares two arrays returning an edit script which expresses the difference between them.
The edit script is represented as an array of `struct<insert: bool, run_length: uint64_t>`.
Each element of the "insert" column determines whether an element was inserted into (true) or deleted from (false) base. Each insertion or deletion is followed by a run of elements which are unchanged from base to target; the length of this run is stored in the "run_length" column. (The first element of the "insert" column is always false; the first element of the "run_length" column is the number of leading elements unchanged from base to target.)
For example for base "hlloo" and target "hello", the edit script would be
```js
[
{"insert": true, "run_length": 1}, // leading run of length 1 ("h")
{"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo")
{"insert": false, "run_length": 0} // delete("o") then an empty run
]
```
Closes #4782 from bkietz/517-Verbose-ArrayEquals and squashes the following commits:
ce358b8b3 <Benjamin Kietzman> fix nits
f09478454 <Benjamin Kietzman> revert expanded filter_probability set
493284062 <Benjamin Kietzman> in extreme cases, implicit target position is not valid
3837ea36e <Benjamin Kietzman> move string utilities from a header into a separate source file
1cae9ba4e <Benjamin Kietzman> remove DiffVisitor, simplify visitation of edit scripts
d56eba425 <Benjamin Kietzman> lint fixes
3f1b9103b <Benjamin Kietzman> clarify (hopefully) storage layout and algorithm flow
68da6be7b <Benjamin Kietzman> add comments clarifying diff implementation
74f657841 <Benjamin Kietzman> add date/time/timestamp/interval formatting
8504ffc8e <Benjamin Kietzman> rename to Storage{Begin, End}
5dcf01d31 <Benjamin Kietzman> add test which prepends some elements
fdba4ba4c <Benjamin Kietzman> use int64_t for lengths
8210a778b <Benjamin Kietzman> trim random tests to run faster, add a random string diff test
4543b19c8 <Benjamin Kietzman> replace Formatter with std::function
c75031174 <Benjamin Kietzman> remove redundant list view range
aa41437eb <Benjamin Kietzman> verify that 8 bit integers are formatted as numbers rather than chars
9df259a6e <Benjamin Kietzman> make diff tests more readable
90ef3c87b <Benjamin Kietzman> move DiffTest.Errors
fcb8db7ba <Benjamin Kietzman> add comment: null EqualOptions::diff_sink
e72a402cd <Benjamin Kietzman> use Result more consistently
29c90ac77 <Benjamin Kietzman> add test with escaped chars
ab3272149 <Benjamin Kietzman> pass std::ostream by pointer rather than reference
2f216868d <Benjamin Kietzman> merge conflict with new StructArray::Make
490633aa0 <Benjamin Kietzman> use iosfwd instead of iostream
f78c3b7de <Benjamin Kietzman> add support for LargeListArray
7cf6e847b <Benjamin Kietzman> lint: explicit constructors
4b55c412b <Benjamin Kietzman> add randomized struct diff test
bd29b553b <Benjamin Kietzman> added a test for formatting diffs of map arrays
bebc9a4d5 <Benjamin Kietzman> add support for formatting unions
efa35fd6b <Benjamin Kietzman> refine list formatting, add struct formatting
88a79f8a1 <Benjamin Kietzman> rename DiffImpl
354af4f6f <Benjamin Kietzman> refactor (|chunked) array asserts to show diffs
46da4eb75 <Benjamin Kietzman> add support for diff(ListArrays)
ea1693e5c <Benjamin Kietzman> add support for diff(DictionaryArrays)
e1527910d <Benjamin Kietzman> add support for diff(ExtensionArray)
1036cadc7 <Benjamin Kietzman> add non-erroring bailouts for unsupported array types in Array::Equals
6f97ba2f5 <Benjamin Kietzman> add support for diff(NullArrays)
8767d1f76 <Benjamin Kietzman> force integer formatting for /(u|)int8_t/
d9f71b23c <Benjamin Kietzman> add support for diffing arrays with nulls
0b5e7c609 <Benjamin Kietzman> add test for Array::Equals with diff
4eb4d1cbb <Benjamin Kietzman> refine & bugfix unified diff formatter
2f27816a5 <Benjamin Kietzman> Add unified format printer and randomized tests
70261c0c2 <Benjamin Kietzman> add DiffVisitor interface
58918bc5d <Benjamin Kietzman> first draft of array comparison, uses D**2 space Myers
Authored-by: Benjamin Kietzman <be...@gmail.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/arrow/CMakeLists.txt | 2 +
cpp/src/arrow/array.cc | 4 +-
cpp/src/arrow/array.h | 5 +-
cpp/src/arrow/array/CMakeLists.txt | 1 +
cpp/src/arrow/array/diff-test.cc | 650 +++++++++++++++++++++++
cpp/src/arrow/array/diff.cc | 819 +++++++++++++++++++++++++++++
cpp/src/arrow/array/diff.h | 75 +++
cpp/src/arrow/compare.cc | 52 +-
cpp/src/arrow/compare.h | 14 +
cpp/src/arrow/testing/gtest_util.cc | 18 +-
cpp/src/arrow/type.h | 1 +
cpp/src/arrow/util/lazy.h | 24 +-
cpp/src/arrow/util/{string.h => string.cc} | 46 +-
cpp/src/arrow/util/string.h | 45 +-
14 files changed, 1684 insertions(+), 72 deletions(-)
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 7d385e0..dc1cec2 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -91,6 +91,7 @@ set(ARROW_SRCS
array/builder_primitive.cc
array/builder_union.cc
array/concatenate.cc
+ array/diff.cc
buffer.cc
compare.cc
extension_type.cc
@@ -136,6 +137,7 @@ set(ARROW_SRCS
util/logging.cc
util/key_value_metadata.cc
util/memory.cc
+ util/string.cc
util/string_builder.cc
util/task-group.cc
util/thread-pool.cc
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 847ed11..3c87280 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -558,7 +558,7 @@ StructArray::StructArray(const std::shared_ptr<DataType>& type, int64_t length,
boxed_fields_.resize(children.size());
}
-Result<std::shared_ptr<Array>> StructArray::Make(
+Result<std::shared_ptr<StructArray>> StructArray::Make(
const std::vector<std::shared_ptr<Array>>& children,
const std::vector<std::shared_ptr<Field>>& fields,
std::shared_ptr<Buffer> null_bitmap, int64_t null_count, int64_t offset) {
@@ -582,7 +582,7 @@ Result<std::shared_ptr<Array>> StructArray::Make(
null_bitmap, null_count, offset);
}
-Result<std::shared_ptr<Array>> StructArray::Make(
+Result<std::shared_ptr<StructArray>> StructArray::Make(
const std::vector<std::shared_ptr<Array>>& children,
const std::vector<std::string>& field_names, std::shared_ptr<Buffer> null_bitmap,
int64_t null_count, int64_t offset) {
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 1b3e47f..fbbe369 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -639,6 +639,7 @@ class ARROW_EXPORT MapArray : public ListArray {
class ARROW_EXPORT FixedSizeListArray : public Array {
public:
using TypeClass = FixedSizeListType;
+ using offset_type = TypeClass::offset_type;
explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);
@@ -912,7 +913,7 @@ class ARROW_EXPORT StructArray : public Array {
///
/// The length and data type are automatically inferred from the arguments.
/// There should be at least one child array.
- static Result<std::shared_ptr<Array>> Make(
+ static Result<std::shared_ptr<StructArray>> Make(
const std::vector<std::shared_ptr<Array>>& children,
const std::vector<std::string>& field_names,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
@@ -923,7 +924,7 @@ class ARROW_EXPORT StructArray : public Array {
/// The length is automatically inferred from the arguments.
/// There should be at least one child array. This method does not
/// check that field types and child array types are consistent.
- static Result<std::shared_ptr<Array>> Make(
+ static Result<std::shared_ptr<StructArray>> Make(
const std::vector<std::shared_ptr<Array>>& children,
const std::vector<std::shared_ptr<Field>>& fields,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
diff --git a/cpp/src/arrow/array/CMakeLists.txt b/cpp/src/arrow/array/CMakeLists.txt
index 88bc4ea..70d0661 100644
--- a/cpp/src/arrow/array/CMakeLists.txt
+++ b/cpp/src/arrow/array/CMakeLists.txt
@@ -16,6 +16,7 @@
# under the License.
add_arrow_test(concatenate-test)
+add_arrow_test(diff-test)
# Headers: top level
arrow_install_all_headers("arrow/array")
diff --git a/cpp/src/arrow/array/diff-test.cc b/cpp/src/arrow/array/diff-test.cc
new file mode 100644
index 0000000..1837de5
--- /dev/null
+++ b/cpp/src/arrow/array/diff-test.cc
@@ -0,0 +1,650 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/array.h"
+#include "arrow/array/diff.h"
+#include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/compute/context.h"
+#include "arrow/compute/kernels/filter.h"
+#include "arrow/status.h"
+#include "arrow/testing/gtest_common.h"
+#include "arrow/testing/random.h"
+#include "arrow/testing/util.h"
+#include "arrow/type.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+constexpr random::SeedType kSeed = 0xdeadbeef;
+static const auto edits_type =
+ struct_({field("insert", boolean()), field("run_length", int64())});
+
+Status ValidateEditScript(const Array& edits, const Array& base, const Array& target) {
+ // beginning (in base) of the run before the current hunk
+ int64_t base_run_begin = 0;
+ return VisitEditScript(edits, [&](int64_t delete_begin, int64_t delete_end,
+ int64_t insert_begin, int64_t insert_end) {
+ auto target_run_begin = insert_begin - (delete_begin - base_run_begin);
+ if (!base.RangeEquals(base_run_begin, delete_begin, target_run_begin, target)) {
+ return Status::Invalid("base and target were unequal in a run");
+ }
+
+ base_run_begin = delete_end;
+ for (int64_t i = insert_begin; i < insert_end; ++i) {
+ for (int64_t d = delete_begin; d < delete_end; ++d) {
+ if (target.RangeEquals(i, i + 1, d, base)) {
+ return Status::Invalid("a deleted element was simultaneously inserted");
+ }
+ }
+ }
+
+ return Status::OK();
+ });
+}
+
+class DiffTest : public ::testing::Test {
+ protected:
+ DiffTest() : rng_(kSeed) {}
+
+ void DoDiff() {
+ auto edits = Diff(*base_, *target_, default_memory_pool());
+ ASSERT_OK(edits.status());
+ edits_ = edits.ValueOrDie();
+ ASSERT_OK(ValidateArray(*edits_));
+ ASSERT_TRUE(edits_->type()->Equals(edits_type));
+ insert_ = checked_pointer_cast<BooleanArray>(edits_->field(0));
+ run_lengths_ = checked_pointer_cast<Int64Array>(edits_->field(1));
+ }
+
+ void DoDiffAndFormat(std::stringstream* out) {
+ DoDiff();
+ auto formatter = MakeUnifiedDiffFormatter(*base_->type(), out);
+ ASSERT_OK(formatter.status());
+ ASSERT_OK(formatter.ValueOrDie()(*edits_, *base_, *target_));
+ }
+
+ // validate diff and assert that it formats as expected, both directly
+ // and through Array::Equals
+ void AssertDiffAndFormat(const std::string& formatted_expected) {
+ std::stringstream formatted;
+
+ DoDiffAndFormat(&formatted);
+ ASSERT_EQ(formatted.str(), formatted_expected) << "formatted diff incorrectly";
+ formatted.str("");
+
+ ASSERT_EQ(edits_->length() == 1,
+ base_->Equals(*target_, EqualOptions().diff_sink(&formatted)));
+ ASSERT_EQ(formatted.str(), formatted_expected)
+ << "Array::Equals formatted diff incorrectly";
+ }
+
+ void AssertInsertIs(const std::string& insert_json) {
+ ASSERT_ARRAYS_EQUAL(*ArrayFromJSON(boolean(), insert_json), *insert_);
+ }
+
+ void AssertRunLengthIs(const std::string& run_lengths_json) {
+ ASSERT_ARRAYS_EQUAL(*ArrayFromJSON(int64(), run_lengths_json), *run_lengths_);
+ }
+
+ random::RandomArrayGenerator rng_;
+ std::shared_ptr<StructArray> edits_;
+ std::shared_ptr<Array> base_, target_;
+ std::shared_ptr<BooleanArray> insert_;
+ std::shared_ptr<Int64Array> run_lengths_;
+};
+
+TEST_F(DiffTest, Trivial) {
+ base_ = ArrayFromJSON(int32(), "[]");
+ target_ = ArrayFromJSON(int32(), "[]");
+ DoDiff();
+ AssertInsertIs("[false]");
+ AssertRunLengthIs("[0]");
+
+ base_ = ArrayFromJSON(null(), "[null, null]");
+ target_ = ArrayFromJSON(null(), "[null, null, null, null]");
+ DoDiff();
+ AssertInsertIs("[false, true, true]");
+ AssertRunLengthIs("[2, 0, 0]");
+
+ base_ = ArrayFromJSON(int32(), "[1, 2, 3]");
+ target_ = ArrayFromJSON(int32(), "[1, 2, 3]");
+ DoDiff();
+ AssertInsertIs("[false]");
+ AssertRunLengthIs("[3]");
+}
+
+TEST_F(DiffTest, Errors) {
+ std::stringstream formatted;
+
+ base_ = ArrayFromJSON(int32(), "[]");
+ target_ = ArrayFromJSON(utf8(), "[]");
+ ASSERT_RAISES(TypeError, Diff(*base_, *target_, default_memory_pool()));
+
+ ASSERT_FALSE(base_->Equals(*target_, EqualOptions().diff_sink(&formatted)));
+ ASSERT_EQ(formatted.str(), R"(# Array types differed: int32 vs string)");
+}
+
+template <typename ArrowType>
+class DiffTestWithNumeric : public DiffTest {
+ protected:
+ std::shared_ptr<DataType> type_singleton() {
+ return TypeTraits<ArrowType>::type_singleton();
+ }
+};
+
+TYPED_TEST_CASE(DiffTestWithNumeric, NumericArrowTypes);
+
+TYPED_TEST(DiffTestWithNumeric, Basics) {
+ // insert one
+ this->base_ = ArrayFromJSON(this->type_singleton(), "[1, 2, null, 5]");
+ this->target_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5]");
+ this->DoDiff();
+ this->AssertInsertIs("[false, true]");
+ this->AssertRunLengthIs("[2, 2]");
+
+ // delete one
+ this->base_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5]");
+ this->target_ = ArrayFromJSON(this->type_singleton(), "[1, 2, null, 5]");
+ this->DoDiff();
+ this->AssertInsertIs("[false, false]");
+ this->AssertRunLengthIs("[2, 2]");
+
+ // change one
+ this->base_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5]");
+ this->target_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 23, null, 5]");
+ this->DoDiff();
+ this->AssertInsertIs("[false, false, true]");
+ this->AssertRunLengthIs("[2, 0, 2]");
+
+ // null out one
+ this->base_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5]");
+ this->target_ = ArrayFromJSON(this->type_singleton(), "[1, 2, null, null, 5]");
+ this->DoDiff();
+ this->AssertInsertIs("[false, false, true]");
+ this->AssertRunLengthIs("[2, 1, 1]");
+
+ // append some
+ this->base_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5]");
+ this->target_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5, 6, 7, 8, 9]");
+ this->DoDiff();
+ this->AssertInsertIs("[false, true, true, true, true]");
+ this->AssertRunLengthIs("[5, 0, 0, 0, 0]");
+
+ // prepend some
+ this->base_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5]");
+ this->target_ = ArrayFromJSON(this->type_singleton(), "[6, 4, 2, 0, 1, 2, 3, null, 5]");
+ this->DoDiff();
+ this->AssertInsertIs("[false, true, true, true, true]");
+ this->AssertRunLengthIs("[0, 0, 0, 0, 5]");
+}
+
+TEST_F(DiffTest, CompareRandomInt64) {
+ compute::FunctionContext ctx;
+ for (auto null_probability : {0.0, 0.25}) {
+ auto values = this->rng_.Int64(1 << 10, 0, 127, null_probability);
+ for (const double filter_probability : {0.99, 0.75, 0.5}) {
+ auto filter_1 = this->rng_.Boolean(values->length(), filter_probability, 0.0);
+ auto filter_2 = this->rng_.Boolean(values->length(), filter_probability, 0.0);
+
+ ASSERT_OK(compute::Filter(&ctx, *values, *filter_1, &this->base_));
+ ASSERT_OK(compute::Filter(&ctx, *values, *filter_2, &this->target_));
+
+ std::stringstream formatted;
+ this->DoDiffAndFormat(&formatted);
+ auto st = ValidateEditScript(*this->edits_, *this->base_, *this->target_);
+ if (!st.ok()) {
+ ASSERT_OK(Status(st.code(), st.message() + "\n" + formatted.str()));
+ }
+ }
+ }
+}
+
+TEST_F(DiffTest, CompareRandomStrings) {
+ compute::FunctionContext ctx;
+ for (auto null_probability : {0.0, 0.25}) {
+ auto values = this->rng_.StringWithRepeats(1 << 10, 1 << 8, 0, 32, null_probability);
+ for (const double filter_probability : {0.99, 0.75, 0.5}) {
+ auto filter_1 = this->rng_.Boolean(values->length(), filter_probability, 0.0);
+ auto filter_2 = this->rng_.Boolean(values->length(), filter_probability, 0.0);
+
+ ASSERT_OK(compute::Filter(&ctx, *values, *filter_1, &this->base_));
+ ASSERT_OK(compute::Filter(&ctx, *values, *filter_2, &this->target_));
+
+ std::stringstream formatted;
+ this->DoDiffAndFormat(&formatted);
+ auto st = ValidateEditScript(*this->edits_, *this->base_, *this->target_);
+ if (!st.ok()) {
+ ASSERT_OK(Status(st.code(), st.message() + "\n" + formatted.str()));
+ }
+ }
+ }
+}
+
+TEST_F(DiffTest, BasicsWithStrings) {
+ // insert one
+ base_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+ target_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])");
+ DoDiff();
+ AssertInsertIs("[false, true]");
+ AssertRunLengthIs("[1, 2]");
+
+ // delete one
+ base_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])");
+ target_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+ DoDiff();
+ AssertInsertIs("[false, false]");
+ AssertRunLengthIs("[1, 2]");
+
+ // change one
+ base_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+ target_ = ArrayFromJSON(utf8(), R"(["gimme", "a", "break"])");
+ DoDiff();
+ AssertInsertIs("[false, false, true]");
+ AssertRunLengthIs("[0, 0, 2]");
+
+ // null out one
+ base_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+ target_ = ArrayFromJSON(utf8(), R"(["give", "a", null])");
+ DoDiff();
+ AssertInsertIs("[false, false, true]");
+ AssertRunLengthIs("[2, 0, 0]");
+}
+
+TEST_F(DiffTest, BasicsWithLists) {
+ // insert one
+ base_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [], [13]])");
+ target_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [5, 9], [], [13]])");
+ DoDiff();
+ AssertInsertIs("[false, true]");
+ AssertRunLengthIs("[1, 2]");
+
+ // delete one
+ base_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [5, 9], [], [13]])");
+ target_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [], [13]])");
+ DoDiff();
+ AssertInsertIs("[false, false]");
+ AssertRunLengthIs("[1, 2]");
+
+ // change one
+ base_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [], [13]])");
+ target_ = ArrayFromJSON(list(int32()), R"([[3, 3, 3], [], [13]])");
+ DoDiff();
+ AssertInsertIs("[false, false, true]");
+ AssertRunLengthIs("[0, 0, 2]");
+
+ // null out one
+ base_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [], [13]])");
+ target_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [], null])");
+ DoDiff();
+ AssertInsertIs("[false, false, true]");
+ AssertRunLengthIs("[2, 0, 0]");
+}
+
+TEST_F(DiffTest, BasicsWithStructs) {
+ auto type = struct_({field("foo", utf8()), field("bar", int32())});
+
+ // insert one
+ base_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {}, {"bar": 13}])");
+ target_ =
+ ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {"foo": "?"}, {}, {"bar": 13}])");
+ DoDiff();
+ AssertInsertIs("[false, true]");
+ AssertRunLengthIs("[1, 2]");
+
+ // delete one
+ base_ =
+ ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {"foo": "?"}, {}, {"bar": 13}])");
+ target_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {}, {"bar": 13}])");
+ DoDiff();
+ AssertInsertIs("[false, false]");
+ AssertRunLengthIs("[1, 2]");
+
+ // change one
+ base_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {}, {"bar": 13}])");
+ target_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 2}, {}, {"bar": 13}])");
+ DoDiff();
+ AssertInsertIs("[false, false, true]");
+ AssertRunLengthIs("[0, 0, 2]");
+
+ // null out one
+ base_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {}, {"bar": 13}])");
+ target_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {}, null])");
+ DoDiff();
+ AssertInsertIs("[false, false, true]");
+ AssertRunLengthIs("[2, 0, 0]");
+}
+
+TEST_F(DiffTest, BasicsWithUnions) {
+ auto type = union_({field("foo", utf8()), field("bar", int32())}, {2, 5});
+
+ // insert one
+ base_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], [5, 13]])");
+ target_ = ArrayFromJSON(type, R"([[2, "!"], [2, "?"], [5, 3], [5, 13]])");
+ DoDiff();
+ AssertInsertIs("[false, true]");
+ AssertRunLengthIs("[1, 2]");
+
+ // delete one
+ base_ = ArrayFromJSON(type, R"([[2, "!"], [2, "?"], [5, 3], [5, 13]])");
+ target_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], [5, 13]])");
+ DoDiff();
+ AssertInsertIs("[false, false]");
+ AssertRunLengthIs("[1, 2]");
+
+ // change one
+ base_ = ArrayFromJSON(type, R"([[5, 3], [2, "!"], [5, 13]])");
+ target_ = ArrayFromJSON(type, R"([[2, "3"], [2, "!"], [5, 13]])");
+ DoDiff();
+ AssertInsertIs("[false, false, true]");
+ AssertRunLengthIs("[0, 0, 2]");
+
+ // null out one
+ base_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], [5, 13]])");
+ target_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], null])");
+ DoDiff();
+ AssertInsertIs("[false, false, true]");
+ AssertRunLengthIs("[2, 0, 0]");
+}
+
+TEST_F(DiffTest, UnifiedDiffFormatter) {
+ // no changes
+ base_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])");
+ target_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])");
+ AssertDiffAndFormat(R"()");
+
+ // insert one
+ base_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+ target_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])");
+ AssertDiffAndFormat(R"(
+@@ -1, +1 @@
++"me"
+)");
+
+ // delete one
+ base_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])");
+ target_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+ AssertDiffAndFormat(R"(
+@@ -1, +1 @@
+-"me"
+)");
+
+ // change one
+ base_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+ target_ = ArrayFromJSON(utf8(), R"(["gimme", "a", "break"])");
+ AssertDiffAndFormat(R"(
+@@ -0, +0 @@
+-"give"
++"gimme"
+)");
+
+ // null out one
+ base_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+ target_ = ArrayFromJSON(utf8(), R"(["give", "a", null])");
+ AssertDiffAndFormat(R"(
+@@ -2, +2 @@
+-"break"
++null
+)");
+
+ // strings with escaped chars
+ base_ = ArrayFromJSON(utf8(), R"(["newline:\n", "quote:'", "backslash:\\"])");
+ target_ =
+ ArrayFromJSON(utf8(), R"(["newline:\n", "tab:\t", "quote:\"", "backslash:\\"])");
+ AssertDiffAndFormat(R"(
+@@ -1, +1 @@
+-"quote:'"
++"tab:\t"
++"quote:\""
+)");
+
+ // date32
+ base_ = ArrayFromJSON(date32(), R"([0, 1, 2, 31, 4])");
+ target_ = ArrayFromJSON(date32(), R"([0, 1, 31, 2, 4])");
+ AssertDiffAndFormat(R"(
+@@ -2, +2 @@
+-1970-01-03
+@@ -4, +3 @@
++1970-01-03
+)");
+
+ // date64
+ constexpr int64_t ms_per_day = 24 * 60 * 60 * 1000;
+ ArrayFromVector<Date64Type>(
+ {0 * ms_per_day, 1 * ms_per_day, 2 * ms_per_day, 31 * ms_per_day, 4 * ms_per_day},
+ &base_);
+ ArrayFromVector<Date64Type>(
+ {0 * ms_per_day, 1 * ms_per_day, 31 * ms_per_day, 2 * ms_per_day, 4 * ms_per_day},
+ &target_);
+ AssertDiffAndFormat(R"(
+@@ -2, +2 @@
+-1970-01-03
+@@ -4, +3 @@
++1970-01-03
+)");
+
+ // timestamp
+ auto x = 678 + 1000000 * (5 + 60 * (4 + 60 * (3 + 24 * int64_t(1))));
+ ArrayFromVector<TimestampType>(timestamp(TimeUnit::MICRO), {0, 1, x, 2, 4}, &base_);
+ ArrayFromVector<TimestampType>(timestamp(TimeUnit::MICRO), {0, 1, 2, x, 4}, &target_);
+ AssertDiffAndFormat(R"(
+@@ -2, +2 @@
+-1970-01-02 03:04:05.000678
+@@ -4, +3 @@
++1970-01-02 03:04:05.000678
+)");
+
+ // lists
+ base_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [], [13], []])");
+ target_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [5, 9], [], [13]])");
+ AssertDiffAndFormat(R"(
+@@ -1, +1 @@
++[5, 9]
+@@ -3, +4 @@
+-[]
+)");
+
+ // maps
+ base_ = ArrayFromJSON(map(utf8(), int32()), R"([
+ [["foo", 2], ["bar", 3], ["baz", 1]],
+ [],
+ [["quux", 13]],
+ []
+ ])");
+ target_ = ArrayFromJSON(map(utf8(), int32()), R"([
+ [["foo", 2], ["bar", 3], ["baz", 1]],
+ [["ytho", 11]],
+ [],
+ [["quux", 13]]
+ ])");
+ AssertDiffAndFormat(R"(
+@@ -1, +1 @@
++[{key: "ytho", value: 11}]
+@@ -3, +4 @@
+-[]
+)");
+
+ // structs
+ auto type = struct_({field("foo", utf8()), field("bar", int32())});
+ base_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {}, {"bar": 13}])");
+ target_ = ArrayFromJSON(type, R"([{"foo": null, "bar": 2}, {}, {"bar": 13}])");
+ AssertDiffAndFormat(R"(
+@@ -0, +0 @@
+-{foo: "!", bar: 3}
++{bar: 2}
+)");
+
+ // unions
+ for (auto mode : {UnionMode::SPARSE, UnionMode::DENSE}) {
+ type = union_({field("foo", utf8()), field("bar", int32())}, {2, 5}, mode);
+ base_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], [5, 13]])");
+ target_ = ArrayFromJSON(type, R"([[2, "!"], [2, "3"], [5, 13]])");
+ AssertDiffAndFormat(R"(
+@@ -1, +1 @@
+-{5: 3}
++{2: "3"}
+)");
+ }
+
+ for (auto type : {int8(), uint8(), // verify that these are printed as numbers rather
+ // than their ascii characters
+ int16(), uint16()}) {
+ // small difference
+ base_ = ArrayFromJSON(type, "[0, 1, 2, 3, 5, 8, 11, 13, 17]");
+ target_ = ArrayFromJSON(type, "[2, 3, 5, 7, 11, 13, 17, 19]");
+ AssertDiffAndFormat(R"(
+@@ -0, +0 @@
+-0
+-1
+@@ -5, +3 @@
+-8
++7
+@@ -9, +7 @@
++19
+)");
+
+ // large difference
+ base_ = ArrayFromJSON(type, "[57, 10, 22, 126, 42]");
+ target_ = ArrayFromJSON(type, "[58, 57, 75, 93, 53, 8, 22, 42, 79, 11]");
+ AssertDiffAndFormat(R"(
+@@ -0, +0 @@
++58
+@@ -1, +2 @@
+-10
++75
++93
++53
++8
+@@ -3, +7 @@
+-126
+@@ -5, +8 @@
++79
++11
+)");
+ }
+}
+
+TEST_F(DiffTest, DictionaryDiffFormatter) {
+ std::stringstream formatted;
+
+ // differing indices
+ auto base_dict = ArrayFromJSON(utf8(), R"(["a", "b", "c"])");
+ auto base_indices = ArrayFromJSON(int8(), "[0, 1, 2, 2, 0, 1]");
+ ASSERT_OK(
+ DictionaryArray::FromArrays(dictionary(base_indices->type(), base_dict->type()),
+ base_indices, base_dict, &base_));
+
+ auto target_dict = base_dict;
+ auto target_indices = ArrayFromJSON(int8(), "[0, 1, 2, 2, 1, 1]");
+ ASSERT_OK(
+ DictionaryArray::FromArrays(dictionary(target_indices->type(), target_dict->type()),
+ target_indices, target_dict, &target_));
+
+ base_->Equals(*target_, EqualOptions().diff_sink(&formatted));
+ ASSERT_EQ(formatted.str(), R"(# Dictionary arrays differed
+## dictionary diff
+## indices diff
+@@ -4, +4 @@
+-0
+@@ -6, +5 @@
++1
+)");
+
+ // differing dictionaries
+ target_dict = ArrayFromJSON(utf8(), R"(["b", "c", "a"])");
+ target_indices = base_indices;
+ ASSERT_OK(
+ DictionaryArray::FromArrays(dictionary(target_indices->type(), target_dict->type()),
+ target_indices, target_dict, &target_));
+
+ formatted.str("");
+ base_->Equals(*target_, EqualOptions().diff_sink(&formatted));
+ ASSERT_EQ(formatted.str(), R"(# Dictionary arrays differed
+## dictionary diff
+@@ -0, +0 @@
+-"a"
+@@ -3, +2 @@
++"a"
+## indices diff
+)");
+}
+
+void MakeSameLength(std::shared_ptr<Array>* a, std::shared_ptr<Array>* b) {
+ auto length = std::min((*a)->length(), (*b)->length());
+ *a = (*a)->Slice(0, length);
+ *b = (*b)->Slice(0, length);
+}
+
+TEST_F(DiffTest, CompareRandomStruct) {
+ compute::FunctionContext ctx;
+ for (auto null_probability : {0.0, 0.25}) {
+ constexpr auto length = 1 << 10;
+ auto int32_values = this->rng_.Int32(length, 0, 127, null_probability);
+ auto utf8_values = this->rng_.String(length, 0, 16, null_probability);
+ for (const double filter_probability : {0.9999, 0.75}) {
+ std::shared_ptr<Array> int32_base, int32_target, utf8_base, utf8_target;
+ ASSERT_OK(compute::Filter(&ctx, *int32_values,
+ *this->rng_.Boolean(length, filter_probability, 0.0),
+ &int32_base));
+ ASSERT_OK(compute::Filter(&ctx, *utf8_values,
+ *this->rng_.Boolean(length, filter_probability, 0.0),
+ &utf8_base));
+ MakeSameLength(&int32_base, &utf8_base);
+
+ ASSERT_OK(compute::Filter(&ctx, *int32_values,
+ *this->rng_.Boolean(length, filter_probability, 0.0),
+ &int32_target));
+ ASSERT_OK(compute::Filter(&ctx, *utf8_values,
+ *this->rng_.Boolean(length, filter_probability, 0.0),
+ &utf8_target));
+ MakeSameLength(&int32_target, &utf8_target);
+
+ auto type = struct_({field("i", int32()), field("s", utf8())});
+ auto base_res = StructArray::Make({int32_base, utf8_base}, type->children());
+ ASSERT_OK(base_res.status());
+ base_ = base_res.ValueOrDie();
+ auto target_res = StructArray::Make({int32_target, utf8_target}, type->children());
+ ASSERT_OK(target_res.status());
+ target_ = target_res.ValueOrDie();
+
+ std::stringstream formatted;
+ this->DoDiffAndFormat(&formatted);
+ auto st = ValidateEditScript(*this->edits_, *this->base_, *this->target_);
+ if (!st.ok()) {
+ ASSERT_OK(Status(st.code(), st.message() + "\n" + formatted.str()));
+ }
+ }
+ }
+}
+
+} // namespace arrow
diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc
new file mode 100644
index 0000000..156bd24
--- /dev/null
+++ b/cpp/src/arrow/array/diff.cc
@@ -0,0 +1,819 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/diff.h"
+
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer-builder.h"
+#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
+#include "arrow/pretty_print.h"
+#include "arrow/status.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/lazy.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/stl.h"
+#include "arrow/util/string.h"
+#include "arrow/util/variant.h"
+#include "arrow/util/visibility.h"
+#include "arrow/vendored/datetime.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+using internal::MakeLazyRange;
+
+template <typename ArrayType>
+auto GetView(const ArrayType& array, int64_t index) -> decltype(array.GetView(index)) {
+ return array.GetView(index);
+}
+
+struct Slice {
+ const Array* array_;
+ int64_t offset_, length_;
+
+ bool operator==(const Slice& other) const {
+ return length_ == other.length_ &&
+ array_->RangeEquals(offset_, offset_ + length_, other.offset_, *other.array_);
+ }
+ bool operator!=(const Slice& other) const { return !(*this == other); }
+};
+
+template <typename ArrayType, typename T = typename ArrayType::TypeClass,
+ typename =
+ typename std::enable_if<std::is_base_of<BaseListType, T>::value ||
+ std::is_base_of<FixedSizeListType, T>::value>::type>
+static Slice GetView(const ArrayType& array, int64_t index) {
+ return Slice{array.values().get(), array.value_offset(index),
+ array.value_length(index)};
+}
+
+struct UnitSlice {
+ const Array* array_;
+ int64_t offset_;
+
+ bool operator==(const UnitSlice& other) const {
+ return array_->RangeEquals(offset_, offset_ + 1, other.offset_, *other.array_);
+ }
+ bool operator!=(const UnitSlice& other) const { return !(*this == other); }
+};
+
+// FIXME(bkietz) this is inefficient;
+// StructArray's fields can be diffed independently then merged
+static UnitSlice GetView(const StructArray& array, int64_t index) {
+ return UnitSlice{&array, index};
+}
+
+static UnitSlice GetView(const UnionArray& array, int64_t index) {
+ return UnitSlice{&array, index};
+}
+
+struct NullTag {
+ constexpr bool operator==(const NullTag& other) const { return true; }
+ constexpr bool operator!=(const NullTag& other) const { return false; }
+};
+
+template <typename T>
+class NullOr {
+ public:
+ using VariantType = util::variant<NullTag, T>;
+
+ NullOr() : variant_(NullTag{}) {}
+ explicit NullOr(T t) : variant_(std::move(t)) {}
+
+ template <typename ArrayType>
+ NullOr(const ArrayType& array, int64_t index) {
+ if (array.IsNull(index)) {
+ variant_.emplace(NullTag{});
+ } else {
+ variant_.emplace(GetView(array, index));
+ }
+ }
+
+ bool operator==(const NullOr& other) const { return variant_ == other.variant_; }
+ bool operator!=(const NullOr& other) const { return variant_ != other.variant_; }
+
+ private:
+ VariantType variant_;
+};
+
+template <typename ArrayType>
+using ViewType = decltype(GetView(std::declval<ArrayType>(), 0));
+
+template <typename ArrayType>
+class ViewGenerator {
+ public:
+ using View = ViewType<ArrayType>;
+
+ explicit ViewGenerator(const Array& array)
+ : array_(checked_cast<const ArrayType&>(array)) {
+ DCHECK_EQ(array.null_count(), 0);
+ }
+
+ View operator()(int64_t index) const { return GetView(array_, index); }
+
+ private:
+ const ArrayType& array_;
+};
+
+template <typename ArrayType>
+internal::LazyRange<ViewGenerator<ArrayType>> MakeViewRange(const Array& array) {
+ using Generator = ViewGenerator<ArrayType>;
+ return internal::LazyRange<Generator>(Generator(array), array.length());
+}
+
+template <typename ArrayType>
+class NullOrViewGenerator {
+ public:
+ using View = ViewType<ArrayType>;
+
+ explicit NullOrViewGenerator(const Array& array)
+ : array_(checked_cast<const ArrayType&>(array)) {}
+
+ NullOr<View> operator()(int64_t index) const {
+ return array_.IsNull(index) ? NullOr<View>() : NullOr<View>(GetView(array_, index));
+ }
+
+ private:
+ const ArrayType& array_;
+};
+
+template <typename ArrayType>
+internal::LazyRange<NullOrViewGenerator<ArrayType>> MakeNullOrViewRange(
+ const Array& array) {
+ using Generator = NullOrViewGenerator<ArrayType>;
+ return internal::LazyRange<Generator>(Generator(array), array.length());
+}
+
+/// A generic sequence difference algorithm, based on
+///
+/// E. W. Myers, "An O(ND) difference algorithm and its variations,"
+/// Algorithmica, vol. 1, no. 1-4, pp. 251–266, 1986.
+///
+/// To summarize, an edit script is computed by maintaining the furthest set of EditPoints
+/// which are reachable in a given number of edits D. This is used to compute the furthest
+/// set reachable with D+1 edits, and the process continues inductively until a complete
+/// edit script is discovered.
+///
+/// From each edit point a single deletion and insertion is made then as many shared
+/// elements as possible are skipped, recording only the endpoint of the run. This
+/// representation is minimal in the common case where the sequences differ only slightly,
+/// since most of the elements are shared between base and target and are represented
+/// implicitly.
+template <typename Iterator>
+class QuadraticSpaceMyersDiff {
+ public:
+ // represents an intermediate state in the comparison of two arrays
+ struct EditPoint {
+ Iterator base, target;
+
+ bool operator==(EditPoint other) const {
+ return base == other.base && target == other.target;
+ }
+ };
+
+ // increment the position within base (the element pointed to was deleted)
+ // then extend maximally
+ EditPoint DeleteOne(EditPoint p) const {
+ if (p.base != base_end_) {
+ ++p.base;
+ }
+ return ExtendFrom(p);
+ }
+
+ // increment the position within target (the element pointed to was inserted)
+ // then extend maximally
+ EditPoint InsertOne(EditPoint p) const {
+ if (p.target != target_end_) {
+ ++p.target;
+ }
+ return ExtendFrom(p);
+ }
+
+ // increment the position within base and target (the elements skipped in this way were
+ // present in both sequences)
+ EditPoint ExtendFrom(EditPoint p) const {
+ for (; p.base != base_end_ && p.target != target_end_; ++p.base, ++p.target) {
+ if (*p.base != *p.target) {
+ break;
+ }
+ }
+ return p;
+ }
+
+ QuadraticSpaceMyersDiff(Iterator base_begin, Iterator base_end, Iterator target_begin,
+ Iterator target_end)
+ : base_begin_(base_begin),
+ base_end_(base_end),
+ target_begin_(target_begin),
+ target_end_(target_end),
+ endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}),
+ insert_({true}) {
+ if (std::distance(base_begin_, base_end_) ==
+ std::distance(target_begin_, target_end_) &&
+ endpoint_base_[0] == base_end_) {
+ // trivial case: base == target
+ finish_index_ = 0;
+ }
+ }
+
+ // beginning of a range for storing per-edit state in endpoint_base_ and insert_
+ int64_t StorageOffset(int64_t edit_count) const {
+ return edit_count * (edit_count + 1) / 2;
+ }
+
+ // given edit_count and index, augment endpoint_base_[index] with the corresponding
+ // position in target (which is only implicitly represented in edit_count, index)
+ EditPoint GetEditPoint(int64_t edit_count, int64_t index) const {
+ DCHECK_GE(index, StorageOffset(edit_count));
+ DCHECK_LT(index, StorageOffset(edit_count + 1));
+ auto insertions_minus_deletions =
+ 2 * (index - StorageOffset(edit_count)) - edit_count;
+ auto maximal_base = endpoint_base_[index];
+ auto maximal_target = std::min(
+ target_begin_ + ((maximal_base - base_begin_) + insertions_minus_deletions),
+ target_end_);
+ return {maximal_base, maximal_target};
+ }
+
+ void Next() {
+ ++edit_count_;
+ // base_begin_ is used as a dummy value here since Iterator may not be default
+ // constructible. The newly allocated range is completely overwritten below.
+ endpoint_base_.resize(StorageOffset(edit_count_ + 1), base_begin_);
+ insert_.resize(StorageOffset(edit_count_ + 1), false);
+
+ auto previous_offset = StorageOffset(edit_count_ - 1);
+ auto current_offset = StorageOffset(edit_count_);
+
+ // try deleting from base first
+ for (int64_t i = 0, i_out = 0; i < edit_count_; ++i, ++i_out) {
+ auto previous_endpoint = GetEditPoint(edit_count_ - 1, i + previous_offset);
+ endpoint_base_[i_out + current_offset] = DeleteOne(previous_endpoint).base;
+ }
+
+ // check if inserting from target could do better
+ for (int64_t i = 0, i_out = 1; i < edit_count_; ++i, ++i_out) {
+ // retrieve the previously computed best endpoint for (edit_count_, i_out)
+ // for comparison with the best endpoint achievable with an insertion
+ auto endpoint_after_deletion = GetEditPoint(edit_count_, i_out + current_offset);
+
+ auto previous_endpoint = GetEditPoint(edit_count_ - 1, i + previous_offset);
+ auto endpoint_after_insertion = InsertOne(previous_endpoint);
+
+ if (endpoint_after_insertion.base - endpoint_after_deletion.base >= 0) {
+ // insertion was more efficient; keep it and mark the insertion in insert_
+ insert_[i_out + current_offset] = true;
+ endpoint_base_[i_out + current_offset] = endpoint_after_insertion.base;
+ }
+ }
+
+ // check for completion
+ EditPoint finish = {base_end_, target_end_};
+ for (int64_t i_out = 0; i_out < edit_count_ + 1; ++i_out) {
+ if (GetEditPoint(edit_count_, i_out + current_offset) == finish) {
+ finish_index_ = i_out + current_offset;
+ return;
+ }
+ }
+ }
+
+ bool Done() { return finish_index_ != -1; }
+
+ Result<std::shared_ptr<StructArray>> GetEdits(MemoryPool* pool) {
+ DCHECK(Done());
+
+ int64_t length = edit_count_ + 1;
+ std::shared_ptr<Buffer> insert_buf, run_length_buf;
+ RETURN_NOT_OK(AllocateBitmap(pool, length, &insert_buf));
+ RETURN_NOT_OK(AllocateBuffer(pool, length * sizeof(int64_t), &run_length_buf));
+ auto run_length = reinterpret_cast<int64_t*>(run_length_buf->mutable_data());
+
+ auto index = finish_index_;
+ auto endpoint = GetEditPoint(edit_count_, finish_index_);
+
+ for (int64_t i = edit_count_; i > 0; --i) {
+ bool insert = insert_[index];
+ BitUtil::SetBitTo(insert_buf->mutable_data(), i, insert);
+
+ auto insertions_minus_deletions =
+ (endpoint.base - base_begin_) - (endpoint.target - target_begin_);
+ if (insert) {
+ ++insertions_minus_deletions;
+ } else {
+ --insertions_minus_deletions;
+ }
+ index = (i - 1 - insertions_minus_deletions) / 2 + StorageOffset(i - 1);
+
+ // endpoint of previous edit
+ auto previous = GetEditPoint(i - 1, index);
+ run_length[i] = endpoint.base - previous.base - !insert;
+ DCHECK_GE(run_length[i], 0);
+
+ endpoint = previous;
+ }
+ BitUtil::SetBitTo(insert_buf->mutable_data(), 0, false);
+ run_length[0] = endpoint.base - base_begin_;
+
+ return StructArray::Make({std::make_shared<BooleanArray>(length, insert_buf),
+ std::make_shared<Int64Array>(length, run_length_buf)},
+ {field("insert", boolean()), field("run_length", int64())});
+ }
+
+ private:
+ int64_t finish_index_ = -1;
+ int64_t edit_count_ = 0;
+ Iterator base_begin_, base_end_;
+ Iterator target_begin_, target_end_;
+ // each element of endpoint_base_ is the furthest position in base reachable given an
+ // edit_count and (# insertions) - (# deletions). Each bit of insert_ records whether
+ // the corresponding furthest position was reached via an insertion or a deletion
+ // (followed by a run of shared elements). See StorageOffset for the
+ // layout of these vectors
+ std::vector<Iterator> endpoint_base_;
+ std::vector<bool> insert_;
+};
+
+struct DiffImpl {
+ Status Visit(const NullType&) {
+ bool insert = base_.length() < target_.length();
+ auto run_length = std::min(base_.length(), target_.length());
+ auto edit_count = std::max(base_.length(), target_.length()) - run_length;
+
+ TypedBufferBuilder<bool> insert_builder(pool_);
+ RETURN_NOT_OK(insert_builder.Resize(edit_count + 1));
+ insert_builder.UnsafeAppend(false);
+ TypedBufferBuilder<int64_t> run_length_builder(pool_);
+ RETURN_NOT_OK(run_length_builder.Resize(edit_count + 1));
+ run_length_builder.UnsafeAppend(run_length);
+ if (edit_count > 0) {
+ insert_builder.UnsafeAppend(edit_count, insert);
+ run_length_builder.UnsafeAppend(edit_count, 0);
+ }
+
+ std::shared_ptr<Buffer> insert_buf, run_length_buf;
+ RETURN_NOT_OK(insert_builder.Finish(&insert_buf));
+ RETURN_NOT_OK(run_length_builder.Finish(&run_length_buf));
+
+ ARROW_ASSIGN_OR_RAISE(
+ out_,
+ StructArray::Make({std::make_shared<BooleanArray>(edit_count + 1, insert_buf),
+ std::make_shared<Int64Array>(edit_count + 1, run_length_buf)},
+ {field("insert", boolean()), field("run_length", int64())}));
+ return Status::OK();
+ }
+
+ template <typename T>
+ Status Visit(const T&) {
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+ if (base_.null_count() == 0 && target_.null_count() == 0) {
+ auto base = MakeViewRange<ArrayType>(base_);
+ auto target = MakeViewRange<ArrayType>(target_);
+ ARROW_ASSIGN_OR_RAISE(out_,
+ Diff(base.begin(), base.end(), target.begin(), target.end()));
+ } else {
+ auto base = MakeNullOrViewRange<ArrayType>(base_);
+ auto target = MakeNullOrViewRange<ArrayType>(target_);
+ ARROW_ASSIGN_OR_RAISE(out_,
+ Diff(base.begin(), base.end(), target.begin(), target.end()));
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const ExtensionType&) {
+ auto base = checked_cast<const ExtensionArray&>(base_).storage();
+ auto target = checked_cast<const ExtensionArray&>(target_).storage();
+ ARROW_ASSIGN_OR_RAISE(out_, arrow::Diff(*base, *target, pool_));
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryType& t) {
+ return Status::NotImplemented("diffing arrays of type ", t);
+ }
+
+ Result<std::shared_ptr<StructArray>> Diff() {
+ RETURN_NOT_OK(VisitTypeInline(*base_.type(), this));
+ return out_;
+ }
+
+ template <typename Iterator>
+ Result<std::shared_ptr<StructArray>> Diff(Iterator base_begin, Iterator base_end,
+ Iterator target_begin, Iterator target_end) {
+ QuadraticSpaceMyersDiff<Iterator> impl(base_begin, base_end, target_begin,
+ target_end);
+ while (!impl.Done()) {
+ impl.Next();
+ }
+ return impl.GetEdits(pool_);
+ }
+
+ const Array& base_;
+ const Array& target_;
+ MemoryPool* pool_;
+ std::shared_ptr<StructArray> out_;
+};
+
+Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
+ MemoryPool* pool) {
+ if (!base.type()->Equals(target.type())) {
+ return Status::TypeError("only taking the diff of like-typed arrays is supported.");
+ }
+
+ return DiffImpl{base, target, pool, nullptr}.Diff();
+}
+
+using Formatter = std::function<void(const Array&, int64_t index, std::ostream*)>;
+
+static Result<Formatter> MakeFormatter(const DataType& type);
+
+class MakeFormatterImpl {
+ public:
+ Result<Formatter> Make(const DataType& type) && {
+ RETURN_NOT_OK(VisitTypeInline(type, this));
+ return std::move(impl_);
+ }
+
+ private:
+ template <typename VISITOR>
+ friend Status VisitTypeInline(const DataType&, VISITOR*);
+
+ // factory implementation
+ Status Visit(const BooleanType&) {
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ *os << (checked_cast<const BooleanArray&>(array).Value(index) ? "true" : "false");
+ };
+ return Status::OK();
+ }
+
+ // format Numerics with std::ostream defaults
+ template <typename T>
+ typename std::enable_if<
+ is_number_type<T>::value && !is_date<T>::value && !is_time<T>::value, Status>::type
+ Visit(const T&) {
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ const auto& numeric = checked_cast<const NumericArray<T>&>(array);
+ if (sizeof(decltype(numeric.Value(index))) == sizeof(char)) {
+ // override std::ostream defaults for /(u|)int8_t/ since they are
+ // formatted as potentially unprintable/tty borking characters
+ *os << static_cast<int16_t>(numeric.Value(index));
+ } else {
+ *os << numeric.Value(index);
+ }
+ };
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_date<T, Status> Visit(const T&) {
+ using unit = typename std::conditional<std::is_same<T, Date32Type>::value,
+ arrow_vendored::date::days,
+ std::chrono::milliseconds>::type;
+
+ static arrow_vendored::date::sys_days epoch{arrow_vendored::date::jan / 1 / 1970};
+
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ unit value(checked_cast<const NumericArray<T>&>(array).Value(index));
+ *os << arrow_vendored::date::format("%F", value + epoch);
+ };
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_time<T, Status> Visit(const T&) {
+ impl_ = MakeTimeFormatter<T, false>("%T");
+ return Status::OK();
+ }
+
+ Status Visit(const TimestampType&) {
+ impl_ = MakeTimeFormatter<TimestampType, true>("%F %T");
+ return Status::OK();
+ }
+
+ Status Visit(const DayTimeIntervalType&) {
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ auto day_millis = checked_cast<const DayTimeIntervalArray&>(array).Value(index);
+ *os << day_millis.days << "d" << day_millis.milliseconds << "ms";
+ };
+ return Status::OK();
+ }
+
+ // format Binary and FixedSizeBinary in hexadecimal
+ template <typename T>
+ enable_if_binary_like<T, Status> Visit(const T&) {
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ *os << HexEncode(
+ checked_cast<const typename TypeTraits<T>::ArrayType&>(array).GetView(index));
+ };
+ return Status::OK();
+ }
+
+ // format Strings with \"\n\r\t\\ escaped
+ Status Visit(const StringType&) {
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ *os << "\"" << Escape(checked_cast<const StringArray&>(array).GetView(index))
+ << "\"";
+ };
+ return Status::OK();
+ }
+
+ // format Decimals with Decimal128Array::FormatValue
+ Status Visit(const Decimal128Type&) {
+ impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+ *os << checked_cast<const Decimal128Array&>(array).FormatValue(index);
+ };
+ return Status::OK();
+ }
+
+ Status Visit(const ListType& t) {
+ struct ListImpl {
+ explicit ListImpl(Formatter f) : values_formatter_(std::move(f)) {}
+
+ void operator()(const Array& array, int64_t index, std::ostream* os) {
+ const auto& list_array = checked_cast<const ListArray&>(array);
+ *os << "[";
+ for (int32_t i = 0; i < list_array.value_length(index); ++i) {
+ if (i != 0) {
+ *os << ", ";
+ }
+ values_formatter_(*list_array.values(), i + list_array.value_offset(index), os);
+ }
+ *os << "]";
+ }
+
+ Formatter values_formatter_;
+ };
+
+ ARROW_ASSIGN_OR_RAISE(auto values_formatter, MakeFormatter(*t.value_type()));
+ impl_ = ListImpl(std::move(values_formatter));
+ return Status::OK();
+ }
+
+ // TODO(bkietz) format maps better
+
+ Status Visit(const StructType& t) {
+ struct StructImpl {
+ explicit StructImpl(std::vector<Formatter> f) : field_formatters_(std::move(f)) {}
+
+ void operator()(const Array& array, int64_t index, std::ostream* os) {
+ const auto& struct_array = checked_cast<const StructArray&>(array);
+ *os << "{";
+ for (int i = 0, printed = 0; i < struct_array.num_fields(); ++i) {
+ if (printed != 0) {
+ *os << ", ";
+ }
+ if (struct_array.field(i)->IsNull(index)) {
+ continue;
+ }
+ ++printed;
+ *os << struct_array.struct_type()->child(i)->name() << ": ";
+ field_formatters_[i](*struct_array.field(i), index, os);
+ }
+ *os << "}";
+ }
+
+ std::vector<Formatter> field_formatters_;
+ };
+
+ std::vector<Formatter> field_formatters(t.num_children());
+ for (int i = 0; i < t.num_children(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(field_formatters[i], MakeFormatter(*t.child(i)->type()));
+ }
+
+ impl_ = StructImpl(std::move(field_formatters));
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& t) {
+ struct UnionImpl {
+ UnionImpl(std::vector<Formatter> f, std::vector<int> c)
+ : field_formatters_(std::move(f)), type_id_to_child_index_(std::move(c)) {}
+
+ void DoFormat(const UnionArray& array, int64_t index, int64_t child_index,
+ std::ostream* os) {
+ auto type_id = array.raw_type_ids()[index];
+ const auto& child = *array.child(type_id_to_child_index_[type_id]);
+
+ *os << "{" << static_cast<int16_t>(type_id) << ": ";
+ if (child.IsNull(child_index)) {
+ *os << "null";
+ } else {
+ field_formatters_[type_id](child, child_index, os);
+ }
+ *os << "}";
+ }
+
+ std::vector<Formatter> field_formatters_;
+ std::vector<int> type_id_to_child_index_;
+ };
+
+ struct SparseImpl : UnionImpl {
+ using UnionImpl::UnionImpl;
+
+ void operator()(const Array& array, int64_t index, std::ostream* os) {
+ const auto& union_array = checked_cast<const UnionArray&>(array);
+ DoFormat(union_array, index, index, os);
+ }
+ };
+
+ struct DenseImpl : UnionImpl {
+ using UnionImpl::UnionImpl;
+
+ void operator()(const Array& array, int64_t index, std::ostream* os) {
+ const auto& union_array = checked_cast<const UnionArray&>(array);
+ DoFormat(union_array, index, union_array.raw_value_offsets()[index], os);
+ }
+ };
+
+ std::vector<Formatter> field_formatters(t.max_type_code() + 1);
+ std::vector<int> type_id_to_child_index(field_formatters.size());
+ for (int i = 0; i < t.num_children(); ++i) {
+ auto type_id = t.type_codes()[i];
+ type_id_to_child_index[type_id] = i;
+ ARROW_ASSIGN_OR_RAISE(field_formatters[type_id],
+ MakeFormatter(*t.child(i)->type()));
+ }
+
+ if (t.mode() == UnionMode::SPARSE) {
+ impl_ = SparseImpl(std::move(field_formatters), std::move(type_id_to_child_index));
+ } else {
+ impl_ = DenseImpl(std::move(field_formatters), std::move(type_id_to_child_index));
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DataType& t) {
+ return Status::NotImplemented("formatting diffs between arrays of type ", t);
+ }
+
+ template <typename T, bool AddEpoch>
+ Formatter MakeTimeFormatter(const std::string& fmt_str) {
+ return [fmt_str](const Array& array, int64_t index, std::ostream* os) {
+ auto fmt = fmt_str.c_str();
+ auto unit = checked_cast<const T&>(*array.type()).unit();
+ auto value = checked_cast<const NumericArray<T>&>(array).Value(index);
+ using arrow_vendored::date::format;
+ using std::chrono::nanoseconds;
+ using std::chrono::microseconds;
+ using std::chrono::milliseconds;
+ using std::chrono::seconds;
+ if (AddEpoch) {
+ static arrow_vendored::date::sys_days epoch{arrow_vendored::date::jan / 1 / 1970};
+
+ switch (unit) {
+ case TimeUnit::NANO:
+ *os << format(fmt, static_cast<nanoseconds>(value) + epoch);
+ break;
+ case TimeUnit::MICRO:
+ *os << format(fmt, static_cast<microseconds>(value) + epoch);
+ break;
+ case TimeUnit::MILLI:
+ *os << format(fmt, static_cast<milliseconds>(value) + epoch);
+ break;
+ case TimeUnit::SECOND:
+ *os << format(fmt, static_cast<seconds>(value) + epoch);
+ break;
+ }
+ return;
+ }
+ switch (unit) {
+ case TimeUnit::NANO:
+ *os << format(fmt, static_cast<nanoseconds>(value));
+ break;
+ case TimeUnit::MICRO:
+ *os << format(fmt, static_cast<microseconds>(value));
+ break;
+ case TimeUnit::MILLI:
+ *os << format(fmt, static_cast<milliseconds>(value));
+ break;
+ case TimeUnit::SECOND:
+ *os << format(fmt, static_cast<seconds>(value));
+ break;
+ }
+ };
+ }
+
+ Formatter impl_;
+};
+
+static Result<Formatter> MakeFormatter(const DataType& type) {
+ return MakeFormatterImpl{}.Make(type);
+}
+
+Status VisitEditScript(
+ const Array& edits,
+ const std::function<Status(int64_t delete_begin, int64_t delete_end,
+ int64_t insert_begin, int64_t insert_end)>& visitor) {
+ static const auto edits_type =
+ struct_({field("insert", boolean()), field("run_length", int64())});
+ DCHECK(edits.type()->Equals(*edits_type));
+ DCHECK_GE(edits.length(), 1);
+
+ auto insert = checked_pointer_cast<BooleanArray>(
+ checked_cast<const StructArray&>(edits).field(0));
+ auto run_lengths =
+ checked_pointer_cast<Int64Array>(checked_cast<const StructArray&>(edits).field(1));
+
+ DCHECK(!insert->Value(0));
+
+ auto length = run_lengths->Value(0);
+ int64_t base_begin, base_end, target_begin, target_end;
+ base_begin = base_end = target_begin = target_end = length;
+ for (int64_t i = 1; i < edits.length(); ++i) {
+ if (insert->Value(i)) {
+ ++target_end;
+ } else {
+ ++base_end;
+ }
+ length = run_lengths->Value(i);
+ if (length != 0) {
+ RETURN_NOT_OK(visitor(base_begin, base_end, target_begin, target_end));
+ base_begin = base_end = base_end + length;
+ target_begin = target_end = target_end + length;
+ }
+ }
+ if (length == 0) {
+ return visitor(base_begin, base_end, target_begin, target_end);
+ }
+ return Status::OK();
+}
+
+class UnifiedDiffFormatter {
+ public:
+ UnifiedDiffFormatter(std::ostream* os, Formatter formatter)
+ : os_(os), formatter_(std::move(formatter)) {}
+
+ Status operator()(int64_t delete_begin, int64_t delete_end, int64_t insert_begin,
+ int64_t insert_end) {
+ *os_ << "@@ -" << delete_begin << ", +" << insert_begin << " @@" << std::endl;
+
+ for (int64_t i = delete_begin; i < delete_end; ++i) {
+ *os_ << "-";
+ if (base_->IsValid(i)) {
+ formatter_(*base_, i, &*os_);
+ } else {
+ *os_ << "null";
+ }
+ *os_ << std::endl;
+ }
+
+ for (int64_t i = insert_begin; i < insert_end; ++i) {
+ *os_ << "+";
+ if (target_->IsValid(i)) {
+ formatter_(*target_, i, &*os_);
+ } else {
+ *os_ << "null";
+ }
+ *os_ << std::endl;
+ }
+
+ return Status::OK();
+ }
+
+ Status operator()(const Array& edits, const Array& base, const Array& target) {
+ if (edits.length() == 1) {
+ return Status::OK();
+ }
+ base_ = &base;
+ target_ = ⌖
+ *os_ << std::endl;
+ return VisitEditScript(edits, *this);
+ }
+
+ private:
+ std::ostream* os_ = nullptr;
+ const Array* base_ = nullptr;
+ const Array* target_ = nullptr;
+ Formatter formatter_;
+};
+
+Result<std::function<Status(const Array& edits, const Array& base, const Array& target)>>
+MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os) {
+ ARROW_ASSIGN_OR_RAISE(auto formatter, MakeFormatter(type));
+ return UnifiedDiffFormatter(os, std::move(formatter));
+}
+
+} // namespace arrow
diff --git a/cpp/src/arrow/array/diff.h b/cpp/src/arrow/array/diff.h
new file mode 100644
index 0000000..a7e654f
--- /dev/null
+++ b/cpp/src/arrow/array/diff.h
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <iosfwd>
+#include <memory>
+
+#include "arrow/array.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+/// \brief Compare two arrays, returning an edit script which expresses the difference
+/// between them
+///
+/// An edit script is an array of struct(insert: bool, run_length: int64_t).
+/// Each element of "insert" determines whether an element was inserted into (true)
+/// or deleted from (false) base. Each insertion or deletion is followed by a run of
+/// elements which are unchanged from base to target; the length of this run is stored
+/// in "run_length". (Note that the edit script begins and ends with a run of shared
+/// elements but both fields of the struct must have the same length. To accomodate this
+/// the first element of "insert" should be ignored.)
+///
+/// For example for base "hlloo" and target "hello", the edit script would be
+/// [
+/// {"insert": false, "run_length": 1}, // leading run of length 1 ("h")
+/// {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo")
+/// {"insert": false, "run_length": 0} // delete("o") then an empty run
+/// ]
+///
+/// Diffing arrays containing nulls is not currently supported.
+///
+/// \param[in] base baseline for comparison
+/// \param[in] target an array of identical type to base whose elements differ from base's
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \return an edit script array which can be applied to base to produce target
+ARROW_EXPORT
+Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
+ MemoryPool* pool);
+
+/// \brief visitor interface for easy traversal of an edit script
+///
+/// visitor will be called for each hunk of insertions and deletions.
+ARROW_EXPORT Status VisitEditScript(
+ const Array& edits,
+ const std::function<Status(int64_t delete_begin, int64_t delete_end,
+ int64_t insert_begin, int64_t insert_end)>& visitor);
+
+/// \brief return a function which will format an edit script in unified
+/// diff format to os, given base and target arrays of type
+ARROW_EXPORT Result<
+ std::function<Status(const Array& edits, const Array& base, const Array& target)>>
+MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os);
+
+} // namespace arrow
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index ff4c2b5..05a1d1f 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -30,6 +30,7 @@
#include <vector>
#include "arrow/array.h"
+#include "arrow/array/diff.h"
#include "arrow/buffer.h"
#include "arrow/scalar.h"
#include "arrow/sparse_tensor.h"
@@ -929,14 +930,61 @@ class ScalarEqualsVisitor {
bool result_;
};
+Status PrintDiff(const Array& left, const Array& right, std::ostream* os) {
+ if (os == nullptr) {
+ return Status::OK();
+ }
+
+ if (!left.type()->Equals(right.type())) {
+ *os << "# Array types differed: " << *left.type() << " vs " << *right.type();
+ return Status::OK();
+ }
+
+ if (left.type()->id() == Type::DICTIONARY) {
+ *os << "# Dictionary arrays differed" << std::endl;
+
+ const auto& left_dict = checked_cast<const DictionaryArray&>(left);
+ const auto& right_dict = checked_cast<const DictionaryArray&>(right);
+
+ *os << "## dictionary diff";
+ auto pos = os->tellp();
+ RETURN_NOT_OK(PrintDiff(*left_dict.dictionary(), *right_dict.dictionary(), os));
+ if (os->tellp() == pos) {
+ *os << std::endl;
+ }
+
+ *os << "## indices diff";
+ pos = os->tellp();
+ RETURN_NOT_OK(PrintDiff(*left_dict.indices(), *right_dict.indices(), os));
+ if (os->tellp() == pos) {
+ *os << std::endl;
+ }
+ return Status::OK();
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto edits, Diff(left, right, default_memory_pool()));
+ ARROW_ASSIGN_OR_RAISE(auto formatter, MakeUnifiedDiffFormatter(*left.type(), os));
+ return formatter(*edits, left, right);
+}
+
} // namespace internal
bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts) {
- return internal::ArrayEqualsImpl<internal::ArrayEqualsVisitor>(left, right, opts);
+ bool are_equal =
+ internal::ArrayEqualsImpl<internal::ArrayEqualsVisitor>(left, right, opts);
+ if (!are_equal) {
+ DCHECK_OK(internal::PrintDiff(left, right, opts.diff_sink()));
+ }
+ return are_equal;
}
bool ArrayApproxEquals(const Array& left, const Array& right, const EqualOptions& opts) {
- return internal::ArrayEqualsImpl<internal::ApproxEqualsVisitor>(left, right, opts);
+ bool are_equal =
+ internal::ArrayEqualsImpl<internal::ApproxEqualsVisitor>(left, right, opts);
+ if (!are_equal) {
+ DCHECK_OK(internal::PrintDiff(left, right, opts.diff_sink()));
+ }
+ return are_equal;
}
bool ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx,
diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h
index 21da16b..eba5636 100644
--- a/cpp/src/arrow/compare.h
+++ b/cpp/src/arrow/compare.h
@@ -21,7 +21,9 @@
#define ARROW_COMPARE_H
#include <cstdint>
+#include <iosfwd>
+#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -57,11 +59,23 @@ class EqualOptions {
return res;
}
+ /// The ostream to which a diff will be formatted if arrays disagree.
+ /// If this is null (the default) no diff will be formatted.
+ std::ostream* diff_sink() const { return diff_sink_; }
+
+ /// Return a new EqualOptions object with the "diff_sink" property changed.
+ EqualOptions diff_sink(std::ostream* diff_sink) const {
+ auto res = EqualOptions(*this);
+ res.diff_sink_ = diff_sink;
+ return res;
+ }
+
static EqualOptions Defaults() { return EqualOptions(); }
protected:
double atol_ = kDefaultAbsoluteTolerance;
bool nans_equal_ = false;
+ std::ostream* diff_sink_ = NULLPTR;
};
/// Returns true if the arrays are exactly equal
diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc
index 70870a7..4944764 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -66,7 +66,10 @@ void AssertTsEqual(const T& expected, const T& actual) {
}
void AssertArraysEqual(const Array& expected, const Array& actual) {
- AssertTsEqual(expected, actual);
+ std::stringstream diff;
+ if (!expected.Equals(actual, EqualOptions().diff_sink(&diff))) {
+ FAIL() << diff.str();
+ }
}
void AssertBatchesEqual(const RecordBatch& expected, const RecordBatch& actual) {
@@ -76,19 +79,14 @@ void AssertBatchesEqual(const RecordBatch& expected, const RecordBatch& actual)
void AssertChunkedEqual(const ChunkedArray& expected, const ChunkedArray& actual) {
ASSERT_EQ(expected.num_chunks(), actual.num_chunks()) << "# chunks unequal";
if (!actual.Equals(expected)) {
- std::stringstream pp_result;
- std::stringstream pp_expected;
-
+ std::stringstream diff;
for (int i = 0; i < actual.num_chunks(); ++i) {
auto c1 = actual.chunk(i);
auto c2 = expected.chunk(i);
- if (!c1->Equals(*c2)) {
- ARROW_EXPECT_OK(::arrow::PrettyPrint(*c1, 0, &pp_result));
- ARROW_EXPECT_OK(::arrow::PrettyPrint(*c2, 0, &pp_expected));
- FAIL() << "Chunk " << i << " Got: " << pp_result.str()
- << "\nExpected: " << pp_expected.str();
- }
+ diff << "# chunk " << i << std::endl;
+ ARROW_IGNORE_EXPR(c1->Equals(c2, EqualOptions().diff_sink(&diff)));
}
+ FAIL() << diff.str();
}
}
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 86df73b..2e4e1b3 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -583,6 +583,7 @@ class ARROW_EXPORT MapType : public ListType {
class ARROW_EXPORT FixedSizeListType : public NestedType {
public:
static constexpr Type::type type_id = Type::FIXED_SIZE_LIST;
+ using offset_type = int32_t;
static constexpr const char* type_name() { return "fixed_size_list"; }
diff --git a/cpp/src/arrow/util/lazy.h b/cpp/src/arrow/util/lazy.h
index de32b5f..55c68b3 100644
--- a/cpp/src/arrow/util/lazy.h
+++ b/cpp/src/arrow/util/lazy.h
@@ -67,13 +67,17 @@ class LazyRange {
using _Unchecked_type = typename LazyRange<Generator>::RangeIter;
#endif
+ RangeIter() = delete;
+ RangeIter(const RangeIter& other) = default;
+ RangeIter& operator=(const RangeIter& other) = default;
+
RangeIter(const LazyRange<Generator>& range, int64_t index)
- : range_(range), index_(index) {}
+ : range_(&range), index_(index) {}
- const return_type operator*() { return range_.gen_(index_); }
+ const return_type operator*() const { return range_->gen_(index_); }
- RangeIter operator+(difference_type length) {
- return RangeIter(range_, index_ + length);
+ RangeIter operator+(difference_type length) const {
+ return RangeIter(*range_, index_ + length);
}
// pre-increment
@@ -90,20 +94,24 @@ class LazyRange {
}
bool operator==(const typename LazyRange<Generator>::RangeIter& other) const {
- return this->index_ == other.index_ && &this->range_ == &other.range_;
+ return this->index_ == other.index_ && this->range_ == other.range_;
}
bool operator!=(const typename LazyRange<Generator>::RangeIter& other) const {
- return this->index_ != other.index_ || &this->range_ != &other.range_;
+ return this->index_ != other.index_ || this->range_ != other.range_;
}
- int64_t operator-(const typename LazyRange<Generator>::RangeIter& other) {
+ int64_t operator-(const typename LazyRange<Generator>::RangeIter& other) const {
return this->index_ - other.index_;
}
+ bool operator<(const typename LazyRange<Generator>::RangeIter& other) const {
+ return this->index_ < other.index_;
+ }
+
private:
// parent range reference
- const LazyRange& range_;
+ const LazyRange* range_;
// current index
int64_t index_;
};
diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.cc
similarity index 63%
copy from cpp/src/arrow/util/string.h
copy to cpp/src/arrow/util/string.cc
index 1d716c5..727ec41 100644
--- a/cpp/src/arrow/util/string.h
+++ b/cpp/src/arrow/util/string.cc
@@ -15,20 +15,17 @@
// specific language governing permissions and limitations
// under the License.
-#ifndef ARROW_UTIL_STRING_UTIL_H
-#define ARROW_UTIL_STRING_UTIL_H
+#include "arrow/util/string.h"
#include <algorithm>
-#include <string>
#include "arrow/status.h"
-#include "arrow/util/string_view.h"
namespace arrow {
static const char* kAsciiTable = "0123456789ABCDEF";
-static inline std::string HexEncode(const uint8_t* data, size_t length) {
+std::string HexEncode(const uint8_t* data, size_t length) {
std::string hex_string;
hex_string.reserve(length * 2);
for (size_t j = 0; j < length; ++j) {
@@ -39,15 +36,42 @@ static inline std::string HexEncode(const uint8_t* data, size_t length) {
return hex_string;
}
-static inline std::string HexEncode(const char* data, size_t length) {
- return HexEncode(reinterpret_cast<const uint8_t*>(data), length);
+std::string Escape(const char* data, size_t length) {
+ std::string escaped_string;
+ escaped_string.reserve(length);
+ for (size_t j = 0; j < length; ++j) {
+ switch (data[j]) {
+ case '"':
+ escaped_string += R"(\")";
+ break;
+ case '\\':
+ escaped_string += R"(\\)";
+ break;
+ case '\t':
+ escaped_string += R"(\t)";
+ break;
+ case '\r':
+ escaped_string += R"(\r)";
+ break;
+ case '\n':
+ escaped_string += R"(\n)";
+ break;
+ default:
+ escaped_string.push_back(data[j]);
+ }
+ }
+ return escaped_string;
}
-static inline std::string HexEncode(util::string_view str) {
- return HexEncode(str.data(), str.size());
+std::string HexEncode(const char* data, size_t length) {
+ return HexEncode(reinterpret_cast<const uint8_t*>(data), length);
}
-static inline Status ParseHexValue(const char* data, uint8_t* out) {
+std::string HexEncode(util::string_view str) { return HexEncode(str.data(), str.size()); }
+
+std::string Escape(util::string_view str) { return Escape(str.data(), str.size()); }
+
+Status ParseHexValue(const char* data, uint8_t* out) {
char c1 = data[0];
char c2 = data[1];
@@ -64,5 +88,3 @@ static inline Status ParseHexValue(const char* data, uint8_t* out) {
}
} // namespace arrow
-
-#endif // ARROW_UTIL_STRING_UTIL_H
diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h
index 1d716c5..7a5949f 100644
--- a/cpp/src/arrow/util/string.h
+++ b/cpp/src/arrow/util/string.h
@@ -15,54 +15,27 @@
// specific language governing permissions and limitations
// under the License.
-#ifndef ARROW_UTIL_STRING_UTIL_H
-#define ARROW_UTIL_STRING_UTIL_H
+#pragma once
-#include <algorithm>
#include <string>
-#include "arrow/status.h"
#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
namespace arrow {
-static const char* kAsciiTable = "0123456789ABCDEF";
+class Status;
-static inline std::string HexEncode(const uint8_t* data, size_t length) {
- std::string hex_string;
- hex_string.reserve(length * 2);
- for (size_t j = 0; j < length; ++j) {
- // Convert to 2 base16 digits
- hex_string.push_back(kAsciiTable[data[j] >> 4]);
- hex_string.push_back(kAsciiTable[data[j] & 15]);
- }
- return hex_string;
-}
+ARROW_EXPORT std::string HexEncode(const uint8_t* data, size_t length);
-static inline std::string HexEncode(const char* data, size_t length) {
- return HexEncode(reinterpret_cast<const uint8_t*>(data), length);
-}
+ARROW_EXPORT std::string Escape(const char* data, size_t length);
-static inline std::string HexEncode(util::string_view str) {
- return HexEncode(str.data(), str.size());
-}
+ARROW_EXPORT std::string HexEncode(const char* data, size_t length);
-static inline Status ParseHexValue(const char* data, uint8_t* out) {
- char c1 = data[0];
- char c2 = data[1];
+ARROW_EXPORT std::string HexEncode(util::string_view str);
- const char* pos1 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c1);
- const char* pos2 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c2);
+ARROW_EXPORT std::string Escape(util::string_view str);
- // Error checking
- if (*pos1 != c1 || *pos2 != c2) {
- return Status::Invalid("Encountered non-hex digit");
- }
-
- *out = static_cast<uint8_t>((pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable));
- return Status::OK();
-}
+ARROW_EXPORT Status ParseHexValue(const char* data, uint8_t* out);
} // namespace arrow
-
-#endif // ARROW_UTIL_STRING_UTIL_H