You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/08/13 17:29:57 UTC

[arrow] branch master updated: ARROW-517: [C++] array comparison, uses D**2 space Myers

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new cba9c7b  ARROW-517: [C++] array comparison, uses D**2 space Myers
cba9c7b is described below

commit cba9c7b3945f2207d74d33ad407b35e6b362d166
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Tue Aug 13 19:29:40 2019 +0200

    ARROW-517: [C++] array comparison, uses D**2 space Myers
    
    Adds `arrow::Diff`, which compares two arrays returning an edit script which expresses the difference between them.
    
    The edit script is represented as an array of `struct<insert: bool, run_length: uint64_t>`.
    Each element of the "insert" column determines whether an element was inserted into (true) or deleted from (false) base. Each insertion or deletion is followed by a run of elements which are unchanged from base to target; the length of this run is stored in the "run_length" column. (The first element of the "insert" column is always false; the first element of the "run_length" column is the number of leading elements unchanged from base to target.)
    
    For example for base "hlloo" and target "hello", the edit script would be
    
    ```js
    [
      {"insert": true, "run_length": 1}, // leading run of length 1 ("h")
      {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo")
      {"insert": false, "run_length": 0} // delete("o") then an empty run
    ]
    ```
    
    Closes #4782 from bkietz/517-Verbose-ArrayEquals and squashes the following commits:
    
    ce358b8b3 <Benjamin Kietzman> fix nits
    f09478454 <Benjamin Kietzman> revert expanded filter_probability set
    493284062 <Benjamin Kietzman> in extreme cases, implicit target position is not valid
    3837ea36e <Benjamin Kietzman> move string utilities from a header into a separate source file
    1cae9ba4e <Benjamin Kietzman> remove DiffVisitor, simplify visitation of edit scripts
    d56eba425 <Benjamin Kietzman> lint fixes
    3f1b9103b <Benjamin Kietzman> clarify (hopefully) storage layout and algorithm flow
    68da6be7b <Benjamin Kietzman> add comments clarifying diff implementation
    74f657841 <Benjamin Kietzman> add date/time/timestamp/interval formatting
    8504ffc8e <Benjamin Kietzman> rename to Storage{Begin, End}
    5dcf01d31 <Benjamin Kietzman> add test which prepends some elements
    fdba4ba4c <Benjamin Kietzman> use int64_t for lengths
    8210a778b <Benjamin Kietzman> trim random tests to run faster, add a random string diff test
    4543b19c8 <Benjamin Kietzman> replace Formatter with std::function
    c75031174 <Benjamin Kietzman> remove redundant list view range
    aa41437eb <Benjamin Kietzman> verify that 8 bit integers are formatted as numbers rather than chars
    9df259a6e <Benjamin Kietzman> make diff tests more readable
    90ef3c87b <Benjamin Kietzman> move DiffTest.Errors
    fcb8db7ba <Benjamin Kietzman> add comment: null EqualOptions::diff_sink
    e72a402cd <Benjamin Kietzman> use Result more consistently
    29c90ac77 <Benjamin Kietzman> add test with escaped chars
    ab3272149 <Benjamin Kietzman> pass std::ostream by pointer rather than reference
    2f216868d <Benjamin Kietzman> merge conflict with new StructArray::Make
    490633aa0 <Benjamin Kietzman> use iosfwd instead of iostream
    f78c3b7de <Benjamin Kietzman> add support for LargeListArray
    7cf6e847b <Benjamin Kietzman> lint: explicit constructors
    4b55c412b <Benjamin Kietzman> add randomized struct diff test
    bd29b553b <Benjamin Kietzman> added a test for formatting diffs of map arrays
    bebc9a4d5 <Benjamin Kietzman> add support for formatting unions
    efa35fd6b <Benjamin Kietzman> refine list formatting, add struct formatting
    88a79f8a1 <Benjamin Kietzman> rename DiffImpl
    354af4f6f <Benjamin Kietzman> refactor (|chunked) array asserts to show diffs
    46da4eb75 <Benjamin Kietzman> add support for diff(ListArrays)
    ea1693e5c <Benjamin Kietzman> add support for diff(DictionaryArrays)
    e1527910d <Benjamin Kietzman> add support for diff(ExtensionArray)
    1036cadc7 <Benjamin Kietzman> add non-erroring bailouts for unsupported array types in Array::Equals
    6f97ba2f5 <Benjamin Kietzman> add support for diff(NullArrays)
    8767d1f76 <Benjamin Kietzman> force integer formatting for /(u|)int8_t/
    d9f71b23c <Benjamin Kietzman> add support for diffing arrays with nulls
    0b5e7c609 <Benjamin Kietzman> add test for Array::Equals with diff
    4eb4d1cbb <Benjamin Kietzman> refine & bugfix unified diff formatter
    2f27816a5 <Benjamin Kietzman> Add unified format printer and randomized tests
    70261c0c2 <Benjamin Kietzman> add DiffVisitor interface
    58918bc5d <Benjamin Kietzman> first draft of array comparison, uses D**2 space Myers
    
    Authored-by: Benjamin Kietzman <be...@gmail.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/CMakeLists.txt               |   2 +
 cpp/src/arrow/array.cc                     |   4 +-
 cpp/src/arrow/array.h                      |   5 +-
 cpp/src/arrow/array/CMakeLists.txt         |   1 +
 cpp/src/arrow/array/diff-test.cc           | 650 +++++++++++++++++++++++
 cpp/src/arrow/array/diff.cc                | 819 +++++++++++++++++++++++++++++
 cpp/src/arrow/array/diff.h                 |  75 +++
 cpp/src/arrow/compare.cc                   |  52 +-
 cpp/src/arrow/compare.h                    |  14 +
 cpp/src/arrow/testing/gtest_util.cc        |  18 +-
 cpp/src/arrow/type.h                       |   1 +
 cpp/src/arrow/util/lazy.h                  |  24 +-
 cpp/src/arrow/util/{string.h => string.cc} |  46 +-
 cpp/src/arrow/util/string.h                |  45 +-
 14 files changed, 1684 insertions(+), 72 deletions(-)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 7d385e0..dc1cec2 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -91,6 +91,7 @@ set(ARROW_SRCS
     array/builder_primitive.cc
     array/builder_union.cc
     array/concatenate.cc
+    array/diff.cc
     buffer.cc
     compare.cc
     extension_type.cc
@@ -136,6 +137,7 @@ set(ARROW_SRCS
     util/logging.cc
     util/key_value_metadata.cc
     util/memory.cc
+    util/string.cc
     util/string_builder.cc
     util/task-group.cc
     util/thread-pool.cc
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 847ed11..3c87280 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -558,7 +558,7 @@ StructArray::StructArray(const std::shared_ptr<DataType>& type, int64_t length,
   boxed_fields_.resize(children.size());
 }
 
-Result<std::shared_ptr<Array>> StructArray::Make(
+Result<std::shared_ptr<StructArray>> StructArray::Make(
     const std::vector<std::shared_ptr<Array>>& children,
     const std::vector<std::shared_ptr<Field>>& fields,
     std::shared_ptr<Buffer> null_bitmap, int64_t null_count, int64_t offset) {
@@ -582,7 +582,7 @@ Result<std::shared_ptr<Array>> StructArray::Make(
                                        null_bitmap, null_count, offset);
 }
 
-Result<std::shared_ptr<Array>> StructArray::Make(
+Result<std::shared_ptr<StructArray>> StructArray::Make(
     const std::vector<std::shared_ptr<Array>>& children,
     const std::vector<std::string>& field_names, std::shared_ptr<Buffer> null_bitmap,
     int64_t null_count, int64_t offset) {
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 1b3e47f..fbbe369 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -639,6 +639,7 @@ class ARROW_EXPORT MapArray : public ListArray {
 class ARROW_EXPORT FixedSizeListArray : public Array {
  public:
   using TypeClass = FixedSizeListType;
+  using offset_type = TypeClass::offset_type;
 
   explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);
 
@@ -912,7 +913,7 @@ class ARROW_EXPORT StructArray : public Array {
   ///
   /// The length and data type are automatically inferred from the arguments.
   /// There should be at least one child array.
-  static Result<std::shared_ptr<Array>> Make(
+  static Result<std::shared_ptr<StructArray>> Make(
       const std::vector<std::shared_ptr<Array>>& children,
       const std::vector<std::string>& field_names,
       std::shared_ptr<Buffer> null_bitmap = NULLPTR,
@@ -923,7 +924,7 @@ class ARROW_EXPORT StructArray : public Array {
   /// The length is automatically inferred from the arguments.
   /// There should be at least one child array.  This method does not
   /// check that field types and child array types are consistent.
-  static Result<std::shared_ptr<Array>> Make(
+  static Result<std::shared_ptr<StructArray>> Make(
       const std::vector<std::shared_ptr<Array>>& children,
       const std::vector<std::shared_ptr<Field>>& fields,
       std::shared_ptr<Buffer> null_bitmap = NULLPTR,
diff --git a/cpp/src/arrow/array/CMakeLists.txt b/cpp/src/arrow/array/CMakeLists.txt
index 88bc4ea..70d0661 100644
--- a/cpp/src/arrow/array/CMakeLists.txt
+++ b/cpp/src/arrow/array/CMakeLists.txt
@@ -16,6 +16,7 @@
 # under the License.
 
 add_arrow_test(concatenate-test)
+add_arrow_test(diff-test)
 
 # Headers: top level
 arrow_install_all_headers("arrow/array")
diff --git a/cpp/src/arrow/array/diff-test.cc b/cpp/src/arrow/array/diff-test.cc
new file mode 100644
index 0000000..1837de5
--- /dev/null
+++ b/cpp/src/arrow/array/diff-test.cc
@@ -0,0 +1,650 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/array.h"
+#include "arrow/array/diff.h"
+#include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/compute/context.h"
+#include "arrow/compute/kernels/filter.h"
+#include "arrow/status.h"
+#include "arrow/testing/gtest_common.h"
+#include "arrow/testing/random.h"
+#include "arrow/testing/util.h"
+#include "arrow/type.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+constexpr random::SeedType kSeed = 0xdeadbeef;
+static const auto edits_type =
+    struct_({field("insert", boolean()), field("run_length", int64())});
+
+Status ValidateEditScript(const Array& edits, const Array& base, const Array& target) {
+  // beginning (in base) of the run before the current hunk
+  int64_t base_run_begin = 0;
+  return VisitEditScript(edits, [&](int64_t delete_begin, int64_t delete_end,
+                                    int64_t insert_begin, int64_t insert_end) {
+    auto target_run_begin = insert_begin - (delete_begin - base_run_begin);
+    if (!base.RangeEquals(base_run_begin, delete_begin, target_run_begin, target)) {
+      return Status::Invalid("base and target were unequal in a run");
+    }
+
+    base_run_begin = delete_end;
+    for (int64_t i = insert_begin; i < insert_end; ++i) {
+      for (int64_t d = delete_begin; d < delete_end; ++d) {
+        if (target.RangeEquals(i, i + 1, d, base)) {
+          return Status::Invalid("a deleted element was simultaneously inserted");
+        }
+      }
+    }
+
+    return Status::OK();
+  });
+}
+
+class DiffTest : public ::testing::Test {
+ protected:
+  DiffTest() : rng_(kSeed) {}
+
+  void DoDiff() {
+    auto edits = Diff(*base_, *target_, default_memory_pool());
+    ASSERT_OK(edits.status());
+    edits_ = edits.ValueOrDie();
+    ASSERT_OK(ValidateArray(*edits_));
+    ASSERT_TRUE(edits_->type()->Equals(edits_type));
+    insert_ = checked_pointer_cast<BooleanArray>(edits_->field(0));
+    run_lengths_ = checked_pointer_cast<Int64Array>(edits_->field(1));
+  }
+
+  void DoDiffAndFormat(std::stringstream* out) {
+    DoDiff();
+    auto formatter = MakeUnifiedDiffFormatter(*base_->type(), out);
+    ASSERT_OK(formatter.status());
+    ASSERT_OK(formatter.ValueOrDie()(*edits_, *base_, *target_));
+  }
+
+  // validate diff and assert that it formats as expected, both directly
+  // and through Array::Equals
+  void AssertDiffAndFormat(const std::string& formatted_expected) {
+    std::stringstream formatted;
+
+    DoDiffAndFormat(&formatted);
+    ASSERT_EQ(formatted.str(), formatted_expected) << "formatted diff incorrectly";
+    formatted.str("");
+
+    ASSERT_EQ(edits_->length() == 1,
+              base_->Equals(*target_, EqualOptions().diff_sink(&formatted)));
+    ASSERT_EQ(formatted.str(), formatted_expected)
+        << "Array::Equals formatted diff incorrectly";
+  }
+
+  void AssertInsertIs(const std::string& insert_json) {
+    ASSERT_ARRAYS_EQUAL(*ArrayFromJSON(boolean(), insert_json), *insert_);
+  }
+
+  void AssertRunLengthIs(const std::string& run_lengths_json) {
+    ASSERT_ARRAYS_EQUAL(*ArrayFromJSON(int64(), run_lengths_json), *run_lengths_);
+  }
+
+  random::RandomArrayGenerator rng_;
+  std::shared_ptr<StructArray> edits_;
+  std::shared_ptr<Array> base_, target_;
+  std::shared_ptr<BooleanArray> insert_;
+  std::shared_ptr<Int64Array> run_lengths_;
+};
+
+TEST_F(DiffTest, Trivial) {
+  base_ = ArrayFromJSON(int32(), "[]");
+  target_ = ArrayFromJSON(int32(), "[]");
+  DoDiff();
+  AssertInsertIs("[false]");
+  AssertRunLengthIs("[0]");
+
+  base_ = ArrayFromJSON(null(), "[null, null]");
+  target_ = ArrayFromJSON(null(), "[null, null, null, null]");
+  DoDiff();
+  AssertInsertIs("[false, true, true]");
+  AssertRunLengthIs("[2, 0, 0]");
+
+  base_ = ArrayFromJSON(int32(), "[1, 2, 3]");
+  target_ = ArrayFromJSON(int32(), "[1, 2, 3]");
+  DoDiff();
+  AssertInsertIs("[false]");
+  AssertRunLengthIs("[3]");
+}
+
+TEST_F(DiffTest, Errors) {
+  std::stringstream formatted;
+
+  base_ = ArrayFromJSON(int32(), "[]");
+  target_ = ArrayFromJSON(utf8(), "[]");
+  ASSERT_RAISES(TypeError, Diff(*base_, *target_, default_memory_pool()));
+
+  ASSERT_FALSE(base_->Equals(*target_, EqualOptions().diff_sink(&formatted)));
+  ASSERT_EQ(formatted.str(), R"(# Array types differed: int32 vs string)");
+}
+
+template <typename ArrowType>
+class DiffTestWithNumeric : public DiffTest {
+ protected:
+  std::shared_ptr<DataType> type_singleton() {
+    return TypeTraits<ArrowType>::type_singleton();
+  }
+};
+
+TYPED_TEST_CASE(DiffTestWithNumeric, NumericArrowTypes);
+
+TYPED_TEST(DiffTestWithNumeric, Basics) {
+  // insert one
+  this->base_ = ArrayFromJSON(this->type_singleton(), "[1, 2, null, 5]");
+  this->target_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5]");
+  this->DoDiff();
+  this->AssertInsertIs("[false, true]");
+  this->AssertRunLengthIs("[2, 2]");
+
+  // delete one
+  this->base_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5]");
+  this->target_ = ArrayFromJSON(this->type_singleton(), "[1, 2, null, 5]");
+  this->DoDiff();
+  this->AssertInsertIs("[false, false]");
+  this->AssertRunLengthIs("[2, 2]");
+
+  // change one
+  this->base_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5]");
+  this->target_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 23, null, 5]");
+  this->DoDiff();
+  this->AssertInsertIs("[false, false, true]");
+  this->AssertRunLengthIs("[2, 0, 2]");
+
+  // null out one
+  this->base_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5]");
+  this->target_ = ArrayFromJSON(this->type_singleton(), "[1, 2, null, null, 5]");
+  this->DoDiff();
+  this->AssertInsertIs("[false, false, true]");
+  this->AssertRunLengthIs("[2, 1, 1]");
+
+  // append some
+  this->base_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5]");
+  this->target_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5, 6, 7, 8, 9]");
+  this->DoDiff();
+  this->AssertInsertIs("[false, true, true, true, true]");
+  this->AssertRunLengthIs("[5, 0, 0, 0, 0]");
+
+  // prepend some
+  this->base_ = ArrayFromJSON(this->type_singleton(), "[1, 2, 3, null, 5]");
+  this->target_ = ArrayFromJSON(this->type_singleton(), "[6, 4, 2, 0, 1, 2, 3, null, 5]");
+  this->DoDiff();
+  this->AssertInsertIs("[false, true, true, true, true]");
+  this->AssertRunLengthIs("[0, 0, 0, 0, 5]");
+}
+
+TEST_F(DiffTest, CompareRandomInt64) {
+  compute::FunctionContext ctx;
+  for (auto null_probability : {0.0, 0.25}) {
+    auto values = this->rng_.Int64(1 << 10, 0, 127, null_probability);
+    for (const double filter_probability : {0.99, 0.75, 0.5}) {
+      auto filter_1 = this->rng_.Boolean(values->length(), filter_probability, 0.0);
+      auto filter_2 = this->rng_.Boolean(values->length(), filter_probability, 0.0);
+
+      ASSERT_OK(compute::Filter(&ctx, *values, *filter_1, &this->base_));
+      ASSERT_OK(compute::Filter(&ctx, *values, *filter_2, &this->target_));
+
+      std::stringstream formatted;
+      this->DoDiffAndFormat(&formatted);
+      auto st = ValidateEditScript(*this->edits_, *this->base_, *this->target_);
+      if (!st.ok()) {
+        ASSERT_OK(Status(st.code(), st.message() + "\n" + formatted.str()));
+      }
+    }
+  }
+}
+
+TEST_F(DiffTest, CompareRandomStrings) {
+  compute::FunctionContext ctx;
+  for (auto null_probability : {0.0, 0.25}) {
+    auto values = this->rng_.StringWithRepeats(1 << 10, 1 << 8, 0, 32, null_probability);
+    for (const double filter_probability : {0.99, 0.75, 0.5}) {
+      auto filter_1 = this->rng_.Boolean(values->length(), filter_probability, 0.0);
+      auto filter_2 = this->rng_.Boolean(values->length(), filter_probability, 0.0);
+
+      ASSERT_OK(compute::Filter(&ctx, *values, *filter_1, &this->base_));
+      ASSERT_OK(compute::Filter(&ctx, *values, *filter_2, &this->target_));
+
+      std::stringstream formatted;
+      this->DoDiffAndFormat(&formatted);
+      auto st = ValidateEditScript(*this->edits_, *this->base_, *this->target_);
+      if (!st.ok()) {
+        ASSERT_OK(Status(st.code(), st.message() + "\n" + formatted.str()));
+      }
+    }
+  }
+}
+
+TEST_F(DiffTest, BasicsWithStrings) {
+  // insert one
+  base_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+  target_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])");
+  DoDiff();
+  AssertInsertIs("[false, true]");
+  AssertRunLengthIs("[1, 2]");
+
+  // delete one
+  base_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])");
+  target_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+  DoDiff();
+  AssertInsertIs("[false, false]");
+  AssertRunLengthIs("[1, 2]");
+
+  // change one
+  base_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+  target_ = ArrayFromJSON(utf8(), R"(["gimme", "a", "break"])");
+  DoDiff();
+  AssertInsertIs("[false, false, true]");
+  AssertRunLengthIs("[0, 0, 2]");
+
+  // null out one
+  base_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+  target_ = ArrayFromJSON(utf8(), R"(["give", "a", null])");
+  DoDiff();
+  AssertInsertIs("[false, false, true]");
+  AssertRunLengthIs("[2, 0, 0]");
+}
+
+TEST_F(DiffTest, BasicsWithLists) {
+  // insert one
+  base_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [], [13]])");
+  target_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [5, 9], [], [13]])");
+  DoDiff();
+  AssertInsertIs("[false, true]");
+  AssertRunLengthIs("[1, 2]");
+
+  // delete one
+  base_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [5, 9], [], [13]])");
+  target_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [], [13]])");
+  DoDiff();
+  AssertInsertIs("[false, false]");
+  AssertRunLengthIs("[1, 2]");
+
+  // change one
+  base_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [], [13]])");
+  target_ = ArrayFromJSON(list(int32()), R"([[3, 3, 3], [], [13]])");
+  DoDiff();
+  AssertInsertIs("[false, false, true]");
+  AssertRunLengthIs("[0, 0, 2]");
+
+  // null out one
+  base_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [], [13]])");
+  target_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [], null])");
+  DoDiff();
+  AssertInsertIs("[false, false, true]");
+  AssertRunLengthIs("[2, 0, 0]");
+}
+
+TEST_F(DiffTest, BasicsWithStructs) {
+  auto type = struct_({field("foo", utf8()), field("bar", int32())});
+
+  // insert one
+  base_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {}, {"bar": 13}])");
+  target_ =
+      ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {"foo": "?"}, {}, {"bar": 13}])");
+  DoDiff();
+  AssertInsertIs("[false, true]");
+  AssertRunLengthIs("[1, 2]");
+
+  // delete one
+  base_ =
+      ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {"foo": "?"}, {}, {"bar": 13}])");
+  target_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {}, {"bar": 13}])");
+  DoDiff();
+  AssertInsertIs("[false, false]");
+  AssertRunLengthIs("[1, 2]");
+
+  // change one
+  base_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {}, {"bar": 13}])");
+  target_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 2}, {}, {"bar": 13}])");
+  DoDiff();
+  AssertInsertIs("[false, false, true]");
+  AssertRunLengthIs("[0, 0, 2]");
+
+  // null out one
+  base_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {}, {"bar": 13}])");
+  target_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {}, null])");
+  DoDiff();
+  AssertInsertIs("[false, false, true]");
+  AssertRunLengthIs("[2, 0, 0]");
+}
+
+TEST_F(DiffTest, BasicsWithUnions) {
+  auto type = union_({field("foo", utf8()), field("bar", int32())}, {2, 5});
+
+  // insert one
+  base_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], [5, 13]])");
+  target_ = ArrayFromJSON(type, R"([[2, "!"], [2, "?"], [5, 3], [5, 13]])");
+  DoDiff();
+  AssertInsertIs("[false, true]");
+  AssertRunLengthIs("[1, 2]");
+
+  // delete one
+  base_ = ArrayFromJSON(type, R"([[2, "!"], [2, "?"], [5, 3], [5, 13]])");
+  target_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], [5, 13]])");
+  DoDiff();
+  AssertInsertIs("[false, false]");
+  AssertRunLengthIs("[1, 2]");
+
+  // change one
+  base_ = ArrayFromJSON(type, R"([[5, 3], [2, "!"], [5, 13]])");
+  target_ = ArrayFromJSON(type, R"([[2, "3"], [2, "!"], [5, 13]])");
+  DoDiff();
+  AssertInsertIs("[false, false, true]");
+  AssertRunLengthIs("[0, 0, 2]");
+
+  // null out one
+  base_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], [5, 13]])");
+  target_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], null])");
+  DoDiff();
+  AssertInsertIs("[false, false, true]");
+  AssertRunLengthIs("[2, 0, 0]");
+}
+
+TEST_F(DiffTest, UnifiedDiffFormatter) {
+  // no changes
+  base_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])");
+  target_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])");
+  AssertDiffAndFormat(R"()");
+
+  // insert one
+  base_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+  target_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])");
+  AssertDiffAndFormat(R"(
+@@ -1, +1 @@
++"me"
+)");
+
+  // delete one
+  base_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])");
+  target_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+  AssertDiffAndFormat(R"(
+@@ -1, +1 @@
+-"me"
+)");
+
+  // change one
+  base_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+  target_ = ArrayFromJSON(utf8(), R"(["gimme", "a", "break"])");
+  AssertDiffAndFormat(R"(
+@@ -0, +0 @@
+-"give"
++"gimme"
+)");
+
+  // null out one
+  base_ = ArrayFromJSON(utf8(), R"(["give", "a", "break"])");
+  target_ = ArrayFromJSON(utf8(), R"(["give", "a", null])");
+  AssertDiffAndFormat(R"(
+@@ -2, +2 @@
+-"break"
++null
+)");
+
+  // strings with escaped chars
+  base_ = ArrayFromJSON(utf8(), R"(["newline:\n", "quote:'", "backslash:\\"])");
+  target_ =
+      ArrayFromJSON(utf8(), R"(["newline:\n", "tab:\t", "quote:\"", "backslash:\\"])");
+  AssertDiffAndFormat(R"(
+@@ -1, +1 @@
+-"quote:'"
++"tab:\t"
++"quote:\""
+)");
+
+  // date32
+  base_ = ArrayFromJSON(date32(), R"([0, 1, 2, 31, 4])");
+  target_ = ArrayFromJSON(date32(), R"([0, 1, 31, 2, 4])");
+  AssertDiffAndFormat(R"(
+@@ -2, +2 @@
+-1970-01-03
+@@ -4, +3 @@
++1970-01-03
+)");
+
+  // date64
+  constexpr int64_t ms_per_day = 24 * 60 * 60 * 1000;
+  ArrayFromVector<Date64Type>(
+      {0 * ms_per_day, 1 * ms_per_day, 2 * ms_per_day, 31 * ms_per_day, 4 * ms_per_day},
+      &base_);
+  ArrayFromVector<Date64Type>(
+      {0 * ms_per_day, 1 * ms_per_day, 31 * ms_per_day, 2 * ms_per_day, 4 * ms_per_day},
+      &target_);
+  AssertDiffAndFormat(R"(
+@@ -2, +2 @@
+-1970-01-03
+@@ -4, +3 @@
++1970-01-03
+)");
+
+  // timestamp
+  auto x = 678 + 1000000 * (5 + 60 * (4 + 60 * (3 + 24 * int64_t(1))));
+  ArrayFromVector<TimestampType>(timestamp(TimeUnit::MICRO), {0, 1, x, 2, 4}, &base_);
+  ArrayFromVector<TimestampType>(timestamp(TimeUnit::MICRO), {0, 1, 2, x, 4}, &target_);
+  AssertDiffAndFormat(R"(
+@@ -2, +2 @@
+-1970-01-02 03:04:05.000678
+@@ -4, +3 @@
++1970-01-02 03:04:05.000678
+)");
+
+  // lists
+  base_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [], [13], []])");
+  target_ = ArrayFromJSON(list(int32()), R"([[2, 3, 1], [5, 9], [], [13]])");
+  AssertDiffAndFormat(R"(
+@@ -1, +1 @@
++[5, 9]
+@@ -3, +4 @@
+-[]
+)");
+
+  // maps
+  base_ = ArrayFromJSON(map(utf8(), int32()), R"([
+    [["foo", 2], ["bar", 3], ["baz", 1]],
+    [],
+    [["quux", 13]],
+    []
+  ])");
+  target_ = ArrayFromJSON(map(utf8(), int32()), R"([
+    [["foo", 2], ["bar", 3], ["baz", 1]],
+    [["ytho", 11]],
+    [],
+    [["quux", 13]]
+  ])");
+  AssertDiffAndFormat(R"(
+@@ -1, +1 @@
++[{key: "ytho", value: 11}]
+@@ -3, +4 @@
+-[]
+)");
+
+  // structs
+  auto type = struct_({field("foo", utf8()), field("bar", int32())});
+  base_ = ArrayFromJSON(type, R"([{"foo": "!", "bar": 3}, {}, {"bar": 13}])");
+  target_ = ArrayFromJSON(type, R"([{"foo": null, "bar": 2}, {}, {"bar": 13}])");
+  AssertDiffAndFormat(R"(
+@@ -0, +0 @@
+-{foo: "!", bar: 3}
++{bar: 2}
+)");
+
+  // unions
+  for (auto mode : {UnionMode::SPARSE, UnionMode::DENSE}) {
+    type = union_({field("foo", utf8()), field("bar", int32())}, {2, 5}, mode);
+    base_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], [5, 13]])");
+    target_ = ArrayFromJSON(type, R"([[2, "!"], [2, "3"], [5, 13]])");
+    AssertDiffAndFormat(R"(
+@@ -1, +1 @@
+-{5: 3}
++{2: "3"}
+)");
+  }
+
+  for (auto type : {int8(), uint8(),  // verify that these are printed as numbers rather
+                                      // than their ascii characters
+                    int16(), uint16()}) {
+    // small difference
+    base_ = ArrayFromJSON(type, "[0, 1, 2, 3, 5, 8, 11, 13, 17]");
+    target_ = ArrayFromJSON(type, "[2, 3, 5, 7, 11, 13, 17, 19]");
+    AssertDiffAndFormat(R"(
+@@ -0, +0 @@
+-0
+-1
+@@ -5, +3 @@
+-8
++7
+@@ -9, +7 @@
++19
+)");
+
+    // large difference
+    base_ = ArrayFromJSON(type, "[57, 10, 22, 126, 42]");
+    target_ = ArrayFromJSON(type, "[58, 57, 75, 93, 53, 8, 22, 42, 79, 11]");
+    AssertDiffAndFormat(R"(
+@@ -0, +0 @@
++58
+@@ -1, +2 @@
+-10
++75
++93
++53
++8
+@@ -3, +7 @@
+-126
+@@ -5, +8 @@
++79
++11
+)");
+  }
+}
+
+TEST_F(DiffTest, DictionaryDiffFormatter) {
+  std::stringstream formatted;
+
+  // differing indices
+  auto base_dict = ArrayFromJSON(utf8(), R"(["a", "b", "c"])");
+  auto base_indices = ArrayFromJSON(int8(), "[0, 1, 2, 2, 0, 1]");
+  ASSERT_OK(
+      DictionaryArray::FromArrays(dictionary(base_indices->type(), base_dict->type()),
+                                  base_indices, base_dict, &base_));
+
+  auto target_dict = base_dict;
+  auto target_indices = ArrayFromJSON(int8(), "[0, 1, 2, 2, 1, 1]");
+  ASSERT_OK(
+      DictionaryArray::FromArrays(dictionary(target_indices->type(), target_dict->type()),
+                                  target_indices, target_dict, &target_));
+
+  base_->Equals(*target_, EqualOptions().diff_sink(&formatted));
+  ASSERT_EQ(formatted.str(), R"(# Dictionary arrays differed
+## dictionary diff
+## indices diff
+@@ -4, +4 @@
+-0
+@@ -6, +5 @@
++1
+)");
+
+  // differing dictionaries
+  target_dict = ArrayFromJSON(utf8(), R"(["b", "c", "a"])");
+  target_indices = base_indices;
+  ASSERT_OK(
+      DictionaryArray::FromArrays(dictionary(target_indices->type(), target_dict->type()),
+                                  target_indices, target_dict, &target_));
+
+  formatted.str("");
+  base_->Equals(*target_, EqualOptions().diff_sink(&formatted));
+  ASSERT_EQ(formatted.str(), R"(# Dictionary arrays differed
+## dictionary diff
+@@ -0, +0 @@
+-"a"
+@@ -3, +2 @@
++"a"
+## indices diff
+)");
+}
+
+void MakeSameLength(std::shared_ptr<Array>* a, std::shared_ptr<Array>* b) {
+  auto length = std::min((*a)->length(), (*b)->length());
+  *a = (*a)->Slice(0, length);
+  *b = (*b)->Slice(0, length);
+}
+
+TEST_F(DiffTest, CompareRandomStruct) {
+  compute::FunctionContext ctx;
+  for (auto null_probability : {0.0, 0.25}) {
+    constexpr auto length = 1 << 10;
+    auto int32_values = this->rng_.Int32(length, 0, 127, null_probability);
+    auto utf8_values = this->rng_.String(length, 0, 16, null_probability);
+    for (const double filter_probability : {0.9999, 0.75}) {
+      std::shared_ptr<Array> int32_base, int32_target, utf8_base, utf8_target;
+      ASSERT_OK(compute::Filter(&ctx, *int32_values,
+                                *this->rng_.Boolean(length, filter_probability, 0.0),
+                                &int32_base));
+      ASSERT_OK(compute::Filter(&ctx, *utf8_values,
+                                *this->rng_.Boolean(length, filter_probability, 0.0),
+                                &utf8_base));
+      MakeSameLength(&int32_base, &utf8_base);
+
+      ASSERT_OK(compute::Filter(&ctx, *int32_values,
+                                *this->rng_.Boolean(length, filter_probability, 0.0),
+                                &int32_target));
+      ASSERT_OK(compute::Filter(&ctx, *utf8_values,
+                                *this->rng_.Boolean(length, filter_probability, 0.0),
+                                &utf8_target));
+      MakeSameLength(&int32_target, &utf8_target);
+
+      auto type = struct_({field("i", int32()), field("s", utf8())});
+      auto base_res = StructArray::Make({int32_base, utf8_base}, type->children());
+      ASSERT_OK(base_res.status());
+      base_ = base_res.ValueOrDie();
+      auto target_res = StructArray::Make({int32_target, utf8_target}, type->children());
+      ASSERT_OK(target_res.status());
+      target_ = target_res.ValueOrDie();
+
+      std::stringstream formatted;
+      this->DoDiffAndFormat(&formatted);
+      auto st = ValidateEditScript(*this->edits_, *this->base_, *this->target_);
+      if (!st.ok()) {
+        ASSERT_OK(Status(st.code(), st.message() + "\n" + formatted.str()));
+      }
+    }
+  }
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc
new file mode 100644
index 0000000..156bd24
--- /dev/null
+++ b/cpp/src/arrow/array/diff.cc
@@ -0,0 +1,819 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/diff.h"
+
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer-builder.h"
+#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
+#include "arrow/pretty_print.h"
+#include "arrow/status.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/lazy.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/stl.h"
+#include "arrow/util/string.h"
+#include "arrow/util/variant.h"
+#include "arrow/util/visibility.h"
+#include "arrow/vendored/datetime.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+using internal::MakeLazyRange;
+
+template <typename ArrayType>
+auto GetView(const ArrayType& array, int64_t index) -> decltype(array.GetView(index)) {
+  return array.GetView(index);
+}
+
+struct Slice {
+  const Array* array_;
+  int64_t offset_, length_;
+
+  bool operator==(const Slice& other) const {
+    return length_ == other.length_ &&
+           array_->RangeEquals(offset_, offset_ + length_, other.offset_, *other.array_);
+  }
+  bool operator!=(const Slice& other) const { return !(*this == other); }
+};
+
+template <typename ArrayType, typename T = typename ArrayType::TypeClass,
+          typename =
+              typename std::enable_if<std::is_base_of<BaseListType, T>::value ||
+                                      std::is_base_of<FixedSizeListType, T>::value>::type>
+static Slice GetView(const ArrayType& array, int64_t index) {
+  return Slice{array.values().get(), array.value_offset(index),
+               array.value_length(index)};
+}
+
+struct UnitSlice {
+  const Array* array_;
+  int64_t offset_;
+
+  bool operator==(const UnitSlice& other) const {
+    return array_->RangeEquals(offset_, offset_ + 1, other.offset_, *other.array_);
+  }
+  bool operator!=(const UnitSlice& other) const { return !(*this == other); }
+};
+
+// FIXME(bkietz) this is inefficient;
+// StructArray's fields can be diffed independently then merged
+static UnitSlice GetView(const StructArray& array, int64_t index) {
+  return UnitSlice{&array, index};
+}
+
+static UnitSlice GetView(const UnionArray& array, int64_t index) {
+  return UnitSlice{&array, index};
+}
+
+struct NullTag {
+  constexpr bool operator==(const NullTag& other) const { return true; }
+  constexpr bool operator!=(const NullTag& other) const { return false; }
+};
+
+template <typename T>
+class NullOr {
+ public:
+  using VariantType = util::variant<NullTag, T>;
+
+  NullOr() : variant_(NullTag{}) {}
+  explicit NullOr(T t) : variant_(std::move(t)) {}
+
+  template <typename ArrayType>
+  NullOr(const ArrayType& array, int64_t index) {
+    if (array.IsNull(index)) {
+      variant_.emplace(NullTag{});
+    } else {
+      variant_.emplace(GetView(array, index));
+    }
+  }
+
+  bool operator==(const NullOr& other) const { return variant_ == other.variant_; }
+  bool operator!=(const NullOr& other) const { return variant_ != other.variant_; }
+
+ private:
+  VariantType variant_;
+};
+
+template <typename ArrayType>
+using ViewType = decltype(GetView(std::declval<ArrayType>(), 0));
+
+template <typename ArrayType>
+class ViewGenerator {
+ public:
+  using View = ViewType<ArrayType>;
+
+  explicit ViewGenerator(const Array& array)
+      : array_(checked_cast<const ArrayType&>(array)) {
+    DCHECK_EQ(array.null_count(), 0);
+  }
+
+  View operator()(int64_t index) const { return GetView(array_, index); }
+
+ private:
+  const ArrayType& array_;
+};
+
+template <typename ArrayType>
+internal::LazyRange<ViewGenerator<ArrayType>> MakeViewRange(const Array& array) {
+  using Generator = ViewGenerator<ArrayType>;
+  return internal::LazyRange<Generator>(Generator(array), array.length());
+}
+
+template <typename ArrayType>
+class NullOrViewGenerator {
+ public:
+  using View = ViewType<ArrayType>;
+
+  explicit NullOrViewGenerator(const Array& array)
+      : array_(checked_cast<const ArrayType&>(array)) {}
+
+  NullOr<View> operator()(int64_t index) const {
+    return array_.IsNull(index) ? NullOr<View>() : NullOr<View>(GetView(array_, index));
+  }
+
+ private:
+  const ArrayType& array_;
+};
+
+template <typename ArrayType>
+internal::LazyRange<NullOrViewGenerator<ArrayType>> MakeNullOrViewRange(
+    const Array& array) {
+  using Generator = NullOrViewGenerator<ArrayType>;
+  return internal::LazyRange<Generator>(Generator(array), array.length());
+}
+
+/// A generic sequence difference algorithm, based on
+///
+/// E. W. Myers, "An O(ND) difference algorithm and its variations,"
+/// Algorithmica, vol. 1, no. 1-4, pp. 251–266, 1986.
+///
+/// To summarize, an edit script is computed by maintaining the furthest set of EditPoints
+/// which are reachable in a given number of edits D. This is used to compute the furthest
+/// set reachable with D+1 edits, and the process continues inductively until a complete
+/// edit script is discovered.
+///
+/// From each edit point a single deletion and insertion is made then as many shared
+/// elements as possible are skipped, recording only the endpoint of the run. This
+/// representation is minimal in the common case where the sequences differ only slightly,
+/// since most of the elements are shared between base and target and are represented
+/// implicitly.
+template <typename Iterator>
+class QuadraticSpaceMyersDiff {
+ public:
+  // represents an intermediate state in the comparison of two arrays
+  struct EditPoint {
+    Iterator base, target;
+
+    bool operator==(EditPoint other) const {
+      return base == other.base && target == other.target;
+    }
+  };
+
+  // increment the position within base (the element pointed to was deleted)
+  // then extend maximally
+  EditPoint DeleteOne(EditPoint p) const {
+    if (p.base != base_end_) {
+      ++p.base;
+    }
+    return ExtendFrom(p);
+  }
+
+  // increment the position within target (the element pointed to was inserted)
+  // then extend maximally
+  EditPoint InsertOne(EditPoint p) const {
+    if (p.target != target_end_) {
+      ++p.target;
+    }
+    return ExtendFrom(p);
+  }
+
+  // increment the position within base and target (the elements skipped in this way were
+  // present in both sequences)
+  EditPoint ExtendFrom(EditPoint p) const {
+    for (; p.base != base_end_ && p.target != target_end_; ++p.base, ++p.target) {
+      if (*p.base != *p.target) {
+        break;
+      }
+    }
+    return p;
+  }
+
+  QuadraticSpaceMyersDiff(Iterator base_begin, Iterator base_end, Iterator target_begin,
+                          Iterator target_end)
+      : base_begin_(base_begin),
+        base_end_(base_end),
+        target_begin_(target_begin),
+        target_end_(target_end),
+        endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}),
+        insert_({true}) {
+    if (std::distance(base_begin_, base_end_) ==
+            std::distance(target_begin_, target_end_) &&
+        endpoint_base_[0] == base_end_) {
+      // trivial case: base == target
+      finish_index_ = 0;
+    }
+  }
+
+  // beginning of a range for storing per-edit state in endpoint_base_ and insert_
+  int64_t StorageOffset(int64_t edit_count) const {
+    return edit_count * (edit_count + 1) / 2;
+  }
+
+  // given edit_count and index, augment endpoint_base_[index] with the corresponding
+  // position in target (which is only implicitly represented in edit_count, index)
+  EditPoint GetEditPoint(int64_t edit_count, int64_t index) const {
+    DCHECK_GE(index, StorageOffset(edit_count));
+    DCHECK_LT(index, StorageOffset(edit_count + 1));
+    auto insertions_minus_deletions =
+        2 * (index - StorageOffset(edit_count)) - edit_count;
+    auto maximal_base = endpoint_base_[index];
+    auto maximal_target = std::min(
+        target_begin_ + ((maximal_base - base_begin_) + insertions_minus_deletions),
+        target_end_);
+    return {maximal_base, maximal_target};
+  }
+
+  void Next() {
+    ++edit_count_;
+    // base_begin_ is used as a dummy value here since Iterator may not be default
+    // constructible. The newly allocated range is completely overwritten below.
+    endpoint_base_.resize(StorageOffset(edit_count_ + 1), base_begin_);
+    insert_.resize(StorageOffset(edit_count_ + 1), false);
+
+    auto previous_offset = StorageOffset(edit_count_ - 1);
+    auto current_offset = StorageOffset(edit_count_);
+
+    // try deleting from base first
+    for (int64_t i = 0, i_out = 0; i < edit_count_; ++i, ++i_out) {
+      auto previous_endpoint = GetEditPoint(edit_count_ - 1, i + previous_offset);
+      endpoint_base_[i_out + current_offset] = DeleteOne(previous_endpoint).base;
+    }
+
+    // check if inserting from target could do better
+    for (int64_t i = 0, i_out = 1; i < edit_count_; ++i, ++i_out) {
+      // retrieve the previously computed best endpoint for (edit_count_, i_out)
+      // for comparison with the best endpoint achievable with an insertion
+      auto endpoint_after_deletion = GetEditPoint(edit_count_, i_out + current_offset);
+
+      auto previous_endpoint = GetEditPoint(edit_count_ - 1, i + previous_offset);
+      auto endpoint_after_insertion = InsertOne(previous_endpoint);
+
+      if (endpoint_after_insertion.base - endpoint_after_deletion.base >= 0) {
+        // insertion was more efficient; keep it and mark the insertion in insert_
+        insert_[i_out + current_offset] = true;
+        endpoint_base_[i_out + current_offset] = endpoint_after_insertion.base;
+      }
+    }
+
+    // check for completion
+    EditPoint finish = {base_end_, target_end_};
+    for (int64_t i_out = 0; i_out < edit_count_ + 1; ++i_out) {
+      if (GetEditPoint(edit_count_, i_out + current_offset) == finish) {
+        finish_index_ = i_out + current_offset;
+        return;
+      }
+    }
+  }
+
+  bool Done() { return finish_index_ != -1; }
+
+  Result<std::shared_ptr<StructArray>> GetEdits(MemoryPool* pool) {
+    DCHECK(Done());
+
+    int64_t length = edit_count_ + 1;
+    std::shared_ptr<Buffer> insert_buf, run_length_buf;
+    RETURN_NOT_OK(AllocateBitmap(pool, length, &insert_buf));
+    RETURN_NOT_OK(AllocateBuffer(pool, length * sizeof(int64_t), &run_length_buf));
+    auto run_length = reinterpret_cast<int64_t*>(run_length_buf->mutable_data());
+
+    auto index = finish_index_;
+    auto endpoint = GetEditPoint(edit_count_, finish_index_);
+
+    for (int64_t i = edit_count_; i > 0; --i) {
+      bool insert = insert_[index];
+      BitUtil::SetBitTo(insert_buf->mutable_data(), i, insert);
+
+      auto insertions_minus_deletions =
+          (endpoint.base - base_begin_) - (endpoint.target - target_begin_);
+      if (insert) {
+        ++insertions_minus_deletions;
+      } else {
+        --insertions_minus_deletions;
+      }
+      index = (i - 1 - insertions_minus_deletions) / 2 + StorageOffset(i - 1);
+
+      // endpoint of previous edit
+      auto previous = GetEditPoint(i - 1, index);
+      run_length[i] = endpoint.base - previous.base - !insert;
+      DCHECK_GE(run_length[i], 0);
+
+      endpoint = previous;
+    }
+    BitUtil::SetBitTo(insert_buf->mutable_data(), 0, false);
+    run_length[0] = endpoint.base - base_begin_;
+
+    return StructArray::Make({std::make_shared<BooleanArray>(length, insert_buf),
+                              std::make_shared<Int64Array>(length, run_length_buf)},
+                             {field("insert", boolean()), field("run_length", int64())});
+  }
+
+ private:
+  int64_t finish_index_ = -1;
+  int64_t edit_count_ = 0;
+  Iterator base_begin_, base_end_;
+  Iterator target_begin_, target_end_;
+  // each element of endpoint_base_ is the furthest position in base reachable given an
+  // edit_count and (# insertions) - (# deletions). Each bit of insert_ records whether
+  // the corresponding furthest position was reached via an insertion or a deletion
+  // (followed by a run of shared elements). See StorageOffset for the
+  // layout of these vectors
+  std::vector<Iterator> endpoint_base_;
+  std::vector<bool> insert_;
+};
+
+struct DiffImpl {
+  Status Visit(const NullType&) {
+    bool insert = base_.length() < target_.length();
+    auto run_length = std::min(base_.length(), target_.length());
+    auto edit_count = std::max(base_.length(), target_.length()) - run_length;
+
+    TypedBufferBuilder<bool> insert_builder(pool_);
+    RETURN_NOT_OK(insert_builder.Resize(edit_count + 1));
+    insert_builder.UnsafeAppend(false);
+    TypedBufferBuilder<int64_t> run_length_builder(pool_);
+    RETURN_NOT_OK(run_length_builder.Resize(edit_count + 1));
+    run_length_builder.UnsafeAppend(run_length);
+    if (edit_count > 0) {
+      insert_builder.UnsafeAppend(edit_count, insert);
+      run_length_builder.UnsafeAppend(edit_count, 0);
+    }
+
+    std::shared_ptr<Buffer> insert_buf, run_length_buf;
+    RETURN_NOT_OK(insert_builder.Finish(&insert_buf));
+    RETURN_NOT_OK(run_length_builder.Finish(&run_length_buf));
+
+    ARROW_ASSIGN_OR_RAISE(
+        out_,
+        StructArray::Make({std::make_shared<BooleanArray>(edit_count + 1, insert_buf),
+                           std::make_shared<Int64Array>(edit_count + 1, run_length_buf)},
+                          {field("insert", boolean()), field("run_length", int64())}));
+    return Status::OK();
+  }
+
+  template <typename T>
+  Status Visit(const T&) {
+    using ArrayType = typename TypeTraits<T>::ArrayType;
+    if (base_.null_count() == 0 && target_.null_count() == 0) {
+      auto base = MakeViewRange<ArrayType>(base_);
+      auto target = MakeViewRange<ArrayType>(target_);
+      ARROW_ASSIGN_OR_RAISE(out_,
+                            Diff(base.begin(), base.end(), target.begin(), target.end()));
+    } else {
+      auto base = MakeNullOrViewRange<ArrayType>(base_);
+      auto target = MakeNullOrViewRange<ArrayType>(target_);
+      ARROW_ASSIGN_OR_RAISE(out_,
+                            Diff(base.begin(), base.end(), target.begin(), target.end()));
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const ExtensionType&) {
+    auto base = checked_cast<const ExtensionArray&>(base_).storage();
+    auto target = checked_cast<const ExtensionArray&>(target_).storage();
+    ARROW_ASSIGN_OR_RAISE(out_, arrow::Diff(*base, *target, pool_));
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryType& t) {
+    return Status::NotImplemented("diffing arrays of type ", t);
+  }
+
+  Result<std::shared_ptr<StructArray>> Diff() {
+    RETURN_NOT_OK(VisitTypeInline(*base_.type(), this));
+    return out_;
+  }
+
+  template <typename Iterator>
+  Result<std::shared_ptr<StructArray>> Diff(Iterator base_begin, Iterator base_end,
+                                            Iterator target_begin, Iterator target_end) {
+    QuadraticSpaceMyersDiff<Iterator> impl(base_begin, base_end, target_begin,
+                                           target_end);
+    while (!impl.Done()) {
+      impl.Next();
+    }
+    return impl.GetEdits(pool_);
+  }
+
+  const Array& base_;
+  const Array& target_;
+  MemoryPool* pool_;
+  std::shared_ptr<StructArray> out_;
+};
+
+Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
+                                          MemoryPool* pool) {
+  if (!base.type()->Equals(target.type())) {
+    return Status::TypeError("only taking the diff of like-typed arrays is supported.");
+  }
+
+  return DiffImpl{base, target, pool, nullptr}.Diff();
+}
+
+using Formatter = std::function<void(const Array&, int64_t index, std::ostream*)>;
+
+static Result<Formatter> MakeFormatter(const DataType& type);
+
+class MakeFormatterImpl {
+ public:
+  Result<Formatter> Make(const DataType& type) && {
+    RETURN_NOT_OK(VisitTypeInline(type, this));
+    return std::move(impl_);
+  }
+
+ private:
+  template <typename VISITOR>
+  friend Status VisitTypeInline(const DataType&, VISITOR*);
+
+  // factory implementation
+  Status Visit(const BooleanType&) {
+    impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+      *os << (checked_cast<const BooleanArray&>(array).Value(index) ? "true" : "false");
+    };
+    return Status::OK();
+  }
+
+  // format Numerics with std::ostream defaults
+  template <typename T>
+  typename std::enable_if<
+      is_number_type<T>::value && !is_date<T>::value && !is_time<T>::value, Status>::type
+  Visit(const T&) {
+    impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+      const auto& numeric = checked_cast<const NumericArray<T>&>(array);
+      if (sizeof(decltype(numeric.Value(index))) == sizeof(char)) {
+        // override std::ostream defaults for /(u|)int8_t/ since they are
+        // formatted as potentially unprintable/tty borking characters
+        *os << static_cast<int16_t>(numeric.Value(index));
+      } else {
+        *os << numeric.Value(index);
+      }
+    };
+    return Status::OK();
+  }
+
+  template <typename T>
+  enable_if_date<T, Status> Visit(const T&) {
+    using unit = typename std::conditional<std::is_same<T, Date32Type>::value,
+                                           arrow_vendored::date::days,
+                                           std::chrono::milliseconds>::type;
+
+    static arrow_vendored::date::sys_days epoch{arrow_vendored::date::jan / 1 / 1970};
+
+    impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+      unit value(checked_cast<const NumericArray<T>&>(array).Value(index));
+      *os << arrow_vendored::date::format("%F", value + epoch);
+    };
+    return Status::OK();
+  }
+
+  template <typename T>
+  enable_if_time<T, Status> Visit(const T&) {
+    impl_ = MakeTimeFormatter<T, false>("%T");
+    return Status::OK();
+  }
+
+  Status Visit(const TimestampType&) {
+    impl_ = MakeTimeFormatter<TimestampType, true>("%F %T");
+    return Status::OK();
+  }
+
+  Status Visit(const DayTimeIntervalType&) {
+    impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+      auto day_millis = checked_cast<const DayTimeIntervalArray&>(array).Value(index);
+      *os << day_millis.days << "d" << day_millis.milliseconds << "ms";
+    };
+    return Status::OK();
+  }
+
+  // format Binary and FixedSizeBinary in hexadecimal
+  template <typename T>
+  enable_if_binary_like<T, Status> Visit(const T&) {
+    impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+      *os << HexEncode(
+          checked_cast<const typename TypeTraits<T>::ArrayType&>(array).GetView(index));
+    };
+    return Status::OK();
+  }
+
+  // format Strings with \"\n\r\t\\ escaped
+  Status Visit(const StringType&) {
+    impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+      *os << "\"" << Escape(checked_cast<const StringArray&>(array).GetView(index))
+          << "\"";
+    };
+    return Status::OK();
+  }
+
+  // format Decimals with Decimal128Array::FormatValue
+  Status Visit(const Decimal128Type&) {
+    impl_ = [](const Array& array, int64_t index, std::ostream* os) {
+      *os << checked_cast<const Decimal128Array&>(array).FormatValue(index);
+    };
+    return Status::OK();
+  }
+
+  Status Visit(const ListType& t) {
+    struct ListImpl {
+      explicit ListImpl(Formatter f) : values_formatter_(std::move(f)) {}
+
+      void operator()(const Array& array, int64_t index, std::ostream* os) {
+        const auto& list_array = checked_cast<const ListArray&>(array);
+        *os << "[";
+        for (int32_t i = 0; i < list_array.value_length(index); ++i) {
+          if (i != 0) {
+            *os << ", ";
+          }
+          values_formatter_(*list_array.values(), i + list_array.value_offset(index), os);
+        }
+        *os << "]";
+      }
+
+      Formatter values_formatter_;
+    };
+
+    ARROW_ASSIGN_OR_RAISE(auto values_formatter, MakeFormatter(*t.value_type()));
+    impl_ = ListImpl(std::move(values_formatter));
+    return Status::OK();
+  }
+
+  // TODO(bkietz) format maps better
+
+  Status Visit(const StructType& t) {
+    struct StructImpl {
+      explicit StructImpl(std::vector<Formatter> f) : field_formatters_(std::move(f)) {}
+
+      void operator()(const Array& array, int64_t index, std::ostream* os) {
+        const auto& struct_array = checked_cast<const StructArray&>(array);
+        *os << "{";
+        for (int i = 0, printed = 0; i < struct_array.num_fields(); ++i) {
+          if (printed != 0) {
+            *os << ", ";
+          }
+          if (struct_array.field(i)->IsNull(index)) {
+            continue;
+          }
+          ++printed;
+          *os << struct_array.struct_type()->child(i)->name() << ": ";
+          field_formatters_[i](*struct_array.field(i), index, os);
+        }
+        *os << "}";
+      }
+
+      std::vector<Formatter> field_formatters_;
+    };
+
+    std::vector<Formatter> field_formatters(t.num_children());
+    for (int i = 0; i < t.num_children(); ++i) {
+      ARROW_ASSIGN_OR_RAISE(field_formatters[i], MakeFormatter(*t.child(i)->type()));
+    }
+
+    impl_ = StructImpl(std::move(field_formatters));
+    return Status::OK();
+  }
+
+  Status Visit(const UnionType& t) {
+    struct UnionImpl {
+      UnionImpl(std::vector<Formatter> f, std::vector<int> c)
+          : field_formatters_(std::move(f)), type_id_to_child_index_(std::move(c)) {}
+
+      void DoFormat(const UnionArray& array, int64_t index, int64_t child_index,
+                    std::ostream* os) {
+        auto type_id = array.raw_type_ids()[index];
+        const auto& child = *array.child(type_id_to_child_index_[type_id]);
+
+        *os << "{" << static_cast<int16_t>(type_id) << ": ";
+        if (child.IsNull(child_index)) {
+          *os << "null";
+        } else {
+          field_formatters_[type_id](child, child_index, os);
+        }
+        *os << "}";
+      }
+
+      std::vector<Formatter> field_formatters_;
+      std::vector<int> type_id_to_child_index_;
+    };
+
+    struct SparseImpl : UnionImpl {
+      using UnionImpl::UnionImpl;
+
+      void operator()(const Array& array, int64_t index, std::ostream* os) {
+        const auto& union_array = checked_cast<const UnionArray&>(array);
+        DoFormat(union_array, index, index, os);
+      }
+    };
+
+    struct DenseImpl : UnionImpl {
+      using UnionImpl::UnionImpl;
+
+      void operator()(const Array& array, int64_t index, std::ostream* os) {
+        const auto& union_array = checked_cast<const UnionArray&>(array);
+        DoFormat(union_array, index, union_array.raw_value_offsets()[index], os);
+      }
+    };
+
+    std::vector<Formatter> field_formatters(t.max_type_code() + 1);
+    std::vector<int> type_id_to_child_index(field_formatters.size());
+    for (int i = 0; i < t.num_children(); ++i) {
+      auto type_id = t.type_codes()[i];
+      type_id_to_child_index[type_id] = i;
+      ARROW_ASSIGN_OR_RAISE(field_formatters[type_id],
+                            MakeFormatter(*t.child(i)->type()));
+    }
+
+    if (t.mode() == UnionMode::SPARSE) {
+      impl_ = SparseImpl(std::move(field_formatters), std::move(type_id_to_child_index));
+    } else {
+      impl_ = DenseImpl(std::move(field_formatters), std::move(type_id_to_child_index));
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DataType& t) {
+    return Status::NotImplemented("formatting diffs between arrays of type ", t);
+  }
+
+  template <typename T, bool AddEpoch>
+  Formatter MakeTimeFormatter(const std::string& fmt_str) {
+    return [fmt_str](const Array& array, int64_t index, std::ostream* os) {
+      auto fmt = fmt_str.c_str();
+      auto unit = checked_cast<const T&>(*array.type()).unit();
+      auto value = checked_cast<const NumericArray<T>&>(array).Value(index);
+      using arrow_vendored::date::format;
+      using std::chrono::nanoseconds;
+      using std::chrono::microseconds;
+      using std::chrono::milliseconds;
+      using std::chrono::seconds;
+      if (AddEpoch) {
+        static arrow_vendored::date::sys_days epoch{arrow_vendored::date::jan / 1 / 1970};
+
+        switch (unit) {
+          case TimeUnit::NANO:
+            *os << format(fmt, static_cast<nanoseconds>(value) + epoch);
+            break;
+          case TimeUnit::MICRO:
+            *os << format(fmt, static_cast<microseconds>(value) + epoch);
+            break;
+          case TimeUnit::MILLI:
+            *os << format(fmt, static_cast<milliseconds>(value) + epoch);
+            break;
+          case TimeUnit::SECOND:
+            *os << format(fmt, static_cast<seconds>(value) + epoch);
+            break;
+        }
+        return;
+      }
+      switch (unit) {
+        case TimeUnit::NANO:
+          *os << format(fmt, static_cast<nanoseconds>(value));
+          break;
+        case TimeUnit::MICRO:
+          *os << format(fmt, static_cast<microseconds>(value));
+          break;
+        case TimeUnit::MILLI:
+          *os << format(fmt, static_cast<milliseconds>(value));
+          break;
+        case TimeUnit::SECOND:
+          *os << format(fmt, static_cast<seconds>(value));
+          break;
+      }
+    };
+  }
+
+  Formatter impl_;
+};
+
+static Result<Formatter> MakeFormatter(const DataType& type) {
+  return MakeFormatterImpl{}.Make(type);
+}
+
+Status VisitEditScript(
+    const Array& edits,
+    const std::function<Status(int64_t delete_begin, int64_t delete_end,
+                               int64_t insert_begin, int64_t insert_end)>& visitor) {
+  static const auto edits_type =
+      struct_({field("insert", boolean()), field("run_length", int64())});
+  DCHECK(edits.type()->Equals(*edits_type));
+  DCHECK_GE(edits.length(), 1);
+
+  auto insert = checked_pointer_cast<BooleanArray>(
+      checked_cast<const StructArray&>(edits).field(0));
+  auto run_lengths =
+      checked_pointer_cast<Int64Array>(checked_cast<const StructArray&>(edits).field(1));
+
+  DCHECK(!insert->Value(0));
+
+  auto length = run_lengths->Value(0);
+  int64_t base_begin, base_end, target_begin, target_end;
+  base_begin = base_end = target_begin = target_end = length;
+  for (int64_t i = 1; i < edits.length(); ++i) {
+    if (insert->Value(i)) {
+      ++target_end;
+    } else {
+      ++base_end;
+    }
+    length = run_lengths->Value(i);
+    if (length != 0) {
+      RETURN_NOT_OK(visitor(base_begin, base_end, target_begin, target_end));
+      base_begin = base_end = base_end + length;
+      target_begin = target_end = target_end + length;
+    }
+  }
+  if (length == 0) {
+    return visitor(base_begin, base_end, target_begin, target_end);
+  }
+  return Status::OK();
+}
+
+class UnifiedDiffFormatter {
+ public:
+  UnifiedDiffFormatter(std::ostream* os, Formatter formatter)
+      : os_(os), formatter_(std::move(formatter)) {}
+
+  Status operator()(int64_t delete_begin, int64_t delete_end, int64_t insert_begin,
+                    int64_t insert_end) {
+    *os_ << "@@ -" << delete_begin << ", +" << insert_begin << " @@" << std::endl;
+
+    for (int64_t i = delete_begin; i < delete_end; ++i) {
+      *os_ << "-";
+      if (base_->IsValid(i)) {
+        formatter_(*base_, i, &*os_);
+      } else {
+        *os_ << "null";
+      }
+      *os_ << std::endl;
+    }
+
+    for (int64_t i = insert_begin; i < insert_end; ++i) {
+      *os_ << "+";
+      if (target_->IsValid(i)) {
+        formatter_(*target_, i, &*os_);
+      } else {
+        *os_ << "null";
+      }
+      *os_ << std::endl;
+    }
+
+    return Status::OK();
+  }
+
+  Status operator()(const Array& edits, const Array& base, const Array& target) {
+    if (edits.length() == 1) {
+      return Status::OK();
+    }
+    base_ = &base;
+    target_ = &target;
+    *os_ << std::endl;
+    return VisitEditScript(edits, *this);
+  }
+
+ private:
+  std::ostream* os_ = nullptr;
+  const Array* base_ = nullptr;
+  const Array* target_ = nullptr;
+  Formatter formatter_;
+};
+
+Result<std::function<Status(const Array& edits, const Array& base, const Array& target)>>
+MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os) {
+  ARROW_ASSIGN_OR_RAISE(auto formatter, MakeFormatter(type));
+  return UnifiedDiffFormatter(os, std::move(formatter));
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/array/diff.h b/cpp/src/arrow/array/diff.h
new file mode 100644
index 0000000..a7e654f
--- /dev/null
+++ b/cpp/src/arrow/array/diff.h
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <iosfwd>
+#include <memory>
+
+#include "arrow/array.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+/// \brief Compare two arrays, returning an edit script which expresses the difference
+/// between them
+///
+/// An edit script is an array of struct(insert: bool, run_length: int64_t).
+/// Each element of "insert" determines whether an element was inserted into (true)
+/// or deleted from (false) base. Each insertion or deletion is followed by a run of
+/// elements which are unchanged from base to target; the length of this run is stored
+/// in "run_length". (Note that the edit script begins and ends with a run of shared
+/// elements but both fields of the struct must have the same length. To accomodate this
+/// the first element of "insert" should be ignored.)
+///
+/// For example for base "hlloo" and target "hello", the edit script would be
+/// [
+///   {"insert": false, "run_length": 1}, // leading run of length 1 ("h")
+///   {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo")
+///   {"insert": false, "run_length": 0} // delete("o") then an empty run
+/// ]
+///
+/// Diffing arrays containing nulls is not currently supported.
+///
+/// \param[in] base baseline for comparison
+/// \param[in] target an array of identical type to base whose elements differ from base's
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \return an edit script array which can be applied to base to produce target
+ARROW_EXPORT
+Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
+                                          MemoryPool* pool);
+
+/// \brief visitor interface for easy traversal of an edit script
+///
+/// visitor will be called for each hunk of insertions and deletions.
+ARROW_EXPORT Status VisitEditScript(
+    const Array& edits,
+    const std::function<Status(int64_t delete_begin, int64_t delete_end,
+                               int64_t insert_begin, int64_t insert_end)>& visitor);
+
+/// \brief return a function which will format an edit script in unified
+/// diff format to os, given base and target arrays of type
+ARROW_EXPORT Result<
+    std::function<Status(const Array& edits, const Array& base, const Array& target)>>
+MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os);
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index ff4c2b5..05a1d1f 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -30,6 +30,7 @@
 #include <vector>
 
 #include "arrow/array.h"
+#include "arrow/array/diff.h"
 #include "arrow/buffer.h"
 #include "arrow/scalar.h"
 #include "arrow/sparse_tensor.h"
@@ -929,14 +930,61 @@ class ScalarEqualsVisitor {
   bool result_;
 };
 
+Status PrintDiff(const Array& left, const Array& right, std::ostream* os) {
+  if (os == nullptr) {
+    return Status::OK();
+  }
+
+  if (!left.type()->Equals(right.type())) {
+    *os << "# Array types differed: " << *left.type() << " vs " << *right.type();
+    return Status::OK();
+  }
+
+  if (left.type()->id() == Type::DICTIONARY) {
+    *os << "# Dictionary arrays differed" << std::endl;
+
+    const auto& left_dict = checked_cast<const DictionaryArray&>(left);
+    const auto& right_dict = checked_cast<const DictionaryArray&>(right);
+
+    *os << "## dictionary diff";
+    auto pos = os->tellp();
+    RETURN_NOT_OK(PrintDiff(*left_dict.dictionary(), *right_dict.dictionary(), os));
+    if (os->tellp() == pos) {
+      *os << std::endl;
+    }
+
+    *os << "## indices diff";
+    pos = os->tellp();
+    RETURN_NOT_OK(PrintDiff(*left_dict.indices(), *right_dict.indices(), os));
+    if (os->tellp() == pos) {
+      *os << std::endl;
+    }
+    return Status::OK();
+  }
+
+  ARROW_ASSIGN_OR_RAISE(auto edits, Diff(left, right, default_memory_pool()));
+  ARROW_ASSIGN_OR_RAISE(auto formatter, MakeUnifiedDiffFormatter(*left.type(), os));
+  return formatter(*edits, left, right);
+}
+
 }  // namespace internal
 
 bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts) {
-  return internal::ArrayEqualsImpl<internal::ArrayEqualsVisitor>(left, right, opts);
+  bool are_equal =
+      internal::ArrayEqualsImpl<internal::ArrayEqualsVisitor>(left, right, opts);
+  if (!are_equal) {
+    DCHECK_OK(internal::PrintDiff(left, right, opts.diff_sink()));
+  }
+  return are_equal;
 }
 
 bool ArrayApproxEquals(const Array& left, const Array& right, const EqualOptions& opts) {
-  return internal::ArrayEqualsImpl<internal::ApproxEqualsVisitor>(left, right, opts);
+  bool are_equal =
+      internal::ArrayEqualsImpl<internal::ApproxEqualsVisitor>(left, right, opts);
+  if (!are_equal) {
+    DCHECK_OK(internal::PrintDiff(left, right, opts.diff_sink()));
+  }
+  return are_equal;
 }
 
 bool ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx,
diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h
index 21da16b..eba5636 100644
--- a/cpp/src/arrow/compare.h
+++ b/cpp/src/arrow/compare.h
@@ -21,7 +21,9 @@
 #define ARROW_COMPARE_H
 
 #include <cstdint>
+#include <iosfwd>
 
+#include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
@@ -57,11 +59,23 @@ class EqualOptions {
     return res;
   }
 
+  /// The ostream to which a diff will be formatted if arrays disagree.
+  /// If this is null (the default) no diff will be formatted.
+  std::ostream* diff_sink() const { return diff_sink_; }
+
+  /// Return a new EqualOptions object with the "diff_sink" property changed.
+  EqualOptions diff_sink(std::ostream* diff_sink) const {
+    auto res = EqualOptions(*this);
+    res.diff_sink_ = diff_sink;
+    return res;
+  }
+
   static EqualOptions Defaults() { return EqualOptions(); }
 
  protected:
   double atol_ = kDefaultAbsoluteTolerance;
   bool nans_equal_ = false;
+  std::ostream* diff_sink_ = NULLPTR;
 };
 
 /// Returns true if the arrays are exactly equal
diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc
index 70870a7..4944764 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -66,7 +66,10 @@ void AssertTsEqual(const T& expected, const T& actual) {
 }
 
 void AssertArraysEqual(const Array& expected, const Array& actual) {
-  AssertTsEqual(expected, actual);
+  std::stringstream diff;
+  if (!expected.Equals(actual, EqualOptions().diff_sink(&diff))) {
+    FAIL() << diff.str();
+  }
 }
 
 void AssertBatchesEqual(const RecordBatch& expected, const RecordBatch& actual) {
@@ -76,19 +79,14 @@ void AssertBatchesEqual(const RecordBatch& expected, const RecordBatch& actual)
 void AssertChunkedEqual(const ChunkedArray& expected, const ChunkedArray& actual) {
   ASSERT_EQ(expected.num_chunks(), actual.num_chunks()) << "# chunks unequal";
   if (!actual.Equals(expected)) {
-    std::stringstream pp_result;
-    std::stringstream pp_expected;
-
+    std::stringstream diff;
     for (int i = 0; i < actual.num_chunks(); ++i) {
       auto c1 = actual.chunk(i);
       auto c2 = expected.chunk(i);
-      if (!c1->Equals(*c2)) {
-        ARROW_EXPECT_OK(::arrow::PrettyPrint(*c1, 0, &pp_result));
-        ARROW_EXPECT_OK(::arrow::PrettyPrint(*c2, 0, &pp_expected));
-        FAIL() << "Chunk " << i << " Got: " << pp_result.str()
-               << "\nExpected: " << pp_expected.str();
-      }
+      diff << "# chunk " << i << std::endl;
+      ARROW_IGNORE_EXPR(c1->Equals(c2, EqualOptions().diff_sink(&diff)));
     }
+    FAIL() << diff.str();
   }
 }
 
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 86df73b..2e4e1b3 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -583,6 +583,7 @@ class ARROW_EXPORT MapType : public ListType {
 class ARROW_EXPORT FixedSizeListType : public NestedType {
  public:
   static constexpr Type::type type_id = Type::FIXED_SIZE_LIST;
+  using offset_type = int32_t;
 
   static constexpr const char* type_name() { return "fixed_size_list"; }
 
diff --git a/cpp/src/arrow/util/lazy.h b/cpp/src/arrow/util/lazy.h
index de32b5f..55c68b3 100644
--- a/cpp/src/arrow/util/lazy.h
+++ b/cpp/src/arrow/util/lazy.h
@@ -67,13 +67,17 @@ class LazyRange {
     using _Unchecked_type = typename LazyRange<Generator>::RangeIter;
 #endif
 
+    RangeIter() = delete;
+    RangeIter(const RangeIter& other) = default;
+    RangeIter& operator=(const RangeIter& other) = default;
+
     RangeIter(const LazyRange<Generator>& range, int64_t index)
-        : range_(range), index_(index) {}
+        : range_(&range), index_(index) {}
 
-    const return_type operator*() { return range_.gen_(index_); }
+    const return_type operator*() const { return range_->gen_(index_); }
 
-    RangeIter operator+(difference_type length) {
-      return RangeIter(range_, index_ + length);
+    RangeIter operator+(difference_type length) const {
+      return RangeIter(*range_, index_ + length);
     }
 
     // pre-increment
@@ -90,20 +94,24 @@ class LazyRange {
     }
 
     bool operator==(const typename LazyRange<Generator>::RangeIter& other) const {
-      return this->index_ == other.index_ && &this->range_ == &other.range_;
+      return this->index_ == other.index_ && this->range_ == other.range_;
     }
 
     bool operator!=(const typename LazyRange<Generator>::RangeIter& other) const {
-      return this->index_ != other.index_ || &this->range_ != &other.range_;
+      return this->index_ != other.index_ || this->range_ != other.range_;
     }
 
-    int64_t operator-(const typename LazyRange<Generator>::RangeIter& other) {
+    int64_t operator-(const typename LazyRange<Generator>::RangeIter& other) const {
       return this->index_ - other.index_;
     }
 
+    bool operator<(const typename LazyRange<Generator>::RangeIter& other) const {
+      return this->index_ < other.index_;
+    }
+
    private:
     // parent range reference
-    const LazyRange& range_;
+    const LazyRange* range_;
     // current index
     int64_t index_;
   };
diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.cc
similarity index 63%
copy from cpp/src/arrow/util/string.h
copy to cpp/src/arrow/util/string.cc
index 1d716c5..727ec41 100644
--- a/cpp/src/arrow/util/string.h
+++ b/cpp/src/arrow/util/string.cc
@@ -15,20 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef ARROW_UTIL_STRING_UTIL_H
-#define ARROW_UTIL_STRING_UTIL_H
+#include "arrow/util/string.h"
 
 #include <algorithm>
-#include <string>
 
 #include "arrow/status.h"
-#include "arrow/util/string_view.h"
 
 namespace arrow {
 
 static const char* kAsciiTable = "0123456789ABCDEF";
 
-static inline std::string HexEncode(const uint8_t* data, size_t length) {
+std::string HexEncode(const uint8_t* data, size_t length) {
   std::string hex_string;
   hex_string.reserve(length * 2);
   for (size_t j = 0; j < length; ++j) {
@@ -39,15 +36,42 @@ static inline std::string HexEncode(const uint8_t* data, size_t length) {
   return hex_string;
 }
 
-static inline std::string HexEncode(const char* data, size_t length) {
-  return HexEncode(reinterpret_cast<const uint8_t*>(data), length);
+std::string Escape(const char* data, size_t length) {
+  std::string escaped_string;
+  escaped_string.reserve(length);
+  for (size_t j = 0; j < length; ++j) {
+    switch (data[j]) {
+      case '"':
+        escaped_string += R"(\")";
+        break;
+      case '\\':
+        escaped_string += R"(\\)";
+        break;
+      case '\t':
+        escaped_string += R"(\t)";
+        break;
+      case '\r':
+        escaped_string += R"(\r)";
+        break;
+      case '\n':
+        escaped_string += R"(\n)";
+        break;
+      default:
+        escaped_string.push_back(data[j]);
+    }
+  }
+  return escaped_string;
 }
 
-static inline std::string HexEncode(util::string_view str) {
-  return HexEncode(str.data(), str.size());
+std::string HexEncode(const char* data, size_t length) {
+  return HexEncode(reinterpret_cast<const uint8_t*>(data), length);
 }
 
-static inline Status ParseHexValue(const char* data, uint8_t* out) {
+std::string HexEncode(util::string_view str) { return HexEncode(str.data(), str.size()); }
+
+std::string Escape(util::string_view str) { return Escape(str.data(), str.size()); }
+
+Status ParseHexValue(const char* data, uint8_t* out) {
   char c1 = data[0];
   char c2 = data[1];
 
@@ -64,5 +88,3 @@ static inline Status ParseHexValue(const char* data, uint8_t* out) {
 }
 
 }  // namespace arrow
-
-#endif  // ARROW_UTIL_STRING_UTIL_H
diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h
index 1d716c5..7a5949f 100644
--- a/cpp/src/arrow/util/string.h
+++ b/cpp/src/arrow/util/string.h
@@ -15,54 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef ARROW_UTIL_STRING_UTIL_H
-#define ARROW_UTIL_STRING_UTIL_H
+#pragma once
 
-#include <algorithm>
 #include <string>
 
-#include "arrow/status.h"
 #include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
 
 namespace arrow {
 
-static const char* kAsciiTable = "0123456789ABCDEF";
+class Status;
 
-static inline std::string HexEncode(const uint8_t* data, size_t length) {
-  std::string hex_string;
-  hex_string.reserve(length * 2);
-  for (size_t j = 0; j < length; ++j) {
-    // Convert to 2 base16 digits
-    hex_string.push_back(kAsciiTable[data[j] >> 4]);
-    hex_string.push_back(kAsciiTable[data[j] & 15]);
-  }
-  return hex_string;
-}
+ARROW_EXPORT std::string HexEncode(const uint8_t* data, size_t length);
 
-static inline std::string HexEncode(const char* data, size_t length) {
-  return HexEncode(reinterpret_cast<const uint8_t*>(data), length);
-}
+ARROW_EXPORT std::string Escape(const char* data, size_t length);
 
-static inline std::string HexEncode(util::string_view str) {
-  return HexEncode(str.data(), str.size());
-}
+ARROW_EXPORT std::string HexEncode(const char* data, size_t length);
 
-static inline Status ParseHexValue(const char* data, uint8_t* out) {
-  char c1 = data[0];
-  char c2 = data[1];
+ARROW_EXPORT std::string HexEncode(util::string_view str);
 
-  const char* pos1 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c1);
-  const char* pos2 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c2);
+ARROW_EXPORT std::string Escape(util::string_view str);
 
-  // Error checking
-  if (*pos1 != c1 || *pos2 != c2) {
-    return Status::Invalid("Encountered non-hex digit");
-  }
-
-  *out = static_cast<uint8_t>((pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable));
-  return Status::OK();
-}
+ARROW_EXPORT Status ParseHexValue(const char* data, uint8_t* out);
 
 }  // namespace arrow
-
-#endif  // ARROW_UTIL_STRING_UTIL_H