You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by fs...@apache.org on 2019/07/12 14:40:24 UTC

[arrow] branch master updated: ARROW-5588: [C++] Better support for building union arrays

This is an automated email from the ASF dual-hosted git repository.

fsaintjacques pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 1bcfbe1  ARROW-5588: [C++] Better support for building union arrays
1bcfbe1 is described below

commit 1bcfbe107067af4a6de486623683fe744614f500
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Fri Jul 12 10:40:03 2019 -0400

    ARROW-5588: [C++] Better support for building union arrays
    
    - Simplify DenseUnionBuilder
    - Add SparesUnionBuilder
    - MakeBuilder can now produce a {Sparse,Dense}UnionBuilder
    - ArrayFromJSON can now produce union arrays
    
    Author: Benjamin Kietzman <be...@gmail.com>
    
    Closes #4781 from bkietz/5588-Better-support-for-building-UnionArrays and squashes the following commits:
    
    38e2828a0 <Benjamin Kietzman> iwyu #include <limits>
    0efe91d77 <Benjamin Kietzman> address review comments
    17e6e27aa <Benjamin Kietzman> construct offset_builder_ with a MemoryPool
    4131fe3fb <Benjamin Kietzman> separate child builder array indexable by type_id
    fd64c1ba7 <Benjamin Kietzman> rewrite union builder to share a base class, let children_ be indexed by type_id
    37de5f2d5 <Benjamin Kietzman> explicit uint8_t for msvc
    673916e50 <Benjamin Kietzman> Disable ListOfDictionary test until ListBuilder is updated
    cf1c5bec5 <Benjamin Kietzman> revert changes to reader.cc
    5742db998 <Benjamin Kietzman> debugging: highlight the broken case and a similar one
    5b1ec9361 <Benjamin Kietzman> improve doccomments, dedupe test code
    33fade16e <Benjamin Kietzman> Adding support for DenseUnions to ArrayFromJSON
    6245c82f1 <Benjamin Kietzman> add SparseUnionBuilder and MakeBuilder case
    8d4f36dcf <Benjamin Kietzman> add tests for building lists where the value builder has mutable type
    351905dd9 <Benjamin Kietzman> add test for lazily typed union builder
    7902d12c8 <Benjamin Kietzman> first pass at updating DenseUnionBuilder
    d20055ad9 <Benjamin Kietzman> minor refactors, adding some asserts
---
 cpp/src/arrow/array-dict-test.cc           |  45 ++++++
 cpp/src/arrow/array-union-test.cc          | 215 +++++++++++++++++++++++++++++
 cpp/src/arrow/array.cc                     |   1 +
 cpp/src/arrow/array.h                      |   7 +-
 cpp/src/arrow/array/builder_nested.cc      |   4 +-
 cpp/src/arrow/array/builder_nested.h       |   2 +-
 cpp/src/arrow/array/builder_union.cc       |  95 ++++++++++---
 cpp/src/arrow/array/builder_union.h        | 162 +++++++++++++++-------
 cpp/src/arrow/builder.cc                   |  24 +++-
 cpp/src/arrow/builder.h                    |   1 +
 cpp/src/arrow/compare.cc                   |  44 ++----
 cpp/src/arrow/ipc/json-simple-test.cc      | 190 +++++++++++++++++++++++++
 cpp/src/arrow/ipc/json-simple.cc           |  89 ++++++++++++
 cpp/src/arrow/python/serialize.cc          |  10 +-
 cpp/src/arrow/type.cc                      |   9 ++
 cpp/src/arrow/type.h                       |   2 +
 python/pyarrow/tests/test_serialization.py |   8 ++
 17 files changed, 788 insertions(+), 120 deletions(-)

diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc
index 16d8aac..ed37df3 100644
--- a/cpp/src/arrow/array-dict-test.cc
+++ b/cpp/src/arrow/array-dict-test.cc
@@ -948,4 +948,49 @@ TEST(TestDictionary, TransposeNulls) {
   AssertArraysEqual(*expected, *out);
 }
 
+TEST(TestDictionary, DISABLED_ListOfDictionary) {
+  std::unique_ptr<ArrayBuilder> root_builder;
+  ASSERT_OK(MakeBuilder(default_memory_pool(), list(dictionary(int8(), utf8())),
+                        &root_builder));
+  auto list_builder = checked_cast<ListBuilder*>(root_builder.get());
+  auto dict_builder =
+      checked_cast<DictionaryBuilder<StringType>*>(list_builder->value_builder());
+
+  ASSERT_OK(list_builder->Append());
+  std::vector<std::string> expected;
+  for (char a : "abc") {
+    for (char d : "def") {
+      for (char g : "ghi") {
+        for (char j : "jkl") {
+          for (char m : "mno") {
+            for (char p : "pqr") {
+              if ((static_cast<int>(a) + d + g + j + m + p) % 16 == 0) {
+                ASSERT_OK(list_builder->Append());
+              }
+              // 3**6 distinct strings; too large for int8
+              char str[6] = {a, d, g, j, m, p};
+              ASSERT_OK(dict_builder->Append(str));
+              expected.push_back(str);
+            }
+          }
+        }
+      }
+    }
+  }
+  std::shared_ptr<Array> expected_dict;
+  ArrayFromVector<StringType, std::string>(expected, &expected_dict);
+
+  std::shared_ptr<Array> array;
+  ASSERT_OK(root_builder->Finish(&array));
+  ASSERT_OK(ValidateArray(*array));
+
+  auto expected_type = list(dictionary(int16(), utf8()));
+  ASSERT_EQ(array->type()->ToString(), expected_type->ToString());
+
+  auto list_array = checked_cast<const ListArray*>(array.get());
+  auto actual_dict =
+      checked_cast<const DictionaryArray&>(*list_array->values()).dictionary();
+  ASSERT_ARRAYS_EQUAL(*expected_dict, *actual_dict);
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/array-union-test.cc b/cpp/src/arrow/array-union-test.cc
index 86cbeae..62a4e15 100644
--- a/cpp/src/arrow/array-union-test.cc
+++ b/cpp/src/arrow/array-union-test.cc
@@ -188,4 +188,219 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) {
                   type_codes);
 }
 
+template <typename B>
+class UnionBuilderTest : public ::testing::Test {
+ public:
+  uint8_t I8 = 8, STR = 13, DBL = 7;
+
+  virtual void AppendInt(int8_t i) {
+    expected_types_vector.push_back(I8);
+    ASSERT_OK(union_builder->Append(I8));
+    ASSERT_OK(i8_builder->Append(i));
+  }
+
+  virtual void AppendString(const std::string& str) {
+    expected_types_vector.push_back(STR);
+    ASSERT_OK(union_builder->Append(STR));
+    ASSERT_OK(str_builder->Append(str));
+  }
+
+  virtual void AppendDouble(double dbl) {
+    expected_types_vector.push_back(DBL);
+    ASSERT_OK(union_builder->Append(DBL));
+    ASSERT_OK(dbl_builder->Append(dbl));
+  }
+
+  void AppendBasics() {
+    AppendInt(33);
+    AppendString("abc");
+    AppendDouble(1.0);
+    AppendDouble(-1.0);
+    AppendString("");
+    AppendInt(10);
+    AppendString("def");
+    AppendInt(-10);
+    AppendDouble(0.5);
+    ASSERT_OK(union_builder->Finish(&actual));
+    ArrayFromVector<Int8Type, uint8_t>(expected_types_vector, &expected_types);
+  }
+
+  void AppendInferred() {
+    I8 = union_builder->AppendChild(i8_builder, "i8");
+    ASSERT_EQ(I8, 0);
+    AppendInt(33);
+    AppendInt(10);
+
+    STR = union_builder->AppendChild(str_builder, "str");
+    ASSERT_EQ(STR, 1);
+    AppendString("abc");
+    AppendString("");
+    AppendString("def");
+    AppendInt(-10);
+
+    DBL = union_builder->AppendChild(dbl_builder, "dbl");
+    ASSERT_EQ(DBL, 2);
+    AppendDouble(1.0);
+    AppendDouble(-1.0);
+    AppendDouble(0.5);
+    ASSERT_OK(union_builder->Finish(&actual));
+    ArrayFromVector<Int8Type, uint8_t>(expected_types_vector, &expected_types);
+
+    ASSERT_EQ(I8, 0);
+    ASSERT_EQ(STR, 1);
+    ASSERT_EQ(DBL, 2);
+  }
+
+  void AppendListOfInferred(std::shared_ptr<ListArray>* actual) {
+    ListBuilder list_builder(default_memory_pool(), union_builder);
+
+    ASSERT_OK(list_builder.Append());
+    I8 = union_builder->AppendChild(i8_builder, "i8");
+    ASSERT_EQ(I8, 0);
+    AppendInt(10);
+
+    ASSERT_OK(list_builder.Append());
+    STR = union_builder->AppendChild(str_builder, "str");
+    ASSERT_EQ(STR, 1);
+    AppendString("abc");
+    AppendInt(-10);
+
+    ASSERT_OK(list_builder.Append());
+    DBL = union_builder->AppendChild(dbl_builder, "dbl");
+    ASSERT_EQ(DBL, 2);
+    AppendDouble(0.5);
+
+    ASSERT_OK(list_builder.Finish(actual));
+    ArrayFromVector<Int8Type, uint8_t>(expected_types_vector, &expected_types);
+
+    ASSERT_EQ(I8, 0);
+    ASSERT_EQ(STR, 1);
+    ASSERT_EQ(DBL, 2);
+  }
+
+  std::vector<uint8_t> expected_types_vector;
+  std::shared_ptr<Array> expected_types;
+  std::shared_ptr<Int8Builder> i8_builder = std::make_shared<Int8Builder>();
+  std::shared_ptr<StringBuilder> str_builder = std::make_shared<StringBuilder>();
+  std::shared_ptr<DoubleBuilder> dbl_builder = std::make_shared<DoubleBuilder>();
+  std::shared_ptr<B> union_builder{new B(default_memory_pool())};
+  std::shared_ptr<UnionArray> actual;
+};
+
+class DenseUnionBuilderTest : public UnionBuilderTest<DenseUnionBuilder> {};
+class SparseUnionBuilderTest : public UnionBuilderTest<SparseUnionBuilder> {
+ public:
+  using Base = UnionBuilderTest<SparseUnionBuilder>;
+
+  void AppendInt(int8_t i) override {
+    Base::AppendInt(i);
+    ASSERT_OK(str_builder->AppendNull());
+    ASSERT_OK(dbl_builder->AppendNull());
+  }
+
+  void AppendString(const std::string& str) override {
+    Base::AppendString(str);
+    ASSERT_OK(i8_builder->AppendNull());
+    ASSERT_OK(dbl_builder->AppendNull());
+  }
+
+  void AppendDouble(double dbl) override {
+    Base::AppendDouble(dbl);
+    ASSERT_OK(i8_builder->AppendNull());
+    ASSERT_OK(str_builder->AppendNull());
+  }
+};
+
+TEST_F(DenseUnionBuilderTest, Basics) {
+  union_builder.reset(new DenseUnionBuilder(
+      default_memory_pool(), {i8_builder, str_builder, dbl_builder},
+      union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
+             {I8, STR, DBL}, UnionMode::DENSE)));
+  AppendBasics();
+
+  auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]");
+  auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])");
+  auto expected_dbl = ArrayFromJSON(float64(), "[1.0, -1.0, 0.5]");
+
+  auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1, 2, 2, 2]");
+
+  std::shared_ptr<Array> expected;
+  ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets,
+                                  {expected_i8, expected_str, expected_dbl},
+                                  {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected));
+
+  ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
+  ASSERT_ARRAYS_EQUAL(*expected, *actual);
+}
+
+TEST_F(DenseUnionBuilderTest, InferredType) {
+  AppendInferred();
+
+  auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]");
+  auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])");
+  auto expected_dbl = ArrayFromJSON(float64(), "[1.0, -1.0, 0.5]");
+
+  auto expected_offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2, 0, 1, 2]");
+
+  std::shared_ptr<Array> expected;
+  ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets,
+                                  {expected_i8, expected_str, expected_dbl},
+                                  {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected));
+
+  ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
+  ASSERT_ARRAYS_EQUAL(*expected, *actual);
+}
+
+TEST_F(DenseUnionBuilderTest, ListOfInferredType) {
+  std::shared_ptr<ListArray> actual;
+  AppendListOfInferred(&actual);
+
+  auto expected_type =
+      list(union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
+                  {I8, STR, DBL}, UnionMode::DENSE));
+  ASSERT_EQ(expected_type->ToString(), actual->type()->ToString());
+}
+
+TEST_F(SparseUnionBuilderTest, Basics) {
+  union_builder.reset(new SparseUnionBuilder(
+      default_memory_pool(), {i8_builder, str_builder, dbl_builder},
+      union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
+             {I8, STR, DBL}, UnionMode::SPARSE)));
+
+  AppendBasics();
+
+  auto expected_i8 =
+      ArrayFromJSON(int8(), "[33, null, null, null, null, 10, null, -10, null]");
+  auto expected_str =
+      ArrayFromJSON(utf8(), R"([null, "abc", null, null, "",  null, "def", null, null])");
+  auto expected_dbl =
+      ArrayFromJSON(float64(), "[null, null, 1.0, -1.0, null, null, null, null, 0.5]");
+
+  std::shared_ptr<Array> expected;
+  ASSERT_OK(UnionArray::MakeSparse(*expected_types,
+                                   {expected_i8, expected_str, expected_dbl},
+                                   {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected));
+
+  ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
+  ASSERT_ARRAYS_EQUAL(*expected, *actual);
+}
+
+TEST_F(SparseUnionBuilderTest, InferredType) {
+  AppendInferred();
+
+  auto expected_i8 =
+      ArrayFromJSON(int8(), "[33, 10, null, null, null, -10, null, null, null]");
+  auto expected_str =
+      ArrayFromJSON(utf8(), R"([null, null, "abc", "", "def",  null, null, null, null])");
+  auto expected_dbl =
+      ArrayFromJSON(float64(), "[null, null, null, null, null, null, 1.0, -1.0, 0.5]");
+
+  std::shared_ptr<Array> expected;
+  ASSERT_OK(UnionArray::MakeSparse(*expected_types,
+                                   {expected_i8, expected_str, expected_dbl},
+                                   {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected));
+
+  ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
+  ASSERT_ARRAYS_EQUAL(*expected, *actual);
+}
 }  // namespace arrow
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index bc38559..5f76f08 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -606,6 +606,7 @@ void UnionArray::SetData(const std::shared_ptr<ArrayData>& data) {
 
   ARROW_CHECK_EQ(data->type->id(), Type::UNION);
   ARROW_CHECK_EQ(data->buffers.size(), 3);
+  union_type_ = checked_cast<const UnionType*>(data_->type.get());
 
   auto type_ids = data_->buffers[1];
   auto value_offsets = data_->buffers[2];
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 256bbdc..599a6ea 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -1032,9 +1032,9 @@ class ARROW_EXPORT UnionArray : public Array {
   const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; }
   const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
 
-  UnionMode::type mode() const {
-    return internal::checked_cast<const UnionType&>(*type()).mode();
-  }
+  const UnionType* union_type() const { return union_type_; }
+
+  UnionMode::type mode() const { return union_type_->mode(); }
 
   // Return the given field as an individual array.
   // For sparse unions, the returned array has its offset, length and null
@@ -1047,6 +1047,7 @@ class ARROW_EXPORT UnionArray : public Array {
 
   const type_id_t* raw_type_ids_;
   const int32_t* raw_value_offsets_;
+  const UnionType* union_type_;
 
   // For caching boxed child data
   mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc
index 0bf9171..30b3fc0 100644
--- a/cpp/src/arrow/array/builder_nested.cc
+++ b/cpp/src/arrow/array/builder_nested.cc
@@ -60,7 +60,7 @@ Status ListBuilder::CheckNextOffset() const {
   const int64_t num_values = value_builder_->length();
   ARROW_RETURN_IF(
       num_values > kListMaximumElements,
-      Status::CapacityError("ListArray cannot contain more then 2^31 - 1 child elements,",
+      Status::CapacityError("ListArray cannot contain more than 2^31 - 1 child elements,",
                             " have ", num_values));
   return Status::OK();
 }
@@ -300,7 +300,7 @@ Status FixedSizeListBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
 // Struct
 
 StructBuilder::StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
-                             std::vector<std::shared_ptr<ArrayBuilder>>&& field_builders)
+                             std::vector<std::shared_ptr<ArrayBuilder>> field_builders)
     : ArrayBuilder(type, pool) {
   children_ = std::move(field_builders);
 }
diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h
index de03145..8742f2b 100644
--- a/cpp/src/arrow/array/builder_nested.h
+++ b/cpp/src/arrow/array/builder_nested.h
@@ -218,7 +218,7 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
 class ARROW_EXPORT StructBuilder : public ArrayBuilder {
  public:
   StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
-                std::vector<std::shared_ptr<ArrayBuilder>>&& field_builders);
+                std::vector<std::shared_ptr<ArrayBuilder>> field_builders);
 
   Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
 
diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc
index f51b7d7..8de786f 100644
--- a/cpp/src/arrow/array/builder_union.cc
+++ b/cpp/src/arrow/array/builder_union.cc
@@ -17,44 +17,101 @@
 
 #include "arrow/array/builder_union.h"
 
+#include <limits>
 #include <utility>
 
+#include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 
 namespace arrow {
 
-DenseUnionBuilder::DenseUnionBuilder(MemoryPool* pool,
-                                     const std::shared_ptr<DataType>& type)
-    : ArrayBuilder(type, pool), types_builder_(pool), offsets_builder_(pool) {}
+using internal::checked_cast;
 
-Status DenseUnionBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
-  std::shared_ptr<Buffer> types;
+Status BasicUnionBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  std::shared_ptr<Buffer> types, null_bitmap;
+  RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
   RETURN_NOT_OK(types_builder_.Finish(&types));
-  std::shared_ptr<Buffer> offsets;
-  RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
 
-  std::shared_ptr<Buffer> null_bitmap;
-  RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+  // If the type has not been specified in the constructor, gather type_codes
+  std::vector<uint8_t> type_codes;
+  if (type_ == nullptr) {
+    for (size_t i = 0; i < children_.size(); ++i) {
+      if (type_id_to_children_[i] != nullptr) {
+        type_codes.push_back(static_cast<uint8_t>(i));
+      }
+    }
+  } else {
+    type_codes = checked_cast<const UnionType&>(*type_).type_codes();
+  }
+  DCHECK_EQ(type_codes.size(), children_.size());
 
-  std::vector<std::shared_ptr<Field>> fields;
   std::vector<std::shared_ptr<ArrayData>> child_data(children_.size());
-  std::vector<uint8_t> type_ids;
   for (size_t i = 0; i < children_.size(); ++i) {
-    std::shared_ptr<ArrayData> data;
-    RETURN_NOT_OK(children_[i]->FinishInternal(&data));
-    child_data[i] = data;
-    fields.push_back(field(field_names_[i], children_[i]->type()));
-    type_ids.push_back(static_cast<uint8_t>(i));
+    RETURN_NOT_OK(children_[i]->FinishInternal(&child_data[i]));
   }
 
   // If the type has not been specified in the constructor, infer it
-  if (!type_) {
-    type_ = union_(fields, type_ids, UnionMode::DENSE);
+  if (type_ == nullptr) {
+    std::vector<std::shared_ptr<Field>> fields;
+    auto field_names_it = field_names_.begin();
+    for (auto&& data : child_data) {
+      fields.push_back(field(*field_names_it++, data->type));
+    }
+    type_ = union_(fields, type_codes, mode_);
   }
 
-  *out = ArrayData::Make(type_, length(), {null_bitmap, types, offsets}, null_count_);
+  *out = ArrayData::Make(type_, length(), {null_bitmap, types, nullptr}, null_count_);
   (*out)->child_data = std::move(child_data);
   return Status::OK();
 }
 
+BasicUnionBuilder::BasicUnionBuilder(
+    MemoryPool* pool, UnionMode::type mode,
+    const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+    const std::shared_ptr<DataType>& type)
+    : ArrayBuilder(type, pool), mode_(mode), types_builder_(pool) {
+  auto union_type = checked_cast<const UnionType*>(type.get());
+  DCHECK_NE(union_type, nullptr);
+  DCHECK_EQ(union_type->mode(), mode);
+
+  children_ = children;
+  type_id_to_children_.resize(union_type->max_type_code() + 1, nullptr);
+  DCHECK_LT(type_id_to_children_.size(), std::numeric_limits<int8_t>::max());
+
+  auto field_it = type->children().begin();
+  auto children_it = children.begin();
+  for (auto type_id : union_type->type_codes()) {
+    type_id_to_children_[type_id] = *children_it++;
+    field_names_.push_back((*field_it++)->name());
+  }
+  DCHECK_EQ(children_it, children.end());
+  DCHECK_EQ(field_it, type->children().end());
+}
+
+int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
+                                      const std::string& field_name) {
+  // force type inferrence in Finish
+  type_ = nullptr;
+
+  field_names_.push_back(field_name);
+  children_.push_back(new_child);
+
+  // Find type_id such that type_id_to_children_[type_id] == nullptr
+  // and use that for the new child. Start searching at dense_type_id_
+  // since type_id_to_children_ is densely packed up at least up to dense_type_id_
+  for (; static_cast<size_t>(dense_type_id_) < type_id_to_children_.size();
+       ++dense_type_id_) {
+    if (type_id_to_children_[dense_type_id_] == nullptr) {
+      type_id_to_children_[dense_type_id_] = new_child;
+      return dense_type_id_++;
+    }
+  }
+
+  DCHECK_LT(type_id_to_children_.size(), std::numeric_limits<int8_t>::max());
+
+  // type_id_to_children_ is already densely packed, so just append the new child
+  type_id_to_children_.push_back(new_child);
+  return dense_type_id_++;
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h
index aac2e54..e04d5d2 100644
--- a/cpp/src/arrow/array/builder_union.h
+++ b/cpp/src/arrow/array/builder_union.h
@@ -27,25 +27,66 @@
 
 namespace arrow {
 
+class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder {
+ public:
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }
+
+  /// \brief Make a new child builder available to the UnionArray
+  ///
+  /// \param[in] new_child the child builder
+  /// \param[in] field_name the name of the field in the union array type
+  /// if type inference is used
+  /// \return child index, which is the "type" argument that needs
+  /// to be passed to the "Append" method to add a new element to
+  /// the union array.
+  int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
+                     const std::string& field_name = "");
+
+ protected:
+  /// Use this constructor to initialize the UnionBuilder with no child builders,
+  /// allowing type to be inferred. You will need to call AppendChild for each of the
+  /// children builders you want to use.
+  explicit BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode)
+      : ArrayBuilder(NULLPTR, pool), mode_(mode), types_builder_(pool) {}
+
+  /// Use this constructor to specify the type explicitly.
+  /// You can still add child builders to the union after using this constructor
+  BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode,
+                    const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+                    const std::shared_ptr<DataType>& type);
+
+  UnionMode::type mode_;
+  std::vector<std::shared_ptr<ArrayBuilder>> type_id_to_children_;
+  // for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr
+  int8_t dense_type_id_ = 0;
+  TypedBufferBuilder<int8_t> types_builder_;
+  std::vector<std::string> field_names_;
+};
+
 /// \class DenseUnionBuilder
 ///
-/// You need to call AppendChild for each of the children builders you want
-/// to use. The function will return an int8_t, which is the type tag
-/// associated with that child. You can then call Append with that tag
-/// (followed by an append on the child builder) to add elements to
-/// the union array.
-///
-/// You can either specify the type when the UnionBuilder is constructed
-/// or let the UnionBuilder infer the type at runtime (by omitting the
-/// type argument from the constructor).
-///
 /// This API is EXPERIMENTAL.
-class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder {
+class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
  public:
-  /// Use this constructor to incrementally build the union array along
-  /// with types, offsets, and null bitmap.
-  explicit DenseUnionBuilder(MemoryPool* pool,
-                             const std::shared_ptr<DataType>& type = NULLPTR);
+  /// Use this constructor to initialize the UnionBuilder with no child builders,
+  /// allowing type to be inferred. You will need to call AppendChild for each of the
+  /// children builders you want to use.
+  explicit DenseUnionBuilder(MemoryPool* pool)
+      : BasicUnionBuilder(pool, UnionMode::DENSE), offsets_builder_(pool) {}
+
+  /// Use this constructor to specify the type explicitly.
+  /// You can still add child builders to the union after using this constructor
+  DenseUnionBuilder(MemoryPool* pool,
+                    const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+                    const std::shared_ptr<DataType>& type)
+      : BasicUnionBuilder(pool, UnionMode::DENSE, children, type),
+        offsets_builder_(pool) {}
 
   Status AppendNull() final {
     ARROW_RETURN_NOT_OK(types_builder_.Append(0));
@@ -54,53 +95,78 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder {
   }
 
   Status AppendNulls(int64_t length) final {
-    ARROW_RETURN_NOT_OK(types_builder_.Reserve(length));
-    ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(length));
-    ARROW_RETURN_NOT_OK(Reserve(length));
-    for (int64_t i = 0; i < length; ++i) {
-      types_builder_.UnsafeAppend(0);
-      offsets_builder_.UnsafeAppend(0);
-    }
+    ARROW_RETURN_NOT_OK(types_builder_.Append(length, 0));
+    ARROW_RETURN_NOT_OK(offsets_builder_.Append(length, 0));
     return AppendToBitmap(length, false);
   }
 
   /// \brief Append an element to the UnionArray. This must be followed
   ///        by an append to the appropriate child builder.
-  /// \param[in] type index of the child the value will be appended
-  /// \param[in] offset offset of the value in that child
-  Status Append(int8_t type, int32_t offset) {
-    ARROW_RETURN_NOT_OK(types_builder_.Append(type));
+  ///
+  /// \param[in] next_type type_id of the child to which the next value will be appended.
+  ///
+  /// The corresponding child builder must be appended to independently after this method
+  /// is called.
+  Status Append(int8_t next_type) {
+    ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
+    if (type_id_to_children_[next_type]->length() == kListMaximumElements) {
+      return Status::CapacityError(
+          "a dense UnionArray cannot contain more than 2^31 - 1 elements from a single "
+          "child");
+    }
+    auto offset = static_cast<int32_t>(type_id_to_children_[next_type]->length());
     ARROW_RETURN_NOT_OK(offsets_builder_.Append(offset));
     return AppendToBitmap(true);
   }
 
-  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+    ARROW_RETURN_NOT_OK(BasicUnionBuilder::FinishInternal(out));
+    return offsets_builder_.Finish(&(*out)->buffers[2]);
+  }
 
-  /// \cond FALSE
-  using ArrayBuilder::Finish;
-  /// \endcond
+ private:
+  TypedBufferBuilder<int32_t> offsets_builder_;
+};
 
-  Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }
+/// \class SparseUnionBuilder
+///
+/// This API is EXPERIMENTAL.
+class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
+ public:
+  /// Use this constructor to initialize the UnionBuilder with no child builders,
+  /// allowing type to be inferred. You will need to call AppendChild for each of the
+  /// children builders you want to use.
+  explicit SparseUnionBuilder(MemoryPool* pool)
+      : BasicUnionBuilder(pool, UnionMode::SPARSE) {}
 
-  /// \brief Make a new child builder available to the UnionArray
-  ///
-  /// \param[in] child the child builder
-  /// \param[in] field_name the name of the field in the union array type
-  /// if type inference is used
-  /// \return child index, which is the "type" argument that needs
-  /// to be passed to the "Append" method to add a new element to
-  /// the union array.
-  int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& child,
-                     const std::string& field_name = "") {
-    children_.push_back(child);
-    field_names_.push_back(field_name);
-    return static_cast<int8_t>(children_.size() - 1);
+  /// Use this constructor to specify the type explicitly.
+  /// You can still add child builders to the union after using this constructor
+  SparseUnionBuilder(MemoryPool* pool,
+                     const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+                     const std::shared_ptr<DataType>& type)
+      : BasicUnionBuilder(pool, UnionMode::SPARSE, children, type) {}
+
+  Status AppendNull() final {
+    ARROW_RETURN_NOT_OK(types_builder_.Append(0));
+    return AppendToBitmap(false);
   }
 
- private:
-  TypedBufferBuilder<int8_t> types_builder_;
-  TypedBufferBuilder<int32_t> offsets_builder_;
-  std::vector<std::string> field_names_;
+  Status AppendNulls(int64_t length) final {
+    ARROW_RETURN_NOT_OK(types_builder_.Append(length, 0));
+    return AppendToBitmap(length, false);
+  }
+
+  /// \brief Append an element to the UnionArray. This must be followed
+  ///        by an append to the appropriate child builder.
+  ///
+  /// \param[in] next_type type_id of the child to which the next value will be appended.
+  ///
+  /// The corresponding child builder must be appended to independently after this method
+  /// is called, and all other child builders must have null appended
+  Status Append(int8_t next_type) {
+    ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
+    return AppendToBitmap(true);
+  }
 };
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index f6f8042..cee443c 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -155,14 +155,32 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
 
     case Type::STRUCT: {
       const std::vector<std::shared_ptr<Field>>& fields = type->children();
-      std::vector<std::shared_ptr<ArrayBuilder>> values_builder;
+      std::vector<std::shared_ptr<ArrayBuilder>> field_builders;
 
       for (auto it : fields) {
         std::unique_ptr<ArrayBuilder> builder;
         RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder));
-        values_builder.emplace_back(std::move(builder));
+        field_builders.emplace_back(std::move(builder));
+      }
+      out->reset(new StructBuilder(type, pool, std::move(field_builders)));
+      return Status::OK();
+    }
+
+    case Type::UNION: {
+      const auto& union_type = internal::checked_cast<const UnionType&>(*type);
+      const std::vector<std::shared_ptr<Field>>& fields = type->children();
+      std::vector<std::shared_ptr<ArrayBuilder>> field_builders;
+
+      for (auto it : fields) {
+        std::unique_ptr<ArrayBuilder> builder;
+        RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder));
+        field_builders.emplace_back(std::move(builder));
+      }
+      if (union_type.mode() == UnionMode::DENSE) {
+        out->reset(new DenseUnionBuilder(pool, std::move(field_builders), type));
+      } else {
+        out->reset(new SparseUnionBuilder(pool, std::move(field_builders), type));
       }
-      out->reset(new StructBuilder(type, pool, std::move(values_builder)));
       return Status::OK();
     }
 
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 56c3e2b..223ed58 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -27,6 +27,7 @@
 #include "arrow/array/builder_nested.h"     // IWYU pragma: export
 #include "arrow/array/builder_primitive.h"  // IWYU pragma: export
 #include "arrow/array/builder_time.h"       // IWYU pragma: export
+#include "arrow/array/builder_union.h"      // IWYU pragma: export
 #include "arrow/status.h"
 #include "arrow/util/visibility.h"
 
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index e1525a4..097bc8f 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -234,26 +234,15 @@ class RangeEqualsVisitor {
     const auto& left_type = checked_cast<const UnionType&>(*left.type());
 
     // Define a mapping from the type id to child number
-    uint8_t max_code = 0;
-
     const std::vector<uint8_t>& type_codes = left_type.type_codes();
-    for (size_t i = 0; i < type_codes.size(); ++i) {
-      const uint8_t code = type_codes[i];
-      if (code > max_code) {
-        max_code = code;
-      }
-    }
-
-    // Store mapping in a vector for constant time lookups
-    std::vector<uint8_t> type_id_to_child_num(max_code + 1);
-    for (uint8_t i = 0; i < static_cast<uint8_t>(type_codes.size()); ++i) {
+    std::vector<uint8_t> type_id_to_child_num(left.union_type()->max_type_code() + 1, 0);
+    for (uint8_t i = 0; i < type_codes.size(); ++i) {
       type_id_to_child_num[type_codes[i]] = i;
     }
 
     const uint8_t* left_ids = left.raw_type_ids();
     const uint8_t* right_ids = right.raw_type_ids();
 
-    uint8_t id, child_num;
     for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
          ++i, ++o_i) {
       if (left.IsNull(i) != right.IsNull(o_i)) {
@@ -264,8 +253,7 @@ class RangeEqualsVisitor {
         return false;
       }
 
-      id = left_ids[i];
-      child_num = type_id_to_child_num[id];
+      auto child_num = type_id_to_child_num[left_ids[i]];
 
       // TODO(wesm): really we should be comparing stretches of non-null data
       // rather than looking at one value at a time.
@@ -773,30 +761,16 @@ class TypeEqualsVisitor {
   Status Visit(const UnionType& left) {
     const auto& right = checked_cast<const UnionType&>(right_);
 
-    if (left.mode() != right.mode() ||
-        left.type_codes().size() != right.type_codes().size()) {
+    if (left.mode() != right.mode() || left.type_codes() != right.type_codes()) {
       result_ = false;
       return Status::OK();
     }
 
-    const std::vector<uint8_t>& left_codes = left.type_codes();
-    const std::vector<uint8_t>& right_codes = right.type_codes();
-
-    for (size_t i = 0; i < left_codes.size(); ++i) {
-      if (left_codes[i] != right_codes[i]) {
-        result_ = false;
-        return Status::OK();
-      }
-    }
-
-    for (int i = 0; i < left.num_children(); ++i) {
-      if (!left.child(i)->Equals(right_.child(i), check_metadata_)) {
-        result_ = false;
-        return Status::OK();
-      }
-    }
-
-    result_ = true;
+    result_ = std::equal(
+        left.children().begin(), left.children().end(), right.children().begin(),
+        [this](const std::shared_ptr<Field>& l, const std::shared_ptr<Field>& r) {
+          return l->Equals(r, check_metadata_);
+        });
     return Status::OK();
   }
 
diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc
index 772557b..ce8b21a 100644
--- a/cpp/src/arrow/ipc/json-simple-test.cc
+++ b/cpp/src/arrow/ipc/json-simple-test.cc
@@ -905,6 +905,196 @@ TEST(TestStruct, Errors) {
   ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[{\"c\": 0}]", &array));
 }
 
+TEST(TestDenseUnion, Basics) {
+  auto field_a = field("a", int8());
+  auto field_b = field("b", boolean());
+
+  auto type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE);
+  auto array = ArrayFromJSON(type, "[[4, 122], [8, true], [4, null], null, [8, false]]");
+
+  auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]");
+  auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 0, 1]");
+  auto expected_a = ArrayFromJSON(int8(), "[122, null]");
+  auto expected_b = ArrayFromJSON(boolean(), "[true, false]");
+
+  std::shared_ptr<Array> expected;
+  ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets,
+                                  {expected_a, expected_b}, {"a", "b"}, {4, 8},
+                                  &expected));
+
+  ASSERT_ARRAYS_EQUAL(*expected, *array);
+}
+
+TEST(TestSparseUnion, Basics) {
+  auto field_a = field("a", int8());
+  auto field_b = field("b", boolean());
+
+  auto type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE);
+  auto array = ArrayFromJSON(type, "[[4, 122], [8, true], [4, null], null, [8, false]]");
+
+  auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]");
+  auto expected_a = ArrayFromJSON(int8(), "[122, null, null, null, null]");
+  auto expected_b = ArrayFromJSON(boolean(), "[null, true, null, null, false]");
+
+  std::shared_ptr<Array> expected;
+  ASSERT_OK(UnionArray::MakeSparse(*expected_types, {expected_a, expected_b}, {"a", "b"},
+                                   {4, 8}, &expected));
+
+  ASSERT_ARRAYS_EQUAL(*expected, *array);
+}
+
+TEST(TestDenseUnion, ListOfUnion) {
+  auto field_a = field("a", int8());
+  auto field_b = field("b", boolean());
+  auto union_type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE);
+  auto list_type = list(union_type);
+  auto array = ArrayFromJSON(list_type,
+                             "["
+                             "[[4, 122], [8, true]],"
+                             "[[4, null], null, [8, false]]"
+                             "]");
+
+  auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]");
+  auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 0, 1]");
+  auto expected_a = ArrayFromJSON(int8(), "[122, null]");
+  auto expected_b = ArrayFromJSON(boolean(), "[true, false]");
+
+  std::shared_ptr<Array> expected_values, expected;
+  ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets,
+                                  {expected_a, expected_b}, {"a", "b"}, {4, 8},
+                                  &expected_values));
+  auto expected_list_offsets = ArrayFromJSON(int32(), "[0, 2, 5]");
+  ASSERT_OK(ListArray::FromArrays(*expected_list_offsets, *expected_values,
+                                  default_memory_pool(), &expected));
+
+  ASSERT_ARRAYS_EQUAL(*expected, *array);
+}
+
+TEST(TestSparseUnion, ListOfUnion) {
+  auto field_a = field("a", int8());
+  auto field_b = field("b", boolean());
+  auto union_type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE);
+  auto list_type = list(union_type);
+  auto array = ArrayFromJSON(list_type,
+                             "["
+                             "[[4, 122], [8, true]],"
+                             "[[4, null], null, [8, false]]"
+                             "]");
+
+  auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]");
+  auto expected_a = ArrayFromJSON(int8(), "[122, null, null, null, null]");
+  auto expected_b = ArrayFromJSON(boolean(), "[null, true, null, null, false]");
+
+  std::shared_ptr<Array> expected_values, expected;
+  ASSERT_OK(UnionArray::MakeSparse(*expected_types, {expected_a, expected_b}, {"a", "b"},
+                                   {4, 8}, &expected_values));
+  auto expected_list_offsets = ArrayFromJSON(int32(), "[0, 2, 5]");
+  ASSERT_OK(ListArray::FromArrays(*expected_list_offsets, *expected_values,
+                                  default_memory_pool(), &expected));
+
+  ASSERT_ARRAYS_EQUAL(*expected, *array);
+}
+
+TEST(TestDenseUnion, UnionOfStructs) {
+  std::vector<std::shared_ptr<Field>> fields = {
+      field("ab", struct_({field("alpha", float64()), field("bravo", utf8())})),
+      field("wtf", struct_({field("whiskey", int8()), field("tango", float64()),
+                            field("foxtrot", list(int8()))})),
+      field("q", struct_({field("quebec", utf8())}))};
+  auto type = union_(fields, {0, 23, 47}, UnionMode::DENSE);
+  auto array = ArrayFromJSON(type, R"([
+    [0, {"alpha": 0.0, "bravo": "charlie"}],
+    [23, {"whiskey": 99}],
+    [0, {"bravo": "mike"}],
+    null,
+    [23, {"tango": 8.25, "foxtrot": [0, 2, 3]}]
+  ])");
+
+  auto expected_types = ArrayFromJSON(int8(), "[0, 23, 0, null, 23]");
+  auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 0, 1]");
+  ArrayVector expected_fields = {ArrayFromJSON(fields[0]->type(), R"([
+      {"alpha": 0.0, "bravo": "charlie"},
+      {"bravo": "mike"}
+    ])"),
+                                 ArrayFromJSON(fields[1]->type(), R"([
+      {"whiskey": 99},
+      {"tango": 8.25, "foxtrot": [0, 2, 3]}
+    ])"),
+                                 ArrayFromJSON(fields[2]->type(), "[]")};
+
+  std::shared_ptr<Array> expected;
+  ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets, expected_fields,
+                                  {"ab", "wtf", "q"}, {0, 23, 47}, &expected));
+
+  ASSERT_ARRAYS_EQUAL(*expected, *array);
+}
+
+TEST(TestSparseUnion, UnionOfStructs) {
+  std::vector<std::shared_ptr<Field>> fields = {
+      field("ab", struct_({field("alpha", float64()), field("bravo", utf8())})),
+      field("wtf", struct_({field("whiskey", int8()), field("tango", float64()),
+                            field("foxtrot", list(int8()))})),
+      field("q", struct_({field("quebec", utf8())}))};
+  auto type = union_(fields, {0, 23, 47}, UnionMode::SPARSE);
+  auto array = ArrayFromJSON(type, R"([
+    [0, {"alpha": 0.0, "bravo": "charlie"}],
+    [23, {"whiskey": 99}],
+    [0, {"bravo": "mike"}],
+    null,
+    [23, {"tango": 8.25, "foxtrot": [0, 2, 3]}]
+  ])");
+
+  auto expected_types = ArrayFromJSON(int8(), "[0, 23, 0, null, 23]");
+  ArrayVector expected_fields = {
+      ArrayFromJSON(fields[0]->type(), R"([
+      {"alpha": 0.0, "bravo": "charlie"},
+      null,
+      {"bravo": "mike"},
+      null,
+      null
+    ])"),
+      ArrayFromJSON(fields[1]->type(), R"([
+      null,
+      {"whiskey": 99},
+      null,
+      null,
+      {"tango": 8.25, "foxtrot": [0, 2, 3]}
+    ])"),
+      ArrayFromJSON(fields[2]->type(), "[null, null, null, null, null]")};
+
+  std::shared_ptr<Array> expected;
+  ASSERT_OK(UnionArray::MakeSparse(*expected_types, expected_fields, {"ab", "wtf", "q"},
+                                   {0, 23, 47}, &expected));
+
+  ASSERT_ARRAYS_EQUAL(*expected, *array);
+}
+
+TEST(TestDenseUnion, Errors) {
+  auto field_a = field("a", int8());
+  auto field_b = field("b", boolean());
+  std::shared_ptr<DataType> type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE);
+  std::shared_ptr<Array> array;
+
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array));
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, 8]]", &array));
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array));
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[4, \"\"]]", &array));
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[8, true, 1]]", &array));
+}
+
+TEST(TestSparseUnion, Errors) {
+  auto field_a = field("a", int8());
+  auto field_b = field("b", boolean());
+  std::shared_ptr<DataType> type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE);
+  std::shared_ptr<Array> array;
+
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array));
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, 8]]", &array));
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array));
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[4, \"\"]]", &array));
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[8, true, 1]]", &array));
+}
+
 }  // namespace json
 }  // namespace internal
 }  // namespace ipc
diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc
index f850f3d..ae01bcc 100644
--- a/cpp/src/arrow/ipc/json-simple.cc
+++ b/cpp/src/arrow/ipc/json-simple.cc
@@ -611,6 +611,94 @@ class StructConverter final : public ConcreteConverter<StructConverter> {
 };
 
 // ------------------------------------------------------------------------
+// Converter for struct arrays
+
+class UnionConverter final : public ConcreteConverter<UnionConverter> {
+ public:
+  explicit UnionConverter(const std::shared_ptr<DataType>& type) { type_ = type; }
+
+  Status Init() override {
+    auto union_type = checked_cast<const UnionType*>(type_.get());
+    mode_ = union_type->mode();
+    type_id_to_child_num_.clear();
+    type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1);
+    int child_i = 0;
+    for (auto type_id : union_type->type_codes()) {
+      type_id_to_child_num_[type_id] = child_i++;
+    }
+    std::vector<std::shared_ptr<ArrayBuilder>> child_builders;
+    for (const auto& field : type_->children()) {
+      std::shared_ptr<Converter> child_converter;
+      RETURN_NOT_OK(GetConverter(field->type(), &child_converter));
+      child_converters_.push_back(child_converter);
+      child_builders.push_back(child_converter->builder());
+    }
+    if (mode_ == UnionMode::DENSE) {
+      builder_ = std::make_shared<DenseUnionBuilder>(default_memory_pool(),
+                                                     std::move(child_builders), type_);
+    } else {
+      builder_ = std::make_shared<SparseUnionBuilder>(default_memory_pool(),
+                                                      std::move(child_builders), type_);
+    }
+    return Status::OK();
+  }
+
+  Status AppendNull() override {
+    for (auto& converter : child_converters_) {
+      RETURN_NOT_OK(converter->AppendNull());
+    }
+    return builder_->AppendNull();
+  }
+
+  // Append a JSON value that is either an array of N elements in order
+  // or an object mapping struct names to values (omitted struct members
+  // are mapped to null).
+  Status AppendValue(const rj::Value& json_obj) override {
+    if (json_obj.IsNull()) {
+      return AppendNull();
+    }
+    if (!json_obj.IsArray()) {
+      return JSONTypeError("array", json_obj.GetType());
+    }
+    if (json_obj.Size() != 2) {
+      return Status::Invalid("Expected [type_id, value] pair, got array of size ",
+                             json_obj.Size());
+    }
+    const auto& id_obj = json_obj[0];
+    if (!id_obj.IsInt()) {
+      return JSONTypeError("int", id_obj.GetType());
+    }
+
+    auto id = static_cast<int8_t>(id_obj.GetInt());
+    auto child_num = type_id_to_child_num_[id];
+    if (child_num == -1) {
+      return Status::Invalid("type_id ", id, " not found in ", *type_);
+    }
+
+    auto child_converter = child_converters_[child_num];
+    if (mode_ == UnionMode::DENSE) {
+      RETURN_NOT_OK(checked_cast<DenseUnionBuilder&>(*builder_).Append(id));
+    } else {
+      RETURN_NOT_OK(checked_cast<SparseUnionBuilder&>(*builder_).Append(id));
+      for (auto&& other_converter : child_converters_) {
+        if (other_converter != child_converter) {
+          RETURN_NOT_OK(other_converter->AppendNull());
+        }
+      }
+    }
+    return child_converter->AppendValue(json_obj[1]);
+  }
+
+  std::shared_ptr<ArrayBuilder> builder() override { return builder_; }
+
+ private:
+  UnionMode::type mode_;
+  std::shared_ptr<ArrayBuilder> builder_;
+  std::vector<std::shared_ptr<Converter>> child_converters_;
+  std::vector<int8_t> type_id_to_child_num_;
+};
+
+// ------------------------------------------------------------------------
 // General conversion functions
 
 Status GetConverter(const std::shared_ptr<DataType>& type,
@@ -648,6 +736,7 @@ Status GetConverter(const std::shared_ptr<DataType>& type,
     SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter)
     SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
     SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter)
+    SIMPLE_CONVERTER_CASE(Type::UNION, UnionConverter)
     default: {
       return Status::NotImplemented("JSON conversion to ", type->ToString(),
                                     " not implemented");
diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc
index 5784394..bc64fb7 100644
--- a/cpp/src/arrow/python/serialize.cc
+++ b/cpp/src/arrow/python/serialize.cc
@@ -77,14 +77,6 @@ class SequenceBuilder {
   // Appending a none to the sequence
   Status AppendNone() { return builder_->AppendNull(); }
 
-  template <typename BuilderType>
-  Status Update(BuilderType* child_builder, int8_t tag) {
-    int32_t offset32 = -1;
-    RETURN_NOT_OK(internal::CastSize(child_builder->length(), &offset32));
-    DCHECK_GE(offset32, 0);
-    return builder_->Append(tag, offset32);
-  }
-
   template <typename BuilderType, typename MakeBuilderFn>
   Status CreateAndUpdate(std::shared_ptr<BuilderType>* child_builder, int8_t tag,
                          MakeBuilderFn make_builder) {
@@ -95,7 +87,7 @@ class SequenceBuilder {
       convert << static_cast<int>(tag);
       type_map_[tag] = builder_->AppendChild(*child_builder, convert.str());
     }
-    return Update(child_builder->get(), type_map_[tag]);
+    return builder_->Append(type_map_[tag]);
   }
 
   template <typename BuilderType, typename T>
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 2dbd31a..55154ba 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -273,6 +273,9 @@ std::string DurationType::ToString() const {
 UnionType::UnionType(const std::vector<std::shared_ptr<Field>>& fields,
                      const std::vector<uint8_t>& type_codes, UnionMode::type mode)
     : NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) {
+  DCHECK_LE(fields.size(), type_codes.size()) << "union field with unknown type id";
+  DCHECK_GE(fields.size(), type_codes.size())
+      << "type id provided without corresponding union field";
   children_ = fields;
 }
 
@@ -284,6 +287,12 @@ DataTypeLayout UnionType::layout() const {
   }
 }
 
+uint8_t UnionType::max_type_code() const {
+  return type_codes_.size() == 0
+             ? 0
+             : *std::max_element(type_codes_.begin(), type_codes_.end());
+}
+
 std::string UnionType::ToString() const {
   std::stringstream s;
 
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 16f486f..10c1f90 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -686,6 +686,8 @@ class ARROW_EXPORT UnionType : public NestedType {
 
   const std::vector<uint8_t>& type_codes() const { return type_codes_; }
 
+  uint8_t max_type_code() const;
+
   UnionMode::type mode() const { return mode_; }
 
  private:
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index 22983e7..6c62699 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -302,6 +302,14 @@ def test_clone():
     assert deserialized == (0, 'a')
 
 
+def test_primitive_serialization_notbroken(large_buffer):
+    serialization_roundtrip({(1, 2): 2}, large_buffer)
+
+
+def test_primitive_serialization_broken(large_buffer):
+    serialization_roundtrip({(): 2}, large_buffer)
+
+
 def test_primitive_serialization(large_buffer):
     for obj in PRIMITIVE_OBJECTS:
         serialization_roundtrip(obj, large_buffer)