You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pc...@apache.org on 2018/07/09 07:46:55 UTC

[arrow] branch master updated: ARROW-2790: [C++] Buffers can contain uninitialized memory

This is an automated email from the ASF dual-hosted git repository.

pcmoritz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 1a419fe  ARROW-2790: [C++] Buffers can contain uninitialized memory
1a419fe is described below

commit 1a419fe8c3deabc0f16bb1089ed523b9ce57ffa7
Author: Dimitri Vorona <vo...@in.tum.de>
AuthorDate: Mon Jul 9 00:46:45 2018 -0700

    ARROW-2790: [C++] Buffers can contain uninitialized memory
    
    Author: Dimitri Vorona <vo...@in.tum.de>
    Author: Wes McKinney <we...@apache.org>
    Author: Philipp Moritz <pc...@gmail.com>
    
    Closes #2216 from alendit/fix-uninitialized-reads and squashes the following commits:
    
    7280074f <Philipp Moritz> call Finish on builders in SequenceBuilder
    c2452f0b <Wes McKinney> Add comment about mysterious non-determinism
    168631c6 <Wes McKinney> Fix Python unit test, though not sure why yet
    aeac2284 <Wes McKinney> Do not zero null bitmaps twice
    e465ab02 <Wes McKinney> More aggressively trim and zero padding in buffers, reduce code duplication
    efa8e853 <Wes McKinney> Clean up code duplication in array-test a little bit
    00e03488 <Philipp Moritz> test serialization for determinism
    af87b732 <Dimitri Vorona> Fixes for revision
    437580f6 <Dimitri Vorona> Format tests
    0a1a82ef <Dimitri Vorona> Fix uninitialized data warnings
    2f0b2820 <Dimitri Vorona> Add more uninitialized data and padding tests
    21be782d <Dimitri Vorona> Remove valgrind suppression since it isn't needed anymore
    4ff873f8 <Dimitri Vorona> Add checks for data_ before zeroing padding
    0d5d07fd <Dimitri Vorona> Stricter int to bool conversion
    7ed1b246 <Dimitri Vorona> Consume statuses
    9a9d5ddb <Dimitri Vorona> Formatting
    797e568b <Dimitri Vorona> Add memory initialization on null and zero padding to buffers
    b8f3b27e <Dimitri Vorona> Add zero-padding tests
    4fb8ace8 <Dimitri Vorona> Add tests for uninitialized values when adding nulls
---
 cpp/src/arrow/array-test.cc                | 249 +++++++++++++++++++++++++----
 cpp/src/arrow/buffer.h                     |   7 +-
 cpp/src/arrow/builder.cc                   |  63 +++++---
 cpp/src/arrow/builder.h                    |  19 +++
 cpp/src/arrow/python/python_to_arrow.cc    |  12 +-
 cpp/src/arrow/test-util.h                  |   2 +
 cpp/src/arrow/util/bit-util.cc             |   2 +-
 cpp/valgrind.supp                          |   8 +-
 python/pyarrow/tests/test_serialization.py |   7 +
 9 files changed, 298 insertions(+), 71 deletions(-)

diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index cd32976..1bef773 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -41,6 +41,42 @@ namespace arrow {
 using std::string;
 using std::vector;
 
+namespace {
+// used to prevent compiler optimizing away side-effect-less statements
+volatile int throw_away = 0;
+
+// checks if the padding of the buffers of the array is zero
+// also causes valgrind warnings if the padding bytes are uninitialized
+void AssertZeroPadded(const Array& array) {
+  for (const auto& buffer : array.data()->buffers) {
+    if (buffer) {
+      const int64_t padding = buffer->capacity() - buffer->size();
+      std::vector<uint8_t> zeros(padding);
+      ASSERT_EQ(0, memcmp(buffer->data() + buffer->size(), zeros.data(), padding));
+    }
+  }
+}
+
+// Check if the valid buffer bytes are initialized by
+// calling memcmp on them which will cause valgrind warnings otherwise
+void TestInitialized(const Array& array) {
+  for (const auto& buffer : array.data()->buffers) {
+    if (buffer) {
+      std::vector<uint8_t> zeros(buffer->capacity());
+      throw_away = memcmp(buffer->data(), zeros.data(), buffer->size());
+    }
+  }
+}
+
+template <typename BuilderType>
+void FinishAndCheckPadding(BuilderType* builder, std::shared_ptr<Array>* out) {
+  ASSERT_OK(builder->Finish(out));
+  AssertZeroPadded(**out);
+  TestInitialized(**out);
+}
+
+}  // namespace
+
 class TestArray : public ::testing::Test {
  public:
   void SetUp() { pool_ = default_memory_pool(); }
@@ -209,10 +245,13 @@ TEST_F(TestArray, BuildLargeInMemoryArray) {
 
   BooleanBuilder builder;
   ASSERT_OK(builder.Reserve(length));
+
+  // Advance does not write to data, see docstring
   ASSERT_OK(builder.Advance(length));
+  memset(builder.data()->mutable_data(), 0, BitUtil::BytesForBits(length));
 
   std::shared_ptr<Array> result;
-  ASSERT_OK(builder.Finish(&result));
+  FinishAndCheckPadding(&builder, &result);
 
   ASSERT_EQ(length, result->length());
 }
@@ -281,7 +320,7 @@ class TestPrimitiveBuilder : public TestBuilder {
         std::make_shared<ArrayType>(size, ex_data, ex_null_bitmap, ex_null_count);
 
     std::shared_ptr<Array> out;
-    ASSERT_OK(builder->Finish(&out));
+    FinishAndCheckPadding(builder.get(), &out);
 
     std::shared_ptr<ArrayType> result = std::dynamic_pointer_cast<ArrayType>(out);
 
@@ -405,7 +444,8 @@ void TestPrimitiveBuilder<PBoolean>::Check(const std::unique_ptr<BooleanBuilder>
 
   // Finish builder and check result array
   std::shared_ptr<Array> out;
-  ASSERT_OK(builder->Finish(&out));
+  FinishAndCheckPadding(builder.get(), &out);
+
   std::shared_ptr<BooleanArray> result = std::dynamic_pointer_cast<BooleanArray>(out);
 
   ASSERT_EQ(ex_null_count, result->null_count());
@@ -456,14 +496,29 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) {
     ASSERT_OK(this->builder_->AppendNull());
   }
 
-  std::shared_ptr<Array> result;
-  ASSERT_OK(this->builder_->Finish(&result));
+  std::shared_ptr<Array> out;
+  FinishAndCheckPadding(this->builder_.get(), &out);
+  auto result = std::dynamic_pointer_cast<typename TypeParam::ArrayType>(out);
 
   for (int64_t i = 0; i < size; ++i) {
     ASSERT_TRUE(result->IsNull(i)) << i;
   }
 }
 
+TYPED_TEST(TestPrimitiveBuilder, TestAppendNulls) {
+  const int64_t size = 10;
+  const uint8_t nullmap[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+
+  ASSERT_OK(this->builder_->AppendNulls(nullmap, size));
+
+  std::shared_ptr<Array> result;
+  FinishAndCheckPadding(this->builder_.get(), &result);
+
+  for (int64_t i = 0; i < size; ++i) {
+    ASSERT_EQ(result->IsValid(i), static_cast<bool>(nullmap[i]));
+  }
+}
+
 TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) {
   DECL_T();
 
@@ -488,7 +543,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) {
 
   do {
     std::shared_ptr<Array> result;
-    ASSERT_OK(this->builder_->Finish(&result));
+    FinishAndCheckPadding(this->builder_.get(), &result);
   } while (false);
 
   ASSERT_EQ(memory_before, this->pool_->bytes_allocated());
@@ -656,6 +711,24 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendValues) {
   this->Check(this->builder_nn_, false);
 }
 
+TYPED_TEST(TestPrimitiveBuilder, TestZeroPadded) {
+  DECL_T();
+
+  int64_t size = 10000;
+  this->RandomData(size);
+
+  vector<T>& draws = this->draws_;
+  vector<uint8_t>& valid_bytes = this->valid_bytes_;
+
+  // first slug
+  int64_t K = 1000;
+
+  ASSERT_OK(this->builder_->AppendValues(draws.data(), K, valid_bytes.data()));
+
+  std::shared_ptr<Array> out;
+  FinishAndCheckPadding(this->builder_.get(), &out);
+}
+
 TYPED_TEST(TestPrimitiveBuilder, TestAppendValuesStdBool) {
   // ARROW-1383
   DECL_T();
@@ -924,7 +997,7 @@ TEST_F(TestStringArray, CompareNullByteSlots) {
   ASSERT_OK(builder3.Append("baz"));
 
   std::shared_ptr<Array> array, array2, array3;
-  ASSERT_OK(builder.Finish(&array));
+  FinishAndCheckPadding(&builder, &array);
   ASSERT_OK(builder2.Finish(&array2));
   ASSERT_OK(builder3.Finish(&array3));
 
@@ -971,7 +1044,7 @@ class TestStringBuilder : public TestBuilder {
 
   void Done() {
     std::shared_ptr<Array> out;
-    EXPECT_OK(builder_->Finish(&out));
+    FinishAndCheckPadding(builder_.get(), &out);
 
     result_ = std::dynamic_pointer_cast<StringArray>(out);
     ASSERT_OK(ValidateArray(*result_));
@@ -1207,6 +1280,21 @@ TEST_F(TestBinaryArray, TestGetValue) {
   }
 }
 
+TEST_F(TestBinaryArray, TestNullValuesInitialized) {
+  for (size_t i = 0; i < expected_.size(); ++i) {
+    if (valid_bytes_[i] == 0) {
+      ASSERT_TRUE(strings_->IsNull(i));
+    } else {
+      int32_t len = -1;
+      const uint8_t* bytes = strings_->GetValue(i, &len);
+      ASSERT_EQ(0, std::memcmp(expected_[i].data(), bytes, len));
+    }
+  }
+  TestInitialized(*strings_);
+}
+
+TEST_F(TestBinaryArray, TestPaddingZeroed) { AssertZeroPadded(*strings_); }
+
 TEST_F(TestBinaryArray, TestGetString) {
   for (size_t i = 0; i < expected_.size(); ++i) {
     if (valid_bytes_[i] == 0) {
@@ -1227,7 +1315,7 @@ TEST_F(TestBinaryArray, TestEqualsEmptyStrings) {
   }
 
   std::shared_ptr<Array> left_arr;
-  ASSERT_OK(builder.Finish(&left_arr));
+  FinishAndCheckPadding(&builder, &left_arr);
 
   const BinaryArray& left = checked_cast<const BinaryArray&>(*left_arr);
   std::shared_ptr<Array> right =
@@ -1247,7 +1335,7 @@ class TestBinaryBuilder : public TestBuilder {
 
   void Done() {
     std::shared_ptr<Array> out;
-    EXPECT_OK(builder_->Finish(&out));
+    FinishAndCheckPadding(builder_.get(), &out);
 
     result_ = std::dynamic_pointer_cast<BinaryArray>(out);
     ASSERT_OK(ValidateArray(*result_));
@@ -1329,7 +1417,9 @@ TEST_F(TestBinaryBuilder, TestCapacityReserve) {
   ASSERT_EQ(reps * N, result_->length());
   ASSERT_EQ(0, result_->null_count());
   ASSERT_EQ(reps * 40, result_->value_data()->size());
-  ASSERT_EQ(expected_capacity, result_->value_data()->capacity());
+
+  // Capacity is shrunk after `Finish`
+  ASSERT_EQ(640, result_->value_data()->capacity());
 }
 
 TEST_F(TestBinaryBuilder, TestZeroLength) {
@@ -1364,7 +1454,7 @@ void CheckSliceEquality() {
   }
 
   std::shared_ptr<Array> array;
-  ASSERT_OK(builder.Finish(&array));
+  FinishAndCheckPadding(&builder, &array);
 
   std::shared_ptr<Array> slice, slice2;
 
@@ -1451,7 +1541,7 @@ TEST_F(TestFWBinaryArray, Builder) {
     }
   }
 
-  ASSERT_OK(builder_->Finish(&result));
+  FinishAndCheckPadding(builder_.get(), &result);
   CheckResult(*result);
 
   // Build using batch API
@@ -1462,7 +1552,8 @@ TEST_F(TestFWBinaryArray, Builder) {
   ASSERT_OK(builder_->AppendValues(raw_data, 50, raw_is_valid));
   ASSERT_OK(
       builder_->AppendValues(raw_data + 50 * byte_width, length - 50, raw_is_valid + 50));
-  ASSERT_OK(builder_->Finish(&result));
+  FinishAndCheckPadding(builder_.get(), &result);
+
   CheckResult(*result);
 
   // Build from std::string
@@ -1531,6 +1622,20 @@ TEST_F(TestFWBinaryArray, ZeroSize) {
   ASSERT_EQ(3, array->null_count());
 }
 
+TEST_F(TestFWBinaryArray, ZeroPadding) {
+  auto type = fixed_size_binary(4);
+  FixedSizeBinaryBuilder builder(type);
+
+  ASSERT_OK(builder.Append("foo1"));
+  ASSERT_OK(builder.AppendNull());
+  ASSERT_OK(builder.Append("foo2"));
+  ASSERT_OK(builder.AppendNull());
+  ASSERT_OK(builder.Append("foo3"));
+
+  std::shared_ptr<Array> array;
+  FinishAndCheckPadding(&builder, &array);
+}
+
 TEST_F(TestFWBinaryArray, Slice) {
   auto type = fixed_size_binary(4);
   FixedSizeBinaryBuilder builder(type);
@@ -1581,7 +1686,7 @@ class TestAdaptiveIntBuilder : public TestBuilder {
     builder_ = std::make_shared<AdaptiveIntBuilder>(pool_);
   }
 
-  void Done() { EXPECT_OK(builder_->Finish(&result_)); }
+  void Done() { FinishAndCheckPadding(builder_.get(), &result_); }
 
  protected:
   std::shared_ptr<AdaptiveIntBuilder> builder_;
@@ -1702,6 +1807,38 @@ TEST_F(TestAdaptiveIntBuilder, TestAppendValues) {
   ASSERT_TRUE(expected_->Equals(result_));
 }
 
+TEST_F(TestAdaptiveIntBuilder, TestAssertZeroPadded) {
+  std::vector<int64_t> values(
+      {0, static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1});
+  ASSERT_OK(builder_->AppendValues(values.data(), values.size()));
+  Done();
+}
+
+TEST_F(TestAdaptiveIntBuilder, TestAppendNull) {
+  int64_t size = 1000;
+  for (unsigned index = 0; index < size; ++index) {
+    ASSERT_OK(builder_->AppendNull());
+  }
+
+  Done();
+
+  for (unsigned index = 0; index < size; ++index) {
+    ASSERT_TRUE(result_->IsNull(index));
+  }
+}
+
+TEST_F(TestAdaptiveIntBuilder, TestAppendNulls) {
+  constexpr int64_t size = 10;
+  const uint8_t nullmap[size] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+  ASSERT_OK(builder_->AppendNulls(nullmap, size));
+
+  Done();
+
+  for (unsigned index = 0; index < size; ++index) {
+    ASSERT_EQ(result_->IsValid(index), static_cast<bool>(nullmap[index]));
+  }
+}
+
 class TestAdaptiveUIntBuilder : public TestBuilder {
  public:
   void SetUp() {
@@ -1709,7 +1846,7 @@ class TestAdaptiveUIntBuilder : public TestBuilder {
     builder_ = std::make_shared<AdaptiveUIntBuilder>(pool_);
   }
 
-  void Done() { EXPECT_OK(builder_->Finish(&result_)); }
+  void Done() { FinishAndCheckPadding(builder_.get(), &result_); }
 
  protected:
   std::shared_ptr<AdaptiveUIntBuilder> builder_;
@@ -1797,6 +1934,38 @@ TEST_F(TestAdaptiveUIntBuilder, TestAppendValues) {
   ASSERT_TRUE(expected_->Equals(result_));
 }
 
+TEST_F(TestAdaptiveUIntBuilder, TestAssertZeroPadded) {
+  std::vector<uint64_t> values(
+      {0, static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) + 1});
+  ASSERT_OK(builder_->AppendValues(values.data(), values.size()));
+  Done();
+}
+
+TEST_F(TestAdaptiveUIntBuilder, TestAppendNull) {
+  int64_t size = 1000;
+  for (unsigned index = 0; index < size; ++index) {
+    ASSERT_OK(builder_->AppendNull());
+  }
+
+  Done();
+
+  for (unsigned index = 0; index < size; ++index) {
+    ASSERT_TRUE(result_->IsNull(index));
+  }
+}
+
+TEST_F(TestAdaptiveUIntBuilder, TestAppendNulls) {
+  constexpr int64_t size = 10;
+  const uint8_t nullmap[size] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+  ASSERT_OK(builder_->AppendNulls(nullmap, size));
+
+  Done();
+
+  for (unsigned index = 0; index < size; ++index) {
+    ASSERT_EQ(result_->IsValid(index), static_cast<bool>(nullmap[index]));
+  }
+}
+
 // ----------------------------------------------------------------------
 // Dictionary tests
 
@@ -1894,7 +2063,7 @@ TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) {
 
     // Finalize result
     std::shared_ptr<Array> result;
-    ASSERT_OK(builder.Finish(&result));
+    FinishAndCheckPadding(&builder, &result);
 
     // Finalize expected data
     std::shared_ptr<Array> dict_array;
@@ -1916,7 +2085,7 @@ TYPED_TEST(TestDictionaryBuilder, DeltaDictionary) {
   ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
   ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
   std::shared_ptr<Array> result;
-  ASSERT_OK(builder.Finish(&result));
+  FinishAndCheckPadding(&builder, &result);
 
   // Build expected data for the initial dictionary
   NumericBuilder<TypeParam> dict_builder1;
@@ -1975,7 +2144,7 @@ TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) {
   ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
   ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
   std::shared_ptr<Array> result;
-  ASSERT_OK(builder.Finish(&result));
+  FinishAndCheckPadding(&builder, &result);
 
   // Build expected data for the initial dictionary
   NumericBuilder<TypeParam> dict_builder1;
@@ -2108,7 +2277,7 @@ TEST(TestStringDictionaryBuilder, DoubleTableSize) {
 
   // Finalize result
   std::shared_ptr<Array> result;
-  ASSERT_OK(builder.Finish(&result));
+  FinishAndCheckPadding(&builder, &result);
 
   // Finalize expected data
   std::shared_ptr<Array> str_array;
@@ -2155,7 +2324,7 @@ TEST(TestStringDictionaryBuilder, DeltaDictionary) {
   ASSERT_OK(builder.Append("test2"));
 
   std::shared_ptr<Array> result_delta;
-  ASSERT_OK(builder.Finish(&result_delta));
+  FinishAndCheckPadding(&builder, &result_delta);
 
   // Build expected data
   StringBuilder str_builder2;
@@ -2192,7 +2361,7 @@ TEST(TestStringDictionaryBuilder, BigDeltaDictionary) {
   }
 
   std::shared_ptr<Array> result;
-  ASSERT_OK(builder.Finish(&result));
+  FinishAndCheckPadding(&builder, &result);
 
   std::shared_ptr<Array> str_array1;
   ASSERT_OK(str_builder1.Finish(&str_array1));
@@ -2272,7 +2441,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) {
   ASSERT_OK(builder.Append(test.data()));
 
   std::shared_ptr<Array> result;
-  ASSERT_OK(builder.Finish(&result));
+  FinishAndCheckPadding(&builder, &result);
 
   // Build expected data
   FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4));
@@ -2306,7 +2475,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, DeltaDictionary) {
   ASSERT_OK(builder.Append(test.data()));
 
   std::shared_ptr<Array> result1;
-  ASSERT_OK(builder.Finish(&result1));
+  FinishAndCheckPadding(&builder, &result1);
 
   // Build expected data
   FixedSizeBinaryBuilder fsb_builder1(arrow::fixed_size_binary(4));
@@ -2332,7 +2501,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, DeltaDictionary) {
   ASSERT_OK(builder.Append(test3.data()));
 
   std::shared_ptr<Array> result2;
-  ASSERT_OK(builder.Finish(&result2));
+  FinishAndCheckPadding(&builder, &result2);
 
   // Build expected data
   FixedSizeBinaryBuilder fsb_builder2(arrow::fixed_size_binary(4));
@@ -2509,7 +2678,7 @@ class TestListArray : public TestBuilder {
 
   void Done() {
     std::shared_ptr<Array> out;
-    EXPECT_OK(builder_->Finish(&out));
+    FinishAndCheckPadding(builder_.get(), &out);
     result_ = std::dynamic_pointer_cast<ListArray>(out);
   }
 
@@ -2965,7 +3134,7 @@ class TestStructBuilder : public TestBuilder {
 
   void Done() {
     std::shared_ptr<Array> out;
-    ASSERT_OK(builder_->Finish(&out));
+    FinishAndCheckPadding(builder_.get(), &out);
     result_ = std::dynamic_pointer_cast<StructArray>(out);
   }
 
@@ -3146,7 +3315,7 @@ TEST_F(TestStructBuilder, TestEquality) {
     int_vb->UnsafeAppend(value);
   }
 
-  ASSERT_OK(builder_->Finish(&array));
+  FinishAndCheckPadding(builder_.get(), &array);
 
   ASSERT_OK(builder_->Resize(list_lengths.size()));
   ASSERT_OK(char_vb->Resize(list_values.size()));
@@ -3273,7 +3442,7 @@ TEST_F(TestStructBuilder, TestSlice) {
   for (int32_t value : int_values) {
     int_vb->UnsafeAppend(value);
   }
-  ASSERT_OK(builder_->Finish(&array));
+  FinishAndCheckPadding(builder_.get(), &array);
 
   std::shared_ptr<StructArray> slice, slice2;
   std::shared_ptr<Int32Array> int_field;
@@ -3348,6 +3517,9 @@ TEST(TestUnionArrayAdHoc, TestSliceEquals) {
 
     ASSERT_TRUE(slice->Equals(slice2));
     ASSERT_TRUE(array->RangeEquals(1, 6, 0, slice));
+
+    AssertZeroPadded(*array);
+    TestInitialized(*array);
   };
 
   CheckUnion(batch->column(1));
@@ -3392,7 +3564,7 @@ class DecimalTest : public ::testing::TestWithParam<int> {
     }
 
     std::shared_ptr<Array> out;
-    ASSERT_OK(builder->Finish(&out));
+    FinishAndCheckPadding(builder.get(), &out);
 
     std::vector<uint8_t> raw_bytes;
 
@@ -3410,8 +3582,7 @@ class DecimalTest : public ::testing::TestWithParam<int> {
 
     std::shared_ptr<Array> lhs = out->Slice(offset);
     std::shared_ptr<Array> rhs = expected->Slice(offset);
-    bool result = lhs->Equals(rhs);
-    ASSERT_TRUE(result);
+    ASSERT_TRUE(lhs->Equals(rhs));
   }
 };
 
@@ -3461,6 +3632,13 @@ TEST(TestRechunkArraysConsistently, Trivial) {
   groups = {{a1, a2}, {}, {b1}};
   rechunked = internal::RechunkArraysConsistently(groups);
   ASSERT_EQ(rechunked.size(), 3);
+
+  for (auto& arrvec : rechunked) {
+    for (auto& arr : arrvec) {
+      AssertZeroPadded(*arr);
+      TestInitialized(*arr);
+    }
+  }
 }
 
 TEST(TestRechunkArraysConsistently, Plain) {
@@ -3507,6 +3685,13 @@ TEST(TestRechunkArraysConsistently, Plain) {
   ASSERT_ARRAYS_EQUAL(*rb[3], *expected);
   ArrayFromVector<Int32Type, int32_t>({48, 49}, &expected);
   ASSERT_ARRAYS_EQUAL(*rb[4], *expected);
+
+  for (auto& arrvec : rechunked) {
+    for (auto& arr : arrvec) {
+      AssertZeroPadded(*arr);
+      TestInitialized(*arr);
+    }
+  }
 }
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index e79fc88..73db8b1 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -304,11 +304,8 @@ class ARROW_EXPORT BufferBuilder {
     size_ += length;
   }
 
-  Status Finish(std::shared_ptr<Buffer>* out) {
-    // Do not shrink to fit to avoid unneeded realloc
-    if (size_ > 0) {
-      RETURN_NOT_OK(buffer_->Resize(size_, false));
-    }
+  Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
+    RETURN_NOT_OK(Resize(size_, shrink_to_fit));
     *out = buffer_;
     Reset();
     return Status::OK();
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 9a8fc7d..b40453c 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -238,6 +238,7 @@ Status PrimitiveBuilder<T>::Init(int64_t capacity) {
   RETURN_NOT_OK(data_->Resize(nbytes));
 
   raw_data_ = reinterpret_cast<value_type*>(data_->mutable_data());
+
   return Status::OK();
 }
 
@@ -326,13 +327,29 @@ Status PrimitiveBuilder<T>::Append(const std::vector<value_type>& values) {
   return AppendValues(values);
 }
 
+namespace {
+
+Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) {
+  if (buffer) {
+    if (bytes_filled < buffer->size()) {
+      // Trim buffer
+      RETURN_NOT_OK(buffer->Resize(bytes_filled));
+    }
+    // zero the padding
+    memset(buffer->mutable_data() + bytes_filled, 0, buffer->capacity() - bytes_filled);
+  } else {
+    DCHECK_EQ(bytes_filled, 0);
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
 template <typename T>
 Status PrimitiveBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
-  const int64_t bytes_required = TypeTraits<T>::bytes_required(length_);
-  if (bytes_required > 0 && bytes_required < data_->size()) {
-    // Trim buffers
-    RETURN_NOT_OK(data_->Resize(bytes_required));
-  }
+  RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get()));
+  RETURN_NOT_OK(TrimBuffer(TypeTraits<T>::bytes_required(length_), data_.get()));
+
   *out = ArrayData::Make(type_, length_, {null_bitmap_, data_}, null_count_);
 
   data_ = null_bitmap_ = nullptr;
@@ -391,12 +408,6 @@ Status AdaptiveIntBuilderBase::Resize(int64_t capacity) {
 AdaptiveIntBuilder::AdaptiveIntBuilder(MemoryPool* pool) : AdaptiveIntBuilderBase(pool) {}
 
 Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
-  const int64_t bytes_required = length_ * int_size_;
-  if (bytes_required > 0 && bytes_required < data_->size()) {
-    // Trim buffers
-    RETURN_NOT_OK(data_->Resize(bytes_required));
-  }
-
   std::shared_ptr<DataType> output_type;
   switch (int_size_) {
     case 1:
@@ -416,6 +427,9 @@ Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
       return Status::NotImplemented("Only ints of size 1,2,4,8 are supported");
   }
 
+  RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get()));
+  RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get()));
+
   *out = ArrayData::Make(output_type, length_, {null_bitmap_, data_}, null_count_);
 
   data_ = null_bitmap_ = nullptr;
@@ -553,11 +567,6 @@ AdaptiveUIntBuilder::AdaptiveUIntBuilder(MemoryPool* pool)
     : AdaptiveIntBuilderBase(pool) {}
 
 Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
-  const int64_t bytes_required = length_ * int_size_;
-  if (bytes_required > 0 && bytes_required < data_->size()) {
-    // Trim buffers
-    RETURN_NOT_OK(data_->Resize(bytes_required));
-  }
   std::shared_ptr<DataType> output_type;
   switch (int_size_) {
     case 1:
@@ -577,6 +586,9 @@ Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
       return Status::NotImplemented("Only ints of size 1,2,4,8 are supported");
   }
 
+  RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get()));
+  RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get()));
+
   *out = ArrayData::Make(output_type, length_, {null_bitmap_, data_}, null_count_);
 
   data_ = null_bitmap_ = nullptr;
@@ -726,6 +738,8 @@ Status BooleanBuilder::Init(int64_t capacity) {
   RETURN_NOT_OK(data_->Resize(nbytes));
 
   raw_data_ = reinterpret_cast<uint8_t*>(data_->mutable_data());
+  memset(raw_data_, 0, static_cast<size_t>(nbytes));
+
   return Status::OK();
 }
 
@@ -743,27 +757,25 @@ Status BooleanBuilder::Resize(int64_t capacity) {
     const int64_t old_bytes = data_->size();
     const int64_t new_bytes = BitUtil::BytesForBits(capacity);
 
-    if (new_bytes != old_bytes) {
+    if (new_bytes > old_bytes) {
       RETURN_NOT_OK(data_->Resize(new_bytes));
       raw_data_ = reinterpret_cast<uint8_t*>(data_->mutable_data());
+      memset(raw_data_ + old_bytes, 0, static_cast<size_t>(new_bytes - old_bytes));
     }
   }
   return Status::OK();
 }
 
 Status BooleanBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
-  const int64_t bytes_required = BitUtil::BytesForBits(length_);
-  if (bytes_required > 0 && bytes_required < data_->size()) {
-    // Trim buffers
-    RETURN_NOT_OK(data_->Resize(bytes_required));
-  }
-
   int64_t bit_offset = length_ % 8;
   if (bit_offset > 0) {
     // Adjust last byte
     data_->mutable_data()[length_ / 8] &= BitUtil::kPrecedingBitmask[bit_offset];
   }
 
+  RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get()));
+  RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), data_.get()));
+
   *out = ArrayData::Make(boolean(), length_, {null_bitmap_, data_}, null_count_);
 
   data_ = null_bitmap_ = nullptr;
@@ -1334,6 +1346,7 @@ Status ListBuilder::Resize(int64_t capacity) {
 Status ListBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
   RETURN_NOT_OK(AppendNextOffset());
 
+  // Offset padding zeroed by BufferBuilder
   std::shared_ptr<Buffer> offsets;
   RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
 
@@ -1422,8 +1435,9 @@ Status BinaryBuilder::AppendNull() {
 Status BinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
   // Write final offset (values length)
   RETURN_NOT_OK(AppendNextOffset());
-  std::shared_ptr<Buffer> offsets, value_data;
 
+  // These buffers' padding zeroed by BufferBuilder
+  std::shared_ptr<Buffer> offsets, value_data;
   RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
   RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
 
@@ -1615,6 +1629,7 @@ StructBuilder::StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool*
 }
 
 Status StructBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get()));
   *out = ArrayData::Make(type_, length_, {null_bitmap_}, null_count_);
 
   (*out)->child_data.resize(field_builders_.size());
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index bab5764..b22f126 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -197,14 +197,20 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
   using ArrayBuilder::Advance;
 
   /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
+  /// The memory at the corresponding data slot is set to 0 to prevent uninitialized
+  /// memory access
   Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
     RETURN_NOT_OK(Reserve(length));
+    memset(raw_data_ + length_, 0,
+           static_cast<size_t>(TypeTraits<Type>::bytes_required(length)));
     UnsafeAppendToBitmap(valid_bytes, length);
     return Status::OK();
   }
 
   Status AppendNull() {
     RETURN_NOT_OK(Reserve(1));
+    memset(raw_data_ + length_, 0,
+           static_cast<size_t>(TypeTraits<Type>::bytes_required(1)));
     UnsafeAppendToBitmap(false);
     return Status::OK();
   }
@@ -340,12 +346,14 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
   /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
   Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
     RETURN_NOT_OK(Reserve(length));
+    memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
     UnsafeAppendToBitmap(valid_bytes, length);
     return Status::OK();
   }
 
   Status AppendNull() {
     RETURN_NOT_OK(Reserve(1));
+    memset(data_->mutable_data() + length_ * int_size_, 0, int_size_);
     UnsafeAppendToBitmap(false);
     return Status::OK();
   }
@@ -551,12 +559,23 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
   /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
   Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
     RETURN_NOT_OK(Reserve(length));
+    // zero bits starting with the next whole bytes, since the bytes before
+    // should be already initialized
+
+    const int64_t old_bytes = BitUtil::BytesForBits(length_);
+    const int64_t new_bytes = BitUtil::BytesForBits(length_ + length);
+    memset(raw_data_ + old_bytes, 0, new_bytes - old_bytes);
     UnsafeAppendToBitmap(valid_bytes, length);
+
     return Status::OK();
   }
 
   Status AppendNull() {
     RETURN_NOT_OK(Reserve(1));
+    if (BitUtil::IsMultipleOf8(length_)) {
+      // zero next byte
+      memset(raw_data_ + (length_ / 8), 0, 1);
+    }
     UnsafeAppendToBitmap(false);
     return Status::OK();
   }
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index f48d728..e63f83a 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -271,9 +271,17 @@ class SequenceBuilder {
     RETURN_NOT_OK(AddSubsequence(dict_tag_, dict_data, dict_offsets_, "dict"));
     RETURN_NOT_OK(AddSubsequence(set_tag_, set_data, set_offsets_, "set"));
 
+    std::shared_ptr<Array> types_array;
+    RETURN_NOT_OK(types_.Finish(&types_array));
+    const auto& types = checked_cast<const Int8Array&>(*types_array);
+
+    std::shared_ptr<Array> offsets_array;
+    RETURN_NOT_OK(offsets_.Finish(&offsets_array));
+    const auto& offsets = checked_cast<const Int32Array&>(*offsets_array);
+
     auto type = ::arrow::union_(fields_, type_ids_, UnionMode::DENSE);
-    out->reset(new UnionArray(type, types_.length(), children_, types_.data(),
-                              offsets_.data(), nones_.null_bitmap(),
+    out->reset(new UnionArray(type, types.length(), children_, types.values(),
+                              offsets.values(), nones_.null_bitmap(),
                               nones_.null_count()));
     return Status::OK();
   }
diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h
index a046228..cb5c8b7 100644
--- a/cpp/src/arrow/test-util.h
+++ b/cpp/src/arrow/test-util.h
@@ -128,6 +128,8 @@ inline Status CopyBufferFromVector(const std::vector<T>& values, MemoryPool* poo
   RETURN_NOT_OK(buffer->Resize(nbytes));
   auto immutable_data = reinterpret_cast<const uint8_t*>(values.data());
   std::copy(immutable_data, immutable_data + nbytes, buffer->mutable_data());
+  memset(buffer->mutable_data() + nbytes, 0,
+         static_cast<size_t>(buffer->capacity() - nbytes));
 
   *result = buffer;
   return Status::OK();
diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc
index d3b6bb3..e8f47bb 100644
--- a/cpp/src/arrow/util/bit-util.cc
+++ b/cpp/src/arrow/util/bit-util.cc
@@ -50,7 +50,7 @@ Status BitUtil::BytesToBits(const std::vector<uint8_t>& bytes, MemoryPool* pool,
   std::shared_ptr<Buffer> buffer;
   RETURN_NOT_OK(AllocateBuffer(pool, bit_length, &buffer));
   uint8_t* out_buf = buffer->mutable_data();
-  memset(out_buf, 0, static_cast<size_t>(bit_length));
+  memset(out_buf, 0, static_cast<size_t>(buffer->capacity()));
   FillBitsFromBytes(bytes, out_buf);
 
   *out = buffer;
diff --git a/cpp/valgrind.supp b/cpp/valgrind.supp
index dc78eea..8e707e3 100644
--- a/cpp/valgrind.supp
+++ b/cpp/valgrind.supp
@@ -21,10 +21,4 @@
     Memcheck:Cond
     fun:*CastFunctor*BooleanType*
 }
-{
-    # Data can be uninitialized when its null bit is set, but we write raw buffers
-    <write_unitialized_data>
-    Memcheck:Param
-    write(buf)
-    fun:__write_nocancel
-}
+
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index 94986a7..e484ebb 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -717,3 +717,10 @@ def test_tensor_alignment():
     ys = pa.deserialize(pa.serialize(xs).to_buffer())
     for y in ys:
         assert y.ctypes.data % 64 == 0
+
+
+def test_serialization_determinism():
+    for obj in COMPLEX_OBJECTS:
+        buf1 = pa.serialize(obj).to_buffer()
+        buf2 = pa.serialize(obj).to_buffer()
+        assert buf1.to_pybytes() == buf2.to_pybytes()