You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pc...@apache.org on 2018/07/09 07:46:55 UTC
[arrow] branch master updated: ARROW-2790: [C++] Buffers can
contain uninitialized memory
This is an automated email from the ASF dual-hosted git repository.
pcmoritz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 1a419fe ARROW-2790: [C++] Buffers can contain uninitialized memory
1a419fe is described below
commit 1a419fe8c3deabc0f16bb1089ed523b9ce57ffa7
Author: Dimitri Vorona <vo...@in.tum.de>
AuthorDate: Mon Jul 9 00:46:45 2018 -0700
ARROW-2790: [C++] Buffers can contain uninitialized memory
Author: Dimitri Vorona <vo...@in.tum.de>
Author: Wes McKinney <we...@apache.org>
Author: Philipp Moritz <pc...@gmail.com>
Closes #2216 from alendit/fix-uninitialized-reads and squashes the following commits:
7280074f <Philipp Moritz> call Finish on builders in SequenceBuilder
c2452f0b <Wes McKinney> Add comment about mysterious non-determinism
168631c6 <Wes McKinney> Fix Python unit test, though not sure why yet
aeac2284 <Wes McKinney> Do not zero null bitmaps twice
e465ab02 <Wes McKinney> More aggressively trim and zero padding in buffers, reduce code duplication
efa8e853 <Wes McKinney> Clean up code duplication in array-test a little bit
00e03488 <Philipp Moritz> test serialization for determinism
af87b732 <Dimitri Vorona> Fixes for revision
437580f6 <Dimitri Vorona> Format tests
0a1a82ef <Dimitri Vorona> Fix uninitialized data warnings
2f0b2820 <Dimitri Vorona> Add more uninitialized data and padding tests
21be782d <Dimitri Vorona> Remove valgrind suppression since it isn't needed anymore
4ff873f8 <Dimitri Vorona> Add checks for data_ before zeroing padding
0d5d07fd <Dimitri Vorona> Stricter int to bool conversion
7ed1b246 <Dimitri Vorona> Consume statuses
9a9d5ddb <Dimitri Vorona> Formatting
797e568b <Dimitri Vorona> Add memory initialization on null and zero padding to buffers
b8f3b27e <Dimitri Vorona> Add zero-padding tests
4fb8ace8 <Dimitri Vorona> Add tests for uninitialized values when adding nulls
---
cpp/src/arrow/array-test.cc | 249 +++++++++++++++++++++++++----
cpp/src/arrow/buffer.h | 7 +-
cpp/src/arrow/builder.cc | 63 +++++---
cpp/src/arrow/builder.h | 19 +++
cpp/src/arrow/python/python_to_arrow.cc | 12 +-
cpp/src/arrow/test-util.h | 2 +
cpp/src/arrow/util/bit-util.cc | 2 +-
cpp/valgrind.supp | 8 +-
python/pyarrow/tests/test_serialization.py | 7 +
9 files changed, 298 insertions(+), 71 deletions(-)
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index cd32976..1bef773 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -41,6 +41,42 @@ namespace arrow {
using std::string;
using std::vector;
+namespace {
+// used to prevent compiler optimizing away side-effect-less statements
+volatile int throw_away = 0;
+
+// checks if the padding of the buffers of the array is zero
+// also causes valgrind warnings if the padding bytes are uninitialized
+void AssertZeroPadded(const Array& array) {
+ for (const auto& buffer : array.data()->buffers) {
+ if (buffer) {
+ const int64_t padding = buffer->capacity() - buffer->size();
+ std::vector<uint8_t> zeros(padding);
+ ASSERT_EQ(0, memcmp(buffer->data() + buffer->size(), zeros.data(), padding));
+ }
+ }
+}
+
+// Check if the valid buffer bytes are initialized by
+// calling memcmp on them which will cause valgrind warnings otherwise
+void TestInitialized(const Array& array) {
+ for (const auto& buffer : array.data()->buffers) {
+ if (buffer) {
+ std::vector<uint8_t> zeros(buffer->capacity());
+ throw_away = memcmp(buffer->data(), zeros.data(), buffer->size());
+ }
+ }
+}
+
+template <typename BuilderType>
+void FinishAndCheckPadding(BuilderType* builder, std::shared_ptr<Array>* out) {
+ ASSERT_OK(builder->Finish(out));
+ AssertZeroPadded(**out);
+ TestInitialized(**out);
+}
+
+} // namespace
+
class TestArray : public ::testing::Test {
public:
void SetUp() { pool_ = default_memory_pool(); }
@@ -209,10 +245,13 @@ TEST_F(TestArray, BuildLargeInMemoryArray) {
BooleanBuilder builder;
ASSERT_OK(builder.Reserve(length));
+
+ // Advance does not write to data, see docstring
ASSERT_OK(builder.Advance(length));
+ memset(builder.data()->mutable_data(), 0, BitUtil::BytesForBits(length));
std::shared_ptr<Array> result;
- ASSERT_OK(builder.Finish(&result));
+ FinishAndCheckPadding(&builder, &result);
ASSERT_EQ(length, result->length());
}
@@ -281,7 +320,7 @@ class TestPrimitiveBuilder : public TestBuilder {
std::make_shared<ArrayType>(size, ex_data, ex_null_bitmap, ex_null_count);
std::shared_ptr<Array> out;
- ASSERT_OK(builder->Finish(&out));
+ FinishAndCheckPadding(builder.get(), &out);
std::shared_ptr<ArrayType> result = std::dynamic_pointer_cast<ArrayType>(out);
@@ -405,7 +444,8 @@ void TestPrimitiveBuilder<PBoolean>::Check(const std::unique_ptr<BooleanBuilder>
// Finish builder and check result array
std::shared_ptr<Array> out;
- ASSERT_OK(builder->Finish(&out));
+ FinishAndCheckPadding(builder.get(), &out);
+
std::shared_ptr<BooleanArray> result = std::dynamic_pointer_cast<BooleanArray>(out);
ASSERT_EQ(ex_null_count, result->null_count());
@@ -456,14 +496,29 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) {
ASSERT_OK(this->builder_->AppendNull());
}
- std::shared_ptr<Array> result;
- ASSERT_OK(this->builder_->Finish(&result));
+ std::shared_ptr<Array> out;
+ FinishAndCheckPadding(this->builder_.get(), &out);
+ auto result = std::dynamic_pointer_cast<typename TypeParam::ArrayType>(out);
for (int64_t i = 0; i < size; ++i) {
ASSERT_TRUE(result->IsNull(i)) << i;
}
}
+TYPED_TEST(TestPrimitiveBuilder, TestAppendNulls) {
+ const int64_t size = 10;
+ const uint8_t nullmap[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+
+ ASSERT_OK(this->builder_->AppendNulls(nullmap, size));
+
+ std::shared_ptr<Array> result;
+ FinishAndCheckPadding(this->builder_.get(), &result);
+
+ for (int64_t i = 0; i < size; ++i) {
+ ASSERT_EQ(result->IsValid(i), static_cast<bool>(nullmap[i]));
+ }
+}
+
TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) {
DECL_T();
@@ -488,7 +543,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) {
do {
std::shared_ptr<Array> result;
- ASSERT_OK(this->builder_->Finish(&result));
+ FinishAndCheckPadding(this->builder_.get(), &result);
} while (false);
ASSERT_EQ(memory_before, this->pool_->bytes_allocated());
@@ -656,6 +711,24 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendValues) {
this->Check(this->builder_nn_, false);
}
+TYPED_TEST(TestPrimitiveBuilder, TestZeroPadded) {
+ DECL_T();
+
+ int64_t size = 10000;
+ this->RandomData(size);
+
+ vector<T>& draws = this->draws_;
+ vector<uint8_t>& valid_bytes = this->valid_bytes_;
+
+ // first slug
+ int64_t K = 1000;
+
+ ASSERT_OK(this->builder_->AppendValues(draws.data(), K, valid_bytes.data()));
+
+ std::shared_ptr<Array> out;
+ FinishAndCheckPadding(this->builder_.get(), &out);
+}
+
TYPED_TEST(TestPrimitiveBuilder, TestAppendValuesStdBool) {
// ARROW-1383
DECL_T();
@@ -924,7 +997,7 @@ TEST_F(TestStringArray, CompareNullByteSlots) {
ASSERT_OK(builder3.Append("baz"));
std::shared_ptr<Array> array, array2, array3;
- ASSERT_OK(builder.Finish(&array));
+ FinishAndCheckPadding(&builder, &array);
ASSERT_OK(builder2.Finish(&array2));
ASSERT_OK(builder3.Finish(&array3));
@@ -971,7 +1044,7 @@ class TestStringBuilder : public TestBuilder {
void Done() {
std::shared_ptr<Array> out;
- EXPECT_OK(builder_->Finish(&out));
+ FinishAndCheckPadding(builder_.get(), &out);
result_ = std::dynamic_pointer_cast<StringArray>(out);
ASSERT_OK(ValidateArray(*result_));
@@ -1207,6 +1280,21 @@ TEST_F(TestBinaryArray, TestGetValue) {
}
}
+TEST_F(TestBinaryArray, TestNullValuesInitialized) {
+ for (size_t i = 0; i < expected_.size(); ++i) {
+ if (valid_bytes_[i] == 0) {
+ ASSERT_TRUE(strings_->IsNull(i));
+ } else {
+ int32_t len = -1;
+ const uint8_t* bytes = strings_->GetValue(i, &len);
+ ASSERT_EQ(0, std::memcmp(expected_[i].data(), bytes, len));
+ }
+ }
+ TestInitialized(*strings_);
+}
+
+TEST_F(TestBinaryArray, TestPaddingZeroed) { AssertZeroPadded(*strings_); }
+
TEST_F(TestBinaryArray, TestGetString) {
for (size_t i = 0; i < expected_.size(); ++i) {
if (valid_bytes_[i] == 0) {
@@ -1227,7 +1315,7 @@ TEST_F(TestBinaryArray, TestEqualsEmptyStrings) {
}
std::shared_ptr<Array> left_arr;
- ASSERT_OK(builder.Finish(&left_arr));
+ FinishAndCheckPadding(&builder, &left_arr);
const BinaryArray& left = checked_cast<const BinaryArray&>(*left_arr);
std::shared_ptr<Array> right =
@@ -1247,7 +1335,7 @@ class TestBinaryBuilder : public TestBuilder {
void Done() {
std::shared_ptr<Array> out;
- EXPECT_OK(builder_->Finish(&out));
+ FinishAndCheckPadding(builder_.get(), &out);
result_ = std::dynamic_pointer_cast<BinaryArray>(out);
ASSERT_OK(ValidateArray(*result_));
@@ -1329,7 +1417,9 @@ TEST_F(TestBinaryBuilder, TestCapacityReserve) {
ASSERT_EQ(reps * N, result_->length());
ASSERT_EQ(0, result_->null_count());
ASSERT_EQ(reps * 40, result_->value_data()->size());
- ASSERT_EQ(expected_capacity, result_->value_data()->capacity());
+
+ // Capacity is shrunk after `Finish`
+ ASSERT_EQ(640, result_->value_data()->capacity());
}
TEST_F(TestBinaryBuilder, TestZeroLength) {
@@ -1364,7 +1454,7 @@ void CheckSliceEquality() {
}
std::shared_ptr<Array> array;
- ASSERT_OK(builder.Finish(&array));
+ FinishAndCheckPadding(&builder, &array);
std::shared_ptr<Array> slice, slice2;
@@ -1451,7 +1541,7 @@ TEST_F(TestFWBinaryArray, Builder) {
}
}
- ASSERT_OK(builder_->Finish(&result));
+ FinishAndCheckPadding(builder_.get(), &result);
CheckResult(*result);
// Build using batch API
@@ -1462,7 +1552,8 @@ TEST_F(TestFWBinaryArray, Builder) {
ASSERT_OK(builder_->AppendValues(raw_data, 50, raw_is_valid));
ASSERT_OK(
builder_->AppendValues(raw_data + 50 * byte_width, length - 50, raw_is_valid + 50));
- ASSERT_OK(builder_->Finish(&result));
+ FinishAndCheckPadding(builder_.get(), &result);
+
CheckResult(*result);
// Build from std::string
@@ -1531,6 +1622,20 @@ TEST_F(TestFWBinaryArray, ZeroSize) {
ASSERT_EQ(3, array->null_count());
}
+TEST_F(TestFWBinaryArray, ZeroPadding) {
+ auto type = fixed_size_binary(4);
+ FixedSizeBinaryBuilder builder(type);
+
+ ASSERT_OK(builder.Append("foo1"));
+ ASSERT_OK(builder.AppendNull());
+ ASSERT_OK(builder.Append("foo2"));
+ ASSERT_OK(builder.AppendNull());
+ ASSERT_OK(builder.Append("foo3"));
+
+ std::shared_ptr<Array> array;
+ FinishAndCheckPadding(&builder, &array);
+}
+
TEST_F(TestFWBinaryArray, Slice) {
auto type = fixed_size_binary(4);
FixedSizeBinaryBuilder builder(type);
@@ -1581,7 +1686,7 @@ class TestAdaptiveIntBuilder : public TestBuilder {
builder_ = std::make_shared<AdaptiveIntBuilder>(pool_);
}
- void Done() { EXPECT_OK(builder_->Finish(&result_)); }
+ void Done() { FinishAndCheckPadding(builder_.get(), &result_); }
protected:
std::shared_ptr<AdaptiveIntBuilder> builder_;
@@ -1702,6 +1807,38 @@ TEST_F(TestAdaptiveIntBuilder, TestAppendValues) {
ASSERT_TRUE(expected_->Equals(result_));
}
+TEST_F(TestAdaptiveIntBuilder, TestAssertZeroPadded) {
+ std::vector<int64_t> values(
+ {0, static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1});
+ ASSERT_OK(builder_->AppendValues(values.data(), values.size()));
+ Done();
+}
+
+TEST_F(TestAdaptiveIntBuilder, TestAppendNull) {
+ int64_t size = 1000;
+ for (unsigned index = 0; index < size; ++index) {
+ ASSERT_OK(builder_->AppendNull());
+ }
+
+ Done();
+
+ for (unsigned index = 0; index < size; ++index) {
+ ASSERT_TRUE(result_->IsNull(index));
+ }
+}
+
+TEST_F(TestAdaptiveIntBuilder, TestAppendNulls) {
+ constexpr int64_t size = 10;
+ const uint8_t nullmap[size] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ ASSERT_OK(builder_->AppendNulls(nullmap, size));
+
+ Done();
+
+ for (unsigned index = 0; index < size; ++index) {
+ ASSERT_EQ(result_->IsValid(index), static_cast<bool>(nullmap[index]));
+ }
+}
+
class TestAdaptiveUIntBuilder : public TestBuilder {
public:
void SetUp() {
@@ -1709,7 +1846,7 @@ class TestAdaptiveUIntBuilder : public TestBuilder {
builder_ = std::make_shared<AdaptiveUIntBuilder>(pool_);
}
- void Done() { EXPECT_OK(builder_->Finish(&result_)); }
+ void Done() { FinishAndCheckPadding(builder_.get(), &result_); }
protected:
std::shared_ptr<AdaptiveUIntBuilder> builder_;
@@ -1797,6 +1934,38 @@ TEST_F(TestAdaptiveUIntBuilder, TestAppendValues) {
ASSERT_TRUE(expected_->Equals(result_));
}
+TEST_F(TestAdaptiveUIntBuilder, TestAssertZeroPadded) {
+ std::vector<uint64_t> values(
+ {0, static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) + 1});
+ ASSERT_OK(builder_->AppendValues(values.data(), values.size()));
+ Done();
+}
+
+TEST_F(TestAdaptiveUIntBuilder, TestAppendNull) {
+ int64_t size = 1000;
+ for (unsigned index = 0; index < size; ++index) {
+ ASSERT_OK(builder_->AppendNull());
+ }
+
+ Done();
+
+ for (unsigned index = 0; index < size; ++index) {
+ ASSERT_TRUE(result_->IsNull(index));
+ }
+}
+
+TEST_F(TestAdaptiveUIntBuilder, TestAppendNulls) {
+ constexpr int64_t size = 10;
+ const uint8_t nullmap[size] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+ ASSERT_OK(builder_->AppendNulls(nullmap, size));
+
+ Done();
+
+ for (unsigned index = 0; index < size; ++index) {
+ ASSERT_EQ(result_->IsValid(index), static_cast<bool>(nullmap[index]));
+ }
+}
+
// ----------------------------------------------------------------------
// Dictionary tests
@@ -1894,7 +2063,7 @@ TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) {
// Finalize result
std::shared_ptr<Array> result;
- ASSERT_OK(builder.Finish(&result));
+ FinishAndCheckPadding(&builder, &result);
// Finalize expected data
std::shared_ptr<Array> dict_array;
@@ -1916,7 +2085,7 @@ TYPED_TEST(TestDictionaryBuilder, DeltaDictionary) {
ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
std::shared_ptr<Array> result;
- ASSERT_OK(builder.Finish(&result));
+ FinishAndCheckPadding(&builder, &result);
// Build expected data for the initial dictionary
NumericBuilder<TypeParam> dict_builder1;
@@ -1975,7 +2144,7 @@ TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) {
ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
std::shared_ptr<Array> result;
- ASSERT_OK(builder.Finish(&result));
+ FinishAndCheckPadding(&builder, &result);
// Build expected data for the initial dictionary
NumericBuilder<TypeParam> dict_builder1;
@@ -2108,7 +2277,7 @@ TEST(TestStringDictionaryBuilder, DoubleTableSize) {
// Finalize result
std::shared_ptr<Array> result;
- ASSERT_OK(builder.Finish(&result));
+ FinishAndCheckPadding(&builder, &result);
// Finalize expected data
std::shared_ptr<Array> str_array;
@@ -2155,7 +2324,7 @@ TEST(TestStringDictionaryBuilder, DeltaDictionary) {
ASSERT_OK(builder.Append("test2"));
std::shared_ptr<Array> result_delta;
- ASSERT_OK(builder.Finish(&result_delta));
+ FinishAndCheckPadding(&builder, &result_delta);
// Build expected data
StringBuilder str_builder2;
@@ -2192,7 +2361,7 @@ TEST(TestStringDictionaryBuilder, BigDeltaDictionary) {
}
std::shared_ptr<Array> result;
- ASSERT_OK(builder.Finish(&result));
+ FinishAndCheckPadding(&builder, &result);
std::shared_ptr<Array> str_array1;
ASSERT_OK(str_builder1.Finish(&str_array1));
@@ -2272,7 +2441,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) {
ASSERT_OK(builder.Append(test.data()));
std::shared_ptr<Array> result;
- ASSERT_OK(builder.Finish(&result));
+ FinishAndCheckPadding(&builder, &result);
// Build expected data
FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4));
@@ -2306,7 +2475,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, DeltaDictionary) {
ASSERT_OK(builder.Append(test.data()));
std::shared_ptr<Array> result1;
- ASSERT_OK(builder.Finish(&result1));
+ FinishAndCheckPadding(&builder, &result1);
// Build expected data
FixedSizeBinaryBuilder fsb_builder1(arrow::fixed_size_binary(4));
@@ -2332,7 +2501,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, DeltaDictionary) {
ASSERT_OK(builder.Append(test3.data()));
std::shared_ptr<Array> result2;
- ASSERT_OK(builder.Finish(&result2));
+ FinishAndCheckPadding(&builder, &result2);
// Build expected data
FixedSizeBinaryBuilder fsb_builder2(arrow::fixed_size_binary(4));
@@ -2509,7 +2678,7 @@ class TestListArray : public TestBuilder {
void Done() {
std::shared_ptr<Array> out;
- EXPECT_OK(builder_->Finish(&out));
+ FinishAndCheckPadding(builder_.get(), &out);
result_ = std::dynamic_pointer_cast<ListArray>(out);
}
@@ -2965,7 +3134,7 @@ class TestStructBuilder : public TestBuilder {
void Done() {
std::shared_ptr<Array> out;
- ASSERT_OK(builder_->Finish(&out));
+ FinishAndCheckPadding(builder_.get(), &out);
result_ = std::dynamic_pointer_cast<StructArray>(out);
}
@@ -3146,7 +3315,7 @@ TEST_F(TestStructBuilder, TestEquality) {
int_vb->UnsafeAppend(value);
}
- ASSERT_OK(builder_->Finish(&array));
+ FinishAndCheckPadding(builder_.get(), &array);
ASSERT_OK(builder_->Resize(list_lengths.size()));
ASSERT_OK(char_vb->Resize(list_values.size()));
@@ -3273,7 +3442,7 @@ TEST_F(TestStructBuilder, TestSlice) {
for (int32_t value : int_values) {
int_vb->UnsafeAppend(value);
}
- ASSERT_OK(builder_->Finish(&array));
+ FinishAndCheckPadding(builder_.get(), &array);
std::shared_ptr<StructArray> slice, slice2;
std::shared_ptr<Int32Array> int_field;
@@ -3348,6 +3517,9 @@ TEST(TestUnionArrayAdHoc, TestSliceEquals) {
ASSERT_TRUE(slice->Equals(slice2));
ASSERT_TRUE(array->RangeEquals(1, 6, 0, slice));
+
+ AssertZeroPadded(*array);
+ TestInitialized(*array);
};
CheckUnion(batch->column(1));
@@ -3392,7 +3564,7 @@ class DecimalTest : public ::testing::TestWithParam<int> {
}
std::shared_ptr<Array> out;
- ASSERT_OK(builder->Finish(&out));
+ FinishAndCheckPadding(builder.get(), &out);
std::vector<uint8_t> raw_bytes;
@@ -3410,8 +3582,7 @@ class DecimalTest : public ::testing::TestWithParam<int> {
std::shared_ptr<Array> lhs = out->Slice(offset);
std::shared_ptr<Array> rhs = expected->Slice(offset);
- bool result = lhs->Equals(rhs);
- ASSERT_TRUE(result);
+ ASSERT_TRUE(lhs->Equals(rhs));
}
};
@@ -3461,6 +3632,13 @@ TEST(TestRechunkArraysConsistently, Trivial) {
groups = {{a1, a2}, {}, {b1}};
rechunked = internal::RechunkArraysConsistently(groups);
ASSERT_EQ(rechunked.size(), 3);
+
+ for (auto& arrvec : rechunked) {
+ for (auto& arr : arrvec) {
+ AssertZeroPadded(*arr);
+ TestInitialized(*arr);
+ }
+ }
}
TEST(TestRechunkArraysConsistently, Plain) {
@@ -3507,6 +3685,13 @@ TEST(TestRechunkArraysConsistently, Plain) {
ASSERT_ARRAYS_EQUAL(*rb[3], *expected);
ArrayFromVector<Int32Type, int32_t>({48, 49}, &expected);
ASSERT_ARRAYS_EQUAL(*rb[4], *expected);
+
+ for (auto& arrvec : rechunked) {
+ for (auto& arr : arrvec) {
+ AssertZeroPadded(*arr);
+ TestInitialized(*arr);
+ }
+ }
}
} // namespace arrow
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index e79fc88..73db8b1 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -304,11 +304,8 @@ class ARROW_EXPORT BufferBuilder {
size_ += length;
}
- Status Finish(std::shared_ptr<Buffer>* out) {
- // Do not shrink to fit to avoid unneeded realloc
- if (size_ > 0) {
- RETURN_NOT_OK(buffer_->Resize(size_, false));
- }
+ Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
+ RETURN_NOT_OK(Resize(size_, shrink_to_fit));
*out = buffer_;
Reset();
return Status::OK();
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 9a8fc7d..b40453c 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -238,6 +238,7 @@ Status PrimitiveBuilder<T>::Init(int64_t capacity) {
RETURN_NOT_OK(data_->Resize(nbytes));
raw_data_ = reinterpret_cast<value_type*>(data_->mutable_data());
+
return Status::OK();
}
@@ -326,13 +327,29 @@ Status PrimitiveBuilder<T>::Append(const std::vector<value_type>& values) {
return AppendValues(values);
}
+namespace {
+
+Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) {
+ if (buffer) {
+ if (bytes_filled < buffer->size()) {
+ // Trim buffer
+ RETURN_NOT_OK(buffer->Resize(bytes_filled));
+ }
+ // zero the padding
+ memset(buffer->mutable_data() + bytes_filled, 0, buffer->capacity() - bytes_filled);
+ } else {
+ DCHECK_EQ(bytes_filled, 0);
+ }
+ return Status::OK();
+}
+
+} // namespace
+
template <typename T>
Status PrimitiveBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
- const int64_t bytes_required = TypeTraits<T>::bytes_required(length_);
- if (bytes_required > 0 && bytes_required < data_->size()) {
- // Trim buffers
- RETURN_NOT_OK(data_->Resize(bytes_required));
- }
+ RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get()));
+ RETURN_NOT_OK(TrimBuffer(TypeTraits<T>::bytes_required(length_), data_.get()));
+
*out = ArrayData::Make(type_, length_, {null_bitmap_, data_}, null_count_);
data_ = null_bitmap_ = nullptr;
@@ -391,12 +408,6 @@ Status AdaptiveIntBuilderBase::Resize(int64_t capacity) {
AdaptiveIntBuilder::AdaptiveIntBuilder(MemoryPool* pool) : AdaptiveIntBuilderBase(pool) {}
Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
- const int64_t bytes_required = length_ * int_size_;
- if (bytes_required > 0 && bytes_required < data_->size()) {
- // Trim buffers
- RETURN_NOT_OK(data_->Resize(bytes_required));
- }
-
std::shared_ptr<DataType> output_type;
switch (int_size_) {
case 1:
@@ -416,6 +427,9 @@ Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
return Status::NotImplemented("Only ints of size 1,2,4,8 are supported");
}
+ RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get()));
+ RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get()));
+
*out = ArrayData::Make(output_type, length_, {null_bitmap_, data_}, null_count_);
data_ = null_bitmap_ = nullptr;
@@ -553,11 +567,6 @@ AdaptiveUIntBuilder::AdaptiveUIntBuilder(MemoryPool* pool)
: AdaptiveIntBuilderBase(pool) {}
Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
- const int64_t bytes_required = length_ * int_size_;
- if (bytes_required > 0 && bytes_required < data_->size()) {
- // Trim buffers
- RETURN_NOT_OK(data_->Resize(bytes_required));
- }
std::shared_ptr<DataType> output_type;
switch (int_size_) {
case 1:
@@ -577,6 +586,9 @@ Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
return Status::NotImplemented("Only ints of size 1,2,4,8 are supported");
}
+ RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get()));
+ RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get()));
+
*out = ArrayData::Make(output_type, length_, {null_bitmap_, data_}, null_count_);
data_ = null_bitmap_ = nullptr;
@@ -726,6 +738,8 @@ Status BooleanBuilder::Init(int64_t capacity) {
RETURN_NOT_OK(data_->Resize(nbytes));
raw_data_ = reinterpret_cast<uint8_t*>(data_->mutable_data());
+ memset(raw_data_, 0, static_cast<size_t>(nbytes));
+
return Status::OK();
}
@@ -743,27 +757,25 @@ Status BooleanBuilder::Resize(int64_t capacity) {
const int64_t old_bytes = data_->size();
const int64_t new_bytes = BitUtil::BytesForBits(capacity);
- if (new_bytes != old_bytes) {
+ if (new_bytes > old_bytes) {
RETURN_NOT_OK(data_->Resize(new_bytes));
raw_data_ = reinterpret_cast<uint8_t*>(data_->mutable_data());
+ memset(raw_data_ + old_bytes, 0, static_cast<size_t>(new_bytes - old_bytes));
}
}
return Status::OK();
}
Status BooleanBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
- const int64_t bytes_required = BitUtil::BytesForBits(length_);
- if (bytes_required > 0 && bytes_required < data_->size()) {
- // Trim buffers
- RETURN_NOT_OK(data_->Resize(bytes_required));
- }
-
int64_t bit_offset = length_ % 8;
if (bit_offset > 0) {
// Adjust last byte
data_->mutable_data()[length_ / 8] &= BitUtil::kPrecedingBitmask[bit_offset];
}
+ RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get()));
+ RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), data_.get()));
+
*out = ArrayData::Make(boolean(), length_, {null_bitmap_, data_}, null_count_);
data_ = null_bitmap_ = nullptr;
@@ -1334,6 +1346,7 @@ Status ListBuilder::Resize(int64_t capacity) {
Status ListBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
RETURN_NOT_OK(AppendNextOffset());
+ // Offset padding zeroed by BufferBuilder
std::shared_ptr<Buffer> offsets;
RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
@@ -1422,8 +1435,9 @@ Status BinaryBuilder::AppendNull() {
Status BinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
// Write final offset (values length)
RETURN_NOT_OK(AppendNextOffset());
- std::shared_ptr<Buffer> offsets, value_data;
+ // These buffers' padding zeroed by BufferBuilder
+ std::shared_ptr<Buffer> offsets, value_data;
RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
@@ -1615,6 +1629,7 @@ StructBuilder::StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool*
}
Status StructBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get()));
*out = ArrayData::Make(type_, length_, {null_bitmap_}, null_count_);
(*out)->child_data.resize(field_builders_.size());
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index bab5764..b22f126 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -197,14 +197,20 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
using ArrayBuilder::Advance;
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
+ /// The memory at the corresponding data slot is set to 0 to prevent uninitialized
+ /// memory access
Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
RETURN_NOT_OK(Reserve(length));
+ memset(raw_data_ + length_, 0,
+ static_cast<size_t>(TypeTraits<Type>::bytes_required(length)));
UnsafeAppendToBitmap(valid_bytes, length);
return Status::OK();
}
Status AppendNull() {
RETURN_NOT_OK(Reserve(1));
+ memset(raw_data_ + length_, 0,
+ static_cast<size_t>(TypeTraits<Type>::bytes_required(1)));
UnsafeAppendToBitmap(false);
return Status::OK();
}
@@ -340,12 +346,14 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
RETURN_NOT_OK(Reserve(length));
+ memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
UnsafeAppendToBitmap(valid_bytes, length);
return Status::OK();
}
Status AppendNull() {
RETURN_NOT_OK(Reserve(1));
+ memset(data_->mutable_data() + length_ * int_size_, 0, int_size_);
UnsafeAppendToBitmap(false);
return Status::OK();
}
@@ -551,12 +559,23 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
RETURN_NOT_OK(Reserve(length));
+ // zero bits starting with the next whole bytes, since the bytes before
+ // should be already initialized
+
+ const int64_t old_bytes = BitUtil::BytesForBits(length_);
+ const int64_t new_bytes = BitUtil::BytesForBits(length_ + length);
+ memset(raw_data_ + old_bytes, 0, new_bytes - old_bytes);
UnsafeAppendToBitmap(valid_bytes, length);
+
return Status::OK();
}
Status AppendNull() {
RETURN_NOT_OK(Reserve(1));
+ if (BitUtil::IsMultipleOf8(length_)) {
+ // zero next byte
+ memset(raw_data_ + (length_ / 8), 0, 1);
+ }
UnsafeAppendToBitmap(false);
return Status::OK();
}
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index f48d728..e63f83a 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -271,9 +271,17 @@ class SequenceBuilder {
RETURN_NOT_OK(AddSubsequence(dict_tag_, dict_data, dict_offsets_, "dict"));
RETURN_NOT_OK(AddSubsequence(set_tag_, set_data, set_offsets_, "set"));
+ std::shared_ptr<Array> types_array;
+ RETURN_NOT_OK(types_.Finish(&types_array));
+ const auto& types = checked_cast<const Int8Array&>(*types_array);
+
+ std::shared_ptr<Array> offsets_array;
+ RETURN_NOT_OK(offsets_.Finish(&offsets_array));
+ const auto& offsets = checked_cast<const Int32Array&>(*offsets_array);
+
auto type = ::arrow::union_(fields_, type_ids_, UnionMode::DENSE);
- out->reset(new UnionArray(type, types_.length(), children_, types_.data(),
- offsets_.data(), nones_.null_bitmap(),
+ out->reset(new UnionArray(type, types.length(), children_, types.values(),
+ offsets.values(), nones_.null_bitmap(),
nones_.null_count()));
return Status::OK();
}
diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h
index a046228..cb5c8b7 100644
--- a/cpp/src/arrow/test-util.h
+++ b/cpp/src/arrow/test-util.h
@@ -128,6 +128,8 @@ inline Status CopyBufferFromVector(const std::vector<T>& values, MemoryPool* poo
RETURN_NOT_OK(buffer->Resize(nbytes));
auto immutable_data = reinterpret_cast<const uint8_t*>(values.data());
std::copy(immutable_data, immutable_data + nbytes, buffer->mutable_data());
+ memset(buffer->mutable_data() + nbytes, 0,
+ static_cast<size_t>(buffer->capacity() - nbytes));
*result = buffer;
return Status::OK();
diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc
index d3b6bb3..e8f47bb 100644
--- a/cpp/src/arrow/util/bit-util.cc
+++ b/cpp/src/arrow/util/bit-util.cc
@@ -50,7 +50,7 @@ Status BitUtil::BytesToBits(const std::vector<uint8_t>& bytes, MemoryPool* pool,
std::shared_ptr<Buffer> buffer;
RETURN_NOT_OK(AllocateBuffer(pool, bit_length, &buffer));
uint8_t* out_buf = buffer->mutable_data();
- memset(out_buf, 0, static_cast<size_t>(bit_length));
+ memset(out_buf, 0, static_cast<size_t>(buffer->capacity()));
FillBitsFromBytes(bytes, out_buf);
*out = buffer;
diff --git a/cpp/valgrind.supp b/cpp/valgrind.supp
index dc78eea..8e707e3 100644
--- a/cpp/valgrind.supp
+++ b/cpp/valgrind.supp
@@ -21,10 +21,4 @@
Memcheck:Cond
fun:*CastFunctor*BooleanType*
}
-{
- # Data can be uninitialized when its null bit is set, but we write raw buffers
- <write_unitialized_data>
- Memcheck:Param
- write(buf)
- fun:__write_nocancel
-}
+
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index 94986a7..e484ebb 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -717,3 +717,10 @@ def test_tensor_alignment():
ys = pa.deserialize(pa.serialize(xs).to_buffer())
for y in ys:
assert y.ctypes.data % 64 == 0
+
+
+def test_serialization_determinism():
+ for obj in COMPLEX_OBJECTS:
+ buf1 = pa.serialize(obj).to_buffer()
+ buf2 = pa.serialize(obj).to_buffer()
+ assert buf1.to_pybytes() == buf2.to_pybytes()