You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/07/23 21:08:33 UTC
[arrow] branch master updated: ARROW-2744: [C++] Avoid creating
list arrays with a null values buffer
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 491114b ARROW-2744: [C++] Avoid creating list arrays with a null values buffer
491114b is described below
commit 491114bd284cecd542c4ae341178e13a48a815de
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Mon Jul 23 17:08:28 2018 -0400
ARROW-2744: [C++] Avoid creating list arrays with a null values buffer
Author: Antoine Pitrou <an...@python.org>
Closes #2243 from pitrou/ARROW-2744 and squashes the following commits:
715189a0 <Antoine Pitrou> ARROW-2744: Avoid creating list arrays with a null values buffer
---
cpp/src/arrow/array-test.cc | 2 ++
cpp/src/arrow/array.cc | 58 +++++++++++++++++++++++-------------
cpp/src/arrow/builder.cc | 9 ++++++
cpp/src/arrow/ipc/test-common.h | 2 ++
python/pyarrow/tests/test_parquet.py | 12 ++++++--
5 files changed, 61 insertions(+), 22 deletions(-)
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 44270d1..df6051e 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -2919,6 +2919,8 @@ TEST_F(TestListArray, TestAppendNull) {
auto values = result_->values();
ASSERT_EQ(0, values->length());
+ // Values buffer should be non-null
+ ASSERT_NE(nullptr, values->data()->buffers[1]);
}
void ValidateBasicListArray(const ListArray* result, const vector<int32_t>& values,
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index fb6ccaf..6222e37 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -664,12 +664,30 @@ namespace internal {
struct ValidateVisitor {
Status Visit(const NullArray&) { return Status::OK(); }
- Status Visit(const PrimitiveArray&) { return Status::OK(); }
+ Status Visit(const PrimitiveArray& array) {
+ if (array.data()->buffers.size() != 2) {
+ return Status::Invalid("number of buffers was != 2");
+ }
+ if (array.values() == nullptr) {
+ return Status::Invalid("values was null");
+ }
+ return Status::OK();
+ }
- Status Visit(const Decimal128Array&) { return Status::OK(); }
+ Status Visit(const Decimal128Array& array) {
+ if (array.data()->buffers.size() != 2) {
+ return Status::Invalid("number of buffers was != 2");
+ }
+ if (array.values() == nullptr) {
+ return Status::Invalid("values was null");
+ }
+ return Status::OK();
+ }
- Status Visit(const BinaryArray&) {
- // TODO(wesm): what to do here?
+ Status Visit(const BinaryArray& array) {
+ if (array.data()->buffers.size() != 3) {
+ return Status::Invalid("number of buffers was != 3");
+ }
return Status::OK();
}
@@ -688,24 +706,24 @@ struct ValidateVisitor {
<< " isn't large enough for length: " << array.length();
return Status::Invalid(ss.str());
}
+
+ if (!array.values()) {
+ return Status::Invalid("values was null");
+ }
+
const int32_t last_offset = array.value_offset(array.length());
- if (last_offset > 0) {
- if (!array.values()) {
- return Status::Invalid("last offset was non-zero and values was null");
- }
- if (array.values()->length() != last_offset) {
- std::stringstream ss;
- ss << "Final offset invariant not equal to values length: " << last_offset
- << "!=" << array.values()->length();
- return Status::Invalid(ss.str());
- }
+ if (array.values()->length() != last_offset) {
+ std::stringstream ss;
+ ss << "Final offset invariant not equal to values length: " << last_offset
+ << "!=" << array.values()->length();
+ return Status::Invalid(ss.str());
+ }
- const Status child_valid = ValidateArray(*array.values());
- if (!child_valid.ok()) {
- std::stringstream ss;
- ss << "Child array invalid: " << child_valid.ToString();
- return Status::Invalid(ss.str());
- }
+ const Status child_valid = ValidateArray(*array.values());
+ if (!child_valid.ok()) {
+ std::stringstream ss;
+ ss << "Child array invalid: " << child_valid.ToString();
+ return Status::Invalid(ss.str());
}
int32_t prev_offset = array.value_offset(0);
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 1b77945..152f1fc 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -54,6 +54,7 @@ Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) {
// zero the padding
buffer->ZeroPadding();
} else {
+ // Null buffers are allowed in place of 0-byte buffers
DCHECK_EQ(bytes_filled, 0);
}
return Status::OK();
@@ -1336,6 +1337,10 @@ Status ListBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
if (values_) {
items = values_->data();
} else {
+ if (value_builder_->length() == 0) {
+ // Try to make sure we get a non-null values buffer (ARROW-2744)
+ RETURN_NOT_OK(value_builder_->Resize(0));
+ }
RETURN_NOT_OK(value_builder_->FinishInternal(&items));
}
@@ -1632,6 +1637,10 @@ Status StructBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
(*out)->child_data.resize(field_builders_.size());
for (size_t i = 0; i < field_builders_.size(); ++i) {
+ if (length_ == 0) {
+ // Try to make sure the child buffers are initialized
+ RETURN_NOT_OK(field_builders_[i]->Resize(0));
+ }
RETURN_NOT_OK(field_builders_[i]->FinishInternal(&(*out)->child_data[i]));
}
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index 2f0237e..4e9fab1 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -100,6 +100,7 @@ Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool
std::shared_ptr<PoolBuffer> data;
RETURN_NOT_OK(test::MakeRandomInt32PoolBuffer(length, pool, &data));
Int32Builder builder(int32(), pool);
+ RETURN_NOT_OK(builder.Init(length));
if (include_nulls) {
std::shared_ptr<PoolBuffer> valid_bytes;
RETURN_NOT_OK(test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes));
@@ -127,6 +128,7 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li
std::vector<int32_t> offsets(
num_lists + 1, 0); // +1 so we can shift for nulls. See partial sum below.
const uint32_t seed = static_cast<uint32_t>(child_array->length());
+
if (num_lists > 0) {
test::rand_uniform_int(num_lists, seed, 0, max_list_size, list_sizes.data());
// make sure sizes are consistent with null
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index d6ca7dd..21610d9 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -158,7 +158,7 @@ def test_pandas_parquet_2_0_rountrip(tmpdir, chunk_size):
@parquet
-def test_chunked_table_write(tmpdir):
+def test_chunked_table_write():
# ARROW-232
df = alltypes_sample(size=10)
@@ -176,7 +176,7 @@ def test_chunked_table_write(tmpdir):
@parquet
-def test_empty_table_roundtrip(tmpdir):
+def test_empty_table_roundtrip():
df = alltypes_sample(size=10)
# The nanosecond->us conversion is a nuisance, so we just avoid it here
del df['datetime']
@@ -193,6 +193,14 @@ def test_empty_table_roundtrip(tmpdir):
@parquet
+def test_empty_lists_table_roundtrip():
+ # ARROW-2744: Shouldn't crash when writing an array of empty lists
+ arr = pa.array([[], []], type=pa.list_(pa.int32()))
+ table = pa.Table.from_arrays([arr], ["A"])
+ _check_roundtrip(table)
+
+
+@parquet
def test_pandas_parquet_datetime_tz():
import pyarrow.parquet as pq