You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/07/23 21:08:33 UTC

[arrow] branch master updated: ARROW-2744: [C++] Avoid creating list arrays with a null values buffer

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 491114b  ARROW-2744: [C++] Avoid creating list arrays with a null values buffer
491114b is described below

commit 491114bd284cecd542c4ae341178e13a48a815de
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Mon Jul 23 17:08:28 2018 -0400

    ARROW-2744: [C++] Avoid creating list arrays with a null values buffer
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #2243 from pitrou/ARROW-2744 and squashes the following commits:
    
    715189a0 <Antoine Pitrou> ARROW-2744:  Avoid creating list arrays with a null values buffer
---
 cpp/src/arrow/array-test.cc          |  2 ++
 cpp/src/arrow/array.cc               | 58 +++++++++++++++++++++++-------------
 cpp/src/arrow/builder.cc             |  9 ++++++
 cpp/src/arrow/ipc/test-common.h      |  2 ++
 python/pyarrow/tests/test_parquet.py | 12 ++++++--
 5 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 44270d1..df6051e 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -2919,6 +2919,8 @@ TEST_F(TestListArray, TestAppendNull) {
 
   auto values = result_->values();
   ASSERT_EQ(0, values->length());
+  // Values buffer should be non-null
+  ASSERT_NE(nullptr, values->data()->buffers[1]);
 }
 
 void ValidateBasicListArray(const ListArray* result, const vector<int32_t>& values,
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index fb6ccaf..6222e37 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -664,12 +664,30 @@ namespace internal {
 struct ValidateVisitor {
   Status Visit(const NullArray&) { return Status::OK(); }
 
-  Status Visit(const PrimitiveArray&) { return Status::OK(); }
+  Status Visit(const PrimitiveArray& array) {
+    if (array.data()->buffers.size() != 2) {
+      return Status::Invalid("number of buffers was != 2");
+    }
+    if (array.values() == nullptr) {
+      return Status::Invalid("values was null");
+    }
+    return Status::OK();
+  }
 
-  Status Visit(const Decimal128Array&) { return Status::OK(); }
+  Status Visit(const Decimal128Array& array) {
+    if (array.data()->buffers.size() != 2) {
+      return Status::Invalid("number of buffers was != 2");
+    }
+    if (array.values() == nullptr) {
+      return Status::Invalid("values was null");
+    }
+    return Status::OK();
+  }
 
-  Status Visit(const BinaryArray&) {
-    // TODO(wesm): what to do here?
+  Status Visit(const BinaryArray& array) {
+    if (array.data()->buffers.size() != 3) {
+      return Status::Invalid("number of buffers was != 3");
+    }
     return Status::OK();
   }
 
@@ -688,24 +706,24 @@ struct ValidateVisitor {
          << " isn't large enough for length: " << array.length();
       return Status::Invalid(ss.str());
     }
+
+    if (!array.values()) {
+      return Status::Invalid("values was null");
+    }
+
     const int32_t last_offset = array.value_offset(array.length());
-    if (last_offset > 0) {
-      if (!array.values()) {
-        return Status::Invalid("last offset was non-zero and values was null");
-      }
-      if (array.values()->length() != last_offset) {
-        std::stringstream ss;
-        ss << "Final offset invariant not equal to values length: " << last_offset
-           << "!=" << array.values()->length();
-        return Status::Invalid(ss.str());
-      }
+    if (array.values()->length() != last_offset) {
+      std::stringstream ss;
+      ss << "Final offset invariant not equal to values length: " << last_offset
+         << "!=" << array.values()->length();
+      return Status::Invalid(ss.str());
+    }
 
-      const Status child_valid = ValidateArray(*array.values());
-      if (!child_valid.ok()) {
-        std::stringstream ss;
-        ss << "Child array invalid: " << child_valid.ToString();
-        return Status::Invalid(ss.str());
-      }
+    const Status child_valid = ValidateArray(*array.values());
+    if (!child_valid.ok()) {
+      std::stringstream ss;
+      ss << "Child array invalid: " << child_valid.ToString();
+      return Status::Invalid(ss.str());
     }
 
     int32_t prev_offset = array.value_offset(0);
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 1b77945..152f1fc 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -54,6 +54,7 @@ Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) {
     // zero the padding
     buffer->ZeroPadding();
   } else {
+    // Null buffers are allowed in place of 0-byte buffers
     DCHECK_EQ(bytes_filled, 0);
   }
   return Status::OK();
@@ -1336,6 +1337,10 @@ Status ListBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
   if (values_) {
     items = values_->data();
   } else {
+    if (value_builder_->length() == 0) {
+      // Try to make sure we get a non-null values buffer (ARROW-2744)
+      RETURN_NOT_OK(value_builder_->Resize(0));
+    }
     RETURN_NOT_OK(value_builder_->FinishInternal(&items));
   }
 
@@ -1632,6 +1637,10 @@ Status StructBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
 
   (*out)->child_data.resize(field_builders_.size());
   for (size_t i = 0; i < field_builders_.size(); ++i) {
+    if (length_ == 0) {
+      // Try to make sure the child buffers are initialized
+      RETURN_NOT_OK(field_builders_[i]->Resize(0));
+    }
     RETURN_NOT_OK(field_builders_[i]->FinishInternal(&(*out)->child_data[i]));
   }
 
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index 2f0237e..4e9fab1 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -100,6 +100,7 @@ Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool
   std::shared_ptr<PoolBuffer> data;
   RETURN_NOT_OK(test::MakeRandomInt32PoolBuffer(length, pool, &data));
   Int32Builder builder(int32(), pool);
+  RETURN_NOT_OK(builder.Init(length));
   if (include_nulls) {
     std::shared_ptr<PoolBuffer> valid_bytes;
     RETURN_NOT_OK(test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes));
@@ -127,6 +128,7 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li
   std::vector<int32_t> offsets(
       num_lists + 1, 0);  // +1 so we can shift for nulls. See partial sum below.
   const uint32_t seed = static_cast<uint32_t>(child_array->length());
+
   if (num_lists > 0) {
     test::rand_uniform_int(num_lists, seed, 0, max_list_size, list_sizes.data());
     // make sure sizes are consistent with null
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index d6ca7dd..21610d9 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -158,7 +158,7 @@ def test_pandas_parquet_2_0_rountrip(tmpdir, chunk_size):
 
 
 @parquet
-def test_chunked_table_write(tmpdir):
+def test_chunked_table_write():
     # ARROW-232
     df = alltypes_sample(size=10)
 
@@ -176,7 +176,7 @@ def test_chunked_table_write(tmpdir):
 
 
 @parquet
-def test_empty_table_roundtrip(tmpdir):
+def test_empty_table_roundtrip():
     df = alltypes_sample(size=10)
     # The nanosecond->us conversion is a nuisance, so we just avoid it here
     del df['datetime']
@@ -193,6 +193,14 @@ def test_empty_table_roundtrip(tmpdir):
 
 
 @parquet
+def test_empty_lists_table_roundtrip():
+    # ARROW-2744: Shouldn't crash when writing an array of empty lists
+    arr = pa.array([[], []], type=pa.list_(pa.int32()))
+    table = pa.Table.from_arrays([arr], ["A"])
+    _check_roundtrip(table)
+
+
+@parquet
 def test_pandas_parquet_datetime_tz():
     import pyarrow.parquet as pq