You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2018/12/10 13:05:25 UTC

[arrow] branch master updated: ARROW-3792: [C++] Writing a list-type chunked column to Parquet fails if any chunk is 0-length

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new d6284cf  ARROW-3792: [C++] Writing a list-type chunked column to Parquet fails if any chunk is 0-length
d6284cf is described below

commit d6284cf89c75f4767996abe087a8eb203401fb6d
Author: Wes McKinney <we...@apache.org>
AuthorDate: Mon Dec 10 14:05:14 2018 +0100

    ARROW-3792: [C++] Writing a list-type chunked column to Parquet fails if any chunk is 0-length
    
    Thanks to @tanyaschlusser to providing a minimal reproduction to help find the underlying problem
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #3141 from wesm/ARROW-3792 and squashes the following commits:
    
    1ed82a57 <Wes McKinney> Add test case and fix
---
 cpp/src/parquet/arrow/writer.cc      |  5 +++++
 python/pyarrow/tests/test_parquet.py | 33 +++++++++++++++++++++++++++++++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
index ef5de07..402cbf0 100644
--- a/cpp/src/parquet/arrow/writer.cc
+++ b/cpp/src/parquet/arrow/writer.cc
@@ -861,6 +861,11 @@ Status ArrowColumnWriter::TypedWriteBatch<FLBAType, ::arrow::Decimal128Type>(
 }
 
 Status ArrowColumnWriter::Write(const Array& data) {
+  if (data.length() == 0) {
+    // Write nothing when length is 0
+    return Status::OK();
+  }
+
   ::arrow::Type::type values_type;
   RETURN_NOT_OK(GetLeafType(*data.type(), &values_type));
 
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index c14056e..89d3224 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from collections import OrderedDict
 import datetime
 import decimal
 import io
@@ -2224,6 +2225,34 @@ def test_merging_parquet_tables_with_different_pandas_metadata(tempdir):
 
 def test_writing_empty_lists():
     # ARROW-2591: [Python] Segmentation fault issue in pq.write_table
-    arr = pa.array([[], []], pa.list_(pa.int32()))
-    table = pa.Table.from_arrays([arr], ['test'])
+    arr1 = pa.array([[], []], pa.list_(pa.int32()))
+    table = pa.Table.from_arrays([arr1], ['list(int32)'])
     _check_roundtrip(table)
+
+
+def test_write_nested_zero_length_array_chunk_failure():
+    # Bug report in ARROW-3792
+    cols = OrderedDict(
+        int32=pa.int32(),
+        list_string=pa.list_(pa.string())
+    )
+    data = [[], [OrderedDict(int32=1, list_string=('G',)), ]]
+
+    # This produces a table with a column like
+    # <Column name='list_string' type=ListType(list<item: string>)>
+    # [
+    #   [],
+    #   [
+    #     [
+    #       "G"
+    #     ]
+    #   ]
+    # ]
+    #
+    # Each column is a ChunkedArray with 2 elements
+    my_arrays = [pa.array(batch, type=pa.struct(cols)).flatten()
+                 for batch in data]
+    my_batches = [pa.RecordBatch.from_arrays(batch, pa.schema(cols))
+                  for batch in my_arrays]
+    tbl = pa.Table.from_batches(my_batches, pa.schema(cols))
+    _check_roundtrip(tbl)