You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2018/12/10 13:05:25 UTC
[arrow] branch master updated: ARROW-3792: [C++] Writing a
list-type chunked column to Parquet fails if any chunk is 0-length
This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new d6284cf ARROW-3792: [C++] Writing a list-type chunked column to Parquet fails if any chunk is 0-length
d6284cf is described below
commit d6284cf89c75f4767996abe087a8eb203401fb6d
Author: Wes McKinney <we...@apache.org>
AuthorDate: Mon Dec 10 14:05:14 2018 +0100
ARROW-3792: [C++] Writing a list-type chunked column to Parquet fails if any chunk is 0-length
Thanks to @tanyaschlusser to providing a minimal reproduction to help find the underlying problem
Author: Wes McKinney <we...@apache.org>
Closes #3141 from wesm/ARROW-3792 and squashes the following commits:
1ed82a57 <Wes McKinney> Add test case and fix
---
cpp/src/parquet/arrow/writer.cc | 5 +++++
python/pyarrow/tests/test_parquet.py | 33 +++++++++++++++++++++++++++++++--
2 files changed, 36 insertions(+), 2 deletions(-)
diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
index ef5de07..402cbf0 100644
--- a/cpp/src/parquet/arrow/writer.cc
+++ b/cpp/src/parquet/arrow/writer.cc
@@ -861,6 +861,11 @@ Status ArrowColumnWriter::TypedWriteBatch<FLBAType, ::arrow::Decimal128Type>(
}
Status ArrowColumnWriter::Write(const Array& data) {
+ if (data.length() == 0) {
+ // Write nothing when length is 0
+ return Status::OK();
+ }
+
::arrow::Type::type values_type;
RETURN_NOT_OK(GetLeafType(*data.type(), &values_type));
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index c14056e..89d3224 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+from collections import OrderedDict
import datetime
import decimal
import io
@@ -2224,6 +2225,34 @@ def test_merging_parquet_tables_with_different_pandas_metadata(tempdir):
def test_writing_empty_lists():
# ARROW-2591: [Python] Segmentation fault issue in pq.write_table
- arr = pa.array([[], []], pa.list_(pa.int32()))
- table = pa.Table.from_arrays([arr], ['test'])
+ arr1 = pa.array([[], []], pa.list_(pa.int32()))
+ table = pa.Table.from_arrays([arr1], ['list(int32)'])
_check_roundtrip(table)
+
+
+def test_write_nested_zero_length_array_chunk_failure():
+ # Bug report in ARROW-3792
+ cols = OrderedDict(
+ int32=pa.int32(),
+ list_string=pa.list_(pa.string())
+ )
+ data = [[], [OrderedDict(int32=1, list_string=('G',)), ]]
+
+ # This produces a table with a column like
+ # <Column name='list_string' type=ListType(list<item: string>)>
+ # [
+ # [],
+ # [
+ # [
+ # "G"
+ # ]
+ # ]
+ # ]
+ #
+ # Each column is a ChunkedArray with 2 elements
+ my_arrays = [pa.array(batch, type=pa.struct(cols)).flatten()
+ for batch in data]
+ my_batches = [pa.RecordBatch.from_arrays(batch, pa.schema(cols))
+ for batch in my_arrays]
+ tbl = pa.Table.from_batches(my_batches, pa.schema(cols))
+ _check_roundtrip(tbl)