You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/03/25 01:24:38 UTC
[arrow] branch master updated: ARROW-4688: [C++][Parquet] Chunk binary column reads at 2^31 - 1 byte boundaries to avoid splitting chunk inside nested string cell

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new fc7d07b  ARROW-4688: [C++][Parquet] Chunk binary column reads at 2^31 - 1 byte boundaries to avoid splitting chunk inside nested string cell
fc7d07b is described below

commit fc7d07b2bfb04615be095606ac1a5f54ceb04cf5
Author: Wes McKinney <we...@apache.org>
AuthorDate: Sun Mar 24 20:24:25 2019 -0500

    ARROW-4688: [C++][Parquet] Chunk binary column reads at 2^31 - 1 byte boundaries to avoid splitting chunk inside nested string cell
    
    Chunked outputs with nested types still does not work though; that will have to be addressed later
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #4023 from wesm/ARROW-4688 and squashes the following commits:
    
    c7c846bc <Wes McKinney> Set binary chunksize to 2^31 - 1, fix edge case in ChunkedBinaryBuilder
    17bf55d4 <Wes McKinney> Start of unit test
---
 cpp/src/arrow/array/builder_binary.h   |  4 ++--
 cpp/src/parquet/arrow/record_reader.cc |  4 ++--
 python/pyarrow/tests/test_parquet.py   | 37 ++++++++++++++++++++++++++--------
 3 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index 3bc930c..c3a459b 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -292,8 +292,8 @@ class ARROW_EXPORT ChunkedBinaryBuilder {
  protected:
   Status NextChunk();
 
-  int32_t max_chunk_size_;
-  int32_t chunk_data_size_;
+  int64_t max_chunk_size_;
+  int64_t chunk_data_size_;
 
   std::unique_ptr<BinaryBuilder> builder_;
   std::vector<std::shared_ptr<Array>> chunks_;
diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc
index 42334b9..57e50a0 100644
--- a/cpp/src/parquet/arrow/record_reader.cc
+++ b/cpp/src/parquet/arrow/record_reader.cc
@@ -629,8 +629,8 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType> {
  public:
   ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool)
       : TypedRecordReader<ByteArrayType>(descr, pool), builder_(nullptr) {
-    // Maximum of 16MB chunks
-    constexpr int32_t kBinaryChunksize = 1 << 24;
+    // ARROW-4688(wesm): Using 2^31 - 1 chunks for now
+    constexpr int32_t kBinaryChunksize = 2147483647;
     DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
     if (descr_->logical_type() == LogicalType::UTF8) {
       builder_.reset(
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 39479e3..1abfc70 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2091,6 +2091,13 @@ def test_large_table_int32_overflow():
     _write_table(table, f)
 
 
+def _simple_table_roundtrip(table):
+    stream = pa.BufferOutputStream()
+    _write_table(table, stream)
+    buf = stream.getvalue()
+    return _read_table(buf)
+
+
 @pytest.mark.pandas
 @pytest.mark.large_memory
 def test_binary_array_overflow_to_chunked():
@@ -2103,23 +2110,37 @@ def test_binary_array_overflow_to_chunked():
     df = pd.DataFrame({'byte_col': values})
 
     tbl = pa.Table.from_pandas(df, preserve_index=False)
-
-    buf = io.BytesIO()
-    _write_table(tbl, buf)
-    buf.seek(0)
-    read_tbl = _read_table(buf)
-    buf = None
+    read_tbl = _simple_table_roundtrip(tbl)
 
     col0_data = read_tbl[0].data
     assert isinstance(col0_data, pa.ChunkedArray)
 
-    # Split up into 16MB chunks. 128 * 16 = 2048, so 129
-    assert col0_data.num_chunks == 129
+    # Split up into 2GB chunks
+    assert col0_data.num_chunks == 2
 
     assert tbl.equals(read_tbl)
 
 
 @pytest.mark.pandas
+@pytest.mark.large_memory
+def test_list_of_binary_large_cell():
+    # ARROW-4688
+    data = []
+
+    # TODO(wesm): handle chunked children
+    # 2^31 - 1 bytes in a single cell
+    # data.append([b'x' * (1 << 20)] * 2047 + [b'x' * ((1 << 20) - 1)])
+
+    # A little under 2GB in cell each containing approximately 10MB each
+    data.extend([[b'x' * 1000000] * 10] * 214)
+
+    arr = pa.array(data)
+    table = pa.Table.from_arrays([arr], ['chunky_cells'])
+    read_table = _simple_table_roundtrip(table)
+    assert table.equals(read_table)
+
+
+@pytest.mark.pandas
 def test_index_column_name_duplicate(tempdir):
     data = {
         'close': {