You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/03/25 01:24:38 UTC
[arrow] branch master updated: ARROW-4688: [C++][Parquet] Chunk
binary column reads at 2^31 - 1 byte boundaries to avoid splitting chunk
inside nested string cell
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new fc7d07b ARROW-4688: [C++][Parquet] Chunk binary column reads at 2^31 - 1 byte boundaries to avoid splitting chunk inside nested string cell
fc7d07b is described below
commit fc7d07b2bfb04615be095606ac1a5f54ceb04cf5
Author: Wes McKinney <we...@apache.org>
AuthorDate: Sun Mar 24 20:24:25 2019 -0500
ARROW-4688: [C++][Parquet] Chunk binary column reads at 2^31 - 1 byte boundaries to avoid splitting chunk inside nested string cell
Chunked outputs with nested types still does not work though; that will have to be addressed later
Author: Wes McKinney <we...@apache.org>
Closes #4023 from wesm/ARROW-4688 and squashes the following commits:
c7c846bc <Wes McKinney> Set binary chunksize to 2^31 - 1, fix edge case in ChunkedBinaryBuilder
17bf55d4 <Wes McKinney> Start of unit test
---
cpp/src/arrow/array/builder_binary.h | 4 ++--
cpp/src/parquet/arrow/record_reader.cc | 4 ++--
python/pyarrow/tests/test_parquet.py | 37 ++++++++++++++++++++++++++--------
3 files changed, 33 insertions(+), 12 deletions(-)
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index 3bc930c..c3a459b 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -292,8 +292,8 @@ class ARROW_EXPORT ChunkedBinaryBuilder {
protected:
Status NextChunk();
- int32_t max_chunk_size_;
- int32_t chunk_data_size_;
+ int64_t max_chunk_size_;
+ int64_t chunk_data_size_;
std::unique_ptr<BinaryBuilder> builder_;
std::vector<std::shared_ptr<Array>> chunks_;
diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc
index 42334b9..57e50a0 100644
--- a/cpp/src/parquet/arrow/record_reader.cc
+++ b/cpp/src/parquet/arrow/record_reader.cc
@@ -629,8 +629,8 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType> {
public:
ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool)
: TypedRecordReader<ByteArrayType>(descr, pool), builder_(nullptr) {
- // Maximum of 16MB chunks
- constexpr int32_t kBinaryChunksize = 1 << 24;
+ // ARROW-4688(wesm): Using 2^31 - 1 chunks for now
+ constexpr int32_t kBinaryChunksize = 2147483647;
DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
if (descr_->logical_type() == LogicalType::UTF8) {
builder_.reset(
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 39479e3..1abfc70 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2091,6 +2091,13 @@ def test_large_table_int32_overflow():
_write_table(table, f)
+def _simple_table_roundtrip(table):
+ stream = pa.BufferOutputStream()
+ _write_table(table, stream)
+ buf = stream.getvalue()
+ return _read_table(buf)
+
+
@pytest.mark.pandas
@pytest.mark.large_memory
def test_binary_array_overflow_to_chunked():
@@ -2103,23 +2110,37 @@ def test_binary_array_overflow_to_chunked():
df = pd.DataFrame({'byte_col': values})
tbl = pa.Table.from_pandas(df, preserve_index=False)
-
- buf = io.BytesIO()
- _write_table(tbl, buf)
- buf.seek(0)
- read_tbl = _read_table(buf)
- buf = None
+ read_tbl = _simple_table_roundtrip(tbl)
col0_data = read_tbl[0].data
assert isinstance(col0_data, pa.ChunkedArray)
- # Split up into 16MB chunks. 128 * 16 = 2048, so 129
- assert col0_data.num_chunks == 129
+ # Split up into 2GB chunks
+ assert col0_data.num_chunks == 2
assert tbl.equals(read_tbl)
@pytest.mark.pandas
+@pytest.mark.large_memory
+def test_list_of_binary_large_cell():
+ # ARROW-4688
+ data = []
+
+ # TODO(wesm): handle chunked children
+ # 2^31 - 1 bytes in a single cell
+ # data.append([b'x' * (1 << 20)] * 2047 + [b'x' * ((1 << 20) - 1)])
+
+ # A little under 2GB in cell each containing approximately 10MB each
+ data.extend([[b'x' * 1000000] * 10] * 214)
+
+ arr = pa.array(data)
+ table = pa.Table.from_arrays([arr], ['chunky_cells'])
+ read_table = _simple_table_roundtrip(table)
+ assert table.equals(read_table)
+
+
+@pytest.mark.pandas
def test_index_column_name_duplicate(tempdir):
data = {
'close': {