You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/04/01 15:19:16 UTC
arrow git commit: ARROW-723: [Python] Ensure that passing
chunk_size=0 when writing Parquet file does not enter infinite loop
Repository: arrow
Updated Branches:
refs/heads/master 9f5e17448 -> fd000964d
ARROW-723: [Python] Ensure that passing chunk_size=0 when writing Parquet file does not enter infinite loop
This should also be fixed in parquet-cpp, will open a JIRA.
Author: Wes McKinney <we...@twosigma.com>
Closes #468 from wesm/ARROW-723 and squashes the following commits:
f938703 [Wes McKinney] Raise if row group size is 0, use default if -1
5f83850 [Wes McKinney] Ensure that passing chunk_size=0 when writing Parquet file does not enter infinite loop
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/fd000964
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/fd000964
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/fd000964
Branch: refs/heads/master
Commit: fd000964d218b355e725d8eced1d1301f36dc092
Parents: 9f5e174
Author: Wes McKinney <we...@twosigma.com>
Authored: Sat Apr 1 11:19:09 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Sat Apr 1 11:19:09 2017 -0400
----------------------------------------------------------------------
python/pyarrow/_parquet.pyx | 5 ++++-
python/pyarrow/parquet.py | 2 +-
python/pyarrow/tests/test_parquet.py | 17 +++++++++++++++++
3 files changed, 22 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/fd000964/python/pyarrow/_parquet.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 8e67da9..c4cbd28 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -538,10 +538,13 @@ cdef class ParquetWriter:
def write_table(self, Table table, row_group_size=None):
cdef CTable* ctable = table.table
- if row_group_size is None:
+ if row_group_size is None or row_group_size == -1:
row_group_size = ctable.num_rows()
+ elif row_group_size == 0:
+ raise ValueError('Row group size cannot be 0')
cdef int c_row_group_size = row_group_size
+
with nogil:
check_status(WriteTable(deref(ctable), self.allocator,
self.sink, c_row_group_size,
http://git-wip-us.apache.org/repos/asf/arrow/blob/fd000964/python/pyarrow/parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index fa96f95..2985316 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -187,7 +187,7 @@ def write_table(table, sink, chunk_size=None, version='1.0',
----------
table : pyarrow.Table
sink: string or pyarrow.io.NativeFile
- chunk_size : int
+ chunk_size : int, default None
The maximum number of rows in each Parquet RowGroup. As a default,
we will write a single RowGroup per file.
version : {"1.0", "2.0"}, default "1.0"
http://git-wip-us.apache.org/repos/asf/arrow/blob/fd000964/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index fc32b9f..b8b2800 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -366,6 +366,23 @@ def test_multithreaded_read():
@parquet
+def test_min_chunksize():
+ data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D'])
+ table = pa.Table.from_pandas(data.reset_index())
+
+ buf = io.BytesIO()
+ pq.write_table(table, buf, chunk_size=-1)
+
+ buf.seek(0)
+ result = pq.read_table(buf)
+
+ assert result.equals(table)
+
+ with pytest.raises(ValueError):
+ pq.write_table(table, buf, chunk_size=0)
+
+
+@parquet
def test_pass_separate_metadata():
# ARROW-471
df = alltypes_sample(size=10000)