You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/04/01 15:19:16 UTC

arrow git commit: ARROW-723: [Python] Ensure that passing chunk_size=0 when writing Parquet file does not enter infinite loop

Repository: arrow
Updated Branches:
  refs/heads/master 9f5e17448 -> fd000964d


ARROW-723: [Python] Ensure that passing chunk_size=0 when writing Parquet file does not enter infinite loop

This should also be fixed in parquet-cpp, will open a JIRA.

Author: Wes McKinney <we...@twosigma.com>

Closes #468 from wesm/ARROW-723 and squashes the following commits:

f938703 [Wes McKinney] Raise if row group size is 0, use default if -1
5f83850 [Wes McKinney] Ensure that passing chunk_size=0 when writing Parquet file does not enter infinite loop


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/fd000964
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/fd000964
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/fd000964

Branch: refs/heads/master
Commit: fd000964d218b355e725d8eced1d1301f36dc092
Parents: 9f5e174
Author: Wes McKinney <we...@twosigma.com>
Authored: Sat Apr 1 11:19:09 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Sat Apr 1 11:19:09 2017 -0400

----------------------------------------------------------------------
 python/pyarrow/_parquet.pyx          |  5 ++++-
 python/pyarrow/parquet.py            |  2 +-
 python/pyarrow/tests/test_parquet.py | 17 +++++++++++++++++
 3 files changed, 22 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/fd000964/python/pyarrow/_parquet.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 8e67da9..c4cbd28 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -538,10 +538,13 @@ cdef class ParquetWriter:
     def write_table(self, Table table, row_group_size=None):
         cdef CTable* ctable = table.table
 
-        if row_group_size is None:
+        if row_group_size is None or row_group_size == -1:
             row_group_size = ctable.num_rows()
+        elif row_group_size == 0:
+            raise ValueError('Row group size cannot be 0')
 
         cdef int c_row_group_size = row_group_size
+
         with nogil:
             check_status(WriteTable(deref(ctable), self.allocator,
                                     self.sink, c_row_group_size,

http://git-wip-us.apache.org/repos/asf/arrow/blob/fd000964/python/pyarrow/parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index fa96f95..2985316 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -187,7 +187,7 @@ def write_table(table, sink, chunk_size=None, version='1.0',
     ----------
     table : pyarrow.Table
     sink: string or pyarrow.io.NativeFile
-    chunk_size : int
+    chunk_size : int, default None
         The maximum number of rows in each Parquet RowGroup. As a default,
         we will write a single RowGroup per file.
     version : {"1.0", "2.0"}, default "1.0"

http://git-wip-us.apache.org/repos/asf/arrow/blob/fd000964/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index fc32b9f..b8b2800 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -366,6 +366,23 @@ def test_multithreaded_read():
 
 
 @parquet
+def test_min_chunksize():
+    data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D'])
+    table = pa.Table.from_pandas(data.reset_index())
+
+    buf = io.BytesIO()
+    pq.write_table(table, buf, chunk_size=-1)
+
+    buf.seek(0)
+    result = pq.read_table(buf)
+
+    assert result.equals(table)
+
+    with pytest.raises(ValueError):
+        pq.write_table(table, buf, chunk_size=0)
+
+
+@parquet
 def test_pass_separate_metadata():
     # ARROW-471
     df = alltypes_sample(size=10000)