You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2018/12/18 14:47:33 UTC

[arrow] branch master updated: ARROW-3058: [Python] Raise more helpful better error message when writing a pandas.DataFrame to Feather format that requires a chunked layout

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 36ded49  ARROW-3058: [Python] Raise more helpful better error message when writing a pandas.DataFrame to Feather format that requires a chunked layout
36ded49 is described below

commit 36ded49568b8c3d664f0f14d06ec199ef5286857
Author: Wes McKinney <we...@apache.org>
AuthorDate: Tue Dec 18 15:47:09 2018 +0100

    ARROW-3058: [Python] Raise more helpful better error message when writing a pandas.DataFrame to Feather format that requires a chunked layout
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #3178 from wesm/ARROW-3058 and squashes the following commits:
    
    4a10687f <Wes McKinney> Raise more helpful better error message when a large binary/string column yields ChunkedArray on conversion to pyarrow.Table
---
 python/pyarrow/feather.py            | 26 +++++++++++++++++++++-----
 python/pyarrow/tests/test_feather.py | 18 ++++++++++++++++++
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index faa2f7d..3713c1f 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -23,7 +23,7 @@ import pandas as pd
 
 from pyarrow.compat import pdapi
 from pyarrow.lib import FeatherError  # noqa
-from pyarrow.lib import RecordBatch, concat_tables
+from pyarrow.lib import Table, concat_tables
 import pyarrow.lib as ext
 
 
@@ -62,6 +62,21 @@ class FeatherReader(ext.FeatherReader):
             use_threads=use_threads)
 
 
+def check_chunked_overflow(col):
+    if col.data.num_chunks == 1:
+        return
+
+    if col.type in (ext.binary(), ext.string()):
+        raise ValueError("Column '{0}' exceeds 2GB maximum capacity of "
+                         "a Feather binary column. This restriction may be "
+                         "lifted in the future".format(col.name))
+    else:
+        # TODO(wesm): Not sure when else this might be reached
+        raise ValueError("Column '{0}' of type {1} was chunked on conversion "
+                         "to Arrow and cannot be currently written to "
+                         "Feather format".format(col.name, str(col.type)))
+
+
 class FeatherWriter(object):
 
     def __init__(self, dest):
@@ -78,10 +93,11 @@ class FeatherWriter(object):
 
         # TODO(wesm): Remove this length check, see ARROW-1732
         if len(df.columns) > 0:
-            batch = RecordBatch.from_pandas(df, preserve_index=False)
-            for i, name in enumerate(batch.schema.names):
-                col = batch[i]
-                self.writer.write_array(name, col)
+            table = Table.from_pandas(df, preserve_index=False)
+            for i, name in enumerate(table.schema.names):
+                col = table[i]
+                check_chunked_overflow(col)
+                self.writer.write_array(name, col.data.chunk(0))
 
         self.writer.close()
 
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 01b5672..d144f98 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import io
 import os
 import sys
 import tempfile
@@ -535,3 +536,20 @@ class TestFeatherReader(unittest.TestCase):
     def test_large_dataframe(self):
         df = pd.DataFrame({'A': np.arange(400000000)})
         self._check_pandas_roundtrip(df)
+
+
+@pytest.mark.large_memory
+def test_chunked_binary_error_message():
+    # ARROW-3058: As Feather does not yet support chunked columns, we at least
+    # make sure it's clear to the user what is going on
+
+    # 2^31 + 1 bytes
+    values = [b'x'] + [
+        b'x' * (1 << 20)
+    ] * 2 * (1 << 10)
+    df = pd.DataFrame({'byte_col': values})
+
+    with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum "
+                       "capacity of a Feather binary column. This restriction "
+                       "may be lifted in the future"):
+        write_feather(df, io.BytesIO())