You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2018/12/18 14:47:33 UTC
[arrow] branch master updated: ARROW-3058: [Python] Raise more
helpful better error message when writing a pandas.DataFrame to Feather
format that requires a chunked layout
This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 36ded49 ARROW-3058: [Python] Raise more helpful better error message when writing a pandas.DataFrame to Feather format that requires a chunked layout
36ded49 is described below
commit 36ded49568b8c3d664f0f14d06ec199ef5286857
Author: Wes McKinney <we...@apache.org>
AuthorDate: Tue Dec 18 15:47:09 2018 +0100
ARROW-3058: [Python] Raise more helpful better error message when writing a pandas.DataFrame to Feather format that requires a chunked layout
Author: Wes McKinney <we...@apache.org>
Closes #3178 from wesm/ARROW-3058 and squashes the following commits:
4a10687f <Wes McKinney> Raise more helpful better error message when a large binary/string column yields ChunkedArray on conversion to pyarrow.Table
---
python/pyarrow/feather.py | 26 +++++++++++++++++++++-----
python/pyarrow/tests/test_feather.py | 18 ++++++++++++++++++
2 files changed, 39 insertions(+), 5 deletions(-)
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index faa2f7d..3713c1f 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -23,7 +23,7 @@ import pandas as pd
from pyarrow.compat import pdapi
from pyarrow.lib import FeatherError # noqa
-from pyarrow.lib import RecordBatch, concat_tables
+from pyarrow.lib import Table, concat_tables
import pyarrow.lib as ext
@@ -62,6 +62,21 @@ class FeatherReader(ext.FeatherReader):
use_threads=use_threads)
+def check_chunked_overflow(col):
+ if col.data.num_chunks == 1:
+ return
+
+ if col.type in (ext.binary(), ext.string()):
+ raise ValueError("Column '{0}' exceeds 2GB maximum capacity of "
+ "a Feather binary column. This restriction may be "
+ "lifted in the future".format(col.name))
+ else:
+ # TODO(wesm): Not sure when else this might be reached
+ raise ValueError("Column '{0}' of type {1} was chunked on conversion "
+ "to Arrow and cannot be currently written to "
+ "Feather format".format(col.name, str(col.type)))
+
+
class FeatherWriter(object):
def __init__(self, dest):
@@ -78,10 +93,11 @@ class FeatherWriter(object):
# TODO(wesm): Remove this length check, see ARROW-1732
if len(df.columns) > 0:
- batch = RecordBatch.from_pandas(df, preserve_index=False)
- for i, name in enumerate(batch.schema.names):
- col = batch[i]
- self.writer.write_array(name, col)
+ table = Table.from_pandas(df, preserve_index=False)
+ for i, name in enumerate(table.schema.names):
+ col = table[i]
+ check_chunked_overflow(col)
+ self.writer.write_array(name, col.data.chunk(0))
self.writer.close()
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 01b5672..d144f98 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+import io
import os
import sys
import tempfile
@@ -535,3 +536,20 @@ class TestFeatherReader(unittest.TestCase):
def test_large_dataframe(self):
df = pd.DataFrame({'A': np.arange(400000000)})
self._check_pandas_roundtrip(df)
+
+
+@pytest.mark.large_memory
+def test_chunked_binary_error_message():
+ # ARROW-3058: As Feather does not yet support chunked columns, we at least
+ # make sure it's clear to the user what is going on
+
+ # 2^31 + 1 bytes
+ values = [b'x'] + [
+ b'x' * (1 << 20)
+ ] * 2 * (1 << 10)
+ df = pd.DataFrame({'byte_col': values})
+
+ with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum "
+ "capacity of a Feather binary column. This restriction "
+ "may be lifted in the future"):
+ write_feather(df, io.BytesIO())