You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/01/23 13:53:10 UTC
[arrow] branch master updated: ARROW-1997: [C++/Python] Ignore zero-copy-option in to_pandas when `strings_to_categorical` is True

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 72dea17  ARROW-1997: [C++/Python] Ignore zero-copy-option in to_pandas when `strings_to_categorical` is True
72dea17 is described below

commit 72dea17fefde50676489470189c5e0492fd01510
Author: Licht-T <li...@outlook.jp>
AuthorDate: Tue Jan 23 08:53:04 2018 -0500

    ARROW-1997: [C++/Python] Ignore zero-copy-option in to_pandas when `strings_to_categorical` is True
    
    This closes [ARROW-1997](https://issues.apache.org/jira/browse/ARROW-1997).
    
    The problem is
    ```python
    >>> import pandas as pd
    >>> import pyarrow as pa
    >>>
    >>> df = pd.DataFrame({
    ... 'Foo': ['A', 'A', 'B', 'B']
    ... })
    >>> table = pa.Table.from_pandas(df)
    >>> df = table.to_pandas(strings_to_categorical=True)
    <class 'pandas.core.categorical.Categorical'>
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "table.pxi", line 1043, in pyarrow.lib.Table.to_pandas
      File "pyarrow/pandas_compat.py", line 535, in table_to_blockmanager
        blocks = _table_to_blocks(options, block_table, nthreads, memory_pool)
      File "pyarrow/pandas_compat.py", line 629, in _table_to_blocks
        return [_reconstruct_block(item) for item in result]
      File "pyarrow/pandas_compat.py", line 436, in _reconstruct_block
        ordered=item['ordered'])
      File "/home/rito/miniconda3/envs/pyarrow-dev-27/lib/python2.7/site-packages/pandas/core/categorical.py", line 624, in from_codes
        raise ValueError("codes need to be between -1 and "
    ValueError: codes need to be between -1 and len(categories)-1
    ```
    
    When `strings_to_categorical=True`, the categorical index is newly created in `to_pandas` procedure.
    But, this passes data to Python by zero-copy, so the array is deallocated.
    https://github.com/Licht-T/arrow/blob/be58af6dd0333652abbe2333ee5968df3f2e371f/cpp/src/arrow/python/arrow_to_pandas.cc#L1040
    
    Author: Licht-T <li...@outlook.jp>
    Author: Wes McKinney <we...@twosigma.com>
    
    Closes #1480 from Licht-T/fix-to_pandas-with-strings_to_categorical and squashes the following commits:
    
    61eac9c1 [Wes McKinney] Adjust error message
    c1bc3539 [Licht-T] TST: Add test for to_pandas no-NA strings with strings_to_categorical
    cce3f50c [Licht-T] BUG: Avoid zero-copy-option in to_pandas when strings_to_categorical is True
---
 cpp/src/arrow/python/arrow_to_pandas.cc     | 16 ++++++++++++----
 python/pyarrow/tests/test_convert_pandas.py | 21 ++++++++++++++++++++-
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index e21bbda..5c8c970 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -963,7 +963,7 @@ class DatetimeTZBlock : public DatetimeBlock {
 class CategoricalBlock : public PandasBlock {
  public:
   explicit CategoricalBlock(PandasOptions options, MemoryPool* pool, int64_t num_rows)
-      : PandasBlock(options, num_rows, 1), pool_(pool) {}
+      : PandasBlock(options, num_rows, 1), pool_(pool), needs_copy_(false) {}
 
   Status Allocate() override {
     return Status::NotImplemented(
@@ -996,14 +996,20 @@ class CategoricalBlock : public PandasBlock {
       return Status::OK();
     };
 
-    if (data.num_chunks() == 1 && indices_first.null_count() == 0) {
+    if (!needs_copy_ && data.num_chunks() == 1 && indices_first.null_count() == 0) {
       RETURN_NOT_OK(CheckIndices(indices_first, dict_arr_first.dictionary()->length()));
       RETURN_NOT_OK(AllocateNDArrayFromIndices<T>(npy_type, indices_first));
     } else {
       if (options_.zero_copy_only) {
         std::stringstream ss;
-        ss << "Needed to copy " << data.num_chunks() << " chunks with "
-           << indices_first.null_count() << " indices nulls, but zero_copy_only was True";
+        if (needs_copy_) {
+          ss << "Need to allocate categorical memory, "
+             << "but only zero-copy conversions allowed.";
+        } else {
+          ss << "Needed to copy " << data.num_chunks() << " chunks with "
+             << indices_first.null_count()
+             << " indices nulls, but zero_copy_only was True";
+        }
         return Status::Invalid(ss.str());
       }
       RETURN_NOT_OK(AllocateNDArray(npy_type, 1));
@@ -1034,6 +1040,7 @@ class CategoricalBlock : public PandasBlock {
     std::shared_ptr<Column> converted_col;
     if (options_.strings_to_categorical &&
         (col->type()->id() == Type::STRING || col->type()->id() == Type::BINARY)) {
+      needs_copy_ = true;
       compute::FunctionContext ctx(pool_);
 
       Datum out;
@@ -1135,6 +1142,7 @@ class CategoricalBlock : public PandasBlock {
   MemoryPool* pool_;
   OwnedRef dictionary_;
   bool ordered_;
+  bool needs_copy_;
 };
 
 Status MakeBlock(PandasOptions options, PandasBlock::type type, int64_t num_rows,
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 83b1da1..5acb9c3 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1237,7 +1237,22 @@ class TestPandasConversion(object):
         assert data_column['numpy_type'] == 'object'
         assert data_column['metadata'] == {'precision': 26, 'scale': 11}
 
-    def test_table_str_to_categorical(self):
+    def test_table_str_to_categorical_without_na(self):
+        values = ['a', 'a', 'b', 'b', 'c']
+        df = pd.DataFrame({'strings': values})
+        field = pa.field('strings', pa.string())
+        schema = pa.schema([field])
+        table = pa.Table.from_pandas(df, schema=schema)
+
+        result = table.to_pandas(strings_to_categorical=True)
+        expected = pd.DataFrame({'strings': pd.Categorical(values)})
+        tm.assert_frame_equal(result, expected, check_dtype=True)
+
+        with pytest.raises(pa.ArrowInvalid):
+            table.to_pandas(strings_to_categorical=True,
+                            zero_copy_only=True)
+
+    def test_table_str_to_categorical_with_na(self):
         values = [None, 'a', 'b', np.nan]
         df = pd.DataFrame({'strings': values})
         field = pa.field('strings', pa.string())
@@ -1248,6 +1263,10 @@ class TestPandasConversion(object):
         expected = pd.DataFrame({'strings': pd.Categorical(values)})
         tm.assert_frame_equal(result, expected, check_dtype=True)
 
+        with pytest.raises(pa.ArrowInvalid):
+            table.to_pandas(strings_to_categorical=True,
+                            zero_copy_only=True)
+
     def test_table_batch_empty_dataframe(self):
         df = pd.DataFrame({})
         _check_pandas_roundtrip(df)

-- 
To stop receiving notification emails like this one, please contact
wesm@apache.org.