You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/01/23 13:53:10 UTC
[arrow] branch master updated: ARROW-1997: [C++/Python] Ignore
zero-copy-option in to_pandas when `strings_to_categorical` is True
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 72dea17 ARROW-1997: [C++/Python] Ignore zero-copy-option in to_pandas when `strings_to_categorical` is True
72dea17 is described below
commit 72dea17fefde50676489470189c5e0492fd01510
Author: Licht-T <li...@outlook.jp>
AuthorDate: Tue Jan 23 08:53:04 2018 -0500
ARROW-1997: [C++/Python] Ignore zero-copy-option in to_pandas when `strings_to_categorical` is True
This closes [ARROW-1997](https://issues.apache.org/jira/browse/ARROW-1997).
The problem is
```python
>>> import pandas as pd
>>> import pyarrow as pa
>>>
>>> df = pd.DataFrame({
... 'Foo': ['A', 'A', 'B', 'B']
... })
>>> table = pa.Table.from_pandas(df)
>>> df = table.to_pandas(strings_to_categorical=True)
<class 'pandas.core.categorical.Categorical'>
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "table.pxi", line 1043, in pyarrow.lib.Table.to_pandas
File "pyarrow/pandas_compat.py", line 535, in table_to_blockmanager
blocks = _table_to_blocks(options, block_table, nthreads, memory_pool)
File "pyarrow/pandas_compat.py", line 629, in _table_to_blocks
return [_reconstruct_block(item) for item in result]
File "pyarrow/pandas_compat.py", line 436, in _reconstruct_block
ordered=item['ordered'])
File "/home/rito/miniconda3/envs/pyarrow-dev-27/lib/python2.7/site-packages/pandas/core/categorical.py", line 624, in from_codes
raise ValueError("codes need to be between -1 and "
ValueError: codes need to be between -1 and len(categories)-1
```
When `strings_to_categorical=True`, the categorical index is newly created in `to_pandas` procedure.
But, this passes data to Python by zero-copy, so the array is deallocated.
https://github.com/Licht-T/arrow/blob/be58af6dd0333652abbe2333ee5968df3f2e371f/cpp/src/arrow/python/arrow_to_pandas.cc#L1040
Author: Licht-T <li...@outlook.jp>
Author: Wes McKinney <we...@twosigma.com>
Closes #1480 from Licht-T/fix-to_pandas-with-strings_to_categorical and squashes the following commits:
61eac9c1 [Wes McKinney] Adjust error message
c1bc3539 [Licht-T] TST: Add test for to_pandas no-NA strings with strings_to_categorical
cce3f50c [Licht-T] BUG: Avoid zero-copy-option in to_pandas when strings_to_categorical is True
---
cpp/src/arrow/python/arrow_to_pandas.cc | 16 ++++++++++++----
python/pyarrow/tests/test_convert_pandas.py | 21 ++++++++++++++++++++-
2 files changed, 32 insertions(+), 5 deletions(-)
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index e21bbda..5c8c970 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -963,7 +963,7 @@ class DatetimeTZBlock : public DatetimeBlock {
class CategoricalBlock : public PandasBlock {
public:
explicit CategoricalBlock(PandasOptions options, MemoryPool* pool, int64_t num_rows)
- : PandasBlock(options, num_rows, 1), pool_(pool) {}
+ : PandasBlock(options, num_rows, 1), pool_(pool), needs_copy_(false) {}
Status Allocate() override {
return Status::NotImplemented(
@@ -996,14 +996,20 @@ class CategoricalBlock : public PandasBlock {
return Status::OK();
};
- if (data.num_chunks() == 1 && indices_first.null_count() == 0) {
+ if (!needs_copy_ && data.num_chunks() == 1 && indices_first.null_count() == 0) {
RETURN_NOT_OK(CheckIndices(indices_first, dict_arr_first.dictionary()->length()));
RETURN_NOT_OK(AllocateNDArrayFromIndices<T>(npy_type, indices_first));
} else {
if (options_.zero_copy_only) {
std::stringstream ss;
- ss << "Needed to copy " << data.num_chunks() << " chunks with "
- << indices_first.null_count() << " indices nulls, but zero_copy_only was True";
+ if (needs_copy_) {
+ ss << "Need to allocate categorical memory, "
+ << "but only zero-copy conversions allowed.";
+ } else {
+ ss << "Needed to copy " << data.num_chunks() << " chunks with "
+ << indices_first.null_count()
+ << " indices nulls, but zero_copy_only was True";
+ }
return Status::Invalid(ss.str());
}
RETURN_NOT_OK(AllocateNDArray(npy_type, 1));
@@ -1034,6 +1040,7 @@ class CategoricalBlock : public PandasBlock {
std::shared_ptr<Column> converted_col;
if (options_.strings_to_categorical &&
(col->type()->id() == Type::STRING || col->type()->id() == Type::BINARY)) {
+ needs_copy_ = true;
compute::FunctionContext ctx(pool_);
Datum out;
@@ -1135,6 +1142,7 @@ class CategoricalBlock : public PandasBlock {
MemoryPool* pool_;
OwnedRef dictionary_;
bool ordered_;
+ bool needs_copy_;
};
Status MakeBlock(PandasOptions options, PandasBlock::type type, int64_t num_rows,
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 83b1da1..5acb9c3 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1237,7 +1237,22 @@ class TestPandasConversion(object):
assert data_column['numpy_type'] == 'object'
assert data_column['metadata'] == {'precision': 26, 'scale': 11}
- def test_table_str_to_categorical(self):
+ def test_table_str_to_categorical_without_na(self):
+ values = ['a', 'a', 'b', 'b', 'c']
+ df = pd.DataFrame({'strings': values})
+ field = pa.field('strings', pa.string())
+ schema = pa.schema([field])
+ table = pa.Table.from_pandas(df, schema=schema)
+
+ result = table.to_pandas(strings_to_categorical=True)
+ expected = pd.DataFrame({'strings': pd.Categorical(values)})
+ tm.assert_frame_equal(result, expected, check_dtype=True)
+
+ with pytest.raises(pa.ArrowInvalid):
+ table.to_pandas(strings_to_categorical=True,
+ zero_copy_only=True)
+
+ def test_table_str_to_categorical_with_na(self):
values = [None, 'a', 'b', np.nan]
df = pd.DataFrame({'strings': values})
field = pa.field('strings', pa.string())
@@ -1248,6 +1263,10 @@ class TestPandasConversion(object):
expected = pd.DataFrame({'strings': pd.Categorical(values)})
tm.assert_frame_equal(result, expected, check_dtype=True)
+ with pytest.raises(pa.ArrowInvalid):
+ table.to_pandas(strings_to_categorical=True,
+ zero_copy_only=True)
+
def test_table_batch_empty_dataframe(self):
df = pd.DataFrame({})
_check_pandas_roundtrip(df)
--
To stop receiving notification emails like this one, please contact
wesm@apache.org.