You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/08/16 02:47:47 UTC
[arrow] branch master updated: ARROW-5952: [Python] fix conversion
of chunked dictionary array with 0 chunks
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5479d30 ARROW-5952: [Python] fix conversion of chunked dictionary array with 0 chunks
5479d30 is described below
commit 5479d3047a23410de00f50687764a4f4300baba5
Author: Joris Van den Bossche <jo...@gmail.com>
AuthorDate: Thu Aug 15 21:47:38 2019 -0500
ARROW-5952: [Python] fix conversion of chunked dictionary array with 0 chunks
https://issues.apache.org/jira/browse/ARROW-5952
Closes #5081 from jorisvandenbossche/ARROW-5952-dictionary-zero-chunks and squashes the following commits:
2f11fb94d <Wes McKinney> Nits
742db0e34 <Joris Van den Bossche> create empty dictionary array of correct type
feb06d310 <Joris Van den Bossche> ARROW-5952: fix conversion of chunked dictionary array with 0 chunks
Lead-authored-by: Joris Van den Bossche <jo...@gmail.com>
Co-authored-by: Wes McKinney <we...@apache.org>
Signed-off-by: Wes McKinney <we...@apache.org>
---
cpp/src/arrow/python/arrow_to_pandas.cc | 47 ++++++++++++++++++++++++---------
python/pyarrow/tests/test_pandas.py | 13 +++++++++
2 files changed, 47 insertions(+), 13 deletions(-)
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index f97782d..39857d7 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -487,7 +487,7 @@ inline Status ConvertNulls(const PandasOptions& options, const ChunkedArray& dat
inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& data,
PyObject** out_values) {
PyAcquireGIL lock;
- if (data.num_chunks() <= 0) {
+ if (data.num_chunks() == 0) {
return Status::OK();
}
// ChunkedArray has at least one chunk
@@ -1042,6 +1042,14 @@ class DatetimeTZBlock : public DatetimeBlock {
std::string timezone_;
};
+Status MakeZeroLengthArray(const std::shared_ptr<DataType>& type,
+ std::shared_ptr<Array>* out) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(default_memory_pool(), type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ return builder->Finish(out);
+}
+
class CategoricalBlock : public PandasBlock {
public:
explicit CategoricalBlock(const PandasOptions& options, MemoryPool* pool,
@@ -1063,6 +1071,10 @@ class CategoricalBlock : public PandasBlock {
using T = typename TRAITS::T;
constexpr int npy_type = TRAITS::npy_type;
+ if (data->num_chunks() == 0) {
+ RETURN_NOT_OK(AllocateNDArray(npy_type, 1));
+ return Status::OK();
+ }
// Sniff the first chunk
const std::shared_ptr<Array> arr_first = data->chunk(0);
const auto& dict_arr_first = checked_cast<const DictionaryArray&>(*arr_first);
@@ -1132,15 +1144,17 @@ class CategoricalBlock : public PandasBlock {
converted_data = out.chunked_array();
} else {
// check if all dictionaries are equal
- const std::shared_ptr<Array> arr_first = data->chunk(0);
- const auto& dict_arr_first = checked_cast<const DictionaryArray&>(*arr_first);
+ if (data->num_chunks() > 1) {
+ const std::shared_ptr<Array> arr_first = data->chunk(0);
+ const auto& dict_arr_first = checked_cast<const DictionaryArray&>(*arr_first);
- for (int c = 1; c < data->num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data->chunk(c);
- const auto& dict_arr = checked_cast<const DictionaryArray&>(*arr);
+ for (int c = 1; c < data->num_chunks(); c++) {
+ const std::shared_ptr<Array> arr = data->chunk(c);
+ const auto& dict_arr = checked_cast<const DictionaryArray&>(*arr);
- if (!(dict_arr_first.dictionary()->Equals(dict_arr.dictionary()))) {
- return Status::NotImplemented("Variable dictionary type not supported");
+ if (!(dict_arr_first.dictionary()->Equals(dict_arr.dictionary()))) {
+ return Status::NotImplemented("Variable dictionary type not supported");
+ }
}
}
converted_data = data;
@@ -1168,13 +1182,20 @@ class CategoricalBlock : public PandasBlock {
}
// TODO(wesm): variable dictionaries
- auto arr = converted_data->chunk(0);
- const auto& dict_arr = checked_cast<const DictionaryArray&>(*arr);
+ std::shared_ptr<Array> dict;
+ if (data->num_chunks() == 0) {
+ // no dictionary values => create empty array
+ RETURN_NOT_OK(MakeZeroLengthArray(dict_type.value_type(), &dict));
+ } else {
+ auto arr = converted_data->chunk(0);
+ const auto& dict_arr = checked_cast<const DictionaryArray&>(*arr);
+ dict = dict_arr.dictionary();
+ }
placement_data_[rel_placement] = abs_placement;
- PyObject* dict;
- RETURN_NOT_OK(ConvertArrayToPandas(options_, dict_arr.dictionary(), nullptr, &dict));
- dictionary_.reset(dict);
+ PyObject* pydict;
+ RETURN_NOT_OK(ConvertArrayToPandas(options_, dict, nullptr, &pydict));
+ dictionary_.reset(pydict);
ordered_ = dict_type.ordered();
return Status::OK();
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 12a6bc3..437fdad 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2233,6 +2233,19 @@ class TestConvertMisc(object):
df = pd.DataFrame({'cat': pd.Categorical([])})
_check_pandas_roundtrip(df)
+ def test_category_zero_chunks(self):
+ # ARROW-5952
+ for pa_type, dtype in [(pa.string(), 'object'), (pa.int64(), 'int64')]:
+ a = pa.chunked_array([], pa.dictionary(pa.int8(), pa_type))
+ result = a.to_pandas()
+ expected = pd.Categorical([], categories=np.array([], dtype=dtype))
+ tm.assert_series_equal(pd.Series(result), pd.Series(expected))
+
+ table = pa.table({'a': a})
+ result = table.to_pandas()
+ expected = pd.DataFrame({'a': expected})
+ tm.assert_frame_equal(result, expected)
+
def test_mixed_types_fails(self):
data = pd.DataFrame({'a': ['a', 1, 2.0]})
with pytest.raises(pa.ArrowTypeError):