You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/08/16 02:47:47 UTC
[arrow] branch master updated: ARROW-5952: [Python] fix conversion of chunked dictionary array with 0 chunks

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5479d30  ARROW-5952: [Python] fix conversion of chunked dictionary array with 0 chunks
5479d30 is described below

commit 5479d3047a23410de00f50687764a4f4300baba5
Author: Joris Van den Bossche <jo...@gmail.com>
AuthorDate: Thu Aug 15 21:47:38 2019 -0500

    ARROW-5952: [Python] fix conversion of chunked dictionary array with 0 chunks
    
    https://issues.apache.org/jira/browse/ARROW-5952
    
    Closes #5081 from jorisvandenbossche/ARROW-5952-dictionary-zero-chunks and squashes the following commits:
    
    2f11fb94d <Wes McKinney> Nits
    742db0e34 <Joris Van den Bossche> create empty dictionary array of correct type
    feb06d310 <Joris Van den Bossche> ARROW-5952:  fix conversion of chunked dictionary array with 0 chunks
    
    Lead-authored-by: Joris Van den Bossche <jo...@gmail.com>
    Co-authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/arrow/python/arrow_to_pandas.cc | 47 ++++++++++++++++++++++++---------
 python/pyarrow/tests/test_pandas.py     | 13 +++++++++
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index f97782d..39857d7 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -487,7 +487,7 @@ inline Status ConvertNulls(const PandasOptions& options, const ChunkedArray& dat
 inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& data,
                             PyObject** out_values) {
   PyAcquireGIL lock;
-  if (data.num_chunks() <= 0) {
+  if (data.num_chunks() == 0) {
     return Status::OK();
   }
   // ChunkedArray has at least one chunk
@@ -1042,6 +1042,14 @@ class DatetimeTZBlock : public DatetimeBlock {
   std::string timezone_;
 };
 
+Status MakeZeroLengthArray(const std::shared_ptr<DataType>& type,
+                           std::shared_ptr<Array>* out) {
+  std::unique_ptr<ArrayBuilder> builder;
+  RETURN_NOT_OK(MakeBuilder(default_memory_pool(), type, &builder));
+  RETURN_NOT_OK(builder->Resize(0));
+  return builder->Finish(out);
+}
+
 class CategoricalBlock : public PandasBlock {
  public:
   explicit CategoricalBlock(const PandasOptions& options, MemoryPool* pool,
@@ -1063,6 +1071,10 @@ class CategoricalBlock : public PandasBlock {
     using T = typename TRAITS::T;
     constexpr int npy_type = TRAITS::npy_type;
 
+    if (data->num_chunks() == 0) {
+      RETURN_NOT_OK(AllocateNDArray(npy_type, 1));
+      return Status::OK();
+    }
     // Sniff the first chunk
     const std::shared_ptr<Array> arr_first = data->chunk(0);
     const auto& dict_arr_first = checked_cast<const DictionaryArray&>(*arr_first);
@@ -1132,15 +1144,17 @@ class CategoricalBlock : public PandasBlock {
       converted_data = out.chunked_array();
     } else {
       // check if all dictionaries are equal
-      const std::shared_ptr<Array> arr_first = data->chunk(0);
-      const auto& dict_arr_first = checked_cast<const DictionaryArray&>(*arr_first);
+      if (data->num_chunks() > 1) {
+        const std::shared_ptr<Array> arr_first = data->chunk(0);
+        const auto& dict_arr_first = checked_cast<const DictionaryArray&>(*arr_first);
 
-      for (int c = 1; c < data->num_chunks(); c++) {
-        const std::shared_ptr<Array> arr = data->chunk(c);
-        const auto& dict_arr = checked_cast<const DictionaryArray&>(*arr);
+        for (int c = 1; c < data->num_chunks(); c++) {
+          const std::shared_ptr<Array> arr = data->chunk(c);
+          const auto& dict_arr = checked_cast<const DictionaryArray&>(*arr);
 
-        if (!(dict_arr_first.dictionary()->Equals(dict_arr.dictionary()))) {
-          return Status::NotImplemented("Variable dictionary type not supported");
+          if (!(dict_arr_first.dictionary()->Equals(dict_arr.dictionary()))) {
+            return Status::NotImplemented("Variable dictionary type not supported");
+          }
         }
       }
       converted_data = data;
@@ -1168,13 +1182,20 @@ class CategoricalBlock : public PandasBlock {
     }
 
     // TODO(wesm): variable dictionaries
-    auto arr = converted_data->chunk(0);
-    const auto& dict_arr = checked_cast<const DictionaryArray&>(*arr);
+    std::shared_ptr<Array> dict;
+    if (data->num_chunks() == 0) {
+      // no dictionary values => create empty array
+      RETURN_NOT_OK(MakeZeroLengthArray(dict_type.value_type(), &dict));
+    } else {
+      auto arr = converted_data->chunk(0);
+      const auto& dict_arr = checked_cast<const DictionaryArray&>(*arr);
+      dict = dict_arr.dictionary();
+    }
 
     placement_data_[rel_placement] = abs_placement;
-    PyObject* dict;
-    RETURN_NOT_OK(ConvertArrayToPandas(options_, dict_arr.dictionary(), nullptr, &dict));
-    dictionary_.reset(dict);
+    PyObject* pydict;
+    RETURN_NOT_OK(ConvertArrayToPandas(options_, dict, nullptr, &pydict));
+    dictionary_.reset(pydict);
     ordered_ = dict_type.ordered();
 
     return Status::OK();
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 12a6bc3..437fdad 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2233,6 +2233,19 @@ class TestConvertMisc(object):
         df = pd.DataFrame({'cat': pd.Categorical([])})
         _check_pandas_roundtrip(df)
 
+    def test_category_zero_chunks(self):
+        # ARROW-5952
+        for pa_type, dtype in [(pa.string(), 'object'), (pa.int64(), 'int64')]:
+            a = pa.chunked_array([], pa.dictionary(pa.int8(), pa_type))
+            result = a.to_pandas()
+            expected = pd.Categorical([], categories=np.array([], dtype=dtype))
+            tm.assert_series_equal(pd.Series(result), pd.Series(expected))
+
+            table = pa.table({'a': a})
+            result = table.to_pandas()
+            expected = pd.DataFrame({'a': expected})
+            tm.assert_frame_equal(result, expected)
+
     def test_mixed_types_fails(self):
         data = pd.DataFrame({'a': ['a', 1, 2.0]})
         with pytest.raises(pa.ArrowTypeError):