You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@arrow.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2018/03/01 04:35:00 UTC

[jira] [Commented] (ARROW-1632) [Python] Permit categorical conversions in Table.to_pandas on a per-column basis

    [ https://issues.apache.org/jira/browse/ARROW-1632?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16381501#comment-16381501 ] 

ASF GitHub Bot commented on ARROW-1632:
---------------------------------------

wesm closed pull request #1620: ARROW-1632: [Python] Permit categorical conversions in Table.to_pandas on a per-column basis
URL: https://github.com/apache/arrow/pull/1620
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 125892afe..aefd4d76d 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -60,6 +60,7 @@ using internal::kNanosecondsInDay;
 using internal::kPandasTimestampNull;
 
 using compute::Datum;
+using compute::FunctionContext;
 
 // ----------------------------------------------------------------------
 // Utility code
@@ -986,7 +987,8 @@ class CategoricalBlock : public PandasBlock {
     // Sniff the first chunk
     const std::shared_ptr<Array> arr_first = data.chunk(0);
     const auto& dict_arr_first = static_cast<const DictionaryArray&>(*arr_first);
-    const auto& indices_first = static_cast<const ArrayType&>(*dict_arr_first.indices());
+    const auto indices_first =
+        std::static_pointer_cast<ArrayType>(dict_arr_first.indices());
 
     auto CheckIndices = [](const ArrayType& arr, int64_t dict_length) {
       const T* values = arr.raw_values();
@@ -1000,8 +1002,8 @@ class CategoricalBlock : public PandasBlock {
       return Status::OK();
     };
 
-    if (!needs_copy_ && data.num_chunks() == 1 && indices_first.null_count() == 0) {
-      RETURN_NOT_OK(CheckIndices(indices_first, dict_arr_first.dictionary()->length()));
+    if (!needs_copy_ && data.num_chunks() == 1 && indices_first->null_count() == 0) {
+      RETURN_NOT_OK(CheckIndices(*indices_first, dict_arr_first.dictionary()->length()));
       RETURN_NOT_OK(AllocateNDArrayFromIndices<T>(npy_type, indices_first));
     } else {
       if (options_.zero_copy_only) {
@@ -1011,7 +1013,7 @@ class CategoricalBlock : public PandasBlock {
              << "but only zero-copy conversions allowed.";
         } else {
           ss << "Needed to copy " << data.num_chunks() << " chunks with "
-             << indices_first.null_count()
+             << indices_first->null_count()
              << " indices nulls, but zero_copy_only was True";
         }
         return Status::Invalid(ss.str());
@@ -1109,10 +1111,11 @@ class CategoricalBlock : public PandasBlock {
 
  protected:
   template <typename T>
-  Status AllocateNDArrayFromIndices(int npy_type, const PrimitiveArray& indices) {
+  Status AllocateNDArrayFromIndices(int npy_type,
+                                    const std::shared_ptr<PrimitiveArray>& indices) {
     npy_intp block_dims[1] = {num_rows_};
 
-    const T* in_values = GetPrimitiveValues<T>(indices);
+    const T* in_values = GetPrimitiveValues<T>(*indices);
     void* data = const_cast<T*>(in_values);
 
     PyAcquireGIL lock;
@@ -1127,6 +1130,22 @@ class CategoricalBlock : public PandasBlock {
                                                nullptr, data, NPY_ARRAY_CARRAY, nullptr);
     RETURN_IF_PYERROR();
 
+    // Add a reference to the underlying Array. Otherwise the array may be
+    // deleted once we leave the block conversion.
+    auto capsule = new ArrowCapsule{{indices}};
+    PyObject* base = PyCapsule_New(reinterpret_cast<void*>(capsule), "arrow",
+                                   &ArrowCapsule_Destructor);
+    if (base == nullptr) {
+      delete capsule;
+      RETURN_IF_PYERROR();
+    }
+
+    if (PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(block_arr), base) == -1) {
+      // Error occurred, trust that SetBaseObject set the error state
+      Py_XDECREF(base);
+      return Status::OK();
+    }
+
     npy_intp placement_dims[1] = {num_columns_};
     PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64);
     RETURN_IF_PYERROR();
@@ -1771,7 +1790,33 @@ Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr<Column
 
 Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr<Table>& table,
                             int nthreads, MemoryPool* pool, PyObject** out) {
-  DataFrameBlockCreator helper(options, table, pool);
+  return ConvertTableToPandas(options, std::unordered_set<std::string>(), table, nthreads,
+                              pool, out);
+}
+
+Status ConvertTableToPandas(PandasOptions options,
+                            const std::unordered_set<std::string>& categorical_columns,
+                            const std::shared_ptr<Table>& table, int nthreads,
+                            MemoryPool* pool, PyObject** out) {
+  std::shared_ptr<Table> current_table = table;
+  if (!categorical_columns.empty()) {
+    FunctionContext ctx;
+    for (int i = 0; i < table->num_columns(); i++) {
+      const Column& col = *table->column(i);
+      if (categorical_columns.count(col.name())) {
+        Datum out;
+        RETURN_NOT_OK(DictionaryEncode(&ctx, Datum(col.data()), &out));
+        std::shared_ptr<ChunkedArray> array = out.chunked_array();
+        auto field = std::make_shared<Field>(
+            col.name(), array->type(), col.field()->nullable(), col.field()->metadata());
+        auto column = std::make_shared<Column>(field, array);
+        RETURN_NOT_OK(current_table->RemoveColumn(i, &current_table));
+        RETURN_NOT_OK(current_table->AddColumn(i, column, &current_table));
+      }
+    }
+  }
+
+  DataFrameBlockCreator helper(options, current_table, pool);
   return helper.Convert(nthreads, out);
 }
 
diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h
index 1e4864637..0541b0f9a 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.h
+++ b/cpp/src/arrow/python/arrow_to_pandas.h
@@ -25,6 +25,7 @@
 
 #include <memory>
 #include <string>
+#include <unordered_set>
 
 #include "arrow/util/visibility.h"
 
@@ -40,6 +41,7 @@ class Table;
 namespace py {
 
 struct PandasOptions {
+  /// If true, we will convert all string columns to categoricals
   bool strings_to_categorical;
   bool zero_copy_only;
 
@@ -64,6 +66,16 @@ ARROW_EXPORT
 Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr<Table>& table,
                             int nthreads, MemoryPool* pool, PyObject** out);
 
+/// Convert a whole table as efficiently as possible to a pandas.DataFrame.
+///
+/// Explicitly name columns that should be a categorical
+/// This option is only used on conversions that are applied to a table.
+ARROW_EXPORT
+Status ConvertTableToPandas(PandasOptions options,
+                            const std::unordered_set<std::string>& categorical_columns,
+                            const std::shared_ptr<Table>& table, int nthreads,
+                            MemoryPool* pool, PyObject** out);
+
 }  // namespace py
 }  // namespace arrow
 
diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd
index f323feaff..256327734 100644
--- a/python/pyarrow/includes/common.pxd
+++ b/python/pyarrow/includes/common.pxd
@@ -23,6 +23,7 @@ from libcpp.memory cimport shared_ptr, unique_ptr, make_shared
 from libcpp.string cimport string as c_string
 from libcpp.vector cimport vector
 from libcpp.unordered_map cimport unordered_map
+from libcpp.unordered_set cimport unordered_set
 
 from cpython cimport PyObject
 cimport cpython
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 8da126aaf..f7361f055 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -882,10 +882,12 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
                                   const shared_ptr[CColumn]& arr,
                                   object py_ref, PyObject** out)
 
-    CStatus ConvertTableToPandas(PandasOptions options,
-                                 const shared_ptr[CTable]& table,
-                                 int nthreads, CMemoryPool* pool,
-                                 PyObject** out)
+    CStatus ConvertTableToPandas(
+        PandasOptions options,
+        const unordered_set[c_string]& categorical_columns,
+        const shared_ptr[CTable]& table,
+        int nthreads, CMemoryPool* pool,
+        PyObject** out)
 
     void c_set_default_memory_pool \
         " arrow::py::set_default_memory_pool"(CMemoryPool* pool)\
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 6d4bf5e78..0bc47fc0d 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -490,7 +490,7 @@ def _make_datetimetz(tz):
 
 
 def table_to_blockmanager(options, table, memory_pool, nthreads=1,
-                          categoricals=None):
+                          categories=None):
     from pyarrow.compat import DatetimeTZDtype
 
     index_columns = []
@@ -564,7 +564,8 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1,
                 block_table.schema.get_field_index(raw_name)
             )
 
-    blocks = _table_to_blocks(options, block_table, nthreads, memory_pool)
+    blocks = _table_to_blocks(options, block_table, nthreads, memory_pool,
+                              categories)
 
     # Construct the row index
     if len(index_arrays) > 1:
@@ -651,12 +652,12 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
     )
 
 
-def _table_to_blocks(options, block_table, nthreads, memory_pool):
+def _table_to_blocks(options, block_table, nthreads, memory_pool, categories):
     # Part of table_to_blockmanager
 
     # Convert an arrow table to Block from the internal pandas API
     result = pa.lib.table_to_blocks(options, block_table, nthreads,
-                                    memory_pool)
+                                    memory_pool, categories)
 
     # Defined above
     return [_reconstruct_block(item) for item in result]
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index e14d4739f..178df5767 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -746,17 +746,22 @@ cdef class RecordBatch:
 
 
 def table_to_blocks(PandasOptions options, Table table, int nthreads,
-                    MemoryPool memory_pool):
+                    MemoryPool memory_pool, categories):
     cdef:
         PyObject* result_obj
         shared_ptr[CTable] c_table = table.sp_table
         CMemoryPool* pool
+        unordered_set[c_string] categorical_columns
+
+    if categories is not None:
+        categorical_columns = {tobytes(cat) for cat in categories}
 
     pool = maybe_unbox_memory_pool(memory_pool)
     with nogil:
         check_status(
             libarrow.ConvertTableToPandas(
-                options, c_table, nthreads, pool, &result_obj
+                options, categorical_columns, c_table, nthreads, pool,
+                &result_obj
             )
         )
 
@@ -1012,7 +1017,7 @@ cdef class Table:
         return result
 
     def to_pandas(self, nthreads=None, strings_to_categorical=False,
-                  memory_pool=None, zero_copy_only=False):
+                  memory_pool=None, zero_copy_only=False, categories=None):
         """
         Convert the arrow::Table to a pandas DataFrame
 
@@ -1029,6 +1034,8 @@ cdef class Table:
         zero_copy_only : boolean, default False
             Raise an ArrowException if this function call would require copying
             the underlying data
+        categories: list, default empty
+            List of columns that should be returned as pandas.Categorical
 
         Returns
         -------
@@ -1036,6 +1043,7 @@ cdef class Table:
         """
         cdef:
             PandasOptions options
+
         options = PandasOptions(
             strings_to_categorical=strings_to_categorical,
             zero_copy_only=zero_copy_only)
@@ -1043,7 +1051,7 @@ cdef class Table:
         if nthreads is None:
             nthreads = cpu_count()
         mgr = pdcompat.table_to_blockmanager(options, self, memory_pool,
-                                             nthreads)
+                                             nthreads, categories)
         return pd.DataFrame(mgr)
 
     def to_pydict(self):
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 6e68dd961..986aeffca 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1027,6 +1027,24 @@ def test_table_empty_str(self):
         expected2 = pd.DataFrame({'strings': pd.Categorical(values)})
         tm.assert_frame_equal(result2, expected2, check_dtype=True)
 
+    def test_selective_categoricals(self):
+        values = ['', '', '', '', '']
+        df = pd.DataFrame({'strings': values})
+        field = pa.field('strings', pa.string())
+        schema = pa.schema([field])
+        table = pa.Table.from_pandas(df, schema=schema)
+        expected_str = pd.DataFrame({'strings': values})
+        expected_cat = pd.DataFrame({'strings': pd.Categorical(values)})
+
+        result1 = table.to_pandas(categories=['strings'])
+        tm.assert_frame_equal(result1, expected_cat, check_dtype=True)
+        result2 = table.to_pandas(categories=[])
+        tm.assert_frame_equal(result2, expected_str, check_dtype=True)
+        result3 = table.to_pandas(categories=('strings',))
+        tm.assert_frame_equal(result3, expected_cat, check_dtype=True)
+        result4 = table.to_pandas(categories=tuple())
+        tm.assert_frame_equal(result4, expected_str, check_dtype=True)
+
     def test_table_str_to_categorical_without_na(self):
         values = ['a', 'a', 'b', 'b', 'c']
         df = pd.DataFrame({'strings': values})


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> [Python] Permit categorical conversions in Table.to_pandas on a per-column basis
> --------------------------------------------------------------------------------
>
>                 Key: ARROW-1632
>                 URL: https://issues.apache.org/jira/browse/ARROW-1632
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: Python
>            Reporter: Wes McKinney
>            Assignee: Uwe L. Korn
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> Currently this is all or nothing



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)