You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/10/06 09:33:05 UTC
[GitHub] [arrow] pitrou commented on a change in pull request #8337: ARROW-10151: [Python] Add support for MapArray conversion to Pandas

pitrou commented on a change in pull request #8337:
URL: https://github.com/apache/arrow/pull/8337#discussion_r500077813



##########
File path: cpp/src/arrow/python/arrow_to_pandas.cc
##########
@@ -791,6 +791,117 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data,
   return Status::OK();
 }
 
+inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data,
+                         PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // TODO(ARROW-489): Currently we don't have a Python reference for single columns.
+  //    Storing a reference to the whole Array would be too expensive.
+
+  // ARROW-3789(wesm): During refactoring I found that unit tests assumed that
+  // timestamp units would be preserved on list<timestamp UNIT> conversions in
+  // Table.to_pandas. So we set the option here to not coerce things to
+  // nanoseconds. Bit of a hack but this seemed the simplest thing to satisfy
+  // the existing unit tests
+  PandasOptions modified_options = options;
+  modified_options.coerce_temporal_nanoseconds = false;
+
+  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+  OwnedRef list_item;
+  OwnedRef key_value;
+  OwnedRef item_value;
+  OwnedRefNoGIL owned_numpy_keys;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_keys, nullptr,
+                                            owned_numpy_keys.ref()));
+  OwnedRefNoGIL owned_numpy_items;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_items, nullptr,
+                                            owned_numpy_items.ref()));
+  PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+  PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+  int64_t chunk_offset = 0;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    auto arr = std::static_pointer_cast<MapArray>(data.chunk(c));

Review comment:
       Use `checked_cast`

##########
File path: cpp/src/arrow/python/arrow_to_pandas.cc
##########
@@ -791,6 +791,117 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data,
   return Status::OK();
 }
 
+inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data,
+                         PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // TODO(ARROW-489): Currently we don't have a Python reference for single columns.
+  //    Storing a reference to the whole Array would be too expensive.
+
+  // ARROW-3789(wesm): During refactoring I found that unit tests assumed that
+  // timestamp units would be preserved on list<timestamp UNIT> conversions in
+  // Table.to_pandas. So we set the option here to not coerce things to
+  // nanoseconds. Bit of a hack but this seemed the simplest thing to satisfy
+  // the existing unit tests
+  PandasOptions modified_options = options;
+  modified_options.coerce_temporal_nanoseconds = false;
+
+  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+  OwnedRef list_item;
+  OwnedRef key_value;
+  OwnedRef item_value;
+  OwnedRefNoGIL owned_numpy_keys;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_keys, nullptr,
+                                            owned_numpy_keys.ref()));
+  OwnedRefNoGIL owned_numpy_items;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_items, nullptr,
+                                            owned_numpy_items.ref()));
+  PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+  PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+  int64_t chunk_offset = 0;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    auto arr = std::static_pointer_cast<MapArray>(data.chunk(c));
+    const bool has_nulls = data.null_count() > 0;
+
+    // Make a list of key/item pairs for each row in array
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      if (has_nulls && arr->IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+      } else {
+        int64_t entry_offset = arr->value_offset(i);
+        int64_t num_maps = arr->value_offset(i + 1) - entry_offset;
+
+        // Build the new list object for the row of maps
+        list_item.reset(PyList_New(0));
+        RETURN_IF_PYERROR();
+
+        // Add each key/item pair in the row
+        for (int64_t j = 0; j < num_maps; ++j) {
+          // Get key value, key is non-nullable for a valid row
+          auto ptr_key = reinterpret_cast<const char*>(
+              PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j));
+          key_value.reset(PyArray_GETITEM(py_keys, ptr_key));
+          RETURN_IF_PYERROR();
+
+          if (item_arrays[c]->IsNull(entry_offset + j)) {
+            // Translate the Null to a None
+            Py_INCREF(Py_None);
+            item_value.reset(Py_None);
+          } else {
+            // Get valid value from item array
+            auto ptr_item = reinterpret_cast<const char*>(
+                PyArray_GETPTR1(py_items, chunk_offset + entry_offset + j));
+            item_value.reset(PyArray_GETITEM(py_items, ptr_item));
+            RETURN_IF_PYERROR();
+          }
+
+          // Add the key/item pair to the list for the row
+          PyList_Append(list_item.obj(),

Review comment:
       With a presized list, use `PyList_SET_ITEM` instead. Be careful with reference counts.

##########
File path: cpp/src/arrow/python/arrow_to_pandas.cc
##########
@@ -791,6 +791,117 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data,
   return Status::OK();
 }
 
+inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data,

Review comment:
       No need to make this `inline`.

##########
File path: cpp/src/arrow/python/arrow_to_pandas.cc
##########
@@ -791,6 +791,117 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data,
   return Status::OK();
 }
 
+inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data,
+                         PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // TODO(ARROW-489): Currently we don't have a Python reference for single columns.
+  //    Storing a reference to the whole Array would be too expensive.

Review comment:
       I don't know what that means. Can you reformulate?

##########
File path: cpp/src/arrow/python/arrow_to_pandas.cc
##########
@@ -791,6 +791,117 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data,
   return Status::OK();
 }
 
+inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data,
+                         PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // TODO(ARROW-489): Currently we don't have a Python reference for single columns.
+  //    Storing a reference to the whole Array would be too expensive.
+
+  // ARROW-3789(wesm): During refactoring I found that unit tests assumed that
+  // timestamp units would be preserved on list<timestamp UNIT> conversions in
+  // Table.to_pandas. So we set the option here to not coerce things to
+  // nanoseconds. Bit of a hack but this seemed the simplest thing to satisfy
+  // the existing unit tests
+  PandasOptions modified_options = options;
+  modified_options.coerce_temporal_nanoseconds = false;
+
+  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+  OwnedRef list_item;
+  OwnedRef key_value;
+  OwnedRef item_value;
+  OwnedRefNoGIL owned_numpy_keys;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_keys, nullptr,
+                                            owned_numpy_keys.ref()));
+  OwnedRefNoGIL owned_numpy_items;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_items, nullptr,
+                                            owned_numpy_items.ref()));
+  PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+  PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+  int64_t chunk_offset = 0;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    auto arr = std::static_pointer_cast<MapArray>(data.chunk(c));
+    const bool has_nulls = data.null_count() > 0;
+
+    // Make a list of key/item pairs for each row in array
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      if (has_nulls && arr->IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+      } else {
+        int64_t entry_offset = arr->value_offset(i);
+        int64_t num_maps = arr->value_offset(i + 1) - entry_offset;
+
+        // Build the new list object for the row of maps
+        list_item.reset(PyList_New(0));
+        RETURN_IF_PYERROR();
+
+        // Add each key/item pair in the row
+        for (int64_t j = 0; j < num_maps; ++j) {
+          // Get key value, key is non-nullable for a valid row
+          auto ptr_key = reinterpret_cast<const char*>(
+              PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j));
+          key_value.reset(PyArray_GETITEM(py_keys, ptr_key));
+          RETURN_IF_PYERROR();
+
+          if (item_arrays[c]->IsNull(entry_offset + j)) {
+            // Translate the Null to a None
+            Py_INCREF(Py_None);

Review comment:
       Why incref here?

##########
File path: cpp/src/arrow/python/arrow_to_pandas.cc
##########
@@ -791,6 +791,117 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data,
   return Status::OK();
 }
 
+inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data,
+                         PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // TODO(ARROW-489): Currently we don't have a Python reference for single columns.
+  //    Storing a reference to the whole Array would be too expensive.
+
+  // ARROW-3789(wesm): During refactoring I found that unit tests assumed that

Review comment:
       Is this still relevant? Blindly copying old comments like this doesn't give me good feelings.

##########
File path: cpp/src/arrow/python/arrow_to_pandas.cc
##########
@@ -791,6 +791,117 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data,
   return Status::OK();
 }
 
+inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data,
+                         PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // TODO(ARROW-489): Currently we don't have a Python reference for single columns.
+  //    Storing a reference to the whole Array would be too expensive.
+
+  // ARROW-3789(wesm): During refactoring I found that unit tests assumed that
+  // timestamp units would be preserved on list<timestamp UNIT> conversions in
+  // Table.to_pandas. So we set the option here to not coerce things to
+  // nanoseconds. Bit of a hack but this seemed the simplest thing to satisfy
+  // the existing unit tests
+  PandasOptions modified_options = options;
+  modified_options.coerce_temporal_nanoseconds = false;
+
+  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+  OwnedRef list_item;
+  OwnedRef key_value;
+  OwnedRef item_value;
+  OwnedRefNoGIL owned_numpy_keys;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_keys, nullptr,
+                                            owned_numpy_keys.ref()));
+  OwnedRefNoGIL owned_numpy_items;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_items, nullptr,
+                                            owned_numpy_items.ref()));
+  PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+  PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+  int64_t chunk_offset = 0;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    auto arr = std::static_pointer_cast<MapArray>(data.chunk(c));
+    const bool has_nulls = data.null_count() > 0;
+
+    // Make a list of key/item pairs for each row in array
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      if (has_nulls && arr->IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+      } else {
+        int64_t entry_offset = arr->value_offset(i);
+        int64_t num_maps = arr->value_offset(i + 1) - entry_offset;
+
+        // Build the new list object for the row of maps
+        list_item.reset(PyList_New(0));
+        RETURN_IF_PYERROR();
+
+        // Add each key/item pair in the row
+        for (int64_t j = 0; j < num_maps; ++j) {
+          // Get key value, key is non-nullable for a valid row
+          auto ptr_key = reinterpret_cast<const char*>(
+              PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j));
+          key_value.reset(PyArray_GETITEM(py_keys, ptr_key));
+          RETURN_IF_PYERROR();
+
+          if (item_arrays[c]->IsNull(entry_offset + j)) {
+            // Translate the Null to a None
+            Py_INCREF(Py_None);
+            item_value.reset(Py_None);
+          } else {
+            // Get valid value from item array
+            auto ptr_item = reinterpret_cast<const char*>(
+                PyArray_GETPTR1(py_items, chunk_offset + entry_offset + j));
+            item_value.reset(PyArray_GETITEM(py_items, ptr_item));
+            RETURN_IF_PYERROR();

Review comment:
       Is this required? I would expect the above to be always successful?

##########
File path: cpp/src/arrow/python/arrow_to_pandas.cc
##########
@@ -791,6 +791,117 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data,
   return Status::OK();
 }
 
+inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data,
+                         PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // TODO(ARROW-489): Currently we don't have a Python reference for single columns.
+  //    Storing a reference to the whole Array would be too expensive.
+
+  // ARROW-3789(wesm): During refactoring I found that unit tests assumed that
+  // timestamp units would be preserved on list<timestamp UNIT> conversions in
+  // Table.to_pandas. So we set the option here to not coerce things to
+  // nanoseconds. Bit of a hack but this seemed the simplest thing to satisfy
+  // the existing unit tests
+  PandasOptions modified_options = options;
+  modified_options.coerce_temporal_nanoseconds = false;
+
+  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+  OwnedRef list_item;
+  OwnedRef key_value;
+  OwnedRef item_value;
+  OwnedRefNoGIL owned_numpy_keys;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_keys, nullptr,
+                                            owned_numpy_keys.ref()));
+  OwnedRefNoGIL owned_numpy_items;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_items, nullptr,
+                                            owned_numpy_items.ref()));
+  PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+  PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+  int64_t chunk_offset = 0;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    auto arr = std::static_pointer_cast<MapArray>(data.chunk(c));
+    const bool has_nulls = data.null_count() > 0;
+
+    // Make a list of key/item pairs for each row in array
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      if (has_nulls && arr->IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+      } else {
+        int64_t entry_offset = arr->value_offset(i);
+        int64_t num_maps = arr->value_offset(i + 1) - entry_offset;
+
+        // Build the new list object for the row of maps
+        list_item.reset(PyList_New(0));
+        RETURN_IF_PYERROR();
+
+        // Add each key/item pair in the row
+        for (int64_t j = 0; j < num_maps; ++j) {
+          // Get key value, key is non-nullable for a valid row
+          auto ptr_key = reinterpret_cast<const char*>(
+              PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j));
+          key_value.reset(PyArray_GETITEM(py_keys, ptr_key));
+          RETURN_IF_PYERROR();
+
+          if (item_arrays[c]->IsNull(entry_offset + j)) {
+            // Translate the Null to a None
+            Py_INCREF(Py_None);
+            item_value.reset(Py_None);
+          } else {
+            // Get valid value from item array
+            auto ptr_item = reinterpret_cast<const char*>(
+                PyArray_GETPTR1(py_items, chunk_offset + entry_offset + j));
+            item_value.reset(PyArray_GETITEM(py_items, ptr_item));
+            RETURN_IF_PYERROR();
+          }
+
+          // Add the key/item pair to the list for the row
+          PyList_Append(list_item.obj(),
+                        PyTuple_Pack(2, key_value.obj(), item_value.obj()));
+          RETURN_IF_PYERROR();
+        }
+
+        *out_values = list_item.obj();
+        // Grant ownership to the resulting array
+        Py_INCREF(*out_values);

Review comment:
       Use `OwnedRef.detach` instead of issueing duplicate incref/decrefs.

##########
File path: cpp/src/arrow/python/arrow_to_pandas.cc
##########
@@ -791,6 +791,117 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data,
   return Status::OK();
 }
 
+inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data,
+                         PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // TODO(ARROW-489): Currently we don't have a Python reference for single columns.
+  //    Storing a reference to the whole Array would be too expensive.
+
+  // ARROW-3789(wesm): During refactoring I found that unit tests assumed that
+  // timestamp units would be preserved on list<timestamp UNIT> conversions in
+  // Table.to_pandas. So we set the option here to not coerce things to
+  // nanoseconds. Bit of a hack but this seemed the simplest thing to satisfy
+  // the existing unit tests
+  PandasOptions modified_options = options;
+  modified_options.coerce_temporal_nanoseconds = false;
+
+  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+  OwnedRef list_item;
+  OwnedRef key_value;
+  OwnedRef item_value;
+  OwnedRefNoGIL owned_numpy_keys;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_keys, nullptr,
+                                            owned_numpy_keys.ref()));
+  OwnedRefNoGIL owned_numpy_items;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_items, nullptr,
+                                            owned_numpy_items.ref()));
+  PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+  PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+  int64_t chunk_offset = 0;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    auto arr = std::static_pointer_cast<MapArray>(data.chunk(c));
+    const bool has_nulls = data.null_count() > 0;
+
+    // Make a list of key/item pairs for each row in array
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      if (has_nulls && arr->IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+      } else {
+        int64_t entry_offset = arr->value_offset(i);
+        int64_t num_maps = arr->value_offset(i + 1) - entry_offset;
+
+        // Build the new list object for the row of maps
+        list_item.reset(PyList_New(0));

Review comment:
       You should presize to `num_maps`.

##########
File path: cpp/src/arrow/python/arrow_to_pandas.cc
##########
@@ -791,6 +791,117 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data,
   return Status::OK();
 }
 
+inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data,
+                         PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // TODO(ARROW-489): Currently we don't have a Python reference for single columns.
+  //    Storing a reference to the whole Array would be too expensive.
+
+  // ARROW-3789(wesm): During refactoring I found that unit tests assumed that
+  // timestamp units would be preserved on list<timestamp UNIT> conversions in
+  // Table.to_pandas. So we set the option here to not coerce things to
+  // nanoseconds. Bit of a hack but this seemed the simplest thing to satisfy
+  // the existing unit tests
+  PandasOptions modified_options = options;
+  modified_options.coerce_temporal_nanoseconds = false;
+
+  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+  OwnedRef list_item;
+  OwnedRef key_value;
+  OwnedRef item_value;
+  OwnedRefNoGIL owned_numpy_keys;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_keys, nullptr,
+                                            owned_numpy_keys.ref()));
+  OwnedRefNoGIL owned_numpy_items;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_items, nullptr,
+                                            owned_numpy_items.ref()));
+  PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+  PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+  int64_t chunk_offset = 0;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    auto arr = std::static_pointer_cast<MapArray>(data.chunk(c));
+    const bool has_nulls = data.null_count() > 0;
+
+    // Make a list of key/item pairs for each row in array
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      if (has_nulls && arr->IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+      } else {
+        int64_t entry_offset = arr->value_offset(i);
+        int64_t num_maps = arr->value_offset(i + 1) - entry_offset;
+
+        // Build the new list object for the row of maps
+        list_item.reset(PyList_New(0));
+        RETURN_IF_PYERROR();
+
+        // Add each key/item pair in the row
+        for (int64_t j = 0; j < num_maps; ++j) {
+          // Get key value, key is non-nullable for a valid row
+          auto ptr_key = reinterpret_cast<const char*>(
+              PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j));
+          key_value.reset(PyArray_GETITEM(py_keys, ptr_key));
+          RETURN_IF_PYERROR();
+
+          if (item_arrays[c]->IsNull(entry_offset + j)) {

Review comment:
       Why do you need this? I would expect `py_items` to contain a `None` entry already...?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org