You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2021/09/16 08:01:10 UTC

[arrow] branch master updated: ARROW-13877: [C++] Support FixedSizeList in generic list kernels

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 2fb8201  ARROW-13877: [C++] Support FixedSizeList in generic list kernels
2fb8201 is described below

commit 2fb820145691630d0378d2f1c41061570970fc78
Author: David Li <li...@gmail.com>
AuthorDate: Thu Sep 16 09:58:56 2021 +0200

    ARROW-13877: [C++] Support FixedSizeList in generic list kernels
    
    Closes #11127 from lidavidm/arrow-13877
    
    Authored-by: David Li <li...@gmail.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/array/array_list_test.cc             | 34 ++++++++++++++++
 cpp/src/arrow/array/array_nested.cc                |  5 +++
 cpp/src/arrow/array/array_nested.h                 |  7 ++++
 cpp/src/arrow/compute/kernels/scalar_nested.cc     | 20 ++++++++++
 .../arrow/compute/kernels/scalar_nested_test.cc    |  4 ++
 cpp/src/arrow/compute/kernels/vector_nested.cc     | 23 +++++++++++
 .../arrow/compute/kernels/vector_nested_test.cc    | 46 ++++++++++++++++++++++
 docs/source/cpp/compute.rst                        |  6 ++-
 8 files changed, 143 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc
index 8cb7d91..a503cbd 100644
--- a/cpp/src/arrow/array/array_list_test.cc
+++ b/cpp/src/arrow/array/array_list_test.cc
@@ -1145,4 +1145,38 @@ TEST_F(TestFixedSizeListArray, NotEnoughValues) {
   ASSERT_OK(result_->ValidateFull());
 }
 
+TEST_F(TestFixedSizeListArray, FlattenZeroLength) {
+  Done();
+  ASSERT_OK_AND_ASSIGN(auto flattened, result_->Flatten());
+  ASSERT_OK(flattened->ValidateFull());
+  ASSERT_EQ(0, flattened->length());
+  AssertTypeEqual(*flattened->type(), *value_type_);
+}
+
+TEST_F(TestFixedSizeListArray, Flatten) {
+  std::vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6, 7};
+  std::vector<uint8_t> is_valid = {1, 0, 1, 1};
+  ASSERT_OK(builder_->AppendValues(4, is_valid.data()));
+  auto* vb = checked_cast<Int32Builder*>(builder_->value_builder());
+  ASSERT_OK(vb->AppendValues(values.data(), static_cast<int64_t>(values.size())));
+  Done();
+
+  {
+    ASSERT_OK_AND_ASSIGN(auto flattened, result_->Flatten());
+    ASSERT_OK(flattened->ValidateFull());
+    ASSERT_EQ(6, flattened->length());
+    AssertArraysEqual(*flattened, *ArrayFromJSON(value_type_, "[0, 1, 4, 5, 6, 7]"),
+                      /*verbose=*/true);
+  }
+
+  {
+    auto sliced = std::dynamic_pointer_cast<FixedSizeListArray>(result_->Slice(1, 2));
+    ASSERT_OK_AND_ASSIGN(auto flattened, sliced->Flatten());
+    ASSERT_OK(flattened->ValidateFull());
+    ASSERT_EQ(2, flattened->length());
+    AssertArraysEqual(*flattened, *ArrayFromJSON(value_type_, "[4, 5]"),
+                      /*verbose=*/true);
+  }
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc
index 102a825..db96a7b 100644
--- a/cpp/src/arrow/array/array_nested.cc
+++ b/cpp/src/arrow/array/array_nested.cc
@@ -438,6 +438,11 @@ Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
                                               /*null_count=*/0, /*offset=*/0);
 }
 
+Result<std::shared_ptr<Array>> FixedSizeListArray::Flatten(
+    MemoryPool* memory_pool) const {
+  return FlattenListArray(*this, memory_pool);
+}
+
 // ----------------------------------------------------------------------
 // Struct
 
diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h
index bd5abaa..c312acd 100644
--- a/cpp/src/arrow/array/array_nested.h
+++ b/cpp/src/arrow/array/array_nested.h
@@ -292,6 +292,13 @@ class ARROW_EXPORT FixedSizeListArray : public Array {
     return values_->Slice(value_offset(i), value_length(i));
   }
 
+  /// \brief Return an Array that is a concatenation of the lists in this array.
+  ///
+  /// Note that it's different from `values()` in that it takes into
+  /// consideration null elements (they are skipped, thus copying may be needed).
+  Result<std::shared_ptr<Array>> Flatten(
+      MemoryPool* memory_pool = default_memory_pool()) const;
+
   /// \brief Construct FixedSizeListArray from child value array and value_length
   ///
   /// \param[in] values Array containing list values
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc
index e9f0696..9ffe8bf 100644
--- a/cpp/src/arrow/compute/kernels/scalar_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc
@@ -55,6 +55,24 @@ Status ListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   return Status::OK();
 }
 
+Status FixedSizeListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  using offset_type = typename FixedSizeListType::offset_type;
+  auto width = checked_cast<const FixedSizeListType&>(*batch[0].type()).list_size();
+  if (batch[0].kind() == Datum::ARRAY) {
+    const auto& arr = *batch[0].array();
+    ArrayData* out_arr = out->mutable_array();
+    auto* out_values = out_arr->GetMutableValues<offset_type>(1);
+    std::fill(out_values, out_values + arr.length, width);
+  } else {
+    const auto& arg0 = batch[0].scalar_as<FixedSizeListScalar>();
+    if (arg0.is_valid) {
+      checked_cast<Int32Scalar*>(out->scalar().get())->value = width;
+    }
+  }
+
+  return Status::OK();
+}
+
 const FunctionDoc list_value_length_doc{
     "Compute list lengths",
     ("`lists` must have a list-like type.\n"
@@ -161,6 +179,8 @@ void RegisterScalarNested(FunctionRegistry* registry) {
       "list_value_length", Arity::Unary(), &list_value_length_doc);
   DCHECK_OK(list_value_length->AddKernel({InputType(Type::LIST)}, int32(),
                                          ListValueLength<ListType>));
+  DCHECK_OK(list_value_length->AddKernel({InputType(Type::FIXED_SIZE_LIST)}, int32(),
+                                         FixedSizeListValueLength));
   DCHECK_OK(list_value_length->AddKernel({InputType(Type::LARGE_LIST)}, int64(),
                                          ListValueLength<LargeListType>));
   DCHECK_OK(registry->AddFunction(std::move(list_value_length)));
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
index ef48995..5cabed3 100644
--- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
@@ -37,6 +37,10 @@ TEST(TestScalarNested, ListValueLength) {
     CheckScalarUnary("list_value_length", ty, "[[0, null, 1], null, [2, 3], []]",
                      GetOffsetType(*ty), "[3, null, 2, 0]");
   }
+
+  CheckScalarUnary("list_value_length", fixed_size_list(int32(), 3),
+                   "[[0, null, 1], null, [2, 3, 4], [1, 2, null]]", int32(),
+                   "[3, null, 3, 3]");
 }
 
 struct {
diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc
index 8050475..974d0f0 100644
--- a/cpp/src/arrow/compute/kernels/vector_nested.cc
+++ b/cpp/src/arrow/compute/kernels/vector_nested.cc
@@ -76,6 +76,25 @@ struct ListParentIndicesArray {
 
   Status Visit(const LargeListType& type) { return VisitList(type); }
 
+  Status Visit(const FixedSizeListType& type) {
+    using offset_type = typename FixedSizeListType::offset_type;
+    const offset_type slot_length = type.list_size();
+    const int64_t values_length = slot_length * (input->length - input->GetNullCount());
+    ARROW_ASSIGN_OR_RAISE(auto indices, ctx->Allocate(values_length * sizeof(int32_t)));
+    auto* out_indices = reinterpret_cast<offset_type*>(indices->mutable_data());
+    const auto* bitmap = input->GetValues<uint8_t>(0, 0);
+    for (int32_t i = 0; i < input->length; i++) {
+      if (!bitmap || BitUtil::GetBit(bitmap, input->offset + i)) {
+        std::fill(out_indices, out_indices + slot_length,
+                  static_cast<int32_t>(base_output_offset + i));
+        out_indices += slot_length;
+      }
+    }
+    out = ArrayData::Make(int32(), values_length, {nullptr, std::move(indices)},
+                          /*null_count=*/0);
+    return Status::OK();
+  }
+
   Status Visit(const DataType& type) {
     return Status::TypeError("Function 'list_parent_indices' expects list input, got ",
                              type.ToString());
@@ -99,6 +118,7 @@ Result<ValueDescr> ListValuesType(KernelContext*, const std::vector<ValueDescr>&
 Result<std::shared_ptr<DataType>> ListParentIndicesType(const DataType& input_type) {
   switch (input_type.id()) {
     case Type::LIST:
+    case Type::FIXED_SIZE_LIST:
       return int32();
     case Type::LARGE_LIST:
       return int64();
@@ -164,6 +184,9 @@ void RegisterVectorNested(FunctionRegistry* registry) {
       std::make_shared<VectorFunction>("list_flatten", Arity::Unary(), &list_flatten_doc);
   DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LIST)}, OutputType(ListValuesType),
                                ListFlatten<ListType>));
+  DCHECK_OK(flatten->AddKernel({InputType::Array(Type::FIXED_SIZE_LIST)},
+                               OutputType(ListValuesType),
+                               ListFlatten<FixedSizeListType>));
   DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LARGE_LIST)},
                                OutputType(ListValuesType), ListFlatten<LargeListType>));
   DCHECK_OK(registry->AddFunction(std::move(flatten)));
diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc
index c8ea6e7..28bb4bd 100644
--- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc
@@ -22,10 +22,13 @@
 #include "arrow/compute/kernels/test_util.h"
 #include "arrow/result.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/checked_cast.h"
 
 namespace arrow {
 namespace compute {
 
+using arrow::internal::checked_cast;
+
 TEST(TestVectorNested, ListFlatten) {
   for (auto ty : {list(int16()), large_list(int16())}) {
     auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], []]");
@@ -51,6 +54,28 @@ TEST(TestVectorNested, ListFlattenChunkedArray) {
   }
 }
 
+TEST(TestVectorNested, ListFlattenFixedSizeList) {
+  for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) {
+    const auto& out_ty = checked_cast<const FixedSizeListType&>(*ty).value_type();
+    {
+      auto input = ArrayFromJSON(ty, "[[0, null], null, [2, 3], [0, 42]]");
+      auto expected = ArrayFromJSON(out_ty, "[0, null, 2, 3, 0, 42]");
+      CheckVectorUnary("list_flatten", input, expected);
+    }
+
+    {
+      // Test a chunked array
+      auto input = ChunkedArrayFromJSON(ty, {"[[0, null], null]", "[[2, 3], [0, 42]]"});
+      auto expected = ChunkedArrayFromJSON(out_ty, {"[0, null]", "[2, 3, 0, 42]"});
+      CheckVectorUnary("list_flatten", input, expected);
+
+      input = ChunkedArrayFromJSON(ty, {});
+      expected = ChunkedArrayFromJSON(out_ty, {});
+      CheckVectorUnary("list_flatten", input, expected);
+    }
+  }
+}
+
 TEST(TestVectorNested, ListParentIndices) {
   for (auto ty : {list(int16()), large_list(int16())}) {
     auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]");
@@ -82,5 +107,26 @@ TEST(TestVectorNested, ListParentIndicesChunkedArray) {
   }
 }
 
+TEST(TestVectorNested, ListParentIndicesFixedSizeList) {
+  for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) {
+    {
+      auto input = ArrayFromJSON(ty, "[[0, null], null, [1, 2], [3, 4], [null, 5]]");
+      auto expected = ArrayFromJSON(int32(), "[0, 0, 2, 2, 3, 3, 4, 4]");
+      CheckVectorUnary("list_parent_indices", input, expected);
+    }
+    {
+      // Test a chunked array
+      auto input =
+          ChunkedArrayFromJSON(ty, {"[[0, null], null, [1, 2]]", "[[3, 4], [null, 5]]"});
+      auto expected = ChunkedArrayFromJSON(int32(), {"[0, 0, 2, 2]", "[3, 3, 4, 4]"});
+      CheckVectorUnary("list_parent_indices", input, expected);
+
+      input = ChunkedArrayFromJSON(ty, {});
+      expected = ChunkedArrayFromJSON(int32(), {});
+      CheckVectorUnary("list_parent_indices", input, expected);
+    }
+  }
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 26781a2..9e9556d 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -1173,7 +1173,8 @@ Structural transforms
 +--------------------------+------------+----------------+-------------------+------------------------------+---------+
 
 * \(1) Each output element is the length of the corresponding input element
-  (null if input is null).  Output type is Int32 for List, Int64 for LargeList.
+  (null if input is null).  Output type is Int32 for List and FixedSizeList,
+  Int64 for LargeList.
 
 * \(2) The output struct's field types are the types of its arguments. The
   field names are specified using an instance of :struct:`MakeStructOptions`.
@@ -1488,7 +1489,8 @@ Structural transforms
 
 * \(2) For each value in the list child array, the index at which it is found
   in the list array is appended to the output.  Nulls in the parent list array
-  are discarded.
+  are discarded.  Output type is Int32 for List and FixedSizeList, Int64 for
+  LargeList.
 
 These functions create a copy of the first input with some elements
 replaced, based on the remaining inputs.