You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2021/09/16 08:01:10 UTC
[arrow] branch master updated: ARROW-13877: [C++] Support
FixedSizeList in generic list kernels
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2fb8201 ARROW-13877: [C++] Support FixedSizeList in generic list kernels
2fb8201 is described below
commit 2fb820145691630d0378d2f1c41061570970fc78
Author: David Li <li...@gmail.com>
AuthorDate: Thu Sep 16 09:58:56 2021 +0200
ARROW-13877: [C++] Support FixedSizeList in generic list kernels
Closes #11127 from lidavidm/arrow-13877
Authored-by: David Li <li...@gmail.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/arrow/array/array_list_test.cc | 34 ++++++++++++++++
cpp/src/arrow/array/array_nested.cc | 5 +++
cpp/src/arrow/array/array_nested.h | 7 ++++
cpp/src/arrow/compute/kernels/scalar_nested.cc | 20 ++++++++++
.../arrow/compute/kernels/scalar_nested_test.cc | 4 ++
cpp/src/arrow/compute/kernels/vector_nested.cc | 23 +++++++++++
.../arrow/compute/kernels/vector_nested_test.cc | 46 ++++++++++++++++++++++
docs/source/cpp/compute.rst | 6 ++-
8 files changed, 143 insertions(+), 2 deletions(-)
diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc
index 8cb7d91..a503cbd 100644
--- a/cpp/src/arrow/array/array_list_test.cc
+++ b/cpp/src/arrow/array/array_list_test.cc
@@ -1145,4 +1145,38 @@ TEST_F(TestFixedSizeListArray, NotEnoughValues) {
ASSERT_OK(result_->ValidateFull());
}
+TEST_F(TestFixedSizeListArray, FlattenZeroLength) {
+ Done();
+ ASSERT_OK_AND_ASSIGN(auto flattened, result_->Flatten());
+ ASSERT_OK(flattened->ValidateFull());
+ ASSERT_EQ(0, flattened->length());
+ AssertTypeEqual(*flattened->type(), *value_type_);
+}
+
+TEST_F(TestFixedSizeListArray, Flatten) {
+ std::vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6, 7};
+ std::vector<uint8_t> is_valid = {1, 0, 1, 1};
+ ASSERT_OK(builder_->AppendValues(4, is_valid.data()));
+ auto* vb = checked_cast<Int32Builder*>(builder_->value_builder());
+ ASSERT_OK(vb->AppendValues(values.data(), static_cast<int64_t>(values.size())));
+ Done();
+
+ {
+ ASSERT_OK_AND_ASSIGN(auto flattened, result_->Flatten());
+ ASSERT_OK(flattened->ValidateFull());
+ ASSERT_EQ(6, flattened->length());
+ AssertArraysEqual(*flattened, *ArrayFromJSON(value_type_, "[0, 1, 4, 5, 6, 7]"),
+ /*verbose=*/true);
+ }
+
+ {
+ auto sliced = std::dynamic_pointer_cast<FixedSizeListArray>(result_->Slice(1, 2));
+ ASSERT_OK_AND_ASSIGN(auto flattened, sliced->Flatten());
+ ASSERT_OK(flattened->ValidateFull());
+ ASSERT_EQ(2, flattened->length());
+ AssertArraysEqual(*flattened, *ArrayFromJSON(value_type_, "[4, 5]"),
+ /*verbose=*/true);
+ }
+}
+
} // namespace arrow
diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc
index 102a825..db96a7b 100644
--- a/cpp/src/arrow/array/array_nested.cc
+++ b/cpp/src/arrow/array/array_nested.cc
@@ -438,6 +438,11 @@ Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
/*null_count=*/0, /*offset=*/0);
}
+Result<std::shared_ptr<Array>> FixedSizeListArray::Flatten(
+ MemoryPool* memory_pool) const {
+ return FlattenListArray(*this, memory_pool);
+}
+
// ----------------------------------------------------------------------
// Struct
diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h
index bd5abaa..c312acd 100644
--- a/cpp/src/arrow/array/array_nested.h
+++ b/cpp/src/arrow/array/array_nested.h
@@ -292,6 +292,13 @@ class ARROW_EXPORT FixedSizeListArray : public Array {
return values_->Slice(value_offset(i), value_length(i));
}
+ /// \brief Return an Array that is a concatenation of the lists in this array.
+ ///
+ /// Note that it's different from `values()` in that it takes into
+ /// consideration null elements (they are skipped, thus copying may be needed).
+ Result<std::shared_ptr<Array>> Flatten(
+ MemoryPool* memory_pool = default_memory_pool()) const;
+
/// \brief Construct FixedSizeListArray from child value array and value_length
///
/// \param[in] values Array containing list values
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc
index e9f0696..9ffe8bf 100644
--- a/cpp/src/arrow/compute/kernels/scalar_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc
@@ -55,6 +55,24 @@ Status ListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
return Status::OK();
}
+Status FixedSizeListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ using offset_type = typename FixedSizeListType::offset_type;
+ auto width = checked_cast<const FixedSizeListType&>(*batch[0].type()).list_size();
+ if (batch[0].kind() == Datum::ARRAY) {
+ const auto& arr = *batch[0].array();
+ ArrayData* out_arr = out->mutable_array();
+ auto* out_values = out_arr->GetMutableValues<offset_type>(1);
+ std::fill(out_values, out_values + arr.length, width);
+ } else {
+ const auto& arg0 = batch[0].scalar_as<FixedSizeListScalar>();
+ if (arg0.is_valid) {
+ checked_cast<Int32Scalar*>(out->scalar().get())->value = width;
+ }
+ }
+
+ return Status::OK();
+}
+
const FunctionDoc list_value_length_doc{
"Compute list lengths",
("`lists` must have a list-like type.\n"
@@ -161,6 +179,8 @@ void RegisterScalarNested(FunctionRegistry* registry) {
"list_value_length", Arity::Unary(), &list_value_length_doc);
DCHECK_OK(list_value_length->AddKernel({InputType(Type::LIST)}, int32(),
ListValueLength<ListType>));
+ DCHECK_OK(list_value_length->AddKernel({InputType(Type::FIXED_SIZE_LIST)}, int32(),
+ FixedSizeListValueLength));
DCHECK_OK(list_value_length->AddKernel({InputType(Type::LARGE_LIST)}, int64(),
ListValueLength<LargeListType>));
DCHECK_OK(registry->AddFunction(std::move(list_value_length)));
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
index ef48995..5cabed3 100644
--- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
@@ -37,6 +37,10 @@ TEST(TestScalarNested, ListValueLength) {
CheckScalarUnary("list_value_length", ty, "[[0, null, 1], null, [2, 3], []]",
GetOffsetType(*ty), "[3, null, 2, 0]");
}
+
+ CheckScalarUnary("list_value_length", fixed_size_list(int32(), 3),
+ "[[0, null, 1], null, [2, 3, 4], [1, 2, null]]", int32(),
+ "[3, null, 3, 3]");
}
struct {
diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc
index 8050475..974d0f0 100644
--- a/cpp/src/arrow/compute/kernels/vector_nested.cc
+++ b/cpp/src/arrow/compute/kernels/vector_nested.cc
@@ -76,6 +76,25 @@ struct ListParentIndicesArray {
Status Visit(const LargeListType& type) { return VisitList(type); }
+ Status Visit(const FixedSizeListType& type) {
+ using offset_type = typename FixedSizeListType::offset_type;
+ const offset_type slot_length = type.list_size();
+ const int64_t values_length = slot_length * (input->length - input->GetNullCount());
+ ARROW_ASSIGN_OR_RAISE(auto indices, ctx->Allocate(values_length * sizeof(int32_t)));
+ auto* out_indices = reinterpret_cast<offset_type*>(indices->mutable_data());
+ const auto* bitmap = input->GetValues<uint8_t>(0, 0);
+ for (int32_t i = 0; i < input->length; i++) {
+ if (!bitmap || BitUtil::GetBit(bitmap, input->offset + i)) {
+ std::fill(out_indices, out_indices + slot_length,
+ static_cast<int32_t>(base_output_offset + i));
+ out_indices += slot_length;
+ }
+ }
+ out = ArrayData::Make(int32(), values_length, {nullptr, std::move(indices)},
+ /*null_count=*/0);
+ return Status::OK();
+ }
+
Status Visit(const DataType& type) {
return Status::TypeError("Function 'list_parent_indices' expects list input, got ",
type.ToString());
@@ -99,6 +118,7 @@ Result<ValueDescr> ListValuesType(KernelContext*, const std::vector<ValueDescr>&
Result<std::shared_ptr<DataType>> ListParentIndicesType(const DataType& input_type) {
switch (input_type.id()) {
case Type::LIST:
+ case Type::FIXED_SIZE_LIST:
return int32();
case Type::LARGE_LIST:
return int64();
@@ -164,6 +184,9 @@ void RegisterVectorNested(FunctionRegistry* registry) {
std::make_shared<VectorFunction>("list_flatten", Arity::Unary(), &list_flatten_doc);
DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LIST)}, OutputType(ListValuesType),
ListFlatten<ListType>));
+ DCHECK_OK(flatten->AddKernel({InputType::Array(Type::FIXED_SIZE_LIST)},
+ OutputType(ListValuesType),
+ ListFlatten<FixedSizeListType>));
DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LARGE_LIST)},
OutputType(ListValuesType), ListFlatten<LargeListType>));
DCHECK_OK(registry->AddFunction(std::move(flatten)));
diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc
index c8ea6e7..28bb4bd 100644
--- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc
@@ -22,10 +22,13 @@
#include "arrow/compute/kernels/test_util.h"
#include "arrow/result.h"
#include "arrow/testing/gtest_util.h"
+#include "arrow/util/checked_cast.h"
namespace arrow {
namespace compute {
+using arrow::internal::checked_cast;
+
TEST(TestVectorNested, ListFlatten) {
for (auto ty : {list(int16()), large_list(int16())}) {
auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], []]");
@@ -51,6 +54,28 @@ TEST(TestVectorNested, ListFlattenChunkedArray) {
}
}
+TEST(TestVectorNested, ListFlattenFixedSizeList) {
+ for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) {
+ const auto& out_ty = checked_cast<const FixedSizeListType&>(*ty).value_type();
+ {
+ auto input = ArrayFromJSON(ty, "[[0, null], null, [2, 3], [0, 42]]");
+ auto expected = ArrayFromJSON(out_ty, "[0, null, 2, 3, 0, 42]");
+ CheckVectorUnary("list_flatten", input, expected);
+ }
+
+ {
+ // Test a chunked array
+ auto input = ChunkedArrayFromJSON(ty, {"[[0, null], null]", "[[2, 3], [0, 42]]"});
+ auto expected = ChunkedArrayFromJSON(out_ty, {"[0, null]", "[2, 3, 0, 42]"});
+ CheckVectorUnary("list_flatten", input, expected);
+
+ input = ChunkedArrayFromJSON(ty, {});
+ expected = ChunkedArrayFromJSON(out_ty, {});
+ CheckVectorUnary("list_flatten", input, expected);
+ }
+ }
+}
+
TEST(TestVectorNested, ListParentIndices) {
for (auto ty : {list(int16()), large_list(int16())}) {
auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]");
@@ -82,5 +107,26 @@ TEST(TestVectorNested, ListParentIndicesChunkedArray) {
}
}
+TEST(TestVectorNested, ListParentIndicesFixedSizeList) {
+ for (auto ty : {fixed_size_list(int16(), 2), fixed_size_list(uint32(), 2)}) {
+ {
+ auto input = ArrayFromJSON(ty, "[[0, null], null, [1, 2], [3, 4], [null, 5]]");
+ auto expected = ArrayFromJSON(int32(), "[0, 0, 2, 2, 3, 3, 4, 4]");
+ CheckVectorUnary("list_parent_indices", input, expected);
+ }
+ {
+ // Test a chunked array
+ auto input =
+ ChunkedArrayFromJSON(ty, {"[[0, null], null, [1, 2]]", "[[3, 4], [null, 5]]"});
+ auto expected = ChunkedArrayFromJSON(int32(), {"[0, 0, 2, 2]", "[3, 3, 4, 4]"});
+ CheckVectorUnary("list_parent_indices", input, expected);
+
+ input = ChunkedArrayFromJSON(ty, {});
+ expected = ChunkedArrayFromJSON(int32(), {});
+ CheckVectorUnary("list_parent_indices", input, expected);
+ }
+ }
+}
+
} // namespace compute
} // namespace arrow
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 26781a2..9e9556d 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -1173,7 +1173,8 @@ Structural transforms
+--------------------------+------------+----------------+-------------------+------------------------------+---------+
* \(1) Each output element is the length of the corresponding input element
- (null if input is null). Output type is Int32 for List, Int64 for LargeList.
+ (null if input is null). Output type is Int32 for List and FixedSizeList,
+ Int64 for LargeList.
* \(2) The output struct's field types are the types of its arguments. The
field names are specified using an instance of :struct:`MakeStructOptions`.
@@ -1488,7 +1489,8 @@ Structural transforms
* \(2) For each value in the list child array, the index at which it is found
in the list array is appended to the output. Nulls in the parent list array
- are discarded.
+ are discarded. Output type is Int32 for List and FixedSizeList, Int64 for
+ LargeList.
These functions create a copy of the first input with some elements
replaced, based on the remaining inputs.