You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "pitrou (via GitHub)" <gi...@apache.org> on 2023/06/15 14:52:32 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #35727: GH-33206: [C++] Add support for StructArray sorting and nested sort keys

pitrou commented on code in PR #35727:
URL: https://github.com/apache/arrow/pull/35727#discussion_r1231029320


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1017,6 +1102,35 @@ Result<NullPartitionResult> SortChunkedArray(
   return output;
 }
 
+Result<NullPartitionResult> SortStructArray(ExecContext* ctx, uint64_t* indices_begin,
+                                            uint64_t* indices_end,
+                                            const StructArray& array,
+                                            SortOrder sort_order,
+                                            NullPlacement null_placement) {
+  ARROW_ASSIGN_OR_RAISE(auto columns, array.Flatten());
+  auto batch = RecordBatch::Make(schema(array.type()->fields()), array.length(),
+                                 std::move(columns));
+
+  auto options = SortOptions::Defaults();
+  options.null_placement = null_placement;
+  options.sort_keys.reserve(array.num_fields());
+  for (int i = 0; i < array.num_fields(); ++i) {
+    options.sort_keys.push_back(SortKey(FieldRef(i), sort_order));
+  }
+
+  ARROW_ASSIGN_OR_RAISE(auto sort_keys,
+                        ResolveRecordBatchSortKeys(*batch, options.sort_keys));
+  if (sort_keys.size() <= 8) {

Review Comment:
   Can we add an internal constant for this magic threshold?



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -974,22 +1030,51 @@ class SortIndicesMetaFunction : public MetaFunction {
   }
 };
 
+void AddLeafFields(const FieldVector& fields, SortOrder order,
+                   std::vector<SortField>* out, std::vector<int>* tmp_indices) {
+  if (fields.empty()) {
+    return;
+  }
+
+  tmp_indices->push_back(0);
+  for (const auto& f : fields) {
+    const auto& type = *f->type();
+    if (type.id() == Type::STRUCT) {
+      AddLeafFields(type.fields(), order, out, tmp_indices);
+    } else {
+      out->emplace_back(FieldPath(*tmp_indices), order);
+    }
+    ++tmp_indices->back();
+  }
+  tmp_indices->pop_back();
+}
+
+void AddField(const DataType& type, const FieldPath& path, SortOrder order,
+              std::vector<SortField>* out, std::vector<int>* tmp_indices) {
+  if (type.id() == Type::STRUCT) {
+    *tmp_indices = path.indices();

Review Comment:
   Does this work if we have a struct-inside-a-struct?



##########
cpp/src/arrow/compute/kernels/vector_sort_internal.h:
##########
@@ -737,17 +757,32 @@ struct ResolvedTableSortKey {
   static Result<std::vector<ResolvedTableSortKey>> Make(
       const Table& table, const RecordBatchVector& batches,
       const std::vector<SortKey>& sort_keys) {
-    auto factory = [&](const SortField& f) {
-      const auto& type = table.schema()->field(f.field_index)->type();
+    auto factory = [&](const SortField& f) -> Result<ResolvedTableSortKey> {
+      std::shared_ptr<DataType> type;
+      int64_t null_count = 0;
+      ArrayVector chunks;
+      chunks.reserve(batches.size());
+
       // We must expose a homogenous chunking for all ResolvedSortKey,
-      // so we can't simply pass `table.column(f.field_index)`
-      ArrayVector chunks(batches.size());
-      std::transform(batches.begin(), batches.end(), chunks.begin(),
-                     [&](const std::shared_ptr<RecordBatch>& batch) {
-                       return batch->column(f.field_index);
-                     });
-      return ResolvedTableSortKey(type, std::move(chunks), f.order,
-                                  table.column(f.field_index)->null_count());
+      // so we can't simply access the column from the table directly.
+      if (f.is_nested()) {
+        ARROW_ASSIGN_OR_RAISE(auto schema_field, f.path.Get(*table.schema()));
+        type = schema_field->type();
+        for (const auto& batch : batches) {
+          ARROW_ASSIGN_OR_RAISE(auto child, f.path.GetFlattened(*batch));
+          null_count += child->null_count();
+          chunks.push_back(std::move(child));
+        }
+      } else {
+        null_count = table.column(f.path[0])->null_count();

Review Comment:
   Is it useful to keep this code path for non-nested field paths? I suppose it just has a bit less overhead than the more general version above?



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -974,22 +1030,51 @@ class SortIndicesMetaFunction : public MetaFunction {
   }
 };
 
+void AddLeafFields(const FieldVector& fields, SortOrder order,
+                   std::vector<SortField>* out, std::vector<int>* tmp_indices) {
+  if (fields.empty()) {
+    return;
+  }
+
+  tmp_indices->push_back(0);
+  for (const auto& f : fields) {
+    const auto& type = *f->type();
+    if (type.id() == Type::STRUCT) {
+      AddLeafFields(type.fields(), order, out, tmp_indices);
+    } else {
+      out->emplace_back(FieldPath(*tmp_indices), order);
+    }
+    ++tmp_indices->back();
+  }
+  tmp_indices->pop_back();
+}
+
+void AddField(const DataType& type, const FieldPath& path, SortOrder order,
+              std::vector<SortField>* out, std::vector<int>* tmp_indices) {
+  if (type.id() == Type::STRUCT) {
+    *tmp_indices = path.indices();
+    AddLeafFields(type.fields(), order, out, tmp_indices);
+  } else {
+    out->emplace_back(path, order);
+  }
+}
+
 }  // namespace
 
 Result<std::vector<SortField>> FindSortKeys(const Schema& schema,
                                             const std::vector<SortKey>& sort_keys) {
   std::vector<SortField> fields;
-  std::unordered_set<int> seen;
+  std::unordered_set<FieldPath> seen;
   fields.reserve(sort_keys.size());
   seen.reserve(sort_keys.size());
 
+  std::vector<int> tmp_indices;
   for (const auto& sort_key : sort_keys) {
-    RETURN_NOT_OK(CheckNonNested(sort_key.target));
-
     ARROW_ASSIGN_OR_RAISE(auto match,
                           PrependInvalidColumn(sort_key.target.FindOne(schema)));
-    if (seen.insert(match[0]).second) {
-      fields.push_back({match[0], sort_key.order});
+    if (seen.insert(match).second) {
+      ARROW_ASSIGN_OR_RAISE(auto schema_field, match.Get(schema));
+      AddField(*schema_field->type(), match, sort_key.order, &fields, &tmp_indices);

Review Comment:
   Instead of leaking `tmp_indices` into the caller like this, can we have a toplevel `AddField` that doesn't take this parameter?



##########
cpp/src/arrow/compute/kernels/vector_sort_test.cc:
##########
@@ -2115,6 +2113,100 @@ INSTANTIATE_TEST_SUITE_P(AllNull, TestTableSortIndicesRandom,
                          testing::Combine(first_sort_keys, num_sort_keys,
                                           testing::Values(1.0)));
 
+class TestNestedSortIndices : public ::testing::Test {
+ protected:
+  static std::shared_ptr<Array> GetArray() {
+    auto struct_type =
+        struct_({field("a", struct_({field("a", uint8()), field("b", uint32())})),
+                 field("b", int32())});
+    auto struct_array = checked_pointer_cast<StructArray>(
+        ArrayFromJSON(struct_type,
+                      R"([{"a": {"a": 5,    "b": null}, "b": 8   },
+                          {"a": {"a": null, "b": 7   }, "b": 3   },
+                          {"a": {"a": null, "b": 9   }, "b": 3   },
+                          {"a": {"a": 2,    "b": 4   }, "b": 6   },
+                          {"a": {"a": 5,    "b": 1   }, "b": null},
+                          {"a": {"a": 3,    "b": null}, "b": 2   },
+                          {"a": {"a": 2,    "b": 3   }, "b": 0   },
+                          {"a": {"a": 2,    "b": 4   }, "b": 1   },
+                          {"a": {"a": null, "b": 7   }, "b": null}])"));

Review Comment:
   Hmm... perhaps we can instead have a struct-of-struct-of-struct? This would help exercise the case where a sort key points to a struct-inside-struct.



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -419,19 +442,27 @@ class RadixRecordBatchSorter {
     return ::arrow::compute::internal::ResolveSortKeys<ResolvedSortKey>(batch, sort_keys);
   }
 
-  const RecordBatch& batch_;
+  std::vector<ResolvedSortKey> sort_keys_;

Review Comment:
   Should this be `const`?



##########
cpp/src/arrow/compute/kernels/vector_sort_internal.h:
##########
@@ -496,16 +508,24 @@ Result<std::vector<ResolvedSortKey>> ResolveSortKeys(
   ARROW_ASSIGN_OR_RAISE(const auto fields, FindSortKeys(schema, sort_keys));
   std::vector<ResolvedSortKey> resolved;
   resolved.reserve(fields.size());
-  std::transform(fields.begin(), fields.end(), std::back_inserter(resolved), factory);
+  for (const auto& f : fields) {
+    ARROW_ASSIGN_OR_RAISE(auto resolved_key, factory(f));
+    resolved.push_back(std::move(resolved_key));
+  }
   return resolved;
 }
 
 template <typename ResolvedSortKey, typename TableOrBatch>
 Result<std::vector<ResolvedSortKey>> ResolveSortKeys(
     const TableOrBatch& table_or_batch, const std::vector<SortKey>& sort_keys) {
   return ResolveSortKeys<ResolvedSortKey>(
-      *table_or_batch.schema(), sort_keys, [&](const SortField& f) {
-        return ResolvedSortKey{table_or_batch.column(f.field_index), f.order};
+      *table_or_batch.schema(), sort_keys,
+      [&](const SortField& f) -> Result<ResolvedSortKey> {
+        if (f.is_nested()) {
+          ARROW_ASSIGN_OR_RAISE(auto child, f.path.GetFlattened(table_or_batch));

Review Comment:
   This will potentially duplicate the work of combining some null bitmaps (for example if asking for field paths `0.0.0.0` and `0.0.0.1`). Not very important probably, as this is a fast operation and deeply-nested sort keys are uncommon, but perhaps worth a comment.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org