You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/10/25 14:10:25 UTC

[GitHub] [arrow] jorisvandenbossche commented on a diff in pull request #14495: ARROW-17989: [C++][Python] Enable struct_field kernel to accept string field names

jorisvandenbossche commented on code in PR #14495:
URL: https://github.com/apache/arrow/pull/14495#discussion_r1004539770


##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -269,14 +312,34 @@ struct StructFieldFunctor {
   }
 };
 
+Result<const DataType*> RecursiveResolveStructFieldType(const FieldRef& field_ref,

Review Comment:
   This could maybe also be replaced or simplified with one of the existing FieldRef / FieldPath utilities?



##########
python/pyarrow/tests/test_compute.py:
##########
@@ -2691,16 +2691,28 @@ def test_struct_fields_options():
     b = pa.array(["bar", None, ""])
     c = pa.StructArray.from_arrays([a, b], ["a", "b"])
     arr = pa.StructArray.from_arrays([a, c], ["a", "c"])
-
-    assert pc.struct_field(arr,
-                           indices=[1, 1]) == pa.array(["bar", None, ""])
-    assert pc.struct_field(arr, [1, 1]) == pa.array(["bar", None, ""])
-    assert pc.struct_field(arr, [0]) == pa.array([4, 5, 6], type=pa.int64())
+    assert pc.struct_field(arr, '.c.b', is_dot_path=True) == b
+    assert pc.struct_field(arr, '.a', is_dot_path=True) == a
+    assert pc.struct_field(arr, 'a') == a
+    assert pc.struct_field(arr, indices=[1, 1]) == b
+    assert pc.struct_field(arr, [1, 1]) == b

Review Comment:
   I think it should also be possible to pass the names as a list? (`['b', 'c']`)



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -252,6 +266,35 @@ struct StructFieldFunctor {
     return Status::OK();
   }
 
+  static Result<std::shared_ptr<Array>> ApplyFieldRef(KernelContext* ctx,
+                                                      const FieldRef& field_ref,
+                                                      std::shared_ptr<Array> current) {
+    if (current->type_id() != Type::STRUCT) {
+      return Status::Invalid("Not a StructArray: ", current->ToString(),
+                             "\nMaybe a bad FieldRef? ", field_ref.ToString());
+    }
+
+    if (field_ref.IsName()) {
+      const auto& array = checked_cast<const StructArray&>(*current);
+      current = array.GetFieldByName(*field_ref.name());
+      if (current == nullptr) {
+        return Status::Invalid("Field not found in struct: '", *field_ref.name(), "'");
+      }
+    } else if (field_ref.IsFieldPath()) {
+      for (const auto& idx : field_ref.field_path()->indices()) {
+        ARROW_RETURN_NOT_OK(CheckIndex(idx, *current->type()));
+        const auto& array = checked_cast<const StructArray&>(*current);
+        ARROW_ASSIGN_OR_RAISE(current, array.GetFlattenedField(idx, ctx->memory_pool()));
+      }

Review Comment:
   There is a `FieldPath::Get()` for arrays, could that be used here instead of this loop?



##########
cpp/src/arrow/compute/api_scalar.cc:
##########
@@ -560,8 +560,12 @@ StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit,
 StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::MICRO, false) {}
 constexpr char StrptimeOptions::kTypeName[];
 
+StructFieldOptions::StructFieldOptions(FieldRef field_ref)
+    : FunctionOptions(internal::kStructFieldOptionsType), field_ref(field_ref) {}
 StructFieldOptions::StructFieldOptions(std::vector<int> indices)
-    : FunctionOptions(internal::kStructFieldOptionsType), indices(std::move(indices)) {}
+    : StructFieldOptions(FieldRef(FieldPath(std::move(indices)))) {}

Review Comment:
   Should we set both FieldRef and indices here? (for backwards compatibility of the indices field in the options class)



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -252,6 +266,35 @@ struct StructFieldFunctor {
     return Status::OK();
   }
 
+  static Result<std::shared_ptr<Array>> ApplyFieldRef(KernelContext* ctx,
+                                                      const FieldRef& field_ref,
+                                                      std::shared_ptr<Array> current) {
+    if (current->type_id() != Type::STRUCT) {
+      return Status::Invalid("Not a StructArray: ", current->ToString(),
+                             "\nMaybe a bad FieldRef? ", field_ref.ToString());
+    }
+
+    if (field_ref.IsName()) {
+      const auto& array = checked_cast<const StructArray&>(*current);
+      current = array.GetFieldByName(*field_ref.name());
+      if (current == nullptr) {
+        return Status::Invalid("Field not found in struct: '", *field_ref.name(), "'");
+      }
+    } else if (field_ref.IsFieldPath()) {
+      for (const auto& idx : field_ref.field_path()->indices()) {
+        ARROW_RETURN_NOT_OK(CheckIndex(idx, *current->type()));
+        const auto& array = checked_cast<const StructArray&>(*current);
+        ARROW_ASSIGN_OR_RAISE(current, array.GetFlattenedField(idx, ctx->memory_pool()));
+      }
+    } else {
+      DCHECK(field_ref.IsNested());
+      for (const auto& ref : *field_ref.nested_refs()) {
+        ARROW_ASSIGN_OR_RAISE(current, ApplyFieldRef(ctx, ref, current));
+      }

Review Comment:
   Possible alternative: convert the nested field ref to a FieldPath (eg with FieldRef::FindOne?) before the code to handle a FieldPath, and then it can use the same code.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org