You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/01/25 14:15:27 UTC

[GitHub] [arrow] lidavidm commented on a change in pull request #12248: ARROW-1888: [C++] Implement Struct Casts

lidavidm commented on a change in pull request #12248:
URL: https://github.com/apache/arrow/pull/12248#discussion_r791750845



##########
File path: cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
##########
@@ -150,6 +150,73 @@ void AddListCast(CastFunction* func) {
   DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel)));
 }
 
+struct CastStruct {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const CastOptions& options = CastState::Get(ctx);
+    const auto in_field_count =
+        checked_cast<const StructType&>(*batch[0].type()).num_fields();
+    const auto out_field_count =
+        checked_cast<const StructType&>(*out->type()).num_fields();
+
+    if (in_field_count != out_field_count) {
+      ARROW_RETURN_NOT_OK(
+          Status(StatusCode::TypeError, "struct field sizes do not match"));
+    }
+
+    for (int64_t i = 0; i < in_field_count; ++i) {
+      const auto in_field_name =
+          checked_cast<const StructType&>(*batch[0].type()).field(i)->name();
+      const auto out_field_name =
+          checked_cast<const StructType&>(*out->type()).field(i)->name();
+      if (in_field_name != out_field_name) {
+        ARROW_RETURN_NOT_OK(
+            Status(StatusCode::TypeError, "struct field names do not match"));
+      }
+    }
+
+    if (out->kind() == Datum::SCALAR) {
+      const auto& in_scalar = checked_cast<const StructScalar&>(*batch[0].scalar());
+      auto out_scalar = checked_cast<StructScalar*>(out->scalar().get());
+
+      for (int64_t i = 0; i < in_field_count; i++) {
+        auto values = in_scalar.value[i];
+        auto target_type = out->type()->field(i)->type();
+        ARROW_ASSIGN_OR_RAISE(Datum cast_values,
+                              Cast(values, target_type, options, ctx->exec_context()));
+        DCHECK_EQ(Datum::SCALAR, cast_values.kind());
+        out_scalar->value.push_back(cast_values.scalar());
+      }
+
+      out_scalar->is_valid = true;

Review comment:
       Shouldn't this get copied from the input scalar?

##########
File path: cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
##########
@@ -150,6 +150,73 @@ void AddListCast(CastFunction* func) {
   DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel)));
 }
 
+struct CastStruct {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const CastOptions& options = CastState::Get(ctx);
+    const auto in_field_count =
+        checked_cast<const StructType&>(*batch[0].type()).num_fields();
+    const auto out_field_count =
+        checked_cast<const StructType&>(*out->type()).num_fields();
+
+    if (in_field_count != out_field_count) {
+      ARROW_RETURN_NOT_OK(
+          Status(StatusCode::TypeError, "struct field sizes do not match"));
+    }

Review comment:
       You could imagine the cast dropping columns or adding columns of nulls in this case, too (ARROW-7051 would make that more efficient, and I believe this is needed to fully complete ARROW-14658)

##########
File path: cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
##########
@@ -150,6 +150,73 @@ void AddListCast(CastFunction* func) {
   DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel)));
 }
 
+struct CastStruct {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const CastOptions& options = CastState::Get(ctx);
+    const auto in_field_count =
+        checked_cast<const StructType&>(*batch[0].type()).num_fields();
+    const auto out_field_count =
+        checked_cast<const StructType&>(*out->type()).num_fields();
+
+    if (in_field_count != out_field_count) {
+      ARROW_RETURN_NOT_OK(
+          Status(StatusCode::TypeError, "struct field sizes do not match"));
+    }
+
+    for (int64_t i = 0; i < in_field_count; ++i) {
+      const auto in_field_name =
+          checked_cast<const StructType&>(*batch[0].type()).field(i)->name();
+      const auto out_field_name =
+          checked_cast<const StructType&>(*out->type()).field(i)->name();
+      if (in_field_name != out_field_name) {
+        ARROW_RETURN_NOT_OK(
+            Status(StatusCode::TypeError, "struct field names do not match"));
+      }
+    }
+
+    if (out->kind() == Datum::SCALAR) {
+      const auto& in_scalar = checked_cast<const StructScalar&>(*batch[0].scalar());
+      auto out_scalar = checked_cast<StructScalar*>(out->scalar().get());
+
+      for (int64_t i = 0; i < in_field_count; i++) {
+        auto values = in_scalar.value[i];
+        auto target_type = out->type()->field(i)->type();
+        ARROW_ASSIGN_OR_RAISE(Datum cast_values,
+                              Cast(values, target_type, options, ctx->exec_context()));
+        DCHECK_EQ(Datum::SCALAR, cast_values.kind());
+        out_scalar->value.push_back(cast_values.scalar());
+      }
+
+      out_scalar->is_valid = true;
+      return Status::OK();
+    }
+
+    const ArrayData& in_array = *batch[0].array();
+    ArrayData* out_array = out->mutable_array();
+
+    for (int64_t i = 0; i < in_field_count; ++i) {
+      auto values = in_array.child_data[0];

Review comment:
       I don't think this will work if the StructArray is sliced - we need to add the StructArray's offset to the child's offset.

##########
File path: cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
##########
@@ -150,6 +150,73 @@ void AddListCast(CastFunction* func) {
   DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel)));
 }
 
+struct CastStruct {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const CastOptions& options = CastState::Get(ctx);
+    const auto in_field_count =
+        checked_cast<const StructType&>(*batch[0].type()).num_fields();
+    const auto out_field_count =
+        checked_cast<const StructType&>(*out->type()).num_fields();
+
+    if (in_field_count != out_field_count) {
+      ARROW_RETURN_NOT_OK(
+          Status(StatusCode::TypeError, "struct field sizes do not match"));
+    }

Review comment:
       Also, you can just `return Status::TypeError(...)` - no need to use the macro or construct the status manually.

##########
File path: cpp/src/arrow/compute/kernels/scalar_cast_test.cc
##########
@@ -2217,6 +2217,65 @@ TEST(Cast, ListToListOptionsPassthru) {
   }
 }
 
+static void CheckStructToStruct(
+    const std::vector<std::shared_ptr<DataType>>& value_types) {
+  for (const auto& src_value_type : value_types) {
+    for (const auto& dest_value_type : value_types) {
+      std::vector<std::string> field_names = {"a"};
+      std::shared_ptr<Array> a1, b1, a2, b2;
+      a1 = ArrayFromJSON(src_value_type, "[1, 2]");
+      a2 = ArrayFromJSON(dest_value_type, "[1, 2]");
+      auto src = StructArray::Make({a1}, field_names).ValueOrDie();
+      auto dest = StructArray::Make({a2}, field_names).ValueOrDie();
+
+      CheckCast(src, dest);
+    }
+  }
+}
+
+TEST(Cast, StructToSameSizedAndNamedStruct) {
+  CheckStructToStruct({int32(), float32(), int64()});

Review comment:
       It might be good to also test a StructArray that has been manually constructed such that the children have offsets (i.e. a StructArray consisting of slices of other arrays), since this is a corner case that has tripped up some other code before (ARROW-14156)




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org