You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "pitrou (via GitHub)" <gi...@apache.org> on 2023/06/27 14:16:02 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #36204: GH-36203: [C++] Support casting in both ways for is_in and index_in

pitrou commented on code in PR #36204:
URL: https://github.com/apache/arrow/pull/36204#discussion_r1243799859


##########
cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc:
##########
@@ -126,9 +126,9 @@ TEST_F(TestIsInKernel, ImplicitlyCastValueSet) {
                                             "true, false, true, false]"));
   AssertArraysEqual(*expected, *out.make_array());
 
-  // fails; value_set cannot be cast to int8
-  opts = SetLookupOptions{ArrayFromJSON(float32(), "[2.5, 3.1, 5.0]")};
-  ASSERT_RAISES(Invalid, CallFunction("is_in", {input}, &opts));
+  // Although value_set cannot be cast to int8, but int8 is castable to float

Review Comment:
   ```suggestion
     // value_set cannot be cast to int8, but int8 is castable to float
   ```



##########
cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc:
##########
@@ -822,6 +822,37 @@ TEST_F(TestIndexInKernel, Boolean) {
   CheckIndexIn(boolean(), "[null, null, null, null]", "[null]", "[0, 0, 0, 0]");
 }
 
+TEST_F(TestIndexInKernel, ImplicitlyCastValueSet) {
+  auto input = ArrayFromJSON(int8(), "[0, 1, 2, 3, 4, 5, 6, 7, 8]");
+
+  SetLookupOptions opts{ArrayFromJSON(int32(), "[2, 3, 5, 7]")};
+  ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("index_in", {input}, &opts));
+
+  auto expected = ArrayFromJSON(int32(), ("[null, null, 0, 1, null,"
+                                          "2, null, 3, null]"));
+  AssertArraysEqual(*expected, *out.make_array());
+
+  // Although value_set cannot be cast to int8, but int8 is castable to float
+  CheckIndexIn(input, ArrayFromJSON(float32(), "[1.0, 2.5, 3.1, 5.0]"),
+               "[null, 0, null, null, null, 3, null, null, null]");
+
+  // Allow implicit casts between binary types...
+  CheckIndexIn(ArrayFromJSON(binary(), R"(["aaa", "bbb", "ccc", null, "bbb"])"),
+               ArrayFromJSON(fixed_size_binary(3), R"(["aaa", "bbb"])"),
+               "[0, 1, null, null, 1]");
+  CheckIndexIn(ArrayFromJSON(utf8(), R"(["aaa", "bbb", "ccc", null, "bbb"])"),
+               ArrayFromJSON(large_utf8(), R"(["aaa", "bbb"])"), "[0, 1, null, null, 1]");

Review Comment:
   Can you check the other way round as well for both checks above?



##########
cpp/src/arrow/compute/kernels/scalar_set_lookup.cc:
##########
@@ -359,13 +386,11 @@ struct IsInVisitor {
   }
 
   template <typename Type>
-  Status ProcessIsIn() {
+  Status ProcessIsIn(const SetLookupState<Type>& state, const ArraySpan& data) {

Review Comment:
   Same remark.



##########
cpp/src/arrow/compute/kernels/scalar_set_lookup.cc:
##########
@@ -263,11 +277,8 @@ struct IndexInVisitor {
   }
 
   template <typename Type>
-  Status ProcessIndexIn() {
+  Status ProcessIndexIn(const SetLookupState<Type>& state, const ArraySpan& data) {

Review Comment:
   The `data` argument here shadows the `data` member variable, which is confusing and error-prone. Do you need it?



##########
cpp/src/arrow/compute/kernels/scalar_set_lookup.cc:
##########
@@ -215,16 +221,24 @@ struct InitStateVisitor {
       return Status::Invalid("Array type didn't match type of values set: ", *arg_type,
                              " vs ", *options.value_set.type());
     }
+
     if (!options.value_set.is_arraylike()) {
       return Status::Invalid("Set lookup value set must be Array or ChunkedArray");
     } else if (!options.value_set.type()->Equals(*arg_type)) {
-      ARROW_ASSIGN_OR_RAISE(
-          options.value_set,
+      auto cast_result =
           Cast(options.value_set, CastOptions::Safe(arg_type.GetSharedPtr()),
-               ctx->exec_context()));
+               ctx->exec_context());
+      if (cast_result.ok()) {
+        options.value_set = *cast_result;
+      } else if (CanCast(*arg_type.type, *options.value_set.type())) {
+        // Will try to cast input array to value set type during kernel exec
+      } else {
+        return Status::Invalid("Input type doesn't match type of values set: ", *arg_type,

Review Comment:
   Ideally, this should have been `TypeError` instead of `Invalid` here, but let's keep consistent for now.
   Perhaps open an issue to change this later?



##########
cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc:
##########
@@ -253,11 +253,11 @@ TEST_F(TestIsInKernel, TimeDuration) {
               "[true, false, false, true, true]", /*skip_nulls=*/true);
   }
 
-  // Different units, invalid cast
-  ASSERT_RAISES(Invalid, IsIn(ArrayFromJSON(duration(TimeUnit::SECOND), "[0, 1, 2]"),
-                              ArrayFromJSON(duration(TimeUnit::MILLI), "[0, 2]")));
+  // Different units, cast value_set to values
+  CheckIsIn(ArrayFromJSON(duration(TimeUnit::SECOND), "[0, 2]"),
+            ArrayFromJSON(duration(TimeUnit::MILLI), "[0, 1, 2000]"), "[true, true]");

Review Comment:
   Ok, but you're choosing an easy case here.
   What if values = `[0s, 2s]` and value_set = `[1ms, 2ms, 2000ms]`? Will it fail because of truncation during casting? Will it return an incorrect value? Will it try casting the other way round?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org