You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/01/28 16:04:33 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #9294: ARROW-8919: [C++][Compute][Dataset] Add Function::DispatchBest to accomodate implicit casts

pitrou commented on a change in pull request #9294:
URL: https://github.com/apache/arrow/pull/9294#discussion_r566200190



##########
File path: docs/source/cpp/compute.rst
##########
@@ -744,3 +749,41 @@ Structural transforms
 * \(2) For each value in the list child array, the index at which it is found
   in the list array is appended to the output.  Nulls in the parent list array
   are discarded.
+
+.. _common-numeric-type:
+
+Common numeric type
+~~~~~~~~~~~~~~~~~~~

Review comment:
       This is presented like another category of functions (same heading level as "structural transforms" above).

##########
File path: cpp/src/arrow/compute/kernels/test_util.cc
##########
@@ -173,5 +175,35 @@ void CheckScalarBinary(std::string func_name, std::shared_ptr<Array> left_input,
   CheckScalar(std::move(func_name), {left_input, right_input}, expected, options);
 }
 
+void CheckDispatchBest(std::string func_name, std::vector<ValueDescr> original_values,
+                       std::vector<ValueDescr> expected_equivalent_values) {
+  ASSERT_OK_AND_ASSIGN(auto function, GetFunctionRegistry()->GetFunction(func_name));
+
+  auto values = original_values;
+  ASSERT_OK_AND_ASSIGN(auto actual_kernel, function->DispatchBest(&values));
+
+  ASSERT_OK_AND_ASSIGN(auto expected_kernel,
+                       function->DispatchExact(expected_equivalent_values));
+
+  auto Format = [](const std::vector<ValueDescr>& descrs) {
+    std::stringstream ss;
+    ss << "(";
+    for (size_t i = 0; i < descrs.size(); ++i) {
+      if (i > 0) {
+        ss << ", ";
+      }
+      ss << descrs[i].ToString();
+    }
+    ss << ")";
+    return ss.str();
+  };
+
+  EXPECT_EQ(actual_kernel, expected_kernel)
+      << "DispatchBest" << Format(original_values) << " => "
+      << actual_kernel->signature->ToString() << "\n"
+      << "DispatchExact" << Format(expected_equivalent_values) << " => "
+      << expected_kernel->signature->ToString();

Review comment:
       If either of `actual_kernel` or `expected_kernel` is null, this is going to crash?

##########
File path: cpp/src/arrow/compute/kernels/test_util.cc
##########
@@ -173,5 +175,35 @@ void CheckScalarBinary(std::string func_name, std::shared_ptr<Array> left_input,
   CheckScalar(std::move(func_name), {left_input, right_input}, expected, options);
 }
 
+void CheckDispatchBest(std::string func_name, std::vector<ValueDescr> original_values,
+                       std::vector<ValueDescr> expected_equivalent_values) {
+  ASSERT_OK_AND_ASSIGN(auto function, GetFunctionRegistry()->GetFunction(func_name));
+
+  auto values = original_values;
+  ASSERT_OK_AND_ASSIGN(auto actual_kernel, function->DispatchBest(&values));
+
+  ASSERT_OK_AND_ASSIGN(auto expected_kernel,
+                       function->DispatchExact(expected_equivalent_values));
+
+  auto Format = [](const std::vector<ValueDescr>& descrs) {
+    std::stringstream ss;
+    ss << "(";
+    for (size_t i = 0; i < descrs.size(); ++i) {
+      if (i > 0) {
+        ss << ", ";
+      }
+      ss << descrs[i].ToString();
+    }
+    ss << ")";
+    return ss.str();
+  };

Review comment:
       Didn't you define a `ValueDescr::ToString` that does this?

##########
File path: cpp/src/arrow/compute/kernels/scalar_cast_test.cc
##########
@@ -403,6 +406,77 @@ class TestCast : public TestBase {
   }
 };
 
+TEST_F(TestCast, CanCast) {
+  auto ExpectCanCast = [](std::shared_ptr<DataType> from,
+                          std::vector<std::shared_ptr<DataType>> to_set,
+                          bool expected = true) {
+    for (auto to : to_set) {
+      EXPECT_EQ(CanCast(*from, *to), expected) << "  from: " << from->ToString() << "\n"
+                                               << "    to: " << to->ToString();
+    }
+  };
+
+  auto ExpectCannotCast = [ExpectCanCast](std::shared_ptr<DataType> from,
+                                          std::vector<std::shared_ptr<DataType>> to_set) {
+    ExpectCanCast(from, to_set, /*expected=*/false);
+  };
+
+  ExpectCanCast(null(), {boolean()});
+  ExpectCanCast(null(), kNumericTypes);
+  ExpectCanCast(null(), kBaseBinaryTypes);
+  ExpectCanCast(
+      null(), {date32(), date64(), time32(TimeUnit::MILLI), timestamp(TimeUnit::SECOND)});
+  ExpectCanCast(dictionary(uint16(), null()), {null()});
+
+  ExpectCanCast(boolean(), {boolean()});
+  ExpectCanCast(boolean(), kNumericTypes);
+  ExpectCanCast(boolean(), {utf8(), large_utf8()});
+  ExpectCanCast(dictionary(int32(), boolean()), {boolean()});
+
+  ExpectCannotCast(boolean(), {null()});
+  ExpectCannotCast(boolean(), {binary(), large_binary()});
+  ExpectCannotCast(boolean(), {date32(), date64(), time32(TimeUnit::MILLI),
+                               timestamp(TimeUnit::SECOND)});
+
+  for (auto from_numeric : kNumericTypes) {
+    ExpectCanCast(from_numeric, {boolean()});
+    ExpectCanCast(from_numeric, kNumericTypes);
+    ExpectCanCast(from_numeric, {utf8(), large_utf8()});
+    ExpectCanCast(dictionary(int32(), from_numeric), {from_numeric});
+
+    ExpectCannotCast(from_numeric, {null()});
+  }
+
+  for (auto from_base_binary : kBaseBinaryTypes) {
+    ExpectCanCast(from_base_binary, {boolean()});
+    ExpectCanCast(from_base_binary, kNumericTypes);
+    ExpectCanCast(from_base_binary, kBaseBinaryTypes);
+    ExpectCanCast(dictionary(int64(), from_base_binary), {from_base_binary});
+
+    // any cast which is valid for the dictionary is valid for the DictionaryArray
+    ExpectCanCast(dictionary(uint32(), from_base_binary), kBaseBinaryTypes);
+    ExpectCanCast(dictionary(int16(), from_base_binary), kNumericTypes);
+
+    ExpectCannotCast(from_base_binary, {null()});
+  }
+
+  ExpectCanCast(utf8(), {timestamp(TimeUnit::MILLI)});
+  ExpectCanCast(large_utf8(), {timestamp(TimeUnit::NANO)});
+  ExpectCannotCast(timestamp(TimeUnit::MICRO),
+                   kBaseBinaryTypes);  // no formatting supported
+
+  ExpectCannotCast(fixed_size_binary(3),
+                   {fixed_size_binary(3)});  // FIXME missing identity cast
+
+  auto smallint = std::make_shared<SmallintType>();
+  ASSERT_OK(RegisterExtensionType(smallint));
+  ExpectCanCast(smallint, {int16()});  // cast storage
+  ExpectCanCast(smallint,
+                kNumericTypes);  // any cast which is valid for storage is supported
+  ExpectCannotCast(null(), {smallint});  // FIXME missing common cast from null
+  ASSERT_OK(UnregisterExtensionType("smallint"));

Review comment:
       Use `ExtensionTypeGuard` to avoid side-effects in case of error or exception.

##########
File path: cpp/src/arrow/compute/cast.cc
##########
@@ -225,13 +208,21 @@ Result<std::shared_ptr<CastFunction>> GetCastFunction(
 }
 
 bool CanCast(const DataType& from_type, const DataType& to_type) {
-  // TODO
   internal::EnsureInitCastTable();
-  auto it = internal::g_cast_table.find(static_cast<int>(from_type.id()));
+  auto it = internal::g_cast_table.find(static_cast<int>(to_type.id()));

Review comment:
       Ha!

##########
File path: cpp/src/arrow/compute/function.cc
##########
@@ -124,7 +179,18 @@ Result<Datum> Function::Execute(const std::vector<Datum>& args,
     inputs[i] = args[i].descr();
   }
 
-  ARROW_ASSIGN_OR_RAISE(auto kernel, DispatchExact(inputs));
+  ARROW_ASSIGN_OR_RAISE(auto kernel, DispatchBest(&inputs));
+  for (size_t i = 0; i != args.size(); ++i) {
+    if (inputs[i] != args[i].descr()) {
+      if (inputs[i].shape != args[i].shape()) {
+        return Status::NotImplemented(
+            "Automatic broadcasting of scalars to arrays for function ", name());
+      }
+
+      ARROW_ASSIGN_OR_RAISE(args[i], Cast(args[i], inputs[i].type));
+    }
+  }

Review comment:
       This logic (take a `vector<ValueDescr>` and accomodate a `vector<Datum>` to fit them) seems general enough to warrant exposing a helper for it? Perhaps in `cast.h`?

##########
File path: cpp/src/arrow/compute/kernels/codegen_internal.cc
##########
@@ -179,6 +179,121 @@ Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& desc
   return descrs[0];
 }
 
+void EnsureDictionaryDecoded(std::vector<ValueDescr>* descrs) {
+  for (ValueDescr& descr : *descrs) {
+    if (descr.type->id() == Type::DICTIONARY) {
+      descr.type = checked_cast<const DictionaryType&>(*descr.type).value_type();
+    }
+  }
+}
+
+void ReplaceNullWithOtherType(std::vector<ValueDescr>* descrs) {
+  DCHECK_EQ(descrs->size(), 2);
+
+  if (descrs->at(0).type->id() == Type::NA) {
+    descrs->at(0).type = descrs->at(1).type;
+    return;
+  }
+
+  if (descrs->at(1).type->id() == Type::NA) {
+    descrs->at(1).type = descrs->at(0).type;
+    return;
+  }
+}
+
+void ReplaceTypes(const std::shared_ptr<DataType>& type,
+                  std::vector<ValueDescr>* descrs) {
+  for (auto& descr : *descrs) {
+    descr.type = type;
+  }
+}
+
+std::shared_ptr<DataType> CommonNumeric(const std::vector<ValueDescr>& descrs) {
+  for (const auto& descr : descrs) {
+    auto id = descr.type->id();
+    if (!is_floating(id) && !is_integer(id)) {
+      // a common numeric type is only possible if all types are numeric
+      return nullptr;
+    }
+  }
+  for (const auto& descr : descrs) {
+    if (descr.type->id() == Type::DOUBLE) return float64();
+  }
+  for (const auto& descr : descrs) {
+    if (descr.type->id() == Type::FLOAT) return float32();
+  }
+
+  bool at_least_one_signed = false;
+  int max_width = 0;
+
+  for (const auto& descr : descrs) {
+    auto id = descr.type->id();
+    at_least_one_signed |= is_signed_integer(id);
+    max_width = std::max(bit_width(id), max_width);
+  }
+
+  if (max_width == 64) return at_least_one_signed ? int64() : uint64();
+  if (max_width == 32) return at_least_one_signed ? int32() : uint32();

Review comment:
       But let's say you are given `{int32(), uint32()}`. Do you want to return `int64()` then?
   (I guess it depends on the intended semantics and use cases)

##########
File path: docs/source/cpp/compute.rst
##########
@@ -744,3 +749,41 @@ Structural transforms
 * \(2) For each value in the list child array, the index at which it is found
   in the list array is appended to the output.  Nulls in the parent list array
   are discarded.
+
+.. _common-numeric-type:
+
+Common numeric type
+~~~~~~~~~~~~~~~~~~~
+
+The common numeric type of a set of input numeric types is the smallest numeric
+type which can accommodate any value of any input. If any input is a floating
+point type the common numeric type is the widest floating point type among the
+inputs. Otherwise the common numeric type is integral, is signed if any input
+is signed, and its width is the maximum width of any input. For example:
+
++-------------------+----------------------+-------------------------------------------+
+| Input types       | Common numeric type  | Notes                                     |
++===================+======================+===========================================+
+| int32, int32      | int32                |                                           |
++-------------------+----------------------+-------------------------------------------+
+| uint32, int32     | int32                | One input signed, override unsigned       |
++-------------------+----------------------+-------------------------------------------+
+| int16, int32      | int32                | Max width is 32, promote LHS to int32     |
++-------------------+----------------------+-------------------------------------------+
+| uint16, uint32    | uint32               | All inputs unsigned, maintain unsigned    |
++-------------------+----------------------+-------------------------------------------+
+| uint16, int32     | int32                | One input signed, override unsigned       |
++-------------------+----------------------+-------------------------------------------+
+| int16, uint32     | int32                |                                           |
++-------------------+----------------------+-------------------------------------------+
+| float32, int32    | float32              | Promote RHS to float32                    |
++-------------------+----------------------+-------------------------------------------+
+| float32, float64  | float64              |                                           |
++-------------------+----------------------+-------------------------------------------+
+| float32, int64    | float32              | int64 is wider, still promotes to float32 |

Review comment:
       Ok, so this means `int32 is_in float32` may produce false positives because it will do `float32` comparisons even though the `int32 -> float32` conversion may lose precision?

##########
File path: cpp/src/arrow/compute/kernels/common.h
##########
@@ -51,4 +51,16 @@ namespace arrow {
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
+namespace compute {
+namespace detail {
+
+/// \brief Look up a kernel in a function. If no Kernel is found, nullptr is returned.
+ARROW_EXPORT
+const Kernel* DispatchExactImpl(const Function* func, const std::vector<ValueDescr>&);
+
+ARROW_EXPORT
+Status NoMatchingKernel(const Function* func, const std::vector<ValueDescr>&);

Review comment:
       Where are these two functions defined? Did I miss something?

##########
File path: docs/source/cpp/compute.rst
##########
@@ -744,3 +749,41 @@ Structural transforms
 * \(2) For each value in the list child array, the index at which it is found
   in the list array is appended to the output.  Nulls in the parent list array
   are discarded.
+
+.. _common-numeric-type:
+
+Common numeric type
+~~~~~~~~~~~~~~~~~~~

Review comment:
       Instead, perhaps create a section about implicit casts?

##########
File path: cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
##########
@@ -149,17 +149,13 @@ void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Dat
 // ----------------------------------------------------------------------
 
 void UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  if (out->is_scalar()) {
-    KERNEL_ASSIGN_OR_RAISE(*out, ctx,
-                           batch[0].scalar_as<DictionaryScalar>().GetEncodedValue());
-    return;
-  }
+  DCHECK(out->is_array());
 
   DictionaryArray dict_arr(batch[0].array());
   const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
 
   const auto& dict_type = *dict_arr.dictionary()->type();
-  if (!dict_type.Equals(options.to_type)) {
+  if (!dict_type.Equals(options.to_type) && !CanCast(dict_type, *options.to_type)) {

Review comment:
       Hmm... do we need a version of `CanCast` that only returns true if safe casting is always possible?

##########
File path: cpp/src/arrow/type_traits.h
##########
@@ -929,6 +929,46 @@ static inline bool is_fixed_width(Type::type type_id) {
   return is_primitive(type_id) || is_dictionary(type_id) || is_fixed_size_binary(type_id);
 }
 
+static inline int bit_width(Type::type type_id) {
+  switch (type_id) {
+    case Type::BOOL:
+      return 1;
+    case Type::UINT8:
+    case Type::INT8:
+      return 8;
+    case Type::UINT16:
+    case Type::INT16:
+      return 16;
+    case Type::UINT32:
+    case Type::INT32:
+    case Type::DATE32:
+    case Type::TIME32:
+      return 32;
+    case Type::UINT64:
+    case Type::INT64:
+    case Type::DATE64:
+    case Type::TIME64:
+    case Type::TIMESTAMP:
+    case Type::DURATION:
+      return 64;
+
+    case Type::HALF_FLOAT:
+      return 16;
+    case Type::FLOAT:
+      return 32;
+    case Type::DOUBLE:
+      return 64;
+
+    case Type::INTERVAL_MONTHS:
+      return 32;
+    case Type::INTERVAL_DAY_TIME:
+      return 64;
+    default:

Review comment:
       Do you want to add `DECIMAL128` and `DECIMAL256`?

##########
File path: cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
##########
@@ -264,10 +264,31 @@ ArrayKernelExec NumericEqualTypesBinary(detail::GetTypeId get_id) {
   }
 }
 
+struct ArithmeticFunction : ScalarFunction {
+  using ScalarFunction::ScalarFunction;
+
+  Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {

Review comment:
       Why is there both this and `Function::DispatchBest`?

##########
File path: cpp/src/arrow/dataset/expression_test.cc
##########
@@ -57,40 +57,91 @@ void ExpectResultsEqual(Actual&& actual, Expected&& expected) {
   MaybeExpected maybe_expected(std::forward<Expected>(expected));
 
   if (maybe_expected.ok()) {
-    ASSERT_OK_AND_ASSIGN(auto actual, maybe_actual);
-    EXPECT_EQ(actual, *maybe_expected);
+    EXPECT_EQ(maybe_actual, maybe_expected);
   } else {
-    EXPECT_EQ(maybe_actual.status().code(), expected.status().code());
-    EXPECT_NE(maybe_actual.status().message().find(expected.status().message()),
-              std::string::npos)
-        << "  actual:   " << maybe_actual.status() << "\n"
-        << "  expected: " << maybe_expected.status();
+    EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT(
+        expected.status().code(), HasSubstr(expected.status().message()), maybe_actual);
   }
 }
 
+const auto no_change = util::nullopt;
+
 TEST(ExpressionUtils, Comparison) {
   auto Expect = [](Result<std::string> expected, Datum l, Datum r) {
     ExpectResultsEqual(Comparison::Execute(l, r).Map(Comparison::GetName), expected);
   };
 
-  Datum zero(0), one(1), two(2), null(std::make_shared<Int32Scalar>()), str("hello");
+  Datum zero(0), one(1), two(2), null(std::make_shared<Int32Scalar>());
+  Datum str("hello"), bin(std::make_shared<BinaryScalar>(Buffer::FromString("hello")));
+  Datum dict_str(DictionaryScalar::Make(std::make_shared<Int32Scalar>(0),
+                                        ArrayFromJSON(utf8(), R"(["a", "b", "c"])")));
 
-  Status parse_failure = Status::Invalid("Failed to parse");
+  Status not_impl = Status::NotImplemented("no kernel matching input types");
 
   Expect("equal", one, one);
   Expect("less", one, two);
   Expect("greater", one, zero);
 
-  // cast RHS to LHS type; "hello" > "1"
-  Expect("greater", str, one);
-  // cast RHS to LHS type; "hello" is not convertible to int
-  Expect(parse_failure, one, str);
-
   Expect("na", one, null);
-  Expect("na", str, null);
   Expect("na", null, one);
-  // cast RHS to LHS type; "hello" is not convertible to int
-  Expect(parse_failure, null, str);
+
+  // strings and ints are not comparable without explicit casts
+  Expect(not_impl, str, one);
+  Expect(not_impl, one, str);
+  Expect(not_impl, str, null);  // not even null ints
+
+  // string -> binary implicit cast allowed
+  Expect("equal", str, bin);
+  Expect("equal", bin, str);
+
+  // dict_str -> string, implicit casts allowed
+  Expect("less", dict_str, str);
+  Expect("less", dict_str, bin);
+}
+
+TEST(ExpressionUtils, StripOrderPreservingCasts) {
+  auto Expect = [](Expression expr, util::optional<Expression> expected_stripped) {
+    ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*kBoringSchema));
+    if (!expected_stripped) {
+      expected_stripped = expr;
+    } else {
+      ASSERT_OK_AND_ASSIGN(expected_stripped, expected_stripped->Bind(*kBoringSchema));
+    }
+    EXPECT_EQ(Comparison::StripOrderPreservingCasts(expr), *expected_stripped);
+  };
+
+  // Casting int to float preserves ordering.
+  // For example, let
+  //   a = 3, b = 2, assert(a > b)
+  // After injecting a cast to float, this ordering still holds
+  //   float(a) == 3.0, float(b) == 2.0, assert(float(a) > float(b))
+  Expect(cast(field_ref("i32"), float32()), field_ref("i32"));
+
+  // Casting an integral type to a wider integral type preserves ordering.
+  Expect(cast(field_ref("i32"), int64()), field_ref("i32"));
+  Expect(cast(field_ref("i32"), int32()), field_ref("i32"));
+  Expect(cast(field_ref("i32"), int16()), no_change);
+  Expect(cast(field_ref("i32"), int8()), no_change);
+
+  Expect(cast(field_ref("u32"), uint64()), field_ref("u32"));
+  Expect(cast(field_ref("u32"), uint32()), field_ref("u32"));
+  Expect(cast(field_ref("u32"), uint16()), no_change);
+  Expect(cast(field_ref("u32"), uint8()), no_change);
+
+  // Casting float to int can affect ordering.
+  // For example, let
+  //   a = 3.5, b = 3.0, assert(a > b)
+  // After injecting a cast to integer, this ordering may no longer hold
+  //   int(a) == 3, int(b) == 3, assert(!(int(a) > int(b)))
+  Expect(cast(field_ref("f32"), int32()), no_change);
+
+  // casting any float type to another preserves ordering
+  Expect(cast(field_ref("f32"), float64()), field_ref("f32"));
+  Expect(cast(field_ref("f64"), float32()), field_ref("f64"));

Review comment:
       ```python
   >>> a = np.float64(2**47)
   >>> b = a + 1
   >>> b > a
   True
   >>> np.float32(b) > np.float32(a)    # both finite and equal
   False
   
   >>> a = np.float64(1e40)
   >>> b = a + 1e25
   >>> b > a
   True
   >>> np.float32(b) > np.float32(a)    # both infinite
   False
   ```
   

##########
File path: cpp/src/arrow/compute/cast.cc
##########
@@ -225,13 +208,21 @@ Result<std::shared_ptr<CastFunction>> GetCastFunction(
 }
 
 bool CanCast(const DataType& from_type, const DataType& to_type) {
-  // TODO
   internal::EnsureInitCastTable();
-  auto it = internal::g_cast_table.find(static_cast<int>(from_type.id()));
+  auto it = internal::g_cast_table.find(static_cast<int>(to_type.id()));
   if (it == internal::g_cast_table.end()) {
     return false;
   }
-  return it->second->CanCastTo(to_type);
+
+  const CastFunction* function = it->second.get();
+  DCHECK_EQ(function->out_type_id(), to_type.id());
+
+  for (auto from_id : function->in_type_ids()) {
+    // XXX should probably check the output type as well

Review comment:
       What needs to be checked besides the type id?

##########
File path: cpp/src/arrow/compute/kernels/codegen_internal.cc
##########
@@ -179,6 +179,121 @@ Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& desc
   return descrs[0];
 }
 
+void EnsureDictionaryDecoded(std::vector<ValueDescr>* descrs) {
+  for (ValueDescr& descr : *descrs) {
+    if (descr.type->id() == Type::DICTIONARY) {
+      descr.type = checked_cast<const DictionaryType&>(*descr.type).value_type();
+    }
+  }
+}
+
+void ReplaceNullWithOtherType(std::vector<ValueDescr>* descrs) {
+  DCHECK_EQ(descrs->size(), 2);
+
+  if (descrs->at(0).type->id() == Type::NA) {
+    descrs->at(0).type = descrs->at(1).type;
+    return;
+  }
+
+  if (descrs->at(1).type->id() == Type::NA) {
+    descrs->at(1).type = descrs->at(0).type;
+    return;
+  }
+}
+
+void ReplaceTypes(const std::shared_ptr<DataType>& type,
+                  std::vector<ValueDescr>* descrs) {
+  for (auto& descr : *descrs) {
+    descr.type = type;
+  }
+}
+
+std::shared_ptr<DataType> CommonNumeric(const std::vector<ValueDescr>& descrs) {
+  for (const auto& descr : descrs) {
+    auto id = descr.type->id();
+    if (!is_floating(id) && !is_integer(id)) {
+      // a common numeric type is only possible if all types are numeric

Review comment:
       Should you bail out for float16 too?

##########
File path: cpp/src/arrow/compute/function.cc
##########
@@ -21,62 +21,66 @@
 #include <memory>
 #include <sstream>
 
+#include "arrow/compute/cast.h"
 #include "arrow/compute/exec.h"
 #include "arrow/compute/exec_internal.h"
+#include "arrow/compute/kernels/common.h"
 #include "arrow/datum.h"
 #include "arrow/util/cpu_info.h"
 
 namespace arrow {
+
+using internal::checked_cast;
+
 namespace compute {
 
 static const FunctionDoc kEmptyFunctionDoc{};
 
 const FunctionDoc& FunctionDoc::Empty() { return kEmptyFunctionDoc; }
 
-Status Function::CheckArity(int passed_num_args) const {
-  if (arity_.is_varargs && passed_num_args < arity_.num_args) {
-    return Status::Invalid("VarArgs function needs at least ", arity_.num_args,
-                           " arguments but kernel accepts only ", passed_num_args);
-  } else if (!arity_.is_varargs && passed_num_args != arity_.num_args) {
-    return Status::Invalid("Function accepts ", arity_.num_args,
-                           " arguments but kernel accepts ", passed_num_args);
+Status CheckArityImpl(const Function* function, int passed_num_args,
+                      const char* passed_num_args_label) {
+  if (function->arity().is_varargs && passed_num_args < function->arity().num_args) {
+    return Status::Invalid("VarArgs function needs at least ", function->arity().num_args,

Review comment:
       Add the function name here too?

##########
File path: cpp/src/arrow/compute/function.cc
##########
@@ -102,20 +106,71 @@ Result<const KernelType*> DispatchExactImpl(const Function& func,
     return kernel_matches[SimdLevel::NONE];
   }
 
-  return Status::NotImplemented("Function ", func.name(),
-                                " has no kernel matching input types ",
-                                FormatArgTypes(values));
+  return nullptr;
+}
+
+const Kernel* DispatchExactImpl(const Function* func,
+                                const std::vector<ValueDescr>& values) {
+  if (func->kind() == Function::SCALAR) {
+    return DispatchExactImpl(checked_cast<const ScalarFunction*>(func)->kernels(),
+                             values);
+  }
+
+  if (func->kind() == Function::VECTOR) {
+    return DispatchExactImpl(checked_cast<const VectorFunction*>(func)->kernels(),
+                             values);
+  }
+
+  if (func->kind() == Function::SCALAR_AGGREGATE) {
+    return DispatchExactImpl(
+        checked_cast<const ScalarAggregateFunction*>(func)->kernels(), values);
+  }
+
+  return nullptr;
+}
+
+}  // namespace detail
+
+Result<const Kernel*> Function::DispatchExact(
+    const std::vector<ValueDescr>& values) const {
+  if (kind_ == Function::META) {
+    return Status::NotImplemented("Dispatch for a MetaFunction's Kernels");
+  }
+  RETURN_NOT_OK(CheckArity(values));
+
+  if (auto kernel = detail::DispatchExactImpl(this, values)) {
+    return kernel;
+  }
+  return detail::NoMatchingKernel(this, values);
+}
+
+Result<const Kernel*> Function::DispatchBest(std::vector<ValueDescr>* values) const {
+  if (kind_ == Function::META) {
+    return Status::NotImplemented("Dispatch for a MetaFunction's Kernels");
+  }
+  RETURN_NOT_OK(CheckArity(*values));
+
+  // first try for an exact match
+  if (auto kernel = detail::DispatchExactImpl(this, *values)) {
+    return kernel;
+  }
+
+  // XXX permit generic conversions here, for example dict -> decoded, null -> any?
+  return DispatchExact(*values);

Review comment:
       `DispatchExact` will re-run exactly the same steps as above, right? What is the point of calling it?

##########
File path: cpp/src/arrow/testing/gtest_util.h
##########
@@ -79,15 +79,18 @@
     EXPECT_THAT(_st.ToString(), (matcher));                                           \
   } while (false)
 
-#define ASSERT_OK(expr)                                                       \
-  do {                                                                        \
-    auto _res = (expr);                                                       \
-    ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res);           \
-    if (!_st.ok()) {                                                          \
-      FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _st.ToString(); \
-    }                                                                         \
+#define EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT(code, matcher, expr) \
+  do {                                                                \
+    auto _res = (expr);                                               \
+    ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res);   \
+    EXPECT_EQ(_st.CodeAsString(), Status::CodeAsString(code));        \
+    EXPECT_THAT(_st.ToString(), (matcher));                           \
   } while (false)
 
+#define ASSERT_OK(expr)                                                              \
+  for (::arrow::Status _st = ::arrow::internal::GenericToStatus((expr)); !_st.ok();) \
+  FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _st.ToString()

Review comment:
       Apart from being more cryptic, what does the `for` loop variant bring?

##########
File path: cpp/src/arrow/compute/function.h
##########
@@ -162,7 +162,15 @@ class ARROW_EXPORT Function {
   ///
   /// NB: This function is overridden in CastFunction.
   virtual Result<const Kernel*> DispatchExact(
-      const std::vector<ValueDescr>& values) const = 0;
+      const std::vector<ValueDescr>& values) const;
+
+  /// \brief Return a best-match kernel that can execute the function given the argument
+  /// types, after implicit casts are applied.
+  ///
+  /// \param[in,out] values Argument types. An element may be modified to indicate that
+  /// the returned kernel only approximately matches the input value descriptors; callers
+  /// are responsible for casting inputs to the type and shape required by the kernel.

Review comment:
       Does it mean the caller must check again, for each `ValueDescr`, whether it changed or not? Perhaps there's a way to return that info, since `DispatchBest` obviously has to compute it.

##########
File path: cpp/src/arrow/compute/function.cc
##########
@@ -102,20 +106,71 @@ Result<const KernelType*> DispatchExactImpl(const Function& func,
     return kernel_matches[SimdLevel::NONE];
   }
 
-  return Status::NotImplemented("Function ", func.name(),
-                                " has no kernel matching input types ",
-                                FormatArgTypes(values));
+  return nullptr;
+}
+
+const Kernel* DispatchExactImpl(const Function* func,
+                                const std::vector<ValueDescr>& values) {
+  if (func->kind() == Function::SCALAR) {
+    return DispatchExactImpl(checked_cast<const ScalarFunction*>(func)->kernels(),
+                             values);
+  }
+
+  if (func->kind() == Function::VECTOR) {
+    return DispatchExactImpl(checked_cast<const VectorFunction*>(func)->kernels(),
+                             values);
+  }
+
+  if (func->kind() == Function::SCALAR_AGGREGATE) {
+    return DispatchExactImpl(
+        checked_cast<const ScalarAggregateFunction*>(func)->kernels(), values);
+  }
+
+  return nullptr;
+}
+
+}  // namespace detail
+
+Result<const Kernel*> Function::DispatchExact(
+    const std::vector<ValueDescr>& values) const {
+  if (kind_ == Function::META) {
+    return Status::NotImplemented("Dispatch for a MetaFunction's Kernels");
+  }
+  RETURN_NOT_OK(CheckArity(values));
+
+  if (auto kernel = detail::DispatchExactImpl(this, values)) {
+    return kernel;
+  }
+  return detail::NoMatchingKernel(this, values);
+}
+
+Result<const Kernel*> Function::DispatchBest(std::vector<ValueDescr>* values) const {
+  if (kind_ == Function::META) {
+    return Status::NotImplemented("Dispatch for a MetaFunction's Kernels");
+  }
+  RETURN_NOT_OK(CheckArity(*values));
+
+  // first try for an exact match
+  if (auto kernel = detail::DispatchExactImpl(this, *values)) {
+    return kernel;
+  }
+
+  // XXX permit generic conversions here, for example dict -> decoded, null -> any?

Review comment:
       Do you intend to add the implicit casting logic to this PR?

##########
File path: cpp/src/arrow/compute/kernels/scalar_cast_string.cc
##########
@@ -215,41 +215,41 @@ void AddBinaryToBinaryCast(CastFunction* func) {
   auto out_ty = TypeTraits<OutType>::type_singleton();
 
   DCHECK_OK(func->AddKernel(
-      OutType::type_id, {in_ty}, out_ty,
+      InType::type_id, {in_ty}, out_ty,

Review comment:
       Ouch.

##########
File path: cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
##########
@@ -149,17 +149,13 @@ void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Dat
 // ----------------------------------------------------------------------
 
 void UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  if (out->is_scalar()) {
-    KERNEL_ASSIGN_OR_RAISE(*out, ctx,
-                           batch[0].scalar_as<DictionaryScalar>().GetEncodedValue());
-    return;
-  }
+  DCHECK(out->is_array());
 
   DictionaryArray dict_arr(batch[0].array());
   const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
 
   const auto& dict_type = *dict_arr.dictionary()->type();
-  if (!dict_type.Equals(options.to_type)) {
+  if (!dict_type.Equals(options.to_type) && !CanCast(dict_type, *options.to_type)) {

Review comment:
       I definitely don't think we want implicit utf8->int32 casting, for example. Let's not reinvent PHP.

##########
File path: cpp/src/arrow/dataset/expression_test.cc
##########
@@ -57,40 +57,91 @@ void ExpectResultsEqual(Actual&& actual, Expected&& expected) {
   MaybeExpected maybe_expected(std::forward<Expected>(expected));
 
   if (maybe_expected.ok()) {
-    ASSERT_OK_AND_ASSIGN(auto actual, maybe_actual);
-    EXPECT_EQ(actual, *maybe_expected);
+    EXPECT_EQ(maybe_actual, maybe_expected);
   } else {
-    EXPECT_EQ(maybe_actual.status().code(), expected.status().code());
-    EXPECT_NE(maybe_actual.status().message().find(expected.status().message()),
-              std::string::npos)
-        << "  actual:   " << maybe_actual.status() << "\n"
-        << "  expected: " << maybe_expected.status();
+    EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT(
+        expected.status().code(), HasSubstr(expected.status().message()), maybe_actual);
   }
 }
 
+const auto no_change = util::nullopt;
+
 TEST(ExpressionUtils, Comparison) {
   auto Expect = [](Result<std::string> expected, Datum l, Datum r) {
     ExpectResultsEqual(Comparison::Execute(l, r).Map(Comparison::GetName), expected);
   };
 
-  Datum zero(0), one(1), two(2), null(std::make_shared<Int32Scalar>()), str("hello");
+  Datum zero(0), one(1), two(2), null(std::make_shared<Int32Scalar>());
+  Datum str("hello"), bin(std::make_shared<BinaryScalar>(Buffer::FromString("hello")));
+  Datum dict_str(DictionaryScalar::Make(std::make_shared<Int32Scalar>(0),
+                                        ArrayFromJSON(utf8(), R"(["a", "b", "c"])")));
 
-  Status parse_failure = Status::Invalid("Failed to parse");
+  Status not_impl = Status::NotImplemented("no kernel matching input types");
 
   Expect("equal", one, one);
   Expect("less", one, two);
   Expect("greater", one, zero);
 
-  // cast RHS to LHS type; "hello" > "1"
-  Expect("greater", str, one);
-  // cast RHS to LHS type; "hello" is not convertible to int
-  Expect(parse_failure, one, str);
-
   Expect("na", one, null);
-  Expect("na", str, null);
   Expect("na", null, one);
-  // cast RHS to LHS type; "hello" is not convertible to int
-  Expect(parse_failure, null, str);
+
+  // strings and ints are not comparable without explicit casts
+  Expect(not_impl, str, one);
+  Expect(not_impl, one, str);
+  Expect(not_impl, str, null);  // not even null ints
+
+  // string -> binary implicit cast allowed
+  Expect("equal", str, bin);
+  Expect("equal", bin, str);
+
+  // dict_str -> string, implicit casts allowed
+  Expect("less", dict_str, str);
+  Expect("less", dict_str, bin);
+}
+
+TEST(ExpressionUtils, StripOrderPreservingCasts) {
+  auto Expect = [](Expression expr, util::optional<Expression> expected_stripped) {
+    ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*kBoringSchema));
+    if (!expected_stripped) {
+      expected_stripped = expr;
+    } else {
+      ASSERT_OK_AND_ASSIGN(expected_stripped, expected_stripped->Bind(*kBoringSchema));
+    }
+    EXPECT_EQ(Comparison::StripOrderPreservingCasts(expr), *expected_stripped);
+  };
+
+  // Casting int to float preserves ordering.
+  // For example, let
+  //   a = 3, b = 2, assert(a > b)
+  // After injecting a cast to float, this ordering still holds
+  //   float(a) == 3.0, float(b) == 2.0, assert(float(a) > float(b))

Review comment:
       Are you sure?
   ```python
   >>> a = 1<<28
   >>> b = a + 1
   >>> b > a
   True
   >>> np.float32(b) > np.float32(a)
   False
   ```

##########
File path: cpp/src/arrow/dataset/expression_test.cc
##########
@@ -57,40 +57,91 @@ void ExpectResultsEqual(Actual&& actual, Expected&& expected) {
   MaybeExpected maybe_expected(std::forward<Expected>(expected));
 
   if (maybe_expected.ok()) {
-    ASSERT_OK_AND_ASSIGN(auto actual, maybe_actual);
-    EXPECT_EQ(actual, *maybe_expected);
+    EXPECT_EQ(maybe_actual, maybe_expected);
   } else {
-    EXPECT_EQ(maybe_actual.status().code(), expected.status().code());
-    EXPECT_NE(maybe_actual.status().message().find(expected.status().message()),
-              std::string::npos)
-        << "  actual:   " << maybe_actual.status() << "\n"
-        << "  expected: " << maybe_expected.status();
+    EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT(
+        expected.status().code(), HasSubstr(expected.status().message()), maybe_actual);
   }
 }
 
+const auto no_change = util::nullopt;
+
 TEST(ExpressionUtils, Comparison) {
   auto Expect = [](Result<std::string> expected, Datum l, Datum r) {
     ExpectResultsEqual(Comparison::Execute(l, r).Map(Comparison::GetName), expected);
   };
 
-  Datum zero(0), one(1), two(2), null(std::make_shared<Int32Scalar>()), str("hello");
+  Datum zero(0), one(1), two(2), null(std::make_shared<Int32Scalar>());
+  Datum str("hello"), bin(std::make_shared<BinaryScalar>(Buffer::FromString("hello")));
+  Datum dict_str(DictionaryScalar::Make(std::make_shared<Int32Scalar>(0),
+                                        ArrayFromJSON(utf8(), R"(["a", "b", "c"])")));
 
-  Status parse_failure = Status::Invalid("Failed to parse");
+  Status not_impl = Status::NotImplemented("no kernel matching input types");
 
   Expect("equal", one, one);
   Expect("less", one, two);
   Expect("greater", one, zero);
 
-  // cast RHS to LHS type; "hello" > "1"
-  Expect("greater", str, one);
-  // cast RHS to LHS type; "hello" is not convertible to int
-  Expect(parse_failure, one, str);
-
   Expect("na", one, null);
-  Expect("na", str, null);
   Expect("na", null, one);
-  // cast RHS to LHS type; "hello" is not convertible to int
-  Expect(parse_failure, null, str);
+
+  // strings and ints are not comparable without explicit casts
+  Expect(not_impl, str, one);
+  Expect(not_impl, one, str);
+  Expect(not_impl, str, null);  // not even null ints
+
+  // string -> binary implicit cast allowed
+  Expect("equal", str, bin);
+  Expect("equal", bin, str);
+
+  // dict_str -> string, implicit casts allowed
+  Expect("less", dict_str, str);
+  Expect("less", dict_str, bin);
+}
+
+TEST(ExpressionUtils, StripOrderPreservingCasts) {
+  auto Expect = [](Expression expr, util::optional<Expression> expected_stripped) {
+    ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*kBoringSchema));
+    if (!expected_stripped) {
+      expected_stripped = expr;
+    } else {
+      ASSERT_OK_AND_ASSIGN(expected_stripped, expected_stripped->Bind(*kBoringSchema));
+    }
+    EXPECT_EQ(Comparison::StripOrderPreservingCasts(expr), *expected_stripped);
+  };
+
+  // Casting int to float preserves ordering.
+  // For example, let
+  //   a = 3, b = 2, assert(a > b)
+  // After injecting a cast to float, this ordering still holds
+  //   float(a) == 3.0, float(b) == 2.0, assert(float(a) > float(b))
+  Expect(cast(field_ref("i32"), float32()), field_ref("i32"));
+
+  // Casting an integral type to a wider integral type preserves ordering.
+  Expect(cast(field_ref("i32"), int64()), field_ref("i32"));
+  Expect(cast(field_ref("i32"), int32()), field_ref("i32"));
+  Expect(cast(field_ref("i32"), int16()), no_change);
+  Expect(cast(field_ref("i32"), int8()), no_change);
+
+  Expect(cast(field_ref("u32"), uint64()), field_ref("u32"));
+  Expect(cast(field_ref("u32"), uint32()), field_ref("u32"));
+  Expect(cast(field_ref("u32"), uint16()), no_change);
+  Expect(cast(field_ref("u32"), uint8()), no_change);

Review comment:
       Also what about uint32 to {int32, int64}?

##########
File path: cpp/src/arrow/dataset/expression_internal.h
##########
@@ -109,6 +105,40 @@ struct Comparison {
     return less.scalar_as<BooleanScalar>().value ? LESS : GREATER;
   }
 
+  static const Expression& StripOrderPreservingCasts(const Expression& expr) {

Review comment:
       Add a comment explaining what this does?

##########
File path: cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
##########
@@ -50,6 +50,7 @@ struct ParseBooleanString {
 std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
   auto func = std::make_shared<CastFunction>("cast_boolean", Type::BOOL);
   AddCommonCasts(Type::BOOL, boolean(), func.get());
+  AddZeroCopyCast(Type::BOOL, boolean(), boolean(), func.get());

Review comment:
       Hmm, why do we need to add an explicit identity cast?

##########
File path: cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
##########
@@ -637,5 +637,58 @@ TYPED_TEST(TestBinaryArithmeticFloating, Mul) {
   this->AssertBinop(Multiply, "[null, 2.0]", this->MakeNullScalar(), "[null, null]");
 }
 
+TEST(TestBinaryArithmetic, DispatchBest) {
+  for (std::string name : {"add", "subtract", "multiply", "divide"}) {
+    for (std::string suffix : {"", "_checked"}) {
+      name += suffix;
+
+      CheckDispatchBest(name, {int32(), int32()}, {int32(), int32()});
+
+      CheckDispatchBest(name, {int32(), null()}, {int32(), int32()});
+
+      CheckDispatchBest(name, {null(), int32()}, {int32(), int32()});
+
+      CheckDispatchBest(name, {int32(), int16()}, {int32(), int32()});
+
+      CheckDispatchBest(name, {int32(), float32()}, {float32(), float32()});
+
+      CheckDispatchBest(name, {float32(), int64()}, {float32(), float32()});
+
+      CheckDispatchBest(name, {float64(), int32()}, {float64(), float64()});
+
+      CheckDispatchBest(name, {dictionary(int8(), float64()), float64()},
+                        {float64(), float64()});
+
+      CheckDispatchBest(name, {dictionary(int8(), float64()), int16()},
+                        {float64(), float64()});
+    }
+  }
+}
+
+TEST(TestBinaryArithmetic, AddWithImplicitCasts) {
+  CheckScalarBinary("add", ArrayFromJSON(int32(), "[0, 1, 2, null]"),
+                    ArrayFromJSON(float64(), "[0.25, 0.5, 0.75, 1.0]"),
+                    ArrayFromJSON(float64(), "[0.25, 1.5, 2.75, null]"));
+
+  CheckScalarBinary("add", ArrayFromJSON(int8(), "[-16, 0, 16, null]"),
+                    ArrayFromJSON(uint32(), "[3, 4, 5, 7]"),
+                    ArrayFromJSON(int32(), "[-13, 4, 21, null]"));
+
+  CheckScalarBinary("add", ArrayFromJSON(dictionary(int32(), int32()), "[0, 1, 2, null]"),

Review comment:
       Can you test with a dictionary that's not trivially equal to its indices?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org