You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/14 17:03:29 UTC

[GitHub] [arrow] lidavidm opened a new pull request, #12891: ARROW-12659: [C++] Support is_valid as a guarantee

lidavidm opened a new pull request, #12891:
URL: https://github.com/apache/arrow/pull/12891

   This rebases #10253 and fixes it up to also address ARROW-15312, including a regression test. 
   
   This refactors how inequalities, is_valid, and is_null are treated in expression simplification, and updates the guarantees that the Parquet/Datasets emits for row groups to properly reflect nullability.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r853529547


##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;
+  bool nullable;
+
+  // Extract an Inequality if possible, derived from "less",
+  // "greater", "less_equal", and "greater_equal" expressions,
+  // possibly disjuncted with an "is_null" Expression.
+  // cmp(a, 2)
+  // cmp(a, 2) or is_null(a)
+  static util::optional<Inequality> ExtractOne(const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
+
+    if (call->function_name == "or_kleene") {
+      // expect the LHS to be a usable field inequality
+      auto out = ExtractOneFromComparison(call->arguments[0]);
+      if (!out) return util::nullopt;
+
+      // expect the RHS to be an is_null expression
+      auto call_rhs = call->arguments[1].call();
+      if (!call_rhs) return util::nullopt;
+      if (call_rhs->function_name != "is_null") return util::nullopt;
+
+      // ... and that it references the same target
+      auto target = call_rhs->arguments[0].field_ref();
+      if (!target) return util::nullopt;
+      if (*target != out->target) return util::nullopt;
+
+      out->nullable = true;
+      return out;
+    }
 
-        // Ensure both calls are comparisons with equal LHS and scalar RHS
-        auto cmp = Comparison::Get(expr);
-        auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+    // fall back to a simple comparison with no "is_null"
+    return ExtractOneFromComparison(guarantee);
+  }
 
-        if (!cmp) return expr;
-        if (!cmp_guarantee) return expr;
+  static util::optional<Inequality> ExtractOneFromComparison(
+      const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
 
-        const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
-        const auto& guarantee_lhs = guarantee.arguments[0];
-        if (lhs != guarantee_lhs) return expr;
+    if (auto cmp = Comparison::Get(call->function_name)) {
+      // not_equal comparisons are not very usable as guarantees
+      if (*cmp == Comparison::NOT_EQUAL) return util::nullopt;
 
-        auto rhs = call->arguments[1].literal();
-        auto guarantee_rhs = guarantee.arguments[1].literal();
+      auto target = call->arguments[0].field_ref();
+      if (!target) return util::nullopt;
 
-        if (!rhs) return expr;
-        if (!rhs->is_scalar()) return expr;
+      auto bound = call->arguments[1].literal();
+      if (!bound) return util::nullopt;
+      if (!bound->is_scalar()) return util::nullopt;
 
-        if (!guarantee_rhs) return expr;
-        if (!guarantee_rhs->is_scalar()) return expr;
+      return Inequality{*cmp, /*target=*/*target, *bound, /*nullable=*/false};
+    }
 
-        ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
-                              Comparison::Execute(*rhs, *guarantee_rhs));
-        DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+    return util::nullopt;
+  }
 
-        if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
-          // RHS of filter is equal to RHS of guarantee
+  /// The given expression simplifies to `value` if the inequality
+  /// target is not nullable. Otherwise, it simplifies to either a
+  /// call to true_unless_null or !true_unless_null.
+  Result<Expression> simplified_to(const Expression& bound_target, bool value) const {
+    if (!nullable) return literal(value);
+
+    ExecContext exec_context;
+
+    // Data may be null, so comparison will yield `value` - or null IFF the data was null
+    //
+    // true_unless_null is cheap; it purely reuses the validity bitmap for the values
+    // buffer. Inversion is less cheap but we expect that term never to be evaluated
+    // since invert(true_unless_null(x)) is not satisfiable.
+    Expression::Call call;
+    call.function_name = "true_unless_null";
+    call.arguments = {bound_target};
+    ARROW_ASSIGN_OR_RAISE(
+        auto true_unless_null,
+        BindNonRecursive(std::move(call),
+                         /*insert_implicit_casts=*/false, &exec_context));
+    if (value) return true_unless_null;
+
+    Expression::Call invert;
+    invert.function_name = "invert";
+    invert.arguments = {std::move(true_unless_null)};
+    return BindNonRecursive(std::move(invert),
+                            /*insert_implicit_casts=*/false, &exec_context);
+  }
 
-          if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
-            // guarantee is a subset of filter, so all data will be included
-            // x > 1, x >= 1, x != 1 guaranteed by x > 1
-            return literal(true);
-          }
+  /// \brief Simplify the given expression given this inequality as a guarantee.
+  Result<Expression> Simplify(Expression expr) {
+    const auto& guarantee = *this;
 
-          if ((*cmp & *cmp_guarantee) == 0) {
-            // guarantee disjoint with filter, so all data will be excluded
-            // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
-            return literal(false);
-          }
+    auto call = expr.call();
+    if (!call) return expr;
 
-          return expr;
-        }
+    auto cmp = Comparison::Get(expr);
+    if (!cmp) return expr;
 
-        if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
-          // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
-          return expr;
-        }
+    auto rhs = call->arguments[1].literal();
+    if (!rhs) return expr;
+    if (!rhs->is_scalar()) return expr;
 
-        if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
-          // x > 1, x >= 1, x != 1 guaranteed by x >= 3
-          return literal(true);
-        } else {
-          // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
-          return literal(false);
-        }
+    const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+    if (!lhs.field_ref()) return expr;
+    if (*lhs.field_ref() != guarantee.target) return expr;
+
+    ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_bound, Comparison::Execute(*rhs, guarantee.bound));
+    DCHECK_NE(cmp_rhs_bound, Comparison::NA);
+
+    if (cmp_rhs_bound == Comparison::EQUAL) {
+      // RHS of filter is equal to RHS of guarantee
+
+      if ((*cmp & guarantee.cmp) == guarantee.cmp) {
+        // guarantee is a subset of filter, so all data will be included
+        // x > 1, x >= 1, x != 1 guaranteed by x > 1
+        return simplified_to(lhs, true);
+      }
+
+      if ((*cmp & guarantee.cmp) == 0) {
+        // guarantee disjoint with filter, so all data will be excluded
+        // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
+        return simplified_to(lhs, false);
+      }
+
+      return expr;
+    }
+
+    if (guarantee.cmp & cmp_rhs_bound) {

Review Comment:
   Alright, added comments here and for the other feedback to try to clarify things. This conditional is surprisingly subtle…



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1103243358

   Ah - yes, it would be good to make explicit (at least, it would be good to document how we handle nullability in general here)


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1103913103

   Actually, right: I think we already do the right thing. 
   
   `(i32 < 10)` with the guarantee `((i32 >= 12) or is_null(i32, {nan_is_null=false}))` simplifies to `invert(true_unless_null(i32))` which is not satisfiable, so the row group will be pruned. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
pitrou commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1103951820

   Ah, great!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r855372594


##########
cpp/src/arrow/util/vector.h:
##########
@@ -78,8 +78,8 @@ std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
 
 template <typename T, typename Predicate>
 std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
-  auto new_end =
-      std::remove_if(values.begin(), values.end(), std::forward<Predicate>(predicate));
+  auto new_end = std::stable_partition(values.begin(), values.end(),

Review Comment:
   I'm curious, is there any reason not to use `remove_if`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r855376851


##########
cpp/src/arrow/util/vector.h:
##########
@@ -78,8 +78,8 @@ std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
 
 template <typename T, typename Predicate>
 std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
-  auto new_end =
-      std::remove_if(values.begin(), values.end(), std::forward<Predicate>(predicate));
+  auto new_end = std::stable_partition(values.begin(), values.end(),

Review Comment:
   Yeah, inverting it would probably be slightly more efficient than calling a stable partition (which can allocate temporary memory AFAIU).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] github-actions[bot] commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1099414572

   :warning: Ticket **has not been started in JIRA**, please click 'Start Progress'.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1099606380

   Hmm, it's crashing because `A == null[string]` is getting converted to `null[string]` by `FoldConstants`. But FilterNode assumes the mask has to be boolean.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] wjones127 commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
wjones127 commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r851338828


##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -682,46 +715,49 @@ std::vector<Expression> GuaranteeConjunctionMembers(
   return FlattenedAssociativeChain(guaranteed_true_predicate).fringe;
 }
 
-// Conjunction members which are represented in known_values are erased from
-// conjunction_members
-Status ExtractKnownFieldValuesImpl(
-    std::vector<Expression>* conjunction_members,
-    std::unordered_map<FieldRef, Datum, FieldRef::Hash>* known_values) {
-  auto unconsumed_end =
-      std::partition(conjunction_members->begin(), conjunction_members->end(),
-                     [](const Expression& expr) {
-                       // search for an equality conditions between a field and a literal
-                       auto call = expr.call();
-                       if (!call) return true;
-
-                       if (call->function_name == "equal") {
-                         auto ref = call->arguments[0].field_ref();
-                         auto lit = call->arguments[1].literal();
-                         return !(ref && lit);
-                       }
-
-                       if (call->function_name == "is_null") {
-                         auto ref = call->arguments[0].field_ref();
-                         return !ref;
-                       }
-
-                       return true;
-                     });
-
-  for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) {
-    auto call = CallNotNull(*it);
-
-    if (call->function_name == "equal") {
-      auto ref = call->arguments[0].field_ref();
-      auto lit = call->arguments[1].literal();
-      known_values->emplace(*ref, *lit);
-    } else if (call->function_name == "is_null") {
-      auto ref = call->arguments[0].field_ref();
-      known_values->emplace(*ref, Datum(std::make_shared<NullScalar>()));
-    }
+// equal(a, 2)

Review Comment:
   Do we want to provide more context here?



##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -128,17 +128,30 @@ util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
   auto maybe_min = min->CastTo(field->type());
   auto maybe_max = max->CastTo(field->type());
   if (maybe_min.ok() && maybe_max.ok()) {
-    auto col_min = maybe_min.MoveValueUnsafe();
-    auto col_max = maybe_max.MoveValueUnsafe();
-    if (col_min->Equals(col_max)) {
-      return compute::equal(std::move(field_expr), compute::literal(std::move(col_min)));
+    min = maybe_min.MoveValueUnsafe();
+    max = maybe_max.MoveValueUnsafe();
+
+    compute::Expression range;
+    if (min->Equals(max)) {
+      auto single_value = compute::equal(field_expr, compute::literal(std::move(min)));
+
+      if (statistics->null_count() == 0) {
+        return compute::and_(single_value, compute::is_valid(field_expr));
+      }
+      return compute::or_(std::move(single_value), is_null(std::move(field_expr)));
     }
 
     auto lower_bound =
-        compute::greater_equal(field_expr, compute::literal(std::move(col_min)));
-    auto upper_bound =
-        compute::less_equal(std::move(field_expr), compute::literal(std::move(col_max)));
-    return compute::and_(std::move(lower_bound), std::move(upper_bound));
+        compute::greater_equal(field_expr, compute::literal(std::move(min)));
+    auto upper_bound = compute::less_equal(field_expr, compute::literal(std::move(max)));
+
+    if (statistics->null_count() != 0) {
+      lower_bound = compute::or_(std::move(lower_bound), is_null(field_expr));
+      upper_bound = compute::or_(std::move(upper_bound), is_null(std::move(field_expr)));
+      return compute::and_(std::move(lower_bound), std::move(upper_bound));
+    }
+    return compute::and_(compute::and_(std::move(lower_bound), std::move(upper_bound)),
+                         compute::is_valid(field_expr));
   }

Review Comment:
   Do we want an arm for `statitics->null_count() == row_count`? In which case, we would just return `is_null(field_expr)`?



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -682,46 +715,49 @@ std::vector<Expression> GuaranteeConjunctionMembers(
   return FlattenedAssociativeChain(guaranteed_true_predicate).fringe;
 }
 
-// Conjunction members which are represented in known_values are erased from
-// conjunction_members
-Status ExtractKnownFieldValuesImpl(
-    std::vector<Expression>* conjunction_members,
-    std::unordered_map<FieldRef, Datum, FieldRef::Hash>* known_values) {
-  auto unconsumed_end =
-      std::partition(conjunction_members->begin(), conjunction_members->end(),
-                     [](const Expression& expr) {
-                       // search for an equality conditions between a field and a literal
-                       auto call = expr.call();
-                       if (!call) return true;
-
-                       if (call->function_name == "equal") {
-                         auto ref = call->arguments[0].field_ref();
-                         auto lit = call->arguments[1].literal();
-                         return !(ref && lit);
-                       }
-
-                       if (call->function_name == "is_null") {
-                         auto ref = call->arguments[0].field_ref();
-                         return !ref;
-                       }
-
-                       return true;
-                     });
-
-  for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) {
-    auto call = CallNotNull(*it);
-
-    if (call->function_name == "equal") {
-      auto ref = call->arguments[0].field_ref();
-      auto lit = call->arguments[1].literal();
-      known_values->emplace(*ref, *lit);
-    } else if (call->function_name == "is_null") {
-      auto ref = call->arguments[0].field_ref();
-      known_values->emplace(*ref, Datum(std::make_shared<NullScalar>()));
-    }
+// equal(a, 2)
+// is_null(a)
+util::optional<std::pair<FieldRef, Datum>> ExtractKnownFieldValue(

Review Comment:
   nit: I don't love that this is different from below function names by only a single character. Maybe something like `ExtractSingleFieldvalue` or something similar?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] jonkeane closed pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
jonkeane closed pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee
URL: https://github.com/apache/arrow/pull/12891


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r853430402


##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -305,19 +308,47 @@ bool Expression::IsNullLiteral() const {
   return false;
 }
 
-bool Expression::IsSatisfiable() const {
-  if (type() && type()->id() == Type::NA) {
-    return false;
+namespace {
+util::optional<compute::NullHandling::type> GetNullHandling(
+    const Expression::Call& call) {
+  DCHECK_NE(call.function, nullptr);
+  if (call.function->kind() == compute::Function::SCALAR) {
+    return static_cast<const compute::ScalarKernel*>(call.kernel)->null_handling;
   }
+  return util::nullopt;
+}
+}  // namespace
+
+bool Expression::IsSatisfiable() const {
+  if (!type()) return true;
+  if (type()->id() != Type::BOOL) return true;
 
   if (auto lit = literal()) {
     if (lit->null_count() == lit->length()) {
       return false;
     }
 
-    if (lit->is_scalar() && lit->type()->id() == Type::BOOL) {
+    if (lit->is_scalar()) {
       return lit->scalar_as<BooleanScalar>().value;
     }
+
+    return true;
+  }
+
+  if (field_ref()) return true;
+
+  auto call = CallNotNull(*this);
+
+  if (call->function_name == "invert") {
+    if (auto nested_call = call->arguments[0].call()) {
+      if (nested_call->function_name == "true_unless_null") return false;
+    }
+  }
+
+  if (call->function_name == "and_kleene") {

Review Comment:
   Ah, actually, this is different. Yes, this should also accept "and".



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r853115172


##########
cpp/src/arrow/util/vector.h:
##########
@@ -77,9 +77,14 @@ std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
 }
 
 template <typename T, typename Predicate>
-std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
-  auto new_end =
-      std::remove_if(values.begin(), values.end(), std::forward<Predicate>(predicate));
+std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate,
+                            std::vector<T>* filtered_out = NULLPTR) {

Review Comment:
   I may be missing something, but it does not seem this third argument is used anywhere?



##########
cpp/src/arrow/compute/kernels/scalar_validity_test.cc:
##########
@@ -48,6 +48,23 @@ TEST_F(TestBooleanValidityKernels, ArrayIsValid) {
                    "[false, true, true, false]");
 }
 
+TEST_F(TestBooleanValidityKernels, TrueUnlessNull) {
+  CheckScalarUnary("true_unless_null", type_singleton(), "[]", type_singleton(), "[]");
+  CheckScalarUnary("true_unless_null", type_singleton(), "[null]", type_singleton(),
+                   "[null]");
+  CheckScalarUnary("true_unless_null", type_singleton(), "[0, 1]", type_singleton(),
+                   "[true, true]");
+  CheckScalarUnary("true_unless_null", type_singleton(), "[null, 1, 0, null]",
+                   type_singleton(), "[null, true, true, null]");
+}
+
+TEST_F(TestBooleanValidityKernels, IsValidIsNullNullType) {
+  CheckScalarUnary("is_null", std::make_shared<NullArray>(5),
+                   ArrayFromJSON(boolean(), "[true, true, true, true, true]"));
+  CheckScalarUnary("is_valid", std::make_shared<NullArray>(5),
+                   ArrayFromJSON(boolean(), "[false, false, false, false, false]"));

Review Comment:
   Can we also test "true_unless_null" here?



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -305,19 +308,47 @@ bool Expression::IsNullLiteral() const {
   return false;
 }
 
-bool Expression::IsSatisfiable() const {
-  if (type() && type()->id() == Type::NA) {
-    return false;
+namespace {
+util::optional<compute::NullHandling::type> GetNullHandling(
+    const Expression::Call& call) {
+  DCHECK_NE(call.function, nullptr);
+  if (call.function->kind() == compute::Function::SCALAR) {
+    return static_cast<const compute::ScalarKernel*>(call.kernel)->null_handling;
   }
+  return util::nullopt;
+}
+}  // namespace
+
+bool Expression::IsSatisfiable() const {
+  if (!type()) return true;
+  if (type()->id() != Type::BOOL) return true;
 
   if (auto lit = literal()) {
     if (lit->null_count() == lit->length()) {
       return false;
     }
 
-    if (lit->is_scalar() && lit->type()->id() == Type::BOOL) {
+    if (lit->is_scalar()) {
       return lit->scalar_as<BooleanScalar>().value;
     }
+
+    return true;
+  }
+
+  if (field_ref()) return true;
+
+  auto call = CallNotNull(*this);
+
+  if (call->function_name == "invert") {

Review Comment:
   Can you add a comment explaining why this is useful?



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;

Review Comment:
   Would you like to add a comment explaining what the terms are? Is it `target <cmp> bound` or `bound <cmp> target`?
   



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;
+  bool nullable;
+
+  // Extract an Inequality if possible, derived from "less",
+  // "greater", "less_equal", and "greater_equal" expressions,
+  // possibly disjuncted with an "is_null" Expression.
+  // cmp(a, 2)
+  // cmp(a, 2) or is_null(a)
+  static util::optional<Inequality> ExtractOne(const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
+
+    if (call->function_name == "or_kleene") {
+      // expect the LHS to be a usable field inequality
+      auto out = ExtractOneFromComparison(call->arguments[0]);
+      if (!out) return util::nullopt;
+
+      // expect the RHS to be an is_null expression
+      auto call_rhs = call->arguments[1].call();
+      if (!call_rhs) return util::nullopt;
+      if (call_rhs->function_name != "is_null") return util::nullopt;
+
+      // ... and that it references the same target
+      auto target = call_rhs->arguments[0].field_ref();
+      if (!target) return util::nullopt;
+      if (*target != out->target) return util::nullopt;
+
+      out->nullable = true;
+      return out;
+    }
 
-        // Ensure both calls are comparisons with equal LHS and scalar RHS
-        auto cmp = Comparison::Get(expr);
-        auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+    // fall back to a simple comparison with no "is_null"
+    return ExtractOneFromComparison(guarantee);
+  }
 
-        if (!cmp) return expr;
-        if (!cmp_guarantee) return expr;
+  static util::optional<Inequality> ExtractOneFromComparison(
+      const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
 
-        const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
-        const auto& guarantee_lhs = guarantee.arguments[0];
-        if (lhs != guarantee_lhs) return expr;
+    if (auto cmp = Comparison::Get(call->function_name)) {
+      // not_equal comparisons are not very usable as guarantees
+      if (*cmp == Comparison::NOT_EQUAL) return util::nullopt;
 
-        auto rhs = call->arguments[1].literal();
-        auto guarantee_rhs = guarantee.arguments[1].literal();
+      auto target = call->arguments[0].field_ref();
+      if (!target) return util::nullopt;
 
-        if (!rhs) return expr;
-        if (!rhs->is_scalar()) return expr;
+      auto bound = call->arguments[1].literal();
+      if (!bound) return util::nullopt;
+      if (!bound->is_scalar()) return util::nullopt;
 
-        if (!guarantee_rhs) return expr;
-        if (!guarantee_rhs->is_scalar()) return expr;
+      return Inequality{*cmp, /*target=*/*target, *bound, /*nullable=*/false};
+    }
 
-        ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
-                              Comparison::Execute(*rhs, *guarantee_rhs));
-        DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+    return util::nullopt;
+  }
 
-        if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
-          // RHS of filter is equal to RHS of guarantee
+  /// The given expression simplifies to `value` if the inequality
+  /// target is not nullable. Otherwise, it simplifies to either a
+  /// call to true_unless_null or !true_unless_null.
+  Result<Expression> simplified_to(const Expression& bound_target, bool value) const {
+    if (!nullable) return literal(value);
+
+    ExecContext exec_context;
+
+    // Data may be null, so comparison will yield `value` - or null IFF the data was null
+    //
+    // true_unless_null is cheap; it purely reuses the validity bitmap for the values
+    // buffer. Inversion is less cheap but we expect that term never to be evaluated
+    // since invert(true_unless_null(x)) is not satisfiable.
+    Expression::Call call;
+    call.function_name = "true_unless_null";
+    call.arguments = {bound_target};
+    ARROW_ASSIGN_OR_RAISE(
+        auto true_unless_null,
+        BindNonRecursive(std::move(call),
+                         /*insert_implicit_casts=*/false, &exec_context));
+    if (value) return true_unless_null;
+
+    Expression::Call invert;
+    invert.function_name = "invert";
+    invert.arguments = {std::move(true_unless_null)};
+    return BindNonRecursive(std::move(invert),
+                            /*insert_implicit_casts=*/false, &exec_context);
+  }
 
-          if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
-            // guarantee is a subset of filter, so all data will be included
-            // x > 1, x >= 1, x != 1 guaranteed by x > 1
-            return literal(true);
-          }
+  /// \brief Simplify the given expression given this inequality as a guarantee.
+  Result<Expression> Simplify(Expression expr) {
+    const auto& guarantee = *this;
 
-          if ((*cmp & *cmp_guarantee) == 0) {
-            // guarantee disjoint with filter, so all data will be excluded
-            // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
-            return literal(false);
-          }
+    auto call = expr.call();
+    if (!call) return expr;
 
-          return expr;
-        }
+    auto cmp = Comparison::Get(expr);
+    if (!cmp) return expr;
 
-        if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
-          // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
-          return expr;
-        }
+    auto rhs = call->arguments[1].literal();
+    if (!rhs) return expr;
+    if (!rhs->is_scalar()) return expr;
 
-        if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
-          // x > 1, x >= 1, x != 1 guaranteed by x >= 3
-          return literal(true);
-        } else {
-          // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
-          return literal(false);
-        }
+    const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+    if (!lhs.field_ref()) return expr;
+    if (*lhs.field_ref() != guarantee.target) return expr;
+
+    ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_bound, Comparison::Execute(*rhs, guarantee.bound));
+    DCHECK_NE(cmp_rhs_bound, Comparison::NA);
+
+    if (cmp_rhs_bound == Comparison::EQUAL) {
+      // RHS of filter is equal to RHS of guarantee
+
+      if ((*cmp & guarantee.cmp) == guarantee.cmp) {
+        // guarantee is a subset of filter, so all data will be included
+        // x > 1, x >= 1, x != 1 guaranteed by x > 1
+        return simplified_to(lhs, true);
+      }
+
+      if ((*cmp & guarantee.cmp) == 0) {
+        // guarantee disjoint with filter, so all data will be excluded
+        // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
+        return simplified_to(lhs, false);
+      }
+
+      return expr;
+    }
+
+    if (guarantee.cmp & cmp_rhs_bound) {
+      // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3

Review Comment:
   Perhaps (with `rhs` being 1 and `bound` being 0):
   ```c++
   // x > 1, x >= 1, x != 1 cannot use guarantee x >= 0
   // (where `guarantee.cmp` is GREATER_EQUAL, `cmp_rhs_bound` is GREATER)
   ```
   



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;
+  bool nullable;
+
+  // Extract an Inequality if possible, derived from "less",
+  // "greater", "less_equal", and "greater_equal" expressions,
+  // possibly disjuncted with an "is_null" Expression.
+  // cmp(a, 2)
+  // cmp(a, 2) or is_null(a)
+  static util::optional<Inequality> ExtractOne(const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
+
+    if (call->function_name == "or_kleene") {
+      // expect the LHS to be a usable field inequality
+      auto out = ExtractOneFromComparison(call->arguments[0]);
+      if (!out) return util::nullopt;
+
+      // expect the RHS to be an is_null expression
+      auto call_rhs = call->arguments[1].call();
+      if (!call_rhs) return util::nullopt;
+      if (call_rhs->function_name != "is_null") return util::nullopt;
+
+      // ... and that it references the same target
+      auto target = call_rhs->arguments[0].field_ref();
+      if (!target) return util::nullopt;
+      if (*target != out->target) return util::nullopt;
+
+      out->nullable = true;
+      return out;
+    }
 
-        // Ensure both calls are comparisons with equal LHS and scalar RHS
-        auto cmp = Comparison::Get(expr);
-        auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+    // fall back to a simple comparison with no "is_null"
+    return ExtractOneFromComparison(guarantee);
+  }
 
-        if (!cmp) return expr;
-        if (!cmp_guarantee) return expr;
+  static util::optional<Inequality> ExtractOneFromComparison(
+      const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
 
-        const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
-        const auto& guarantee_lhs = guarantee.arguments[0];
-        if (lhs != guarantee_lhs) return expr;
+    if (auto cmp = Comparison::Get(call->function_name)) {
+      // not_equal comparisons are not very usable as guarantees
+      if (*cmp == Comparison::NOT_EQUAL) return util::nullopt;
 
-        auto rhs = call->arguments[1].literal();
-        auto guarantee_rhs = guarantee.arguments[1].literal();
+      auto target = call->arguments[0].field_ref();
+      if (!target) return util::nullopt;
 
-        if (!rhs) return expr;
-        if (!rhs->is_scalar()) return expr;
+      auto bound = call->arguments[1].literal();
+      if (!bound) return util::nullopt;
+      if (!bound->is_scalar()) return util::nullopt;
 
-        if (!guarantee_rhs) return expr;
-        if (!guarantee_rhs->is_scalar()) return expr;
+      return Inequality{*cmp, /*target=*/*target, *bound, /*nullable=*/false};
+    }
 
-        ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
-                              Comparison::Execute(*rhs, *guarantee_rhs));
-        DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+    return util::nullopt;
+  }
 
-        if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
-          // RHS of filter is equal to RHS of guarantee
+  /// The given expression simplifies to `value` if the inequality
+  /// target is not nullable. Otherwise, it simplifies to either a
+  /// call to true_unless_null or !true_unless_null.
+  Result<Expression> simplified_to(const Expression& bound_target, bool value) const {
+    if (!nullable) return literal(value);
+
+    ExecContext exec_context;
+
+    // Data may be null, so comparison will yield `value` - or null IFF the data was null
+    //
+    // true_unless_null is cheap; it purely reuses the validity bitmap for the values
+    // buffer. Inversion is less cheap but we expect that term never to be evaluated
+    // since invert(true_unless_null(x)) is not satisfiable.
+    Expression::Call call;
+    call.function_name = "true_unless_null";
+    call.arguments = {bound_target};
+    ARROW_ASSIGN_OR_RAISE(
+        auto true_unless_null,
+        BindNonRecursive(std::move(call),
+                         /*insert_implicit_casts=*/false, &exec_context));
+    if (value) return true_unless_null;
+
+    Expression::Call invert;
+    invert.function_name = "invert";
+    invert.arguments = {std::move(true_unless_null)};
+    return BindNonRecursive(std::move(invert),
+                            /*insert_implicit_casts=*/false, &exec_context);
+  }
 
-          if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
-            // guarantee is a subset of filter, so all data will be included
-            // x > 1, x >= 1, x != 1 guaranteed by x > 1
-            return literal(true);
-          }
+  /// \brief Simplify the given expression given this inequality as a guarantee.
+  Result<Expression> Simplify(Expression expr) {
+    const auto& guarantee = *this;
 
-          if ((*cmp & *cmp_guarantee) == 0) {
-            // guarantee disjoint with filter, so all data will be excluded
-            // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
-            return literal(false);
-          }
+    auto call = expr.call();
+    if (!call) return expr;
 
-          return expr;
-        }
+    auto cmp = Comparison::Get(expr);
+    if (!cmp) return expr;
 
-        if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
-          // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
-          return expr;
-        }
+    auto rhs = call->arguments[1].literal();
+    if (!rhs) return expr;
+    if (!rhs->is_scalar()) return expr;
 
-        if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
-          // x > 1, x >= 1, x != 1 guaranteed by x >= 3
-          return literal(true);
-        } else {
-          // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
-          return literal(false);
-        }
+    const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+    if (!lhs.field_ref()) return expr;
+    if (*lhs.field_ref() != guarantee.target) return expr;
+
+    ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_bound, Comparison::Execute(*rhs, guarantee.bound));
+    DCHECK_NE(cmp_rhs_bound, Comparison::NA);
+
+    if (cmp_rhs_bound == Comparison::EQUAL) {
+      // RHS of filter is equal to RHS of guarantee
+
+      if ((*cmp & guarantee.cmp) == guarantee.cmp) {
+        // guarantee is a subset of filter, so all data will be included
+        // x > 1, x >= 1, x != 1 guaranteed by x > 1
+        return simplified_to(lhs, true);
+      }
+
+      if ((*cmp & guarantee.cmp) == 0) {
+        // guarantee disjoint with filter, so all data will be excluded
+        // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
+        return simplified_to(lhs, false);
+      }
+
+      return expr;
+    }
+
+    if (guarantee.cmp & cmp_rhs_bound) {
+      // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
+      return expr;
+    }
+
+    if (*cmp & Comparison::GetFlipped(cmp_rhs_bound)) {
+      // x > 1, x >= 1, x != 1 guaranteed by x >= 3

Review Comment:
   Perhaps
   ```suggestion
         // x > 1, x >= 1, x != 1 guaranteed by x >= 3
         // (where `guarantee.cmp` is GREATER_EQUAL, `cmp_rhs_bound` is LESS)
   ```



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -305,19 +308,47 @@ bool Expression::IsNullLiteral() const {
   return false;
 }
 
-bool Expression::IsSatisfiable() const {
-  if (type() && type()->id() == Type::NA) {
-    return false;
+namespace {
+util::optional<compute::NullHandling::type> GetNullHandling(
+    const Expression::Call& call) {
+  DCHECK_NE(call.function, nullptr);
+  if (call.function->kind() == compute::Function::SCALAR) {
+    return static_cast<const compute::ScalarKernel*>(call.kernel)->null_handling;
   }
+  return util::nullopt;
+}
+}  // namespace
+
+bool Expression::IsSatisfiable() const {
+  if (!type()) return true;
+  if (type()->id() != Type::BOOL) return true;
 
   if (auto lit = literal()) {
     if (lit->null_count() == lit->length()) {
       return false;
     }
 
-    if (lit->is_scalar() && lit->type()->id() == Type::BOOL) {
+    if (lit->is_scalar()) {
       return lit->scalar_as<BooleanScalar>().value;
     }
+
+    return true;
+  }
+
+  if (field_ref()) return true;
+
+  auto call = CallNotNull(*this);
+
+  if (call->function_name == "invert") {
+    if (auto nested_call = call->arguments[0].call()) {
+      if (nested_call->function_name == "true_unless_null") return false;
+    }
+  }
+
+  if (call->function_name == "and_kleene") {

Review Comment:
   Why specifically "and_kleene" but not "and"? Can you add a comment?



##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -128,17 +128,30 @@ util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
   auto maybe_min = min->CastTo(field->type());
   auto maybe_max = max->CastTo(field->type());
   if (maybe_min.ok() && maybe_max.ok()) {
-    auto col_min = maybe_min.MoveValueUnsafe();
-    auto col_max = maybe_max.MoveValueUnsafe();
-    if (col_min->Equals(col_max)) {
-      return compute::equal(std::move(field_expr), compute::literal(std::move(col_min)));
+    min = maybe_min.MoveValueUnsafe();
+    max = maybe_max.MoveValueUnsafe();
+
+    compute::Expression range;
+    if (min->Equals(max)) {
+      auto single_value = compute::equal(field_expr, compute::literal(std::move(min)));
+
+      if (statistics->null_count() == 0) {
+        return compute::and_(single_value, compute::is_valid(field_expr));

Review Comment:
   Is it useful to add `is_valid` here? If a value is equal to `min` it implies it is valid.



##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -128,17 +128,30 @@ util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
   auto maybe_min = min->CastTo(field->type());
   auto maybe_max = max->CastTo(field->type());
   if (maybe_min.ok() && maybe_max.ok()) {
-    auto col_min = maybe_min.MoveValueUnsafe();
-    auto col_max = maybe_max.MoveValueUnsafe();
-    if (col_min->Equals(col_max)) {
-      return compute::equal(std::move(field_expr), compute::literal(std::move(col_min)));
+    min = maybe_min.MoveValueUnsafe();
+    max = maybe_max.MoveValueUnsafe();
+
+    compute::Expression range;
+    if (min->Equals(max)) {
+      auto single_value = compute::equal(field_expr, compute::literal(std::move(min)));
+
+      if (statistics->null_count() == 0) {
+        return compute::and_(single_value, compute::is_valid(field_expr));
+      }
+      return compute::or_(std::move(single_value), is_null(std::move(field_expr)));
     }
 
     auto lower_bound =
-        compute::greater_equal(field_expr, compute::literal(std::move(col_min)));
-    auto upper_bound =
-        compute::less_equal(std::move(field_expr), compute::literal(std::move(col_max)));
-    return compute::and_(std::move(lower_bound), std::move(upper_bound));
+        compute::greater_equal(field_expr, compute::literal(std::move(min)));
+    auto upper_bound = compute::less_equal(field_expr, compute::literal(std::move(max)));
+
+    if (statistics->null_count() != 0) {
+      lower_bound = compute::or_(std::move(lower_bound), is_null(field_expr));
+      upper_bound = compute::or_(std::move(upper_bound), is_null(std::move(field_expr)));
+      return compute::and_(std::move(lower_bound), std::move(upper_bound));
+    }
+    return compute::and_(compute::and_(std::move(lower_bound), std::move(upper_bound)),
+                         compute::is_valid(field_expr));

Review Comment:
   This seems a bit pointlessly complicated, or I'm missing something? Why not:
   ```suggestion
       auto in_range = compute::and_(std::move(lower_bound), std::move(upper_bound));
       if (statistics->null_count() != 0) {
         return compute::or_(std::move(in_range), compute::is_null(field_expr));
       }
       return in_range;
   ```
   



##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -128,17 +128,30 @@ util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
   auto maybe_min = min->CastTo(field->type());
   auto maybe_max = max->CastTo(field->type());
   if (maybe_min.ok() && maybe_max.ok()) {
-    auto col_min = maybe_min.MoveValueUnsafe();
-    auto col_max = maybe_max.MoveValueUnsafe();
-    if (col_min->Equals(col_max)) {
-      return compute::equal(std::move(field_expr), compute::literal(std::move(col_min)));
+    min = maybe_min.MoveValueUnsafe();
+    max = maybe_max.MoveValueUnsafe();
+
+    compute::Expression range;

Review Comment:
   This variable doesn't seem used?



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;
+  bool nullable;
+
+  // Extract an Inequality if possible, derived from "less",
+  // "greater", "less_equal", and "greater_equal" expressions,
+  // possibly disjuncted with an "is_null" Expression.
+  // cmp(a, 2)
+  // cmp(a, 2) or is_null(a)
+  static util::optional<Inequality> ExtractOne(const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
+
+    if (call->function_name == "or_kleene") {
+      // expect the LHS to be a usable field inequality
+      auto out = ExtractOneFromComparison(call->arguments[0]);
+      if (!out) return util::nullopt;
+
+      // expect the RHS to be an is_null expression
+      auto call_rhs = call->arguments[1].call();
+      if (!call_rhs) return util::nullopt;
+      if (call_rhs->function_name != "is_null") return util::nullopt;
+
+      // ... and that it references the same target
+      auto target = call_rhs->arguments[0].field_ref();
+      if (!target) return util::nullopt;
+      if (*target != out->target) return util::nullopt;
+
+      out->nullable = true;
+      return out;
+    }
 
-        // Ensure both calls are comparisons with equal LHS and scalar RHS
-        auto cmp = Comparison::Get(expr);
-        auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+    // fall back to a simple comparison with no "is_null"
+    return ExtractOneFromComparison(guarantee);
+  }
 
-        if (!cmp) return expr;
-        if (!cmp_guarantee) return expr;
+  static util::optional<Inequality> ExtractOneFromComparison(
+      const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
 
-        const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
-        const auto& guarantee_lhs = guarantee.arguments[0];
-        if (lhs != guarantee_lhs) return expr;
+    if (auto cmp = Comparison::Get(call->function_name)) {
+      // not_equal comparisons are not very usable as guarantees
+      if (*cmp == Comparison::NOT_EQUAL) return util::nullopt;
 
-        auto rhs = call->arguments[1].literal();
-        auto guarantee_rhs = guarantee.arguments[1].literal();
+      auto target = call->arguments[0].field_ref();
+      if (!target) return util::nullopt;
 
-        if (!rhs) return expr;
-        if (!rhs->is_scalar()) return expr;
+      auto bound = call->arguments[1].literal();
+      if (!bound) return util::nullopt;
+      if (!bound->is_scalar()) return util::nullopt;
 
-        if (!guarantee_rhs) return expr;
-        if (!guarantee_rhs->is_scalar()) return expr;
+      return Inequality{*cmp, /*target=*/*target, *bound, /*nullable=*/false};
+    }
 
-        ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
-                              Comparison::Execute(*rhs, *guarantee_rhs));
-        DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+    return util::nullopt;
+  }
 
-        if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
-          // RHS of filter is equal to RHS of guarantee
+  /// The given expression simplifies to `value` if the inequality
+  /// target is not nullable. Otherwise, it simplifies to either a
+  /// call to true_unless_null or !true_unless_null.
+  Result<Expression> simplified_to(const Expression& bound_target, bool value) const {
+    if (!nullable) return literal(value);
+
+    ExecContext exec_context;
+
+    // Data may be null, so comparison will yield `value` - or null IFF the data was null
+    //
+    // true_unless_null is cheap; it purely reuses the validity bitmap for the values
+    // buffer. Inversion is less cheap but we expect that term never to be evaluated
+    // since invert(true_unless_null(x)) is not satisfiable.
+    Expression::Call call;
+    call.function_name = "true_unless_null";
+    call.arguments = {bound_target};
+    ARROW_ASSIGN_OR_RAISE(
+        auto true_unless_null,
+        BindNonRecursive(std::move(call),
+                         /*insert_implicit_casts=*/false, &exec_context));
+    if (value) return true_unless_null;
+
+    Expression::Call invert;
+    invert.function_name = "invert";
+    invert.arguments = {std::move(true_unless_null)};
+    return BindNonRecursive(std::move(invert),
+                            /*insert_implicit_casts=*/false, &exec_context);
+  }
 
-          if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
-            // guarantee is a subset of filter, so all data will be included
-            // x > 1, x >= 1, x != 1 guaranteed by x > 1
-            return literal(true);
-          }
+  /// \brief Simplify the given expression given this inequality as a guarantee.
+  Result<Expression> Simplify(Expression expr) {
+    const auto& guarantee = *this;
 
-          if ((*cmp & *cmp_guarantee) == 0) {
-            // guarantee disjoint with filter, so all data will be excluded
-            // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
-            return literal(false);
-          }
+    auto call = expr.call();
+    if (!call) return expr;
 
-          return expr;
-        }
+    auto cmp = Comparison::Get(expr);
+    if (!cmp) return expr;
 
-        if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
-          // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
-          return expr;
-        }
+    auto rhs = call->arguments[1].literal();
+    if (!rhs) return expr;
+    if (!rhs->is_scalar()) return expr;
 
-        if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
-          // x > 1, x >= 1, x != 1 guaranteed by x >= 3
-          return literal(true);
-        } else {
-          // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
-          return literal(false);
-        }
+    const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+    if (!lhs.field_ref()) return expr;
+    if (*lhs.field_ref() != guarantee.target) return expr;
+
+    ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_bound, Comparison::Execute(*rhs, guarantee.bound));
+    DCHECK_NE(cmp_rhs_bound, Comparison::NA);
+
+    if (cmp_rhs_bound == Comparison::EQUAL) {
+      // RHS of filter is equal to RHS of guarantee
+
+      if ((*cmp & guarantee.cmp) == guarantee.cmp) {
+        // guarantee is a subset of filter, so all data will be included
+        // x > 1, x >= 1, x != 1 guaranteed by x > 1
+        return simplified_to(lhs, true);
+      }
+
+      if ((*cmp & guarantee.cmp) == 0) {
+        // guarantee disjoint with filter, so all data will be excluded
+        // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
+        return simplified_to(lhs, false);
+      }
+
+      return expr;
+    }
+
+    if (guarantee.cmp & cmp_rhs_bound) {

Review Comment:
   This is cryptic, what is this condition supposed to imply?



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;
+  bool nullable;
+
+  // Extract an Inequality if possible, derived from "less",
+  // "greater", "less_equal", and "greater_equal" expressions,
+  // possibly disjuncted with an "is_null" Expression.
+  // cmp(a, 2)
+  // cmp(a, 2) or is_null(a)
+  static util::optional<Inequality> ExtractOne(const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
+
+    if (call->function_name == "or_kleene") {
+      // expect the LHS to be a usable field inequality
+      auto out = ExtractOneFromComparison(call->arguments[0]);
+      if (!out) return util::nullopt;
+
+      // expect the RHS to be an is_null expression
+      auto call_rhs = call->arguments[1].call();
+      if (!call_rhs) return util::nullopt;
+      if (call_rhs->function_name != "is_null") return util::nullopt;
+
+      // ... and that it references the same target
+      auto target = call_rhs->arguments[0].field_ref();
+      if (!target) return util::nullopt;
+      if (*target != out->target) return util::nullopt;
+
+      out->nullable = true;
+      return out;
+    }
 
-        // Ensure both calls are comparisons with equal LHS and scalar RHS
-        auto cmp = Comparison::Get(expr);
-        auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+    // fall back to a simple comparison with no "is_null"
+    return ExtractOneFromComparison(guarantee);
+  }
 
-        if (!cmp) return expr;
-        if (!cmp_guarantee) return expr;
+  static util::optional<Inequality> ExtractOneFromComparison(
+      const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
 
-        const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
-        const auto& guarantee_lhs = guarantee.arguments[0];
-        if (lhs != guarantee_lhs) return expr;
+    if (auto cmp = Comparison::Get(call->function_name)) {
+      // not_equal comparisons are not very usable as guarantees
+      if (*cmp == Comparison::NOT_EQUAL) return util::nullopt;
 
-        auto rhs = call->arguments[1].literal();
-        auto guarantee_rhs = guarantee.arguments[1].literal();
+      auto target = call->arguments[0].field_ref();
+      if (!target) return util::nullopt;
 
-        if (!rhs) return expr;
-        if (!rhs->is_scalar()) return expr;
+      auto bound = call->arguments[1].literal();
+      if (!bound) return util::nullopt;
+      if (!bound->is_scalar()) return util::nullopt;
 
-        if (!guarantee_rhs) return expr;
-        if (!guarantee_rhs->is_scalar()) return expr;
+      return Inequality{*cmp, /*target=*/*target, *bound, /*nullable=*/false};
+    }
 
-        ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
-                              Comparison::Execute(*rhs, *guarantee_rhs));
-        DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+    return util::nullopt;
+  }
 
-        if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
-          // RHS of filter is equal to RHS of guarantee
+  /// The given expression simplifies to `value` if the inequality
+  /// target is not nullable. Otherwise, it simplifies to either a
+  /// call to true_unless_null or !true_unless_null.
+  Result<Expression> simplified_to(const Expression& bound_target, bool value) const {
+    if (!nullable) return literal(value);
+
+    ExecContext exec_context;
+
+    // Data may be null, so comparison will yield `value` - or null IFF the data was null
+    //
+    // true_unless_null is cheap; it purely reuses the validity bitmap for the values
+    // buffer. Inversion is less cheap but we expect that term never to be evaluated
+    // since invert(true_unless_null(x)) is not satisfiable.
+    Expression::Call call;
+    call.function_name = "true_unless_null";
+    call.arguments = {bound_target};
+    ARROW_ASSIGN_OR_RAISE(
+        auto true_unless_null,
+        BindNonRecursive(std::move(call),
+                         /*insert_implicit_casts=*/false, &exec_context));
+    if (value) return true_unless_null;
+
+    Expression::Call invert;
+    invert.function_name = "invert";
+    invert.arguments = {std::move(true_unless_null)};
+    return BindNonRecursive(std::move(invert),
+                            /*insert_implicit_casts=*/false, &exec_context);
+  }
 
-          if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
-            // guarantee is a subset of filter, so all data will be included
-            // x > 1, x >= 1, x != 1 guaranteed by x > 1
-            return literal(true);
-          }
+  /// \brief Simplify the given expression given this inequality as a guarantee.
+  Result<Expression> Simplify(Expression expr) {
+    const auto& guarantee = *this;
 
-          if ((*cmp & *cmp_guarantee) == 0) {
-            // guarantee disjoint with filter, so all data will be excluded
-            // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
-            return literal(false);
-          }
+    auto call = expr.call();
+    if (!call) return expr;
 
-          return expr;
-        }
+    auto cmp = Comparison::Get(expr);
+    if (!cmp) return expr;
 
-        if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
-          // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
-          return expr;
-        }
+    auto rhs = call->arguments[1].literal();
+    if (!rhs) return expr;
+    if (!rhs->is_scalar()) return expr;
 
-        if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
-          // x > 1, x >= 1, x != 1 guaranteed by x >= 3
-          return literal(true);
-        } else {
-          // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
-          return literal(false);
-        }
+    const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+    if (!lhs.field_ref()) return expr;
+    if (*lhs.field_ref() != guarantee.target) return expr;
+
+    ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_bound, Comparison::Execute(*rhs, guarantee.bound));
+    DCHECK_NE(cmp_rhs_bound, Comparison::NA);
+
+    if (cmp_rhs_bound == Comparison::EQUAL) {
+      // RHS of filter is equal to RHS of guarantee
+
+      if ((*cmp & guarantee.cmp) == guarantee.cmp) {
+        // guarantee is a subset of filter, so all data will be included
+        // x > 1, x >= 1, x != 1 guaranteed by x > 1
+        return simplified_to(lhs, true);
+      }
+
+      if ((*cmp & guarantee.cmp) == 0) {
+        // guarantee disjoint with filter, so all data will be excluded
+        // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
+        return simplified_to(lhs, false);
+      }
+
+      return expr;
+    }
+
+    if (guarantee.cmp & cmp_rhs_bound) {
+      // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3

Review Comment:
   This is contradicted by the next comment below, did you make a mistake?



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;
+  bool nullable;
+
+  // Extract an Inequality if possible, derived from "less",
+  // "greater", "less_equal", and "greater_equal" expressions,
+  // possibly disjuncted with an "is_null" Expression.
+  // cmp(a, 2)
+  // cmp(a, 2) or is_null(a)
+  static util::optional<Inequality> ExtractOne(const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
+
+    if (call->function_name == "or_kleene") {
+      // expect the LHS to be a usable field inequality
+      auto out = ExtractOneFromComparison(call->arguments[0]);
+      if (!out) return util::nullopt;
+
+      // expect the RHS to be an is_null expression
+      auto call_rhs = call->arguments[1].call();
+      if (!call_rhs) return util::nullopt;
+      if (call_rhs->function_name != "is_null") return util::nullopt;
+
+      // ... and that it references the same target
+      auto target = call_rhs->arguments[0].field_ref();
+      if (!target) return util::nullopt;
+      if (*target != out->target) return util::nullopt;
+
+      out->nullable = true;
+      return out;
+    }
 
-        // Ensure both calls are comparisons with equal LHS and scalar RHS
-        auto cmp = Comparison::Get(expr);
-        auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+    // fall back to a simple comparison with no "is_null"
+    return ExtractOneFromComparison(guarantee);
+  }
 
-        if (!cmp) return expr;
-        if (!cmp_guarantee) return expr;
+  static util::optional<Inequality> ExtractOneFromComparison(
+      const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
 
-        const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
-        const auto& guarantee_lhs = guarantee.arguments[0];
-        if (lhs != guarantee_lhs) return expr;
+    if (auto cmp = Comparison::Get(call->function_name)) {
+      // not_equal comparisons are not very usable as guarantees
+      if (*cmp == Comparison::NOT_EQUAL) return util::nullopt;
 
-        auto rhs = call->arguments[1].literal();
-        auto guarantee_rhs = guarantee.arguments[1].literal();
+      auto target = call->arguments[0].field_ref();
+      if (!target) return util::nullopt;
 
-        if (!rhs) return expr;
-        if (!rhs->is_scalar()) return expr;
+      auto bound = call->arguments[1].literal();
+      if (!bound) return util::nullopt;
+      if (!bound->is_scalar()) return util::nullopt;
 
-        if (!guarantee_rhs) return expr;
-        if (!guarantee_rhs->is_scalar()) return expr;
+      return Inequality{*cmp, /*target=*/*target, *bound, /*nullable=*/false};
+    }
 
-        ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
-                              Comparison::Execute(*rhs, *guarantee_rhs));
-        DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+    return util::nullopt;
+  }
 
-        if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
-          // RHS of filter is equal to RHS of guarantee
+  /// The given expression simplifies to `value` if the inequality
+  /// target is not nullable. Otherwise, it simplifies to either a
+  /// call to true_unless_null or !true_unless_null.
+  Result<Expression> simplified_to(const Expression& bound_target, bool value) const {
+    if (!nullable) return literal(value);
+
+    ExecContext exec_context;
+
+    // Data may be null, so comparison will yield `value` - or null IFF the data was null
+    //
+    // true_unless_null is cheap; it purely reuses the validity bitmap for the values
+    // buffer. Inversion is less cheap but we expect that term never to be evaluated
+    // since invert(true_unless_null(x)) is not satisfiable.
+    Expression::Call call;
+    call.function_name = "true_unless_null";
+    call.arguments = {bound_target};
+    ARROW_ASSIGN_OR_RAISE(
+        auto true_unless_null,
+        BindNonRecursive(std::move(call),
+                         /*insert_implicit_casts=*/false, &exec_context));
+    if (value) return true_unless_null;
+
+    Expression::Call invert;
+    invert.function_name = "invert";
+    invert.arguments = {std::move(true_unless_null)};
+    return BindNonRecursive(std::move(invert),
+                            /*insert_implicit_casts=*/false, &exec_context);
+  }
 
-          if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
-            // guarantee is a subset of filter, so all data will be included
-            // x > 1, x >= 1, x != 1 guaranteed by x > 1
-            return literal(true);
-          }
+  /// \brief Simplify the given expression given this inequality as a guarantee.
+  Result<Expression> Simplify(Expression expr) {
+    const auto& guarantee = *this;
 
-          if ((*cmp & *cmp_guarantee) == 0) {
-            // guarantee disjoint with filter, so all data will be excluded
-            // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
-            return literal(false);
-          }
+    auto call = expr.call();
+    if (!call) return expr;
 
-          return expr;
-        }
+    auto cmp = Comparison::Get(expr);
+    if (!cmp) return expr;
 
-        if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
-          // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
-          return expr;
-        }
+    auto rhs = call->arguments[1].literal();
+    if (!rhs) return expr;
+    if (!rhs->is_scalar()) return expr;
 
-        if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
-          // x > 1, x >= 1, x != 1 guaranteed by x >= 3
-          return literal(true);
-        } else {
-          // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
-          return literal(false);
-        }
+    const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+    if (!lhs.field_ref()) return expr;
+    if (*lhs.field_ref() != guarantee.target) return expr;
+
+    ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_bound, Comparison::Execute(*rhs, guarantee.bound));
+    DCHECK_NE(cmp_rhs_bound, Comparison::NA);
+
+    if (cmp_rhs_bound == Comparison::EQUAL) {
+      // RHS of filter is equal to RHS of guarantee
+
+      if ((*cmp & guarantee.cmp) == guarantee.cmp) {
+        // guarantee is a subset of filter, so all data will be included
+        // x > 1, x >= 1, x != 1 guaranteed by x > 1
+        return simplified_to(lhs, true);
+      }
+
+      if ((*cmp & guarantee.cmp) == 0) {
+        // guarantee disjoint with filter, so all data will be excluded
+        // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
+        return simplified_to(lhs, false);
+      }
+
+      return expr;
+    }
+
+    if (guarantee.cmp & cmp_rhs_bound) {
+      // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
+      return expr;
+    }
+
+    if (*cmp & Comparison::GetFlipped(cmp_rhs_bound)) {
+      // x > 1, x >= 1, x != 1 guaranteed by x >= 3
+      return simplified_to(lhs, true);
+    } else {
+      // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
+      return simplified_to(lhs, false);
+    }
+  }
+};
+
+/// \brief Simplify an expression given a guarantee, if the guarantee
+///   is is_valid().
+Result<Expression> IsValidSimplification(Expression expr,

Review Comment:
   Make this a verb, for example call it `SimplifyIsValid`.



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;
+  bool nullable;

Review Comment:
   Is this "the target can be null"?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1103237112

   (Also note @bkietz should get author credit here, I'm just fixing up his old PR and adding some comments/tests)


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] jonkeane commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
jonkeane commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1105637705

   I'm going to merge + create a test in R to (double) confirm that the intended behavior there is fixed — we could use that PR if we need any cleanups


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1102645188

   CC @pitrou or @westonpace, any comments here?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] github-actions[bot] commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1099414550

   https://issues.apache.org/jira/browse/ARROW-12659


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1099607810

   This needs to return a datum of the right type: https://github.com/apache/arrow/blob/63d2a9c856969a2c05e12ae8857a135bceaf45c1/cpp/src/arrow/compute/exec/expression.cc#L632-L640


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r853543688


##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;
+  bool nullable;
+
+  // Extract an Inequality if possible, derived from "less",
+  // "greater", "less_equal", and "greater_equal" expressions,
+  // possibly disjuncted with an "is_null" Expression.
+  // cmp(a, 2)
+  // cmp(a, 2) or is_null(a)
+  static util::optional<Inequality> ExtractOne(const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
+
+    if (call->function_name == "or_kleene") {
+      // expect the LHS to be a usable field inequality
+      auto out = ExtractOneFromComparison(call->arguments[0]);
+      if (!out) return util::nullopt;
+
+      // expect the RHS to be an is_null expression
+      auto call_rhs = call->arguments[1].call();
+      if (!call_rhs) return util::nullopt;
+      if (call_rhs->function_name != "is_null") return util::nullopt;
+
+      // ... and that it references the same target
+      auto target = call_rhs->arguments[0].field_ref();
+      if (!target) return util::nullopt;
+      if (*target != out->target) return util::nullopt;
+
+      out->nullable = true;
+      return out;
+    }
 
-        // Ensure both calls are comparisons with equal LHS and scalar RHS
-        auto cmp = Comparison::Get(expr);
-        auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+    // fall back to a simple comparison with no "is_null"
+    return ExtractOneFromComparison(guarantee);
+  }
 
-        if (!cmp) return expr;
-        if (!cmp_guarantee) return expr;
+  static util::optional<Inequality> ExtractOneFromComparison(
+      const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
 
-        const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
-        const auto& guarantee_lhs = guarantee.arguments[0];
-        if (lhs != guarantee_lhs) return expr;
+    if (auto cmp = Comparison::Get(call->function_name)) {
+      // not_equal comparisons are not very usable as guarantees
+      if (*cmp == Comparison::NOT_EQUAL) return util::nullopt;
 
-        auto rhs = call->arguments[1].literal();
-        auto guarantee_rhs = guarantee.arguments[1].literal();
+      auto target = call->arguments[0].field_ref();
+      if (!target) return util::nullopt;
 
-        if (!rhs) return expr;
-        if (!rhs->is_scalar()) return expr;
+      auto bound = call->arguments[1].literal();
+      if (!bound) return util::nullopt;
+      if (!bound->is_scalar()) return util::nullopt;
 
-        if (!guarantee_rhs) return expr;
-        if (!guarantee_rhs->is_scalar()) return expr;
+      return Inequality{*cmp, /*target=*/*target, *bound, /*nullable=*/false};
+    }
 
-        ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
-                              Comparison::Execute(*rhs, *guarantee_rhs));
-        DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+    return util::nullopt;
+  }
 
-        if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
-          // RHS of filter is equal to RHS of guarantee
+  /// The given expression simplifies to `value` if the inequality
+  /// target is not nullable. Otherwise, it simplifies to either a
+  /// call to true_unless_null or !true_unless_null.
+  Result<Expression> simplified_to(const Expression& bound_target, bool value) const {
+    if (!nullable) return literal(value);
+
+    ExecContext exec_context;
+
+    // Data may be null, so comparison will yield `value` - or null IFF the data was null
+    //
+    // true_unless_null is cheap; it purely reuses the validity bitmap for the values
+    // buffer. Inversion is less cheap but we expect that term never to be evaluated
+    // since invert(true_unless_null(x)) is not satisfiable.
+    Expression::Call call;
+    call.function_name = "true_unless_null";
+    call.arguments = {bound_target};
+    ARROW_ASSIGN_OR_RAISE(
+        auto true_unless_null,
+        BindNonRecursive(std::move(call),
+                         /*insert_implicit_casts=*/false, &exec_context));
+    if (value) return true_unless_null;
+
+    Expression::Call invert;
+    invert.function_name = "invert";
+    invert.arguments = {std::move(true_unless_null)};
+    return BindNonRecursive(std::move(invert),
+                            /*insert_implicit_casts=*/false, &exec_context);
+  }
 
-          if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
-            // guarantee is a subset of filter, so all data will be included
-            // x > 1, x >= 1, x != 1 guaranteed by x > 1
-            return literal(true);
-          }
+  /// \brief Simplify the given expression given this inequality as a guarantee.
+  Result<Expression> Simplify(Expression expr) {
+    const auto& guarantee = *this;
 
-          if ((*cmp & *cmp_guarantee) == 0) {
-            // guarantee disjoint with filter, so all data will be excluded
-            // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
-            return literal(false);
-          }
+    auto call = expr.call();
+    if (!call) return expr;
 
-          return expr;
-        }
+    auto cmp = Comparison::Get(expr);
+    if (!cmp) return expr;
 
-        if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
-          // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
-          return expr;
-        }
+    auto rhs = call->arguments[1].literal();
+    if (!rhs) return expr;
+    if (!rhs->is_scalar()) return expr;
 
-        if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
-          // x > 1, x >= 1, x != 1 guaranteed by x >= 3
-          return literal(true);
-        } else {
-          // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
-          return literal(false);
-        }
+    const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+    if (!lhs.field_ref()) return expr;
+    if (*lhs.field_ref() != guarantee.target) return expr;
+
+    ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_bound, Comparison::Execute(*rhs, guarantee.bound));
+    DCHECK_NE(cmp_rhs_bound, Comparison::NA);
+
+    if (cmp_rhs_bound == Comparison::EQUAL) {
+      // RHS of filter is equal to RHS of guarantee
+
+      if ((*cmp & guarantee.cmp) == guarantee.cmp) {
+        // guarantee is a subset of filter, so all data will be included
+        // x > 1, x >= 1, x != 1 guaranteed by x > 1
+        return simplified_to(lhs, true);
+      }
+
+      if ((*cmp & guarantee.cmp) == 0) {
+        // guarantee disjoint with filter, so all data will be excluded
+        // x > 1, x >= 1, x != 1 unsatisfiable if x == 1

Review Comment:
   Indeed, fixed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] westonpace commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
westonpace commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1103239308

   > Hmm. I guess we are treating guarantees and filters differently. x < 10 as a guarantee implies is_valid(x), but not as a filter. We may want to fix that, but that would also be a drastic change.
   
   Sorry, I wasn't clear.  I don't think we should automatically add `is_valid(x)`.  I'm just wondering if this is something we ought to document since it may not be intuitive to users.  I can probably add something when we talk about pushdown filtering in the datasets docs.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r853496805


##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;
+  bool nullable;
+
+  // Extract an Inequality if possible, derived from "less",
+  // "greater", "less_equal", and "greater_equal" expressions,
+  // possibly disjuncted with an "is_null" Expression.
+  // cmp(a, 2)
+  // cmp(a, 2) or is_null(a)
+  static util::optional<Inequality> ExtractOne(const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
+
+    if (call->function_name == "or_kleene") {
+      // expect the LHS to be a usable field inequality
+      auto out = ExtractOneFromComparison(call->arguments[0]);
+      if (!out) return util::nullopt;
+
+      // expect the RHS to be an is_null expression
+      auto call_rhs = call->arguments[1].call();
+      if (!call_rhs) return util::nullopt;
+      if (call_rhs->function_name != "is_null") return util::nullopt;
+
+      // ... and that it references the same target
+      auto target = call_rhs->arguments[0].field_ref();
+      if (!target) return util::nullopt;
+      if (*target != out->target) return util::nullopt;
+
+      out->nullable = true;
+      return out;
+    }
 
-        // Ensure both calls are comparisons with equal LHS and scalar RHS
-        auto cmp = Comparison::Get(expr);
-        auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+    // fall back to a simple comparison with no "is_null"
+    return ExtractOneFromComparison(guarantee);
+  }
 
-        if (!cmp) return expr;
-        if (!cmp_guarantee) return expr;
+  static util::optional<Inequality> ExtractOneFromComparison(
+      const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
 
-        const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
-        const auto& guarantee_lhs = guarantee.arguments[0];
-        if (lhs != guarantee_lhs) return expr;
+    if (auto cmp = Comparison::Get(call->function_name)) {
+      // not_equal comparisons are not very usable as guarantees
+      if (*cmp == Comparison::NOT_EQUAL) return util::nullopt;
 
-        auto rhs = call->arguments[1].literal();
-        auto guarantee_rhs = guarantee.arguments[1].literal();
+      auto target = call->arguments[0].field_ref();
+      if (!target) return util::nullopt;
 
-        if (!rhs) return expr;
-        if (!rhs->is_scalar()) return expr;
+      auto bound = call->arguments[1].literal();
+      if (!bound) return util::nullopt;
+      if (!bound->is_scalar()) return util::nullopt;
 
-        if (!guarantee_rhs) return expr;
-        if (!guarantee_rhs->is_scalar()) return expr;
+      return Inequality{*cmp, /*target=*/*target, *bound, /*nullable=*/false};
+    }
 
-        ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
-                              Comparison::Execute(*rhs, *guarantee_rhs));
-        DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+    return util::nullopt;
+  }
 
-        if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
-          // RHS of filter is equal to RHS of guarantee
+  /// The given expression simplifies to `value` if the inequality
+  /// target is not nullable. Otherwise, it simplifies to either a
+  /// call to true_unless_null or !true_unless_null.
+  Result<Expression> simplified_to(const Expression& bound_target, bool value) const {
+    if (!nullable) return literal(value);
+
+    ExecContext exec_context;
+
+    // Data may be null, so comparison will yield `value` - or null IFF the data was null
+    //
+    // true_unless_null is cheap; it purely reuses the validity bitmap for the values
+    // buffer. Inversion is less cheap but we expect that term never to be evaluated
+    // since invert(true_unless_null(x)) is not satisfiable.
+    Expression::Call call;
+    call.function_name = "true_unless_null";
+    call.arguments = {bound_target};
+    ARROW_ASSIGN_OR_RAISE(
+        auto true_unless_null,
+        BindNonRecursive(std::move(call),
+                         /*insert_implicit_casts=*/false, &exec_context));
+    if (value) return true_unless_null;
+
+    Expression::Call invert;
+    invert.function_name = "invert";
+    invert.arguments = {std::move(true_unless_null)};
+    return BindNonRecursive(std::move(invert),
+                            /*insert_implicit_casts=*/false, &exec_context);
+  }
 
-          if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
-            // guarantee is a subset of filter, so all data will be included
-            // x > 1, x >= 1, x != 1 guaranteed by x > 1
-            return literal(true);
-          }
+  /// \brief Simplify the given expression given this inequality as a guarantee.
+  Result<Expression> Simplify(Expression expr) {
+    const auto& guarantee = *this;
 
-          if ((*cmp & *cmp_guarantee) == 0) {
-            // guarantee disjoint with filter, so all data will be excluded
-            // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
-            return literal(false);
-          }
+    auto call = expr.call();
+    if (!call) return expr;
 
-          return expr;
-        }
+    auto cmp = Comparison::Get(expr);
+    if (!cmp) return expr;
 
-        if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
-          // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
-          return expr;
-        }
+    auto rhs = call->arguments[1].literal();
+    if (!rhs) return expr;
+    if (!rhs->is_scalar()) return expr;
 
-        if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
-          // x > 1, x >= 1, x != 1 guaranteed by x >= 3
-          return literal(true);
-        } else {
-          // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
-          return literal(false);
-        }
+    const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+    if (!lhs.field_ref()) return expr;
+    if (*lhs.field_ref() != guarantee.target) return expr;
+
+    ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_bound, Comparison::Execute(*rhs, guarantee.bound));
+    DCHECK_NE(cmp_rhs_bound, Comparison::NA);
+
+    if (cmp_rhs_bound == Comparison::EQUAL) {
+      // RHS of filter is equal to RHS of guarantee
+
+      if ((*cmp & guarantee.cmp) == guarantee.cmp) {
+        // guarantee is a subset of filter, so all data will be included
+        // x > 1, x >= 1, x != 1 guaranteed by x > 1
+        return simplified_to(lhs, true);
+      }
+
+      if ((*cmp & guarantee.cmp) == 0) {
+        // guarantee disjoint with filter, so all data will be excluded
+        // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
+        return simplified_to(lhs, false);
+      }
+
+      return expr;
+    }
+
+    if (guarantee.cmp & cmp_rhs_bound) {

Review Comment:
   Or actually, I straight up just don't understand this…will try to clear this up…



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1103235600

   > This looks great, thanks for figuring this out. It seems there would be some advantage whenever I filter parquet files with an equality to add is_valid if that column might contain nulls. For example:
   > 
   > `(ds.field(x) < 10) & is_valid(ds.field(x))` will eliminate a row group with min 12 and null_count > 0 where `ds.field(x) < 10` will not (although the filtering will be very fast we will still have to decode the row group).
   > 
   > I don't know if this is worth documenting somewhere or if it is too obscure to include.
   
   Hmm. I guess we are treating guarantees and filters differently. `x < 10` as a guarantee implies `is_valid(x)`, but not as a filter. We may want to fix that, but that would also be a drastic change.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1105431483

   I think I've addressed everything now, any other comments here?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r852174446


##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -128,17 +128,30 @@ util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
   auto maybe_min = min->CastTo(field->type());
   auto maybe_max = max->CastTo(field->type());
   if (maybe_min.ok() && maybe_max.ok()) {
-    auto col_min = maybe_min.MoveValueUnsafe();
-    auto col_max = maybe_max.MoveValueUnsafe();
-    if (col_min->Equals(col_max)) {
-      return compute::equal(std::move(field_expr), compute::literal(std::move(col_min)));
+    min = maybe_min.MoveValueUnsafe();
+    max = maybe_max.MoveValueUnsafe();
+
+    compute::Expression range;
+    if (min->Equals(max)) {
+      auto single_value = compute::equal(field_expr, compute::literal(std::move(min)));
+
+      if (statistics->null_count() == 0) {
+        return compute::and_(single_value, compute::is_valid(field_expr));
+      }
+      return compute::or_(std::move(single_value), is_null(std::move(field_expr)));
     }
 
     auto lower_bound =
-        compute::greater_equal(field_expr, compute::literal(std::move(col_min)));
-    auto upper_bound =
-        compute::less_equal(std::move(field_expr), compute::literal(std::move(col_max)));
-    return compute::and_(std::move(lower_bound), std::move(upper_bound));
+        compute::greater_equal(field_expr, compute::literal(std::move(min)));
+    auto upper_bound = compute::less_equal(field_expr, compute::literal(std::move(max)));
+
+    if (statistics->null_count() != 0) {
+      lower_bound = compute::or_(std::move(lower_bound), is_null(field_expr));
+      upper_bound = compute::or_(std::move(upper_bound), is_null(std::move(field_expr)));
+      return compute::and_(std::move(lower_bound), std::move(upper_bound));
+    }
+    return compute::and_(compute::and_(std::move(lower_bound), std::move(upper_bound)),
+                         compute::is_valid(field_expr));
   }

Review Comment:
   I believe this is handled at https://github.com/apache/arrow/blob/fae66cba04aba6528ba7d6a8c225cff24c469ef2/cpp/src/arrow/dataset/file_parquet.cc#L118-L121
   
   Confusingly enough `num_values` does not include nulls. See this test which covers this already: https://github.com/apache/arrow/pull/12891/files#diff-d88654840d0432223c1617e8fd9289db0f4e6fff6b34e9f062861ef8eec724fcR256
   
   This writes each record batch to its own row group, so the test would fail if we didn't generate the proper guarantee for the all-null row group.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
pitrou commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1103552544

   > Hmm. I guess we are treating guarantees and filters differently. `x < 10` as a guarantee implies `is_valid(x)`, but not as a filter. We may want to fix that, but that would also be a drastic change.
   
   I think it would definitely be worthwhile to fix it. It's delicate enough to think about simplifications without this oddity.
   
   Also, isn't it surprising as a user for the `x < 10` filter to accept nulls? It wouldn't in SQL for example.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r853483076


##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;
+  bool nullable;
+
+  // Extract an Inequality if possible, derived from "less",
+  // "greater", "less_equal", and "greater_equal" expressions,
+  // possibly disjuncted with an "is_null" Expression.
+  // cmp(a, 2)
+  // cmp(a, 2) or is_null(a)
+  static util::optional<Inequality> ExtractOne(const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
+
+    if (call->function_name == "or_kleene") {
+      // expect the LHS to be a usable field inequality
+      auto out = ExtractOneFromComparison(call->arguments[0]);
+      if (!out) return util::nullopt;
+
+      // expect the RHS to be an is_null expression
+      auto call_rhs = call->arguments[1].call();
+      if (!call_rhs) return util::nullopt;
+      if (call_rhs->function_name != "is_null") return util::nullopt;
+
+      // ... and that it references the same target
+      auto target = call_rhs->arguments[0].field_ref();
+      if (!target) return util::nullopt;
+      if (*target != out->target) return util::nullopt;
+
+      out->nullable = true;
+      return out;
+    }
 
-        // Ensure both calls are comparisons with equal LHS and scalar RHS
-        auto cmp = Comparison::Get(expr);
-        auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+    // fall back to a simple comparison with no "is_null"
+    return ExtractOneFromComparison(guarantee);
+  }
 
-        if (!cmp) return expr;
-        if (!cmp_guarantee) return expr;
+  static util::optional<Inequality> ExtractOneFromComparison(
+      const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
 
-        const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
-        const auto& guarantee_lhs = guarantee.arguments[0];
-        if (lhs != guarantee_lhs) return expr;
+    if (auto cmp = Comparison::Get(call->function_name)) {
+      // not_equal comparisons are not very usable as guarantees
+      if (*cmp == Comparison::NOT_EQUAL) return util::nullopt;
 
-        auto rhs = call->arguments[1].literal();
-        auto guarantee_rhs = guarantee.arguments[1].literal();
+      auto target = call->arguments[0].field_ref();
+      if (!target) return util::nullopt;
 
-        if (!rhs) return expr;
-        if (!rhs->is_scalar()) return expr;
+      auto bound = call->arguments[1].literal();
+      if (!bound) return util::nullopt;
+      if (!bound->is_scalar()) return util::nullopt;
 
-        if (!guarantee_rhs) return expr;
-        if (!guarantee_rhs->is_scalar()) return expr;
+      return Inequality{*cmp, /*target=*/*target, *bound, /*nullable=*/false};
+    }
 
-        ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
-                              Comparison::Execute(*rhs, *guarantee_rhs));
-        DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+    return util::nullopt;
+  }
 
-        if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
-          // RHS of filter is equal to RHS of guarantee
+  /// The given expression simplifies to `value` if the inequality
+  /// target is not nullable. Otherwise, it simplifies to either a
+  /// call to true_unless_null or !true_unless_null.
+  Result<Expression> simplified_to(const Expression& bound_target, bool value) const {
+    if (!nullable) return literal(value);
+
+    ExecContext exec_context;
+
+    // Data may be null, so comparison will yield `value` - or null IFF the data was null
+    //
+    // true_unless_null is cheap; it purely reuses the validity bitmap for the values
+    // buffer. Inversion is less cheap but we expect that term never to be evaluated
+    // since invert(true_unless_null(x)) is not satisfiable.
+    Expression::Call call;
+    call.function_name = "true_unless_null";
+    call.arguments = {bound_target};
+    ARROW_ASSIGN_OR_RAISE(
+        auto true_unless_null,
+        BindNonRecursive(std::move(call),
+                         /*insert_implicit_casts=*/false, &exec_context));
+    if (value) return true_unless_null;
+
+    Expression::Call invert;
+    invert.function_name = "invert";
+    invert.arguments = {std::move(true_unless_null)};
+    return BindNonRecursive(std::move(invert),
+                            /*insert_implicit_casts=*/false, &exec_context);
+  }
 
-          if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
-            // guarantee is a subset of filter, so all data will be included
-            // x > 1, x >= 1, x != 1 guaranteed by x > 1
-            return literal(true);
-          }
+  /// \brief Simplify the given expression given this inequality as a guarantee.
+  Result<Expression> Simplify(Expression expr) {
+    const auto& guarantee = *this;
 
-          if ((*cmp & *cmp_guarantee) == 0) {
-            // guarantee disjoint with filter, so all data will be excluded
-            // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
-            return literal(false);
-          }
+    auto call = expr.call();
+    if (!call) return expr;
 
-          return expr;
-        }
+    auto cmp = Comparison::Get(expr);
+    if (!cmp) return expr;
 
-        if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
-          // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
-          return expr;
-        }
+    auto rhs = call->arguments[1].literal();
+    if (!rhs) return expr;
+    if (!rhs->is_scalar()) return expr;
 
-        if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
-          // x > 1, x >= 1, x != 1 guaranteed by x >= 3
-          return literal(true);
-        } else {
-          // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
-          return literal(false);
-        }
+    const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+    if (!lhs.field_ref()) return expr;
+    if (*lhs.field_ref() != guarantee.target) return expr;
+
+    ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_bound, Comparison::Execute(*rhs, guarantee.bound));
+    DCHECK_NE(cmp_rhs_bound, Comparison::NA);
+
+    if (cmp_rhs_bound == Comparison::EQUAL) {
+      // RHS of filter is equal to RHS of guarantee
+
+      if ((*cmp & guarantee.cmp) == guarantee.cmp) {
+        // guarantee is a subset of filter, so all data will be included
+        // x > 1, x >= 1, x != 1 guaranteed by x > 1
+        return simplified_to(lhs, true);
+      }
+
+      if ((*cmp & guarantee.cmp) == 0) {
+        // guarantee disjoint with filter, so all data will be excluded
+        // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
+        return simplified_to(lhs, false);
+      }
+
+      return expr;
+    }
+
+    if (guarantee.cmp & cmp_rhs_bound) {

Review Comment:
   It's unclear to me after writing out some examples (I don't think it handles all cases right either as you note with the incorrect comment)



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;
+  bool nullable;
+
+  // Extract an Inequality if possible, derived from "less",
+  // "greater", "less_equal", and "greater_equal" expressions,
+  // possibly disjuncted with an "is_null" Expression.
+  // cmp(a, 2)
+  // cmp(a, 2) or is_null(a)
+  static util::optional<Inequality> ExtractOne(const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
+
+    if (call->function_name == "or_kleene") {
+      // expect the LHS to be a usable field inequality
+      auto out = ExtractOneFromComparison(call->arguments[0]);
+      if (!out) return util::nullopt;
+
+      // expect the RHS to be an is_null expression
+      auto call_rhs = call->arguments[1].call();
+      if (!call_rhs) return util::nullopt;
+      if (call_rhs->function_name != "is_null") return util::nullopt;
+
+      // ... and that it references the same target
+      auto target = call_rhs->arguments[0].field_ref();
+      if (!target) return util::nullopt;
+      if (*target != out->target) return util::nullopt;
+
+      out->nullable = true;
+      return out;
+    }
 
-        // Ensure both calls are comparisons with equal LHS and scalar RHS
-        auto cmp = Comparison::Get(expr);
-        auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+    // fall back to a simple comparison with no "is_null"
+    return ExtractOneFromComparison(guarantee);
+  }
 
-        if (!cmp) return expr;
-        if (!cmp_guarantee) return expr;
+  static util::optional<Inequality> ExtractOneFromComparison(
+      const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
 
-        const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
-        const auto& guarantee_lhs = guarantee.arguments[0];
-        if (lhs != guarantee_lhs) return expr;
+    if (auto cmp = Comparison::Get(call->function_name)) {
+      // not_equal comparisons are not very usable as guarantees
+      if (*cmp == Comparison::NOT_EQUAL) return util::nullopt;
 
-        auto rhs = call->arguments[1].literal();
-        auto guarantee_rhs = guarantee.arguments[1].literal();
+      auto target = call->arguments[0].field_ref();
+      if (!target) return util::nullopt;
 
-        if (!rhs) return expr;
-        if (!rhs->is_scalar()) return expr;
+      auto bound = call->arguments[1].literal();
+      if (!bound) return util::nullopt;
+      if (!bound->is_scalar()) return util::nullopt;
 
-        if (!guarantee_rhs) return expr;
-        if (!guarantee_rhs->is_scalar()) return expr;
+      return Inequality{*cmp, /*target=*/*target, *bound, /*nullable=*/false};
+    }
 
-        ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
-                              Comparison::Execute(*rhs, *guarantee_rhs));
-        DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+    return util::nullopt;
+  }
 
-        if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
-          // RHS of filter is equal to RHS of guarantee
+  /// The given expression simplifies to `value` if the inequality
+  /// target is not nullable. Otherwise, it simplifies to either a
+  /// call to true_unless_null or !true_unless_null.
+  Result<Expression> simplified_to(const Expression& bound_target, bool value) const {
+    if (!nullable) return literal(value);
+
+    ExecContext exec_context;
+
+    // Data may be null, so comparison will yield `value` - or null IFF the data was null
+    //
+    // true_unless_null is cheap; it purely reuses the validity bitmap for the values
+    // buffer. Inversion is less cheap but we expect that term never to be evaluated
+    // since invert(true_unless_null(x)) is not satisfiable.
+    Expression::Call call;
+    call.function_name = "true_unless_null";
+    call.arguments = {bound_target};
+    ARROW_ASSIGN_OR_RAISE(
+        auto true_unless_null,
+        BindNonRecursive(std::move(call),
+                         /*insert_implicit_casts=*/false, &exec_context));
+    if (value) return true_unless_null;
+
+    Expression::Call invert;
+    invert.function_name = "invert";
+    invert.arguments = {std::move(true_unless_null)};
+    return BindNonRecursive(std::move(invert),
+                            /*insert_implicit_casts=*/false, &exec_context);
+  }
 
-          if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
-            // guarantee is a subset of filter, so all data will be included
-            // x > 1, x >= 1, x != 1 guaranteed by x > 1
-            return literal(true);
-          }
+  /// \brief Simplify the given expression given this inequality as a guarantee.
+  Result<Expression> Simplify(Expression expr) {
+    const auto& guarantee = *this;
 
-          if ((*cmp & *cmp_guarantee) == 0) {
-            // guarantee disjoint with filter, so all data will be excluded
-            // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
-            return literal(false);
-          }
+    auto call = expr.call();
+    if (!call) return expr;
 
-          return expr;
-        }
+    auto cmp = Comparison::Get(expr);
+    if (!cmp) return expr;
 
-        if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
-          // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
-          return expr;
-        }
+    auto rhs = call->arguments[1].literal();
+    if (!rhs) return expr;
+    if (!rhs->is_scalar()) return expr;
 
-        if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
-          // x > 1, x >= 1, x != 1 guaranteed by x >= 3
-          return literal(true);
-        } else {
-          // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
-          return literal(false);
-        }
+    const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+    if (!lhs.field_ref()) return expr;
+    if (*lhs.field_ref() != guarantee.target) return expr;
+
+    ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_bound, Comparison::Execute(*rhs, guarantee.bound));
+    DCHECK_NE(cmp_rhs_bound, Comparison::NA);
+
+    if (cmp_rhs_bound == Comparison::EQUAL) {
+      // RHS of filter is equal to RHS of guarantee
+
+      if ((*cmp & guarantee.cmp) == guarantee.cmp) {
+        // guarantee is a subset of filter, so all data will be included
+        // x > 1, x >= 1, x != 1 guaranteed by x > 1
+        return simplified_to(lhs, true);
+      }
+
+      if ((*cmp & guarantee.cmp) == 0) {
+        // guarantee disjoint with filter, so all data will be excluded
+        // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
+        return simplified_to(lhs, false);
+      }
+
+      return expr;
+    }
+
+    if (guarantee.cmp & cmp_rhs_bound) {

Review Comment:
   I'll try to replace this



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r855375741


##########
cpp/src/arrow/util/vector.h:
##########
@@ -78,8 +78,8 @@ std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
 
 template <typename T, typename Predicate>
 std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
-  auto new_end =
-      std::remove_if(values.begin(), values.end(), std::forward<Predicate>(predicate));
+  auto new_end = std::stable_partition(values.begin(), values.end(),

Review Comment:
   We want `keep_if` not `remove_if` though I suppose we could wrap `predicate` and invert it.



##########
cpp/src/arrow/util/vector.h:
##########
@@ -78,8 +78,8 @@ std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
 
 template <typename T, typename Predicate>
 std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
-  auto new_end =
-      std::remove_if(values.begin(), values.end(), std::forward<Predicate>(predicate));
+  auto new_end = std::stable_partition(values.begin(), values.end(),

Review Comment:
   We want a (hypothetical) `keep_if` not `remove_if` though I suppose we could wrap `predicate` and invert it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r855521687


##########
cpp/src/arrow/util/vector.h:
##########
@@ -78,8 +78,8 @@ std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
 
 template <typename T, typename Predicate>
 std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
-  auto new_end =
-      std::remove_if(values.begin(), values.end(), std::forward<Predicate>(predicate));
+  auto new_end = std::stable_partition(values.begin(), values.end(),

Review Comment:
   Follow up at #12949



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r852176503


##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -682,46 +715,49 @@ std::vector<Expression> GuaranteeConjunctionMembers(
   return FlattenedAssociativeChain(guaranteed_true_predicate).fringe;
 }
 
-// Conjunction members which are represented in known_values are erased from
-// conjunction_members
-Status ExtractKnownFieldValuesImpl(
-    std::vector<Expression>* conjunction_members,
-    std::unordered_map<FieldRef, Datum, FieldRef::Hash>* known_values) {
-  auto unconsumed_end =
-      std::partition(conjunction_members->begin(), conjunction_members->end(),
-                     [](const Expression& expr) {
-                       // search for an equality conditions between a field and a literal
-                       auto call = expr.call();
-                       if (!call) return true;
-
-                       if (call->function_name == "equal") {
-                         auto ref = call->arguments[0].field_ref();
-                         auto lit = call->arguments[1].literal();
-                         return !(ref && lit);
-                       }
-
-                       if (call->function_name == "is_null") {
-                         auto ref = call->arguments[0].field_ref();
-                         return !ref;
-                       }
-
-                       return true;
-                     });
-
-  for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) {
-    auto call = CallNotNull(*it);
-
-    if (call->function_name == "equal") {
-      auto ref = call->arguments[0].field_ref();
-      auto lit = call->arguments[1].literal();
-      known_values->emplace(*ref, *lit);
-    } else if (call->function_name == "is_null") {
-      auto ref = call->arguments[0].field_ref();
-      known_values->emplace(*ref, Datum(std::make_shared<NullScalar>()));
-    }
+// equal(a, 2)
+// is_null(a)
+util::optional<std::pair<FieldRef, Datum>> ExtractKnownFieldValue(

Review Comment:
   Done.



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -682,46 +715,49 @@ std::vector<Expression> GuaranteeConjunctionMembers(
   return FlattenedAssociativeChain(guaranteed_true_predicate).fringe;
 }
 
-// Conjunction members which are represented in known_values are erased from
-// conjunction_members
-Status ExtractKnownFieldValuesImpl(
-    std::vector<Expression>* conjunction_members,
-    std::unordered_map<FieldRef, Datum, FieldRef::Hash>* known_values) {
-  auto unconsumed_end =
-      std::partition(conjunction_members->begin(), conjunction_members->end(),
-                     [](const Expression& expr) {
-                       // search for an equality conditions between a field and a literal
-                       auto call = expr.call();
-                       if (!call) return true;
-
-                       if (call->function_name == "equal") {
-                         auto ref = call->arguments[0].field_ref();
-                         auto lit = call->arguments[1].literal();
-                         return !(ref && lit);
-                       }
-
-                       if (call->function_name == "is_null") {
-                         auto ref = call->arguments[0].field_ref();
-                         return !ref;
-                       }
-
-                       return true;
-                     });
-
-  for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) {
-    auto call = CallNotNull(*it);
-
-    if (call->function_name == "equal") {
-      auto ref = call->arguments[0].field_ref();
-      auto lit = call->arguments[1].literal();
-      known_values->emplace(*ref, *lit);
-    } else if (call->function_name == "is_null") {
-      auto ref = call->arguments[0].field_ref();
-      known_values->emplace(*ref, Datum(std::make_shared<NullScalar>()));
-    }
+// equal(a, 2)

Review Comment:
   I turned this into a docstring.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r853427405


##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -305,19 +308,47 @@ bool Expression::IsNullLiteral() const {
   return false;
 }
 
-bool Expression::IsSatisfiable() const {
-  if (type() && type()->id() == Type::NA) {
-    return false;
+namespace {
+util::optional<compute::NullHandling::type> GetNullHandling(
+    const Expression::Call& call) {
+  DCHECK_NE(call.function, nullptr);
+  if (call.function->kind() == compute::Function::SCALAR) {
+    return static_cast<const compute::ScalarKernel*>(call.kernel)->null_handling;
   }
+  return util::nullopt;
+}
+}  // namespace
+
+bool Expression::IsSatisfiable() const {
+  if (!type()) return true;
+  if (type()->id() != Type::BOOL) return true;
 
   if (auto lit = literal()) {
     if (lit->null_count() == lit->length()) {
       return false;
     }
 
-    if (lit->is_scalar() && lit->type()->id() == Type::BOOL) {
+    if (lit->is_scalar()) {
       return lit->scalar_as<BooleanScalar>().value;
     }
+
+    return true;
+  }
+
+  if (field_ref()) return true;
+
+  auto call = CallNotNull(*this);
+
+  if (call->function_name == "invert") {
+    if (auto nested_call = call->arguments[0].call()) {
+      if (nested_call->function_name == "true_unless_null") return false;
+    }
+  }
+
+  if (call->function_name == "and_kleene") {

Review Comment:
   I'll move the explanation from https://issues.apache.org/jira/browse/ARROW-13848



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] westonpace commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
westonpace commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1104407060

   `invert(true_unless_null(i32))` being not satisfiable is probably more correct than simplifying to `literal(false)` so I wouldn't recommend that.  I agree, it sounds like we are doing the right thing, sorry for the wild goose chase.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] wjones127 commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
wjones127 commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r852193507


##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -128,17 +128,30 @@ util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
   auto maybe_min = min->CastTo(field->type());
   auto maybe_max = max->CastTo(field->type());
   if (maybe_min.ok() && maybe_max.ok()) {
-    auto col_min = maybe_min.MoveValueUnsafe();
-    auto col_max = maybe_max.MoveValueUnsafe();
-    if (col_min->Equals(col_max)) {
-      return compute::equal(std::move(field_expr), compute::literal(std::move(col_min)));
+    min = maybe_min.MoveValueUnsafe();
+    max = maybe_max.MoveValueUnsafe();
+
+    compute::Expression range;
+    if (min->Equals(max)) {
+      auto single_value = compute::equal(field_expr, compute::literal(std::move(min)));
+
+      if (statistics->null_count() == 0) {
+        return compute::and_(single_value, compute::is_valid(field_expr));
+      }
+      return compute::or_(std::move(single_value), is_null(std::move(field_expr)));
     }
 
     auto lower_bound =
-        compute::greater_equal(field_expr, compute::literal(std::move(col_min)));
-    auto upper_bound =
-        compute::less_equal(std::move(field_expr), compute::literal(std::move(col_max)));
-    return compute::and_(std::move(lower_bound), std::move(upper_bound));
+        compute::greater_equal(field_expr, compute::literal(std::move(min)));
+    auto upper_bound = compute::less_equal(field_expr, compute::literal(std::move(max)));
+
+    if (statistics->null_count() != 0) {
+      lower_bound = compute::or_(std::move(lower_bound), is_null(field_expr));
+      upper_bound = compute::or_(std::move(upper_bound), is_null(std::move(field_expr)));
+      return compute::and_(std::move(lower_bound), std::move(upper_bound));
+    }
+    return compute::and_(compute::and_(std::move(lower_bound), std::move(upper_bound)),
+                         compute::is_valid(field_expr));
   }

Review Comment:
   Ah thanks for the pointer on `num_values`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r853420864


##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -128,17 +128,30 @@ util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
   auto maybe_min = min->CastTo(field->type());
   auto maybe_max = max->CastTo(field->type());
   if (maybe_min.ok() && maybe_max.ok()) {
-    auto col_min = maybe_min.MoveValueUnsafe();
-    auto col_max = maybe_max.MoveValueUnsafe();
-    if (col_min->Equals(col_max)) {
-      return compute::equal(std::move(field_expr), compute::literal(std::move(col_min)));
+    min = maybe_min.MoveValueUnsafe();
+    max = maybe_max.MoveValueUnsafe();
+
+    compute::Expression range;
+    if (min->Equals(max)) {
+      auto single_value = compute::equal(field_expr, compute::literal(std::move(min)));
+
+      if (statistics->null_count() == 0) {
+        return compute::and_(single_value, compute::is_valid(field_expr));

Review Comment:
   Removing this does break a test, but it's because right now `i64 > 1` doesn't cause `is_null(i64)` to simplify - we need to be a little smarter here. Will fix that.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r853417249


##########
cpp/src/arrow/util/vector.h:
##########
@@ -77,9 +77,14 @@ std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
 }
 
 template <typename T, typename Predicate>
-std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
-  auto new_end =
-      std::remove_if(values.begin(), values.end(), std::forward<Predicate>(predicate));
+std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate,
+                            std::vector<T>* filtered_out = NULLPTR) {

Review Comment:
   It's not used indeed. But the change is still needed since it actually inverts what FilterVector does! (The current FilterVector is backwards of what you would expect…)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] westonpace commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
westonpace commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r853522205


##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;
+  bool nullable;
+
+  // Extract an Inequality if possible, derived from "less",
+  // "greater", "less_equal", and "greater_equal" expressions,
+  // possibly disjuncted with an "is_null" Expression.
+  // cmp(a, 2)
+  // cmp(a, 2) or is_null(a)
+  static util::optional<Inequality> ExtractOne(const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
+
+    if (call->function_name == "or_kleene") {
+      // expect the LHS to be a usable field inequality
+      auto out = ExtractOneFromComparison(call->arguments[0]);
+      if (!out) return util::nullopt;
+
+      // expect the RHS to be an is_null expression
+      auto call_rhs = call->arguments[1].call();
+      if (!call_rhs) return util::nullopt;
+      if (call_rhs->function_name != "is_null") return util::nullopt;
+
+      // ... and that it references the same target
+      auto target = call_rhs->arguments[0].field_ref();
+      if (!target) return util::nullopt;
+      if (*target != out->target) return util::nullopt;
+
+      out->nullable = true;
+      return out;
+    }
 
-        // Ensure both calls are comparisons with equal LHS and scalar RHS
-        auto cmp = Comparison::Get(expr);
-        auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+    // fall back to a simple comparison with no "is_null"
+    return ExtractOneFromComparison(guarantee);
+  }
 
-        if (!cmp) return expr;
-        if (!cmp_guarantee) return expr;
+  static util::optional<Inequality> ExtractOneFromComparison(
+      const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
 
-        const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
-        const auto& guarantee_lhs = guarantee.arguments[0];
-        if (lhs != guarantee_lhs) return expr;
+    if (auto cmp = Comparison::Get(call->function_name)) {
+      // not_equal comparisons are not very usable as guarantees
+      if (*cmp == Comparison::NOT_EQUAL) return util::nullopt;
 
-        auto rhs = call->arguments[1].literal();
-        auto guarantee_rhs = guarantee.arguments[1].literal();
+      auto target = call->arguments[0].field_ref();
+      if (!target) return util::nullopt;
 
-        if (!rhs) return expr;
-        if (!rhs->is_scalar()) return expr;
+      auto bound = call->arguments[1].literal();
+      if (!bound) return util::nullopt;
+      if (!bound->is_scalar()) return util::nullopt;
 
-        if (!guarantee_rhs) return expr;
-        if (!guarantee_rhs->is_scalar()) return expr;
+      return Inequality{*cmp, /*target=*/*target, *bound, /*nullable=*/false};
+    }
 
-        ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
-                              Comparison::Execute(*rhs, *guarantee_rhs));
-        DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+    return util::nullopt;
+  }
 
-        if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
-          // RHS of filter is equal to RHS of guarantee
+  /// The given expression simplifies to `value` if the inequality
+  /// target is not nullable. Otherwise, it simplifies to either a
+  /// call to true_unless_null or !true_unless_null.
+  Result<Expression> simplified_to(const Expression& bound_target, bool value) const {
+    if (!nullable) return literal(value);
+
+    ExecContext exec_context;
+
+    // Data may be null, so comparison will yield `value` - or null IFF the data was null
+    //
+    // true_unless_null is cheap; it purely reuses the validity bitmap for the values
+    // buffer. Inversion is less cheap but we expect that term never to be evaluated
+    // since invert(true_unless_null(x)) is not satisfiable.
+    Expression::Call call;
+    call.function_name = "true_unless_null";
+    call.arguments = {bound_target};
+    ARROW_ASSIGN_OR_RAISE(
+        auto true_unless_null,
+        BindNonRecursive(std::move(call),
+                         /*insert_implicit_casts=*/false, &exec_context));
+    if (value) return true_unless_null;
+
+    Expression::Call invert;
+    invert.function_name = "invert";
+    invert.arguments = {std::move(true_unless_null)};
+    return BindNonRecursive(std::move(invert),
+                            /*insert_implicit_casts=*/false, &exec_context);
+  }
 
-          if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
-            // guarantee is a subset of filter, so all data will be included
-            // x > 1, x >= 1, x != 1 guaranteed by x > 1
-            return literal(true);
-          }
+  /// \brief Simplify the given expression given this inequality as a guarantee.
+  Result<Expression> Simplify(Expression expr) {
+    const auto& guarantee = *this;
 
-          if ((*cmp & *cmp_guarantee) == 0) {
-            // guarantee disjoint with filter, so all data will be excluded
-            // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
-            return literal(false);
-          }
+    auto call = expr.call();
+    if (!call) return expr;
 
-          return expr;
-        }
+    auto cmp = Comparison::Get(expr);
+    if (!cmp) return expr;
 
-        if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
-          // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
-          return expr;
-        }
+    auto rhs = call->arguments[1].literal();
+    if (!rhs) return expr;
+    if (!rhs->is_scalar()) return expr;
 
-        if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
-          // x > 1, x >= 1, x != 1 guaranteed by x >= 3
-          return literal(true);
-        } else {
-          // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
-          return literal(false);
-        }
+    const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+    if (!lhs.field_ref()) return expr;
+    if (*lhs.field_ref() != guarantee.target) return expr;
+
+    ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_bound, Comparison::Execute(*rhs, guarantee.bound));
+    DCHECK_NE(cmp_rhs_bound, Comparison::NA);
+
+    if (cmp_rhs_bound == Comparison::EQUAL) {
+      // RHS of filter is equal to RHS of guarantee
+
+      if ((*cmp & guarantee.cmp) == guarantee.cmp) {
+        // guarantee is a subset of filter, so all data will be included
+        // x > 1, x >= 1, x != 1 guaranteed by x > 1
+        return simplified_to(lhs, true);
+      }
+
+      if ((*cmp & guarantee.cmp) == 0) {
+        // guarantee disjoint with filter, so all data will be excluded
+        // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
+        return simplified_to(lhs, false);
+      }
+
+      return expr;
+    }
+
+    if (guarantee.cmp & cmp_rhs_bound) {
+      // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
+      return expr;
+    }
+
+    if (*cmp & Comparison::GetFlipped(cmp_rhs_bound)) {
+      // x > 1, x >= 1, x != 1 guaranteed by x >= 3

Review Comment:
   This is hard to reason but I agree with the conclusions :)
   ```
   x > 3, x >= 3 always true if guaranteed x > 5
     * cmp_rhs_bound will be <
     * cmp will be > or >=
     * guarantee.cmp will be >
     * Your logic simplifies to true (correct)
   x < 5, x <= 5 always true if guaranteed x < 2
     * cmp_rhs_bound will be >
     * cmp will be < or <=
     * guarantee.cmp will be <
     * Your logic simplifies to true (correct)
   x != 5 always true if guaranteed x < 3 or x > 7
     * cmp_rhs_bound will be > or <
     * cmp will be <>
     * guarantee.cmp will be < or > (always disjoint with cmp_rhs_bound)
     * Your logic simplifies to true (correct)
   x > 5, x >= 5 always false if guaranteed x < 3
     * cmp_rhs_bound is >
     * cmp is > or >=
     * guarantee.cmp is <
     * Your logic simplifies to false (correct)
   x < 5, x <= 5 always false if guaranteed x > 7
     * cmp_rhs_bound is <
     * cmp is < or <=
     * guarantee.cmp is >
     * Your logic simplifies to false (correct)
   x == 5 always false if guaranteed x > 7 or x < 3
     * cmp_rhs_bound is < or >
     * cmp is ==
     * guarantee.cmp is > or < (always disjoint with cmp_rhs_bound)
     * Your logic simplifies to false (correct)
   ```
   



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -879,79 +918,183 @@ Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_cont
 
 namespace {
 
-Result<Expression> DirectComparisonSimplification(Expression expr,
-                                                  const Expression::Call& guarantee) {
-  return Modify(
-      std::move(expr), [](Expression expr) { return expr; },
-      [&guarantee](Expression expr, ...) -> Result<Expression> {
-        auto call = expr.call();
-        if (!call) return expr;
+// An inequality comparison which a target Expression is known to satisfy. If nullable,
+// the target may evaluate to null in addition to values satisfying the comparison.
+struct Inequality {
+  Comparison::type cmp;
+  const FieldRef& target;
+  const Datum& bound;
+  bool nullable;
+
+  // Extract an Inequality if possible, derived from "less",
+  // "greater", "less_equal", and "greater_equal" expressions,
+  // possibly disjuncted with an "is_null" Expression.
+  // cmp(a, 2)
+  // cmp(a, 2) or is_null(a)
+  static util::optional<Inequality> ExtractOne(const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
+
+    if (call->function_name == "or_kleene") {
+      // expect the LHS to be a usable field inequality
+      auto out = ExtractOneFromComparison(call->arguments[0]);
+      if (!out) return util::nullopt;
+
+      // expect the RHS to be an is_null expression
+      auto call_rhs = call->arguments[1].call();
+      if (!call_rhs) return util::nullopt;
+      if (call_rhs->function_name != "is_null") return util::nullopt;
+
+      // ... and that it references the same target
+      auto target = call_rhs->arguments[0].field_ref();
+      if (!target) return util::nullopt;
+      if (*target != out->target) return util::nullopt;
+
+      out->nullable = true;
+      return out;
+    }
 
-        // Ensure both calls are comparisons with equal LHS and scalar RHS
-        auto cmp = Comparison::Get(expr);
-        auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+    // fall back to a simple comparison with no "is_null"
+    return ExtractOneFromComparison(guarantee);
+  }
 
-        if (!cmp) return expr;
-        if (!cmp_guarantee) return expr;
+  static util::optional<Inequality> ExtractOneFromComparison(
+      const Expression& guarantee) {
+    auto call = guarantee.call();
+    if (!call) return util::nullopt;
 
-        const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
-        const auto& guarantee_lhs = guarantee.arguments[0];
-        if (lhs != guarantee_lhs) return expr;
+    if (auto cmp = Comparison::Get(call->function_name)) {
+      // not_equal comparisons are not very usable as guarantees
+      if (*cmp == Comparison::NOT_EQUAL) return util::nullopt;
 
-        auto rhs = call->arguments[1].literal();
-        auto guarantee_rhs = guarantee.arguments[1].literal();
+      auto target = call->arguments[0].field_ref();
+      if (!target) return util::nullopt;
 
-        if (!rhs) return expr;
-        if (!rhs->is_scalar()) return expr;
+      auto bound = call->arguments[1].literal();
+      if (!bound) return util::nullopt;
+      if (!bound->is_scalar()) return util::nullopt;
 
-        if (!guarantee_rhs) return expr;
-        if (!guarantee_rhs->is_scalar()) return expr;
+      return Inequality{*cmp, /*target=*/*target, *bound, /*nullable=*/false};
+    }
 
-        ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
-                              Comparison::Execute(*rhs, *guarantee_rhs));
-        DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+    return util::nullopt;
+  }
 
-        if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
-          // RHS of filter is equal to RHS of guarantee
+  /// The given expression simplifies to `value` if the inequality
+  /// target is not nullable. Otherwise, it simplifies to either a
+  /// call to true_unless_null or !true_unless_null.
+  Result<Expression> simplified_to(const Expression& bound_target, bool value) const {
+    if (!nullable) return literal(value);
+
+    ExecContext exec_context;
+
+    // Data may be null, so comparison will yield `value` - or null IFF the data was null
+    //
+    // true_unless_null is cheap; it purely reuses the validity bitmap for the values
+    // buffer. Inversion is less cheap but we expect that term never to be evaluated
+    // since invert(true_unless_null(x)) is not satisfiable.
+    Expression::Call call;
+    call.function_name = "true_unless_null";
+    call.arguments = {bound_target};
+    ARROW_ASSIGN_OR_RAISE(
+        auto true_unless_null,
+        BindNonRecursive(std::move(call),
+                         /*insert_implicit_casts=*/false, &exec_context));
+    if (value) return true_unless_null;
+
+    Expression::Call invert;
+    invert.function_name = "invert";
+    invert.arguments = {std::move(true_unless_null)};
+    return BindNonRecursive(std::move(invert),
+                            /*insert_implicit_casts=*/false, &exec_context);
+  }
 
-          if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
-            // guarantee is a subset of filter, so all data will be included
-            // x > 1, x >= 1, x != 1 guaranteed by x > 1
-            return literal(true);
-          }
+  /// \brief Simplify the given expression given this inequality as a guarantee.
+  Result<Expression> Simplify(Expression expr) {
+    const auto& guarantee = *this;
 
-          if ((*cmp & *cmp_guarantee) == 0) {
-            // guarantee disjoint with filter, so all data will be excluded
-            // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
-            return literal(false);
-          }
+    auto call = expr.call();
+    if (!call) return expr;
 
-          return expr;
-        }
+    auto cmp = Comparison::Get(expr);
+    if (!cmp) return expr;
 
-        if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
-          // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
-          return expr;
-        }
+    auto rhs = call->arguments[1].literal();
+    if (!rhs) return expr;
+    if (!rhs->is_scalar()) return expr;
 
-        if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
-          // x > 1, x >= 1, x != 1 guaranteed by x >= 3
-          return literal(true);
-        } else {
-          // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
-          return literal(false);
-        }
+    const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+    if (!lhs.field_ref()) return expr;
+    if (*lhs.field_ref() != guarantee.target) return expr;
+
+    ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_bound, Comparison::Execute(*rhs, guarantee.bound));
+    DCHECK_NE(cmp_rhs_bound, Comparison::NA);
+
+    if (cmp_rhs_bound == Comparison::EQUAL) {
+      // RHS of filter is equal to RHS of guarantee
+
+      if ((*cmp & guarantee.cmp) == guarantee.cmp) {
+        // guarantee is a subset of filter, so all data will be included
+        // x > 1, x >= 1, x != 1 guaranteed by x > 1
+        return simplified_to(lhs, true);
+      }
+
+      if ((*cmp & guarantee.cmp) == 0) {
+        // guarantee disjoint with filter, so all data will be excluded
+        // x > 1, x >= 1, x != 1 unsatisfiable if x == 1

Review Comment:
   `x >= 1` is satisfiable if `x == 1` (those two are not disjoint).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] ursabot commented on pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

Posted by GitBox <gi...@apache.org>.
ursabot commented on PR #12891:
URL: https://github.com/apache/arrow/pull/12891#issuecomment-1107853290

   Benchmark runs are scheduled for baseline = 20ec0fda708b72e4398e422f8bc3ee8ef0a76528 and contender = 0e03af446c328d0ef963510c3292cb14e092b917. 0e03af446c328d0ef963510c3292cb14e092b917 is a master commit associated with this PR. Results will be available as each benchmark for each run completes.
   Conbench compare runs links:
   [Finished :arrow_down:0.0% :arrow_up:0.0%] [ec2-t3-xlarge-us-east-2](https://conbench.ursa.dev/compare/runs/b1a92ea63a9e4603a6f0fe244a3cdf83...9cc1b961baa54b18841206d704cdd139/)
   [Failed] [test-mac-arm](https://conbench.ursa.dev/compare/runs/8cd3e83c596948bba25f69edbb1aa1a9...b6a32d1850fb458cae8f69d4b852e59d/)
   [Failed :arrow_down:0.0% :arrow_up:0.0%] [ursa-i9-9960x](https://conbench.ursa.dev/compare/runs/ef8f169fd3a745899121fa31ba8a7834...ddf86dc5a07446018930a71d758a32d1/)
   [Finished :arrow_down:0.42% :arrow_up:0.34%] [ursa-thinkcentre-m75q](https://conbench.ursa.dev/compare/runs/1d968edc9cf64c30a6f38cb474e8bd7d...860a9578bcfc4505bb5cc146d87ee0af/)
   Buildkite builds:
   [Finished] <https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ec2-t3-xlarge-us-east-2/builds/576| `0e03af44` ec2-t3-xlarge-us-east-2>
   [Failed] <https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-test-mac-arm/builds/564| `0e03af44` test-mac-arm>
   [Failed] <https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-i9-9960x/builds/562| `0e03af44` ursa-i9-9960x>
   [Finished] <https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-thinkcentre-m75q/builds/574| `0e03af44` ursa-thinkcentre-m75q>
   [Finished] <https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ec2-t3-xlarge-us-east-2/builds/575| `20ec0fda` ec2-t3-xlarge-us-east-2>
   [Failed] <https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-test-mac-arm/builds/563| `20ec0fda` test-mac-arm>
   [Failed] <https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-i9-9960x/builds/561| `20ec0fda` ursa-i9-9960x>
   [Finished] <https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-thinkcentre-m75q/builds/573| `20ec0fda` ursa-thinkcentre-m75q>
   Supported benchmarks:
   ec2-t3-xlarge-us-east-2: Supported benchmark langs: Python, R. Runs only benchmarks with cloud = True
   test-mac-arm: Supported benchmark langs: C++, Python, R
   ursa-i9-9960x: Supported benchmark langs: Python, R, JavaScript
   ursa-thinkcentre-m75q: Supported benchmark langs: C++, Java
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org