You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/15 16:46:47 UTC

[GitHub] [arrow] wjones127 commented on a diff in pull request #12891: ARROW-12659: [C++] Support is_valid as a guarantee

wjones127 commented on code in PR #12891:
URL: https://github.com/apache/arrow/pull/12891#discussion_r851338828


##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -682,46 +715,49 @@ std::vector<Expression> GuaranteeConjunctionMembers(
   return FlattenedAssociativeChain(guaranteed_true_predicate).fringe;
 }
 
-// Conjunction members which are represented in known_values are erased from
-// conjunction_members
-Status ExtractKnownFieldValuesImpl(
-    std::vector<Expression>* conjunction_members,
-    std::unordered_map<FieldRef, Datum, FieldRef::Hash>* known_values) {
-  auto unconsumed_end =
-      std::partition(conjunction_members->begin(), conjunction_members->end(),
-                     [](const Expression& expr) {
-                       // search for an equality conditions between a field and a literal
-                       auto call = expr.call();
-                       if (!call) return true;
-
-                       if (call->function_name == "equal") {
-                         auto ref = call->arguments[0].field_ref();
-                         auto lit = call->arguments[1].literal();
-                         return !(ref && lit);
-                       }
-
-                       if (call->function_name == "is_null") {
-                         auto ref = call->arguments[0].field_ref();
-                         return !ref;
-                       }
-
-                       return true;
-                     });
-
-  for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) {
-    auto call = CallNotNull(*it);
-
-    if (call->function_name == "equal") {
-      auto ref = call->arguments[0].field_ref();
-      auto lit = call->arguments[1].literal();
-      known_values->emplace(*ref, *lit);
-    } else if (call->function_name == "is_null") {
-      auto ref = call->arguments[0].field_ref();
-      known_values->emplace(*ref, Datum(std::make_shared<NullScalar>()));
-    }
+// equal(a, 2)

Review Comment:
   Do we want to provide more context here?



##########
cpp/src/arrow/dataset/file_parquet.cc:
##########
@@ -128,17 +128,30 @@ util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
   auto maybe_min = min->CastTo(field->type());
   auto maybe_max = max->CastTo(field->type());
   if (maybe_min.ok() && maybe_max.ok()) {
-    auto col_min = maybe_min.MoveValueUnsafe();
-    auto col_max = maybe_max.MoveValueUnsafe();
-    if (col_min->Equals(col_max)) {
-      return compute::equal(std::move(field_expr), compute::literal(std::move(col_min)));
+    min = maybe_min.MoveValueUnsafe();
+    max = maybe_max.MoveValueUnsafe();
+
+    compute::Expression range;
+    if (min->Equals(max)) {
+      auto single_value = compute::equal(field_expr, compute::literal(std::move(min)));
+
+      if (statistics->null_count() == 0) {
+        return compute::and_(single_value, compute::is_valid(field_expr));
+      }
+      return compute::or_(std::move(single_value), is_null(std::move(field_expr)));
     }
 
     auto lower_bound =
-        compute::greater_equal(field_expr, compute::literal(std::move(col_min)));
-    auto upper_bound =
-        compute::less_equal(std::move(field_expr), compute::literal(std::move(col_max)));
-    return compute::and_(std::move(lower_bound), std::move(upper_bound));
+        compute::greater_equal(field_expr, compute::literal(std::move(min)));
+    auto upper_bound = compute::less_equal(field_expr, compute::literal(std::move(max)));
+
+    if (statistics->null_count() != 0) {
+      lower_bound = compute::or_(std::move(lower_bound), is_null(field_expr));
+      upper_bound = compute::or_(std::move(upper_bound), is_null(std::move(field_expr)));
+      return compute::and_(std::move(lower_bound), std::move(upper_bound));
+    }
+    return compute::and_(compute::and_(std::move(lower_bound), std::move(upper_bound)),
+                         compute::is_valid(field_expr));
   }

Review Comment:
   Do we want an arm for `statitics->null_count() == row_count`? In which case, we would just return `is_null(field_expr)`?



##########
cpp/src/arrow/compute/exec/expression.cc:
##########
@@ -682,46 +715,49 @@ std::vector<Expression> GuaranteeConjunctionMembers(
   return FlattenedAssociativeChain(guaranteed_true_predicate).fringe;
 }
 
-// Conjunction members which are represented in known_values are erased from
-// conjunction_members
-Status ExtractKnownFieldValuesImpl(
-    std::vector<Expression>* conjunction_members,
-    std::unordered_map<FieldRef, Datum, FieldRef::Hash>* known_values) {
-  auto unconsumed_end =
-      std::partition(conjunction_members->begin(), conjunction_members->end(),
-                     [](const Expression& expr) {
-                       // search for an equality conditions between a field and a literal
-                       auto call = expr.call();
-                       if (!call) return true;
-
-                       if (call->function_name == "equal") {
-                         auto ref = call->arguments[0].field_ref();
-                         auto lit = call->arguments[1].literal();
-                         return !(ref && lit);
-                       }
-
-                       if (call->function_name == "is_null") {
-                         auto ref = call->arguments[0].field_ref();
-                         return !ref;
-                       }
-
-                       return true;
-                     });
-
-  for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) {
-    auto call = CallNotNull(*it);
-
-    if (call->function_name == "equal") {
-      auto ref = call->arguments[0].field_ref();
-      auto lit = call->arguments[1].literal();
-      known_values->emplace(*ref, *lit);
-    } else if (call->function_name == "is_null") {
-      auto ref = call->arguments[0].field_ref();
-      known_values->emplace(*ref, Datum(std::make_shared<NullScalar>()));
-    }
+// equal(a, 2)
+// is_null(a)
+util::optional<std::pair<FieldRef, Datum>> ExtractKnownFieldValue(

Review Comment:
   nit: I don't love that this is different from below function names by only a single character. Maybe something like `ExtractSingleFieldvalue` or something similar?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org