You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/12/22 09:18:59 UTC

(arrow-rs) branch master updated: Minor: Improve comments and errors for ArrowPredicate (#5230)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 41fda0bb98 Minor: Improve comments and errors for ArrowPredicate (#5230)
41fda0bb98 is described below

commit 41fda0bb9886b163ff93511343b4b2bc770e62ca
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Fri Dec 22 04:18:52 2023 -0500

    Minor: Improve comments and errors for ArrowPredicate (#5230)
    
    * Minor: Improve comments and errors for ArrowPredicate
    
    * Improve comments
---
 parquet/src/arrow/arrow_reader/filter.rs    |  6 ++++--
 parquet/src/arrow/arrow_reader/mod.rs       | 24 +++++++++++++++++++-----
 parquet/src/arrow/arrow_reader/selection.rs | 14 ++++++++++----
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/parquet/src/arrow/arrow_reader/filter.rs b/parquet/src/arrow/arrow_reader/filter.rs
index ea529b74f6..a80255f413 100644
--- a/parquet/src/arrow/arrow_reader/filter.rs
+++ b/parquet/src/arrow/arrow_reader/filter.rs
@@ -29,8 +29,10 @@ pub trait ArrowPredicate: Send + 'static {
     /// Evaluate this predicate for the given [`RecordBatch`] containing the columns
     /// identified by [`Self::projection`]
     ///
-    /// Rows that are `true` in the returned [`BooleanArray`] will be returned by the
-    /// parquet reader, whereas rows that are `false` or `Null` will not be
+    /// Must return a  [`BooleanArray`] that has the same length as the input
+    /// `batch` where each row indicates whether the row should be returned:
+    /// * `true`:the row should be returned
+    /// * `false` or `null`: the row should not be returned
     fn evaluate(&mut self, batch: RecordBatch) -> Result<BooleanArray, ArrowError>;
 }
 
diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
index 77de839940..52d7249a29 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -677,11 +677,16 @@ pub(crate) fn apply_range(
     selection
 }
 
-/// Evaluates an [`ArrowPredicate`] returning the [`RowSelection`]
+/// Evaluates an [`ArrowPredicate`], returning a [`RowSelection`] indicating
+/// which rows to return.
 ///
-/// If this [`ParquetRecordBatchReader`] has a [`RowSelection`], the
-/// returned [`RowSelection`] will be the conjunction of this and
-/// the rows selected by `predicate`
+/// `input_selection`: Optional pre-existing selection. If `Some`, then the
+/// final [`RowSelection`] will be the conjunction of it and the rows selected
+/// by `predicate`.
+///
+/// Note: A pre-existing selection may come from evaluating a previous predicate
+/// or if the [`ParquetRecordBatchReader`] specified an explicit
+/// [`RowSelection`] in addition to one or more predicates.
 pub(crate) fn evaluate_predicate(
     batch_size: usize,
     array_reader: Box<dyn ArrayReader>,
@@ -691,7 +696,16 @@ pub(crate) fn evaluate_predicate(
     let reader = ParquetRecordBatchReader::new(batch_size, array_reader, input_selection.clone());
     let mut filters = vec![];
     for maybe_batch in reader {
-        let filter = predicate.evaluate(maybe_batch?)?;
+        let maybe_batch = maybe_batch?;
+        let input_rows = maybe_batch.num_rows();
+        let filter = predicate.evaluate(maybe_batch)?;
+        // Since user supplied predicate, check error here to catch bugs quickly
+        if filter.len() != input_rows {
+            return Err(arrow_err!(
+                "ArrowPredicate predicate returned {} rows, expected {input_rows}",
+                filter.len()
+            ));
+        }
         match filter.null_count() {
             0 => filters.push(filter),
             _ => filters.push(prep_null_mask_filter(&filter)),
diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs
index cebf3f9d38..82f2146129 100644
--- a/parquet/src/arrow/arrow_reader/selection.rs
+++ b/parquet/src/arrow/arrow_reader/selection.rs
@@ -241,16 +241,22 @@ impl RowSelection {
             selectors: remaining,
         }
     }
-    /// Given a [`RowSelection`] computed under `self`, returns the [`RowSelection`]
-    /// representing their conjunction
+    /// returns a [`RowSelection`] representing rows that are selected in both
+    /// input [`RowSelection`]s.
     ///
-    /// For example:
+    /// This is equivalent to the logical `AND` / conjunction of the two
+    /// selections.
+    ///
+    /// # Example
+    /// If `N` means the row is not selected, and `Y` means it is
+    /// selected:
     ///
+    /// ```text
     /// self:     NNNNNNNNNNNNYYYYYYYYYYYYYYYYYYYYYYNNNYYYYY
     /// other:                YYYYYNNNNYYYYYYYYYYYYY   YYNNN
     ///
     /// returned: NNNNNNNNNNNNYYYYYNNNNYYYYYYYYYYYYYNNNYYNNN
-    ///
+    /// ```
     ///
     /// # Panics
     ///