You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/12/22 09:18:59 UTC
(arrow-rs) branch master updated: Minor: Improve comments and errors for ArrowPredicate (#5230)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 41fda0bb98 Minor: Improve comments and errors for ArrowPredicate (#5230)
41fda0bb98 is described below
commit 41fda0bb9886b163ff93511343b4b2bc770e62ca
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Fri Dec 22 04:18:52 2023 -0500
Minor: Improve comments and errors for ArrowPredicate (#5230)
* Minor: Improve comments and errors for ArrowPredicate
* Improve comments
---
parquet/src/arrow/arrow_reader/filter.rs | 6 ++++--
parquet/src/arrow/arrow_reader/mod.rs | 24 +++++++++++++++++++-----
parquet/src/arrow/arrow_reader/selection.rs | 14 ++++++++++----
3 files changed, 33 insertions(+), 11 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/filter.rs b/parquet/src/arrow/arrow_reader/filter.rs
index ea529b74f6..a80255f413 100644
--- a/parquet/src/arrow/arrow_reader/filter.rs
+++ b/parquet/src/arrow/arrow_reader/filter.rs
@@ -29,8 +29,10 @@ pub trait ArrowPredicate: Send + 'static {
/// Evaluate this predicate for the given [`RecordBatch`] containing the columns
/// identified by [`Self::projection`]
///
- /// Rows that are `true` in the returned [`BooleanArray`] will be returned by the
- /// parquet reader, whereas rows that are `false` or `Null` will not be
+ /// Must return a [`BooleanArray`] that has the same length as the input
+ /// `batch` where each row indicates whether the row should be returned:
+ /// * `true`:the row should be returned
+ /// * `false` or `null`: the row should not be returned
fn evaluate(&mut self, batch: RecordBatch) -> Result<BooleanArray, ArrowError>;
}
diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
index 77de839940..52d7249a29 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -677,11 +677,16 @@ pub(crate) fn apply_range(
selection
}
-/// Evaluates an [`ArrowPredicate`] returning the [`RowSelection`]
+/// Evaluates an [`ArrowPredicate`], returning a [`RowSelection`] indicating
+/// which rows to return.
///
-/// If this [`ParquetRecordBatchReader`] has a [`RowSelection`], the
-/// returned [`RowSelection`] will be the conjunction of this and
-/// the rows selected by `predicate`
+/// `input_selection`: Optional pre-existing selection. If `Some`, then the
+/// final [`RowSelection`] will be the conjunction of it and the rows selected
+/// by `predicate`.
+///
+/// Note: A pre-existing selection may come from evaluating a previous predicate
+/// or if the [`ParquetRecordBatchReader`] specified an explicit
+/// [`RowSelection`] in addition to one or more predicates.
pub(crate) fn evaluate_predicate(
batch_size: usize,
array_reader: Box<dyn ArrayReader>,
@@ -691,7 +696,16 @@ pub(crate) fn evaluate_predicate(
let reader = ParquetRecordBatchReader::new(batch_size, array_reader, input_selection.clone());
let mut filters = vec![];
for maybe_batch in reader {
- let filter = predicate.evaluate(maybe_batch?)?;
+ let maybe_batch = maybe_batch?;
+ let input_rows = maybe_batch.num_rows();
+ let filter = predicate.evaluate(maybe_batch)?;
+ // Since user supplied predicate, check error here to catch bugs quickly
+ if filter.len() != input_rows {
+ return Err(arrow_err!(
+ "ArrowPredicate predicate returned {} rows, expected {input_rows}",
+ filter.len()
+ ));
+ }
match filter.null_count() {
0 => filters.push(filter),
_ => filters.push(prep_null_mask_filter(&filter)),
diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs
index cebf3f9d38..82f2146129 100644
--- a/parquet/src/arrow/arrow_reader/selection.rs
+++ b/parquet/src/arrow/arrow_reader/selection.rs
@@ -241,16 +241,22 @@ impl RowSelection {
selectors: remaining,
}
}
- /// Given a [`RowSelection`] computed under `self`, returns the [`RowSelection`]
- /// representing their conjunction
+ /// returns a [`RowSelection`] representing rows that are selected in both
+ /// input [`RowSelection`]s.
///
- /// For example:
+ /// This is equivalent to the logical `AND` / conjunction of the two
+ /// selections.
+ ///
+ /// # Example
+ /// If `N` means the row is not selected, and `Y` means it is
+ /// selected:
///
+ /// ```text
/// self: NNNNNNNNNNNNYYYYYYYYYYYYYYYYYYYYYYNNNYYYYY
/// other: YYYYYNNNNYYYYYYYYYYYYY YYNNN
///
/// returned: NNNNNNNNNNNNYYYYYNNNNYYYYYYYYYYYYYNNNYYNNN
- ///
+ /// ```
///
/// # Panics
///