You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/11/02 16:09:19 UTC

[arrow-rs] branch master updated: Add `RowSelection::from_selectors_and_combine` to merge RowSelectors (#2994)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new f11372cb8 Add `RowSelection::from_selectors_and_combine` to  merge RowSelectors  (#2994)
f11372cb8 is described below

commit f11372cb8ff4d6fecbe1bd7b5ef3d66cba719c83
Author: Yang Jiang <ya...@ebay.com>
AuthorDate: Thu Nov 3 00:09:13 2022 +0800

    Add `RowSelection::from_selectors_and_combine` to  merge RowSelectors  (#2994)
    
    * Support merge RowSelectors when creating RowSelection
    
    * remove useless
    
    * change it to default from
    
    * Update parquet/src/arrow/arrow_reader/selection.rs
    
    Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
    
    * fix comment
    
    Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
---
 parquet/src/arrow/arrow_reader/selection.rs | 124 +++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 1 deletion(-)

diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs
index f1270926b..2328c4501 100644
--- a/parquet/src/arrow/arrow_reader/selection.rs
+++ b/parquet/src/arrow/arrow_reader/selection.rs
@@ -117,6 +117,38 @@ impl RowSelection {
         Self { selectors }
     }
 
+    /// Creates a [`RowSelection`] from a slice of uncombined `RowSelector`:
+    /// Like [skip(5),skip(5),read(10)].
+    /// After combine will return [skip(10),read(10)]
+    /// # Note
+    ///  [`RowSelection`] must be combined prior to use within offset_index or else the code will panic.
+    fn from_selectors_and_combine(selectors: &[RowSelector]) -> Self {
+        if selectors.len() < 2 {
+            return Self {
+                selectors: Vec::from(selectors),
+            };
+        }
+        let first = selectors.first().unwrap();
+        let mut sum_rows = first.row_count;
+        let mut skip = first.skip;
+        let mut combined_result = vec![];
+
+        for s in selectors.iter().skip(1) {
+            if s.skip == skip {
+                sum_rows += s.row_count
+            } else {
+                add_selector(skip, sum_rows, &mut combined_result);
+                sum_rows = s.row_count;
+                skip = s.skip;
+            }
+        }
+        add_selector(skip, sum_rows, &mut combined_result);
+
+        Self {
+            selectors: combined_result,
+        }
+    }
+
     /// Given an offset index, return the offset ranges for all data pages selected by `self`
     #[cfg(any(test, feature = "async"))]
     pub(crate) fn scan_ranges(
@@ -307,7 +339,7 @@ impl RowSelection {
 
 impl From<Vec<RowSelector>> for RowSelection {
     fn from(selectors: Vec<RowSelector>) -> Self {
-        Self { selectors }
+        Self::from_selectors_and_combine(selectors.as_slice())
     }
 }
 
@@ -317,6 +349,15 @@ impl From<RowSelection> for VecDeque<RowSelector> {
     }
 }
 
+fn add_selector(skip: bool, sum_row: usize, combined_result: &mut Vec<RowSelector>) {
+    let selector = if skip {
+        RowSelector::skip(sum_row)
+    } else {
+        RowSelector::select(sum_row)
+    };
+    combined_result.push(selector);
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -470,6 +511,87 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_combine() {
+        let a = vec![
+            RowSelector::skip(3),
+            RowSelector::skip(3),
+            RowSelector::select(10),
+            RowSelector::skip(4),
+        ];
+
+        let b = vec![
+            RowSelector::skip(3),
+            RowSelector::skip(3),
+            RowSelector::select(10),
+            RowSelector::skip(4),
+            RowSelector::skip(0),
+        ];
+
+        let c = vec![
+            RowSelector::skip(2),
+            RowSelector::skip(4),
+            RowSelector::select(3),
+            RowSelector::select(3),
+            RowSelector::select(4),
+            RowSelector::skip(3),
+            RowSelector::skip(1),
+            RowSelector::skip(0),
+        ];
+
+        let expected = RowSelection::from(vec![
+            RowSelector::skip(6),
+            RowSelector::select(10),
+            RowSelector::skip(4),
+        ]);
+
+        assert_eq!(RowSelection::from_selectors_and_combine(&a), expected);
+        assert_eq!(RowSelection::from_selectors_and_combine(&b), expected);
+        assert_eq!(RowSelection::from_selectors_and_combine(&c), expected);
+    }
+
+    #[test]
+    fn test_combine_2elements() {
+        let a = vec![RowSelector::select(10), RowSelector::select(5)];
+        let a_expect = vec![RowSelector::select(15)];
+        assert_eq!(
+            RowSelection::from_selectors_and_combine(&a).selectors,
+            a_expect
+        );
+
+        let b = vec![RowSelector::select(10), RowSelector::skip(5)];
+        let b_expect = vec![RowSelector::select(10), RowSelector::skip(5)];
+        assert_eq!(
+            RowSelection::from_selectors_and_combine(&b).selectors,
+            b_expect
+        );
+
+        let c = vec![RowSelector::skip(10), RowSelector::select(5)];
+        let c_expect = vec![RowSelector::skip(10), RowSelector::select(5)];
+        assert_eq!(
+            RowSelection::from_selectors_and_combine(&c).selectors,
+            c_expect
+        );
+
+        let d = vec![RowSelector::skip(10), RowSelector::skip(5)];
+        let d_expect = vec![RowSelector::skip(15)];
+        assert_eq!(
+            RowSelection::from_selectors_and_combine(&d).selectors,
+            d_expect
+        );
+    }
+
+    #[test]
+    fn test_from_one_and_empty() {
+        let a = vec![RowSelector::select(10)];
+        let selection1 = RowSelection::from(a.clone());
+        assert_eq!(selection1.selectors, a);
+
+        let b = vec![];
+        let selection1 = RowSelection::from(b.clone());
+        assert_eq!(selection1.selectors, b)
+    }
+
     #[test]
     #[should_panic(expected = "selection exceeds the number of selected rows")]
     fn test_and_longer() {