You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/11 01:42:53 UTC

[arrow-rs] branch master updated: Remove unused range module (#3085)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 01396822e Remove unused range module (#3085)
01396822e is described below

commit 01396822eb68a90565cf8b177aab4b0ce8af40e1
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Fri Nov 11 14:42:47 2022 +1300

    Remove unused range module (#3085)
---
 parquet/src/file/page_index/mod.rs   |   3 -
 parquet/src/file/page_index/range.rs | 475 -----------------------------------
 2 files changed, 478 deletions(-)

diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs
index bb7808f16..dcc1120fc 100644
--- a/parquet/src/file/page_index/mod.rs
+++ b/parquet/src/file/page_index/mod.rs
@@ -17,6 +17,3 @@
 
 pub mod index;
 pub mod index_reader;
-
-#[cfg(test)]
-pub(crate) mod range;
diff --git a/parquet/src/file/page_index/range.rs b/parquet/src/file/page_index/range.rs
deleted file mode 100644
index 816ea4025..000000000
--- a/parquet/src/file/page_index/range.rs
+++ /dev/null
@@ -1,475 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-use crate::errors::ParquetError;
-use crate::format::PageLocation;
-use std::cmp::Ordering;
-use std::collections::VecDeque;
-use std::ops::RangeInclusive;
-
-type Range = RangeInclusive<usize>;
-
-pub trait RangeOps {
-    fn is_before(&self, other: &Self) -> bool;
-
-    fn is_after(&self, other: &Self) -> bool;
-
-    fn count(&self) -> usize;
-
-    fn union(left: &Range, right: &Range) -> Option<Range>;
-
-    fn intersection(left: &Range, right: &Range) -> Option<Range>;
-}
-
-impl RangeOps for Range {
-    fn is_before(&self, other: &Range) -> bool {
-        self.end() < other.start()
-    }
-
-    fn is_after(&self, other: &Range) -> bool {
-        self.start() > other.end()
-    }
-
-    fn count(&self) -> usize {
-        self.end() + 1 - self.start()
-    }
-
-    /// Return the union of the two ranges,
-    /// Return `None` if there are hole between them.
-    fn union(left: &Range, right: &Range) -> Option<Range> {
-        if left.start() <= right.start() {
-            if left.end() + 1 >= *right.start() {
-                return Some(Range::new(
-                    *left.start(),
-                    std::cmp::max(*left.end(), *right.end()),
-                ));
-            }
-        } else if right.end() + 1 >= *left.start() {
-            return Some(Range::new(
-                *right.start(),
-                std::cmp::max(*left.end(), *right.end()),
-            ));
-        }
-        None
-    }
-
-    /// Returns the intersection of the two ranges,
-    /// return null if they are not overlapped.
-    fn intersection(left: &Range, right: &Range) -> Option<Range> {
-        if left.start() <= right.start() {
-            if left.end() >= right.start() {
-                return Some(Range::new(
-                    *right.start(),
-                    std::cmp::min(*left.end(), *right.end()),
-                ));
-            }
-        } else if right.end() >= left.start() {
-            return Some(Range::new(
-                *left.start(),
-                std::cmp::min(*left.end(), *right.end()),
-            ));
-        }
-        None
-    }
-}
-
-/// Struct representing row ranges in a row-group. These row ranges are calculated as a result of using
-/// the column index on the filtering.
-#[derive(Debug, Clone)]
-pub struct RowRanges {
-    pub ranges: VecDeque<Range>,
-}
-
-impl RowRanges {
-    //create an empty RowRanges
-    pub fn new_empty() -> Self {
-        RowRanges {
-            ranges: VecDeque::new(),
-        }
-    }
-
-    pub fn count(&self) -> usize {
-        self.ranges.len()
-    }
-
-    pub fn filter_with_mask(&self, mask: &[bool]) -> Result<RowRanges, ParquetError> {
-        if self.ranges.len() != mask.len() {
-            return Err(ParquetError::General(format!(
-                "Mask size {} is not equal to number of pages {}",
-                mask.len(),
-                self.count()
-            )));
-        }
-        let vec_range = mask
-            .iter()
-            .zip(self.ranges.clone())
-            .filter_map(|(&f, r)| if f { Some(r) } else { None })
-            .collect();
-        Ok(RowRanges { ranges: vec_range })
-    }
-
-    /// Add a range to the end of the list of ranges. It maintains the disjunctive ascending order of the ranges by
-    /// trying to union the specified range to the last ranges in the list. The specified range shall be larger than
-    /// the last one or might be overlapped with some of the last ones.
-    /// [a, b] < [c, d] if b < c
-    pub fn add(&mut self, mut range: Range) {
-        let count = self.count();
-        if count > 0 {
-            for i in 1..(count + 1) {
-                let index = count - i;
-                let last = self.ranges.get(index).unwrap();
-                assert!(!last.is_after(&range), "Must add range in ascending!");
-                // try to merge range
-                match Range::union(last, &range) {
-                    None => {
-                        break;
-                    }
-                    Some(r) => {
-                        range = r;
-                        self.ranges.remove(index);
-                    }
-                }
-            }
-        }
-        self.ranges.push_back(range);
-    }
-
-    /// Calculates the union of the two specified RowRanges object. The union of two range is calculated if there are no
-    /// elements between them. Otherwise, the two disjunctive ranges are stored separately.
-    /// For example:
-    /// [113, 241] ∪ [221, 340] = [113, 330]
-    /// [113, 230] ∪ [231, 340] = [113, 340]
-    /// while
-    /// [113, 230] ∪ [232, 340] = [113, 230], [232, 340]
-    ///
-    /// The result RowRanges object will contain all the row indexes that were contained in one of the specified objects.
-    pub fn union(mut left: RowRanges, mut right: RowRanges) -> RowRanges {
-        let v1 = &mut left.ranges;
-        let v2 = &mut right.ranges;
-        let mut result = RowRanges::new_empty();
-        if v2.is_empty() {
-            left.clone()
-        } else {
-            let mut range2 = v2.pop_front().unwrap();
-            while !v1.is_empty() {
-                let range1 = v1.pop_front().unwrap();
-                if range1.is_after(&range2) {
-                    result.add(range2);
-                    range2 = range1;
-                    std::mem::swap(v1, v2);
-                } else {
-                    result.add(range1);
-                }
-            }
-
-            result.add(range2);
-            while !v2.is_empty() {
-                result.add(v2.pop_front().unwrap())
-            }
-
-            result
-        }
-    }
-
-    /// Calculates the intersection of the two specified RowRanges object. Two ranges intersect if they have common
-    /// elements otherwise the result is empty.
-    /// For example:
-    /// [113, 241] ∩ [221, 340] = [221, 241]
-    /// while
-    /// [113, 230] ∩ [231, 340] = <EMPTY>
-    ///
-    /// The result RowRanges object will contain all the row indexes there were contained in both of the specified objects
-    #[allow(clippy::mut_range_bound)]
-    pub fn intersection(left: RowRanges, right: RowRanges) -> RowRanges {
-        let mut result = RowRanges::new_empty();
-        let mut right_index = 0;
-        for l in left.ranges.iter() {
-            for i in right_index..right.ranges.len() {
-                let r = right.ranges.get(i).unwrap();
-                if l.is_before(r) {
-                    break;
-                } else if l.is_after(r) {
-                    right_index = i + 1;
-                    continue;
-                }
-                if let Some(ra) = Range::intersection(l, r) {
-                    result.add(ra);
-                }
-            }
-        }
-        result
-    }
-
-    #[allow(unused)]
-    pub fn row_count(&self) -> usize {
-        self.ranges.iter().map(|x| x.count()).sum()
-    }
-
-    pub fn is_overlapping(&self, x: &Range) -> bool {
-        self.ranges
-            .binary_search_by(|y| -> Ordering {
-                if y.is_before(x) {
-                    Ordering::Less
-                } else if y.is_after(x) {
-                    Ordering::Greater
-                } else {
-                    Ordering::Equal
-                }
-            })
-            .is_ok()
-    }
-}
-
-/// Takes an array of [`PageLocation`], and a total number of rows, and based on the provided `page_mask`
-/// returns the corresponding [`RowRanges`] to scan
-pub fn compute_row_ranges(
-    page_mask: &[bool],
-    locations: &[PageLocation],
-    total_rows: usize,
-) -> Result<RowRanges, ParquetError> {
-    if page_mask.len() != locations.len() {
-        return Err(ParquetError::General(format!(
-            "Page_mask size {} is not equal to number of locations {}",
-            page_mask.len(),
-            locations.len(),
-        )));
-    }
-    let row_ranges = page_locations_to_row_ranges(locations, total_rows)?;
-    row_ranges.filter_with_mask(page_mask)
-}
-
-fn page_locations_to_row_ranges(
-    locations: &[PageLocation],
-    total_rows: usize,
-) -> Result<RowRanges, ParquetError> {
-    if locations.is_empty() || total_rows == 0 {
-        return Ok(RowRanges::new_empty());
-    }
-
-    // If we read directly from parquet pageIndex to construct locations,
-    // the location index should be continuous
-    let mut vec_range: VecDeque<Range> = locations
-        .windows(2)
-        .map(|x| {
-            let start = x[0].first_row_index as usize;
-            let end = (x[1].first_row_index - 1) as usize;
-            Range::new(start, end)
-        })
-        .collect();
-
-    let last = Range::new(
-        locations.last().unwrap().first_row_index as usize,
-        total_rows - 1,
-    );
-    vec_range.push_back(last);
-
-    Ok(RowRanges { ranges: vec_range })
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::basic::Type::INT32;
-    use crate::file::page_index::index::{NativeIndex, PageIndex};
-    use crate::file::page_index::range::{compute_row_ranges, Range, RowRanges};
-    use crate::format::{BoundaryOrder, PageLocation};
-
-    #[test]
-    fn test_binary_search_overlap() {
-        let mut ranges = RowRanges::new_empty();
-        ranges.add(Range::new(1, 3));
-        ranges.add(Range::new(6, 7));
-
-        assert!(ranges.is_overlapping(&Range::new(1, 2)));
-        // include both [start, end]
-        assert!(ranges.is_overlapping(&Range::new(0, 1)));
-        assert!(ranges.is_overlapping(&Range::new(0, 3)));
-
-        assert!(ranges.is_overlapping(&Range::new(0, 7)));
-        assert!(ranges.is_overlapping(&Range::new(2, 7)));
-
-        assert!(!ranges.is_overlapping(&Range::new(4, 5)));
-    }
-
-    #[test]
-    fn test_add_func_ascending_disjunctive() {
-        let mut ranges_1 = RowRanges::new_empty();
-        ranges_1.add(Range::new(1, 3));
-        ranges_1.add(Range::new(5, 6));
-        ranges_1.add(Range::new(8, 9));
-        assert_eq!(ranges_1.count(), 3);
-    }
-
-    #[test]
-    fn test_add_func_ascending_merge() {
-        let mut ranges_1 = RowRanges::new_empty();
-        ranges_1.add(Range::new(1, 3));
-        ranges_1.add(Range::new(4, 5));
-        ranges_1.add(Range::new(6, 7));
-        assert_eq!(ranges_1.count(), 1);
-    }
-
-    #[test]
-    #[should_panic(expected = "Must add range in ascending!")]
-    fn test_add_func_not_ascending() {
-        let mut ranges_1 = RowRanges::new_empty();
-        ranges_1.add(Range::new(6, 7));
-        ranges_1.add(Range::new(1, 3));
-        ranges_1.add(Range::new(4, 5));
-        assert_eq!(ranges_1.count(), 1);
-    }
-
-    #[test]
-    fn test_union_func() {
-        let mut ranges_1 = RowRanges::new_empty();
-        ranges_1.add(Range::new(1, 2));
-        ranges_1.add(Range::new(3, 4));
-        ranges_1.add(Range::new(5, 6));
-
-        let mut ranges_2 = RowRanges::new_empty();
-        ranges_2.add(Range::new(2, 3));
-        ranges_2.add(Range::new(4, 5));
-        ranges_2.add(Range::new(6, 7));
-
-        let ranges = RowRanges::union(ranges_1, ranges_2);
-        assert_eq!(ranges.count(), 1);
-        let range = ranges.ranges.get(0).unwrap();
-        assert_eq!(*range.start(), 1);
-        assert_eq!(*range.end(), 7);
-
-        let mut ranges_a = RowRanges::new_empty();
-        ranges_a.add(Range::new(1, 3));
-        ranges_a.add(Range::new(5, 8));
-        ranges_a.add(Range::new(11, 12));
-
-        let mut ranges_b = RowRanges::new_empty();
-        ranges_b.add(Range::new(0, 2));
-        ranges_b.add(Range::new(6, 7));
-        ranges_b.add(Range::new(10, 11));
-
-        let ranges = RowRanges::union(ranges_a, ranges_b);
-        assert_eq!(ranges.count(), 3);
-
-        let range_1 = ranges.ranges.get(0).unwrap();
-        assert_eq!(*range_1.start(), 0);
-        assert_eq!(*range_1.end(), 3);
-        let range_2 = ranges.ranges.get(1).unwrap();
-        assert_eq!(*range_2.start(), 5);
-        assert_eq!(*range_2.end(), 8);
-        let range_3 = ranges.ranges.get(2).unwrap();
-        assert_eq!(*range_3.start(), 10);
-        assert_eq!(*range_3.end(), 12);
-    }
-
-    #[test]
-    fn test_intersection_func() {
-        let mut ranges_1 = RowRanges::new_empty();
-        ranges_1.add(Range::new(1, 2));
-        ranges_1.add(Range::new(3, 4));
-        ranges_1.add(Range::new(5, 6));
-
-        let mut ranges_2 = RowRanges::new_empty();
-        ranges_2.add(Range::new(2, 3));
-        ranges_2.add(Range::new(4, 5));
-        ranges_2.add(Range::new(6, 7));
-
-        let ranges = RowRanges::intersection(ranges_1, ranges_2);
-        assert_eq!(ranges.count(), 1);
-        let range = ranges.ranges.get(0).unwrap();
-        assert_eq!(*range.start(), 2);
-        assert_eq!(*range.end(), 6);
-
-        let mut ranges_a = RowRanges::new_empty();
-        ranges_a.add(Range::new(1, 3));
-        ranges_a.add(Range::new(5, 8));
-        ranges_a.add(Range::new(11, 12));
-
-        let mut ranges_b = RowRanges::new_empty();
-        ranges_b.add(Range::new(0, 2));
-        ranges_b.add(Range::new(6, 7));
-        ranges_b.add(Range::new(10, 11));
-
-        let ranges = RowRanges::intersection(ranges_a, ranges_b);
-        assert_eq!(ranges.count(), 3);
-
-        let range_1 = ranges.ranges.get(0).unwrap();
-        assert_eq!(*range_1.start(), 1);
-        assert_eq!(*range_1.end(), 2);
-        let range_2 = ranges.ranges.get(1).unwrap();
-        assert_eq!(*range_2.start(), 6);
-        assert_eq!(*range_2.end(), 7);
-        let range_3 = ranges.ranges.get(2).unwrap();
-        assert_eq!(*range_3.start(), 11);
-        assert_eq!(*range_3.end(), 11);
-    }
-
-    #[test]
-    fn test_compute_one() {
-        let locations = &[PageLocation {
-            offset: 50,
-            compressed_page_size: 10,
-            first_row_index: 0,
-        }];
-        let total_rows = 10;
-
-        let row_ranges = compute_row_ranges(&[true], locations, total_rows).unwrap();
-        assert_eq!(row_ranges.count(), 1);
-        assert_eq!(row_ranges.ranges.get(0).unwrap(), &Range::new(0, 9));
-    }
-
-    #[test]
-    fn test_compute_multi() {
-        let index: NativeIndex<i32> = NativeIndex {
-            physical_type: INT32,
-            indexes: vec![
-                PageIndex {
-                    min: Some(0),
-                    max: Some(10),
-                    null_count: Some(0),
-                },
-                PageIndex {
-                    min: Some(15),
-                    max: Some(20),
-                    null_count: Some(0),
-                },
-            ],
-            boundary_order: BoundaryOrder::ASCENDING,
-        };
-        let locations = &[
-            PageLocation {
-                offset: 100,
-                compressed_page_size: 10,
-                first_row_index: 0,
-            },
-            PageLocation {
-                offset: 200,
-                compressed_page_size: 20,
-                first_row_index: 11,
-            },
-        ];
-        let total_rows = 20;
-
-        //filter `x < 11`
-        let filter =
-            |page: &PageIndex<i32>| page.max.as_ref().map(|&x| x < 11).unwrap_or(false);
-
-        let mask = index.indexes.iter().map(filter).collect::<Vec<_>>();
-
-        let row_ranges = compute_row_ranges(&mask, locations, total_rows).unwrap();
-
-        assert_eq!(row_ranges.count(), 1);
-        assert_eq!(row_ranges.ranges.get(0).unwrap(), &Range::new(0, 10));
-    }
-}