You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/08/05 08:36:56 UTC
[arrow-rs] branch master updated: fix: Fix skip error in calculate_row_count. (#2329)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new b6eaf2243 fix: Fix skip error in calculate_row_count. (#2329)
b6eaf2243 is described below
commit b6eaf2243dda9c09806972452c4261a6a9eb4741
Author: Yang Jiang <ya...@ebay.com>
AuthorDate: Fri Aug 5 16:36:51 2022 +0800
fix: Fix skip error in calculate_row_count. (#2329)
* Fix skip error calculate_row_count error
* fix ut
---
parquet/src/file/serialized_reader.rs | 4 +++-
parquet/src/util/page_util.rs | 44 ++++++++++++++++++++++++++++++++++-
2 files changed, 46 insertions(+), 2 deletions(-)
diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs
index 766813f11..034d70e35 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -1512,7 +1512,9 @@ mod tests {
if i != 351 {
assert!((meta.num_rows == 21) || (meta.num_rows == 20));
} else {
- assert_eq!(meta.num_rows, 11);
+ // last page first row index is 7290, total row count is 7300
+ // because first row start with zero, last page row count should be 10.
+ assert_eq!(meta.num_rows, 10);
}
assert!(!meta.is_dict);
vec.push(meta);
diff --git a/parquet/src/util/page_util.rs b/parquet/src/util/page_util.rs
index 5cdcf7535..7716b7116 100644
--- a/parquet/src/util/page_util.rs
+++ b/parquet/src/util/page_util.rs
@@ -25,7 +25,8 @@ use crate::file::reader::ChunkReader;
/// Use column chunk's offset index to get the `page_num` page row count.
pub(crate) fn calculate_row_count(indexes: &[PageLocation], page_num: usize, total_row_count: i64) -> Result<usize> {
if page_num == indexes.len() - 1 {
- Ok((total_row_count - indexes[page_num].first_row_index + 1) as usize)
+ // first_row_index start with 0, so no need to plus one additional.
+ Ok((total_row_count - indexes[page_num].first_row_index) as usize)
} else {
Ok((indexes[page_num + 1].first_row_index - indexes[page_num].first_row_index) as usize)
}
@@ -52,3 +53,44 @@ pub(crate) fn get_pages_readable_slices<T: Read + Send, R: ChunkReader<T=T>>(col
}
Ok((page_readers, has_dictionary_page))
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ /**
+ parquet-tools meta ./test.parquet got:
+
+ file schema: test_schema
+ --------------------------------------------------------------------------------
+ leaf: REQUIRED INT64 R:0 D:
+
+ row group 1: RC:256 TS:2216 OFFSET:4
+ --------------------------------------------------------------------------------
+ leaf: INT64 UNCOMPRESSED DO:0 FPO:4 SZ:2216/2216/1.00 VC:256 ENC:PLAIN,RLE ST:[min: 0, max: 255, num_nulls not defined
+
+ parquet-tools column-index -c leaf ./test.parquet got:
+
+ offset index for column leaf:
+ offset compressed size first row index
+ page-0 4 554 0
+ page-1 558 554 64
+ page-2 1112 554 128
+ page-3 1666 554 192
+
+ **/
+ #[test]
+ fn test_calculate_row_count() {
+ let total_row_count = 256;
+ let mut indexes = vec![];
+ indexes.push(PageLocation::new(4, 554, 0));
+ indexes.push(PageLocation::new(558, 554, 64));
+ indexes.push(PageLocation::new(1112, 554, 128));
+ indexes.push(PageLocation::new(1666, 554, 192));
+ for i in 0..4 {
+ // each page should has 64 rows.
+ assert_eq!(64, calculate_row_count(indexes.as_slice(), i, total_row_count).unwrap());
+ }
+
+ }
+}