You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/08/05 08:36:56 UTC

[arrow-rs] branch master updated: fix: Fix skip error in calculate_row_count. (#2329)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new b6eaf2243 fix: Fix skip error in calculate_row_count. (#2329)
b6eaf2243 is described below

commit b6eaf2243dda9c09806972452c4261a6a9eb4741
Author: Yang Jiang <ya...@ebay.com>
AuthorDate: Fri Aug 5 16:36:51 2022 +0800

    fix: Fix skip error in calculate_row_count. (#2329)
    
    * Fix skip error calculate_row_count error
    
    * fix ut
---
 parquet/src/file/serialized_reader.rs |  4 +++-
 parquet/src/util/page_util.rs         | 44 ++++++++++++++++++++++++++++++++++-
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs
index 766813f11..034d70e35 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -1512,7 +1512,9 @@ mod tests {
             if i != 351 {
                 assert!((meta.num_rows == 21) || (meta.num_rows == 20));
             } else {
-                assert_eq!(meta.num_rows, 11);
+                // last page first row index is 7290, total row count is 7300
+                // because first row start with zero, last page row count should be 10.
+                assert_eq!(meta.num_rows, 10);
             }
             assert!(!meta.is_dict);
             vec.push(meta);
diff --git a/parquet/src/util/page_util.rs b/parquet/src/util/page_util.rs
index 5cdcf7535..7716b7116 100644
--- a/parquet/src/util/page_util.rs
+++ b/parquet/src/util/page_util.rs
@@ -25,7 +25,8 @@ use crate::file::reader::ChunkReader;
 /// Use column chunk's offset index to get the `page_num` page row count.
 pub(crate) fn calculate_row_count(indexes: &[PageLocation], page_num: usize, total_row_count: i64) -> Result<usize> {
     if page_num == indexes.len() - 1 {
-        Ok((total_row_count - indexes[page_num].first_row_index + 1) as usize)
+        // first_row_index start with 0, so no need to plus one additional.
+        Ok((total_row_count - indexes[page_num].first_row_index) as usize)
     } else {
         Ok((indexes[page_num + 1].first_row_index - indexes[page_num].first_row_index) as usize)
     }
@@ -52,3 +53,44 @@ pub(crate) fn get_pages_readable_slices<T: Read + Send, R: ChunkReader<T=T>>(col
     }
     Ok((page_readers, has_dictionary_page))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /**
+     parquet-tools meta  ./test.parquet got:
+
+                file schema: test_schema
+        --------------------------------------------------------------------------------
+        leaf:        REQUIRED INT64 R:0 D:
+
+            row group 1: RC:256 TS:2216 OFFSET:4
+        --------------------------------------------------------------------------------
+        leaf:         INT64 UNCOMPRESSED DO:0 FPO:4 SZ:2216/2216/1.00 VC:256 ENC:PLAIN,RLE ST:[min: 0, max: 255, num_nulls not defined
+
+    parquet-tools column-index -c leaf ./test.parquet got:
+
+            offset index for column leaf:
+                              offset   compressed size       first row index
+        page-0                         4               554                     0
+        page-1                       558               554                    64
+        page-2                      1112               554                   128
+        page-3                      1666               554                   192
+
+    **/
+    #[test]
+    fn test_calculate_row_count() {
+        let total_row_count = 256;
+        let mut  indexes = vec![];
+        indexes.push(PageLocation::new(4, 554, 0));
+        indexes.push(PageLocation::new(558, 554, 64));
+        indexes.push(PageLocation::new(1112, 554, 128));
+        indexes.push(PageLocation::new(1666, 554, 192));
+        for i in 0..4 {
+            // each page should has 64 rows.
+            assert_eq!(64, calculate_row_count(indexes.as_slice(), i, total_row_count).unwrap());
+        }
+
+    }
+}