You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/07/06 14:46:46 UTC

[GitHub] [arrow-rs] Ted-Jiang commented on a diff in pull request #2012: Add page index reader test for all types and support empty index.

Ted-Jiang commented on code in PR #2012:
URL: https://github.com/apache/arrow-rs/pull/2012#discussion_r914564868


##########
parquet/src/file/serialized_reader.rs:
##########
@@ -1098,11 +1105,292 @@ mod tests {
         let offset_indexes = metadata.offset_indexes().unwrap();
         // only one row group
         assert_eq!(offset_indexes.len(), 1);
-        let offset_index = offset_indexes.get(0).unwrap();
-        let page_offset = offset_index.get(0).unwrap();
+        let offset_index = &offset_indexes[0];
+        let page_offset = &offset_index[0][0];
 
         assert_eq!(4, page_offset.offset);
         assert_eq!(152, page_offset.compressed_page_size);
         assert_eq!(0, page_offset.first_row_index);
     }
+
+    #[test]
+    fn test_page_index_reader_all_type() {
+        let test_file = get_test_file("alltypes_tiny_pages_plain.parquet");
+        let builder = ReadOptionsBuilder::new();
+        //enable read page index
+        let options = builder.with_page_index().build();
+        let reader_result = SerializedFileReader::new_with_options(test_file, options);
+        let reader = reader_result.unwrap();
+
+        // Test contents in Parquet metadata
+        let metadata = reader.metadata();
+        assert_eq!(metadata.num_row_groups(), 1);
+
+        let page_indexes = metadata.page_indexes().unwrap();
+        let row_group_offset_indexes = &metadata.offset_indexes().unwrap()[0];
+
+        // only one row group
+        assert_eq!(page_indexes.len(), 1);
+        let row_group_metadata = metadata.row_group(0);
+
+        //col0->id: INT32 UNCOMPRESSED DO:0 FPO:4 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 7299, num_nulls: 0]
+        if let Index::INT32(index) = &page_indexes[0][0] {
+            check_native_page_index(
+                index,
+                325,
+                row_group_metadata
+                    .column(0)
+                    .statistics()
+                    .unwrap()
+                    .min_bytes(),
+                row_group_metadata
+                    .column(0)
+                    .statistics()
+                    .unwrap()
+                    .max_bytes(),
+                BoundaryOrder::Unordered,
+            );
+            assert_eq!(row_group_offset_indexes[0].len(), 325);
+        } else {
+            unreachable!()
+        };
+        //col1->bool_col:BOOLEAN UNCOMPRESSED DO:0 FPO:37329 SZ:3022/3022/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: false, max: true, num_nulls: 0]
+        if let Index::BOOLEAN(index) = &page_indexes[0][1] {
+            assert_eq!(index.indexes.len(), 82);
+            assert_eq!(row_group_offset_indexes[1].len(), 82);
+        } else {
+            unreachable!()
+        };
+        //col2->tinyint_col: INT32 UNCOMPRESSED DO:0 FPO:40351 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0]
+        if let Index::INT32(index) = &page_indexes[0][2] {
+            check_native_page_index(
+                index,
+                325,
+                row_group_metadata
+                    .column(0)
+                    .statistics()
+                    .unwrap()
+                    .min_bytes(),
+                row_group_metadata
+                    .column(0)
+                    .statistics()
+                    .unwrap()
+                    .max_bytes(),
+                BoundaryOrder::Ascending,
+            );
+            assert_eq!(row_group_offset_indexes[2].len(), 325);
+        } else {
+            unreachable!()
+        };
+        //col4->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0]
+        if let Index::INT32(index) = &page_indexes[0][3] {
+            check_native_page_index(
+                index,
+                325,
+                row_group_metadata
+                    .column(0)
+                    .statistics()
+                    .unwrap()
+                    .min_bytes(),
+                row_group_metadata
+                    .column(0)
+                    .statistics()
+                    .unwrap()
+                    .max_bytes(),
+                BoundaryOrder::Ascending,
+            );
+            assert_eq!(row_group_offset_indexes[3].len(), 325);
+        } else {
+            unreachable!()
+        };
+        //col5->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0]
+        if let Index::INT32(index) = &page_indexes[0][4] {
+            check_native_page_index(
+                index,
+                325,
+                row_group_metadata
+                    .column(0)
+                    .statistics()
+                    .unwrap()
+                    .min_bytes(),
+                row_group_metadata
+                    .column(0)
+                    .statistics()
+                    .unwrap()
+                    .max_bytes(),
+                BoundaryOrder::Ascending,
+            );
+            assert_eq!(row_group_offset_indexes[4].len(), 325);
+        } else {
+            unreachable!()
+        };
+        //col6->bigint_col: INT64 UNCOMPRESSED DO:0 FPO:152326 SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 90, num_nulls: 0]
+        if let Index::INT64(index) = &page_indexes[0][5] {
+            //Todo row_group_metadata.column(0).statistics().unwrap().min_bytes() only return 4 bytes
+            check_native_page_index(

Review Comment:
   i try to use
   ```
   row_group_metadata
                       .column(0)
                       .statistics()
                       .unwrap()
                       .min_bytes(),
   ```
   get min values from one column chunk metadata in type In64, but it return only 4 bytes...
   I think this is a bug.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org