You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/11/07 19:53:22 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #4131: Add parquet integration tests for explicitly smaller page sizes, page pruning

alamb commented on code in PR #4131:
URL: https://github.com/apache/arrow-datafusion/pull/4131#discussion_r1015837545


##########
datafusion/core/tests/parquet_filter_pushdown.rs:
##########
@@ -225,6 +230,77 @@ async fn single_file() {
         .run()
         .await;
 }
+
+#[cfg(not(target_family = "windows"))]
+#[tokio::test]
+async fn single_file_small_data_pages() {

Review Comment:
   Here is the new test -- I verified manually the layout was good (typically makes 6 data pages for each column chunk)



##########
parquet-test-utils/src/lib.rs:
##########
@@ -51,42 +51,28 @@ pub struct TestParquetFile {
     object_meta: ObjectMeta,
 }
 
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Copy)]
 pub struct ParquetScanOptions {
     pub pushdown_filters: bool,
     pub reorder_filters: bool,
     pub enable_page_index: bool,
 }
 
 impl TestParquetFile {
-    /// Creates a new parquet file at the specified location
+    /// Creates a new parquet file at the specified location with the
+    /// given properties
     pub fn try_new(
         path: PathBuf,
+        props: WriterProperties,

Review Comment:
   this change is to pass in the writer props to remove a level of indirection of how the file is created and make the code easier to read



##########
datafusion/core/tests/parquet_filter_pushdown.rs:
##########
@@ -225,6 +230,77 @@ async fn single_file() {
         .run()
         .await;
 }
+
+#[cfg(not(target_family = "windows"))]
+#[tokio::test]
+async fn single_file_small_data_pages() {
+    let tempdir = TempDir::new().unwrap();
+
+    let generator = AccessLogGenerator::new().with_row_limit(Some(NUM_ROWS));
+
+    // set the max page rows with arbitrary sizes 8311 to increase
+    // effectiveness of page filtering
+    let props = WriterProperties::builder()
+        .set_data_page_row_count_limit(8311)
+        .build();
+    let file = tempdir.path().join("data_8311.parquet");
+
+    let start = Instant::now();
+    println!("Writing test data to {:?}", file);
+    let test_parquet_file = TestParquetFile::try_new(file, props, generator).unwrap();
+    println!(
+        "Completed generating test data in {:?}",
+        Instant::now() - start
+    );
+
+    // The statistics on the 'pod' column are as follows:
+    //
+    // parquet-tools dump -d ~/Downloads/data_8311.parquet
+    //
+    // ...
+    // pod TV=53819 RL=0 DL=0 DS:                 8 DE:PLAIN
+    // ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+    // page 0:                                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[min: aqcathnxqsphdhgjtgvxsfyiwbmhlmg, max: bvjjmytpfzdfsvlzfhbunasihjgxpesbmxv, num_nulls not defined] CRC:[none] SZ:7 VC:9216
+    // page 1:                                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[min: bvjjmytpfzdfsvlzfhbunasihjgxpesbmxv, max: bxyubzxbbmhroqhrdzttngxcpwwgkpaoizvgzd, num_nulls not defined] CRC:[none] SZ:7 VC:9216
+    // page 2:                                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[min: bxyubzxbbmhroqhrdzttngxcpwwgkpaoizvgzd, max: djzdyiecnumrsrcbizwlqzdhnpoiqdh, num_nulls not defined] CRC:[none] SZ:10 VC:9216
+    // page 3:                                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[min: djzdyiecnumrsrcbizwlqzdhnpoiqdh, max: fktdcgtmzvoedpwhfevcvvrtaurzgex, num_nulls not defined] CRC:[none] SZ:7 VC:9216
+    // page 4:                                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[min: fktdcgtmzvoedpwhfevcvvrtaurzgex, max: fwtdpgtxwqkkgtgvthhwycrvjiizdifyp, num_nulls not defined] CRC:[none] SZ:7 VC:9216
+    // page 5:                                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[min: fwtdpgtxwqkkgtgvthhwycrvjiizdifyp, max: iadnalqpdzthpifrvewossmpqibgtsuin, num_nulls not defined] CRC:[none] SZ:7 VC:7739
+    //
+    // This test currently fails due to https://github.com/apache/arrow-datafusion/issues/3833
+    // (page index pruning not implemented for byte array)
+
+    // TestCase::new(&test_parquet_file)
+    //     .with_name("selective")
+    //     // predicagte is chosen carefully to prune pages 0, 1, 2, 3, 4
+    //     // pod = 'iadnalqpdzthpifrvewossmpqibgtsuin'
+    //     .with_filter(col("pod").eq(lit("iadnalqpdzthpifrvewossmpqibgtsuin")))
+    //     .with_pushdown_expected(PushdownExpected::Some)
+    //     .with_page_index_filtering_expected(PageIndexFilteringExpected::Some)
+    //     .with_expected_rows(2574)
+    //     .run()
+    //     .await;
+
+    // time TV=53819 RL=0 DL=0 DS:                7092 DE:PLAIN
+    // --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+    // page 0:                                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max: 1970-01-01T00:00:00.004133888, num_nulls not defined] CRC:[none] SZ:13844 VC:9216
+    // page 1:                                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max: 1970-01-01T00:00:00.006397952, num_nulls not defined] CRC:[none] SZ:14996 VC:9216
+    // page 2:                                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max: 1970-01-01T00:00:00.005650432, num_nulls not defined] CRC:[none] SZ:14996 VC:9216
+    // page 3:                                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max: 1970-01-01T00:00:00.004269056, num_nulls not defined] CRC:[none] SZ:14996 VC:9216
+    // page 4:                                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max: 1970-01-01T00:00:00.007261184, num_nulls not defined] CRC:[none] SZ:14996 VC:9216
+    // page 5:                                     DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[min: 1970-01-01T00:00:00.000000000, max: 1970-01-01T00:00:00.005330944, num_nulls not defined] CRC:[none] SZ:12601 VC:7739
+    TestCase::new(&test_parquet_file)
+        .with_name("selective")
+        // predicagte is chosen carefully to prune pages

Review Comment:
   ```suggestion
           // predicate is chosen carefully to prune pages
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org