You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by dh...@apache.org on 2022/10/19 20:08:05 UTC

[arrow-datafusion] branch master updated: Allow enabling collection of statistics during TPC-H benchmarks (#3889)

This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new c87964caa Allow enabling collection of statistics during TPC-H benchmarks (#3889)
c87964caa is described below

commit c87964caa8e07b438d20b082fd0d0cd337d77951
Author: Batuhan Taskaya <is...@gmail.com>
AuthorDate: Wed Oct 19 23:07:59 2022 +0300

    Allow enabling collection of statistics during TPC-H benchmarks (#3889)
    
    Co-authored-by: Heres, Daniel <da...@gmail.com>
---
 benchmarks/README.md       | 15 ++++++++++-----
 benchmarks/src/bin/tpch.rs | 10 ++++++++--
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 97a0bd4c6..524f5cb17 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -25,7 +25,7 @@ implementations as well as other query engines.
 
 ## Benchmark derived from TPC-H
 
-These benchmarks are derived from the [TPC-H][1] benchmark. And we use this repo as the source of tpch-gen and answers: 
+These benchmarks are derived from the [TPC-H][1] benchmark. And we use this repo as the source of tpch-gen and answers:
 https://github.com/databricks/tpch-dbgen.git, based on [2.17.1](https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf) version of TPC-H.
 
 ## Generating Test Data
@@ -55,6 +55,11 @@ You can enable the features `simd` (to use SIMD instructions, `cargo nightly` is
 cargo run --release --features "simd mimalloc" --bin tpch -- benchmark datafusion --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
 ```
 
+If you want to disable collection of statistics (and thus cost based optimizers), you can pass `--disable-statistics` flag.
+``bash
+cargo run --release --bin tpch -- benchmark datafusion --iterations 3 --path /mnt/tpch-parquet --format parquet --query 17 --disable-statistics
+```
+
 The benchmark program also supports CSV and Parquet input file formats and a utility is provided to convert from `tbl`
 (generated by the `dbgen` utility) to CSV and Parquet.
 
@@ -130,16 +135,16 @@ h2o groupby query 1 took 1669 ms
 ## Parquet filter pushdown benchmarks
 
 This is a set of benchmarks for testing and verifying performance of parquet filter pushdown. The queries are executed on
-a synthetic dataset generated during the benchmark execution and designed to simulate web server access logs. 
+a synthetic dataset generated during the benchmark execution and designed to simulate web server access logs.
 
 ```base
 cargo run --release --bin parquet_filter_pushdown --  --path ./data --scale-factor 1.0
 ```
 
 This will generate the synthetic dataset at `./data/logs.parquet`. The size of the dataset can be controlled through the `size_factor`
-(with the default value of `1.0` generating a ~1GB parquet file). 
+(with the default value of `1.0` generating a ~1GB parquet file).
 
-For each filter we will run the query using different `ParquetScanOption` settings. 
+For each filter we will run the query using different `ParquetScanOption` settings.
 
 Example run:
 ```
@@ -159,4 +164,4 @@ Iteration 0 returned 1781686 rows in 1940 ms
 Iteration 1 returned 1781686 rows in 1986 ms
 Iteration 2 returned 1781686 rows in 1947 ms
 ...
-```
\ No newline at end of file
+```
diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs
index d06487935..b2f2bf181 100644
--- a/benchmarks/src/bin/tpch.rs
+++ b/benchmarks/src/bin/tpch.rs
@@ -98,6 +98,10 @@ struct DataFusionBenchmarkOpt {
     /// Path to output directory where JSON summary file should be written to
     #[structopt(parse(from_os_str), short = "o", long = "output")]
     output_path: Option<PathBuf>,
+
+    /// Whether to disable collection of statistics (and cost based optimizations) or not.
+    #[structopt(short = "S", long = "disable-statistics")]
+    disable_statistics: bool,
 }
 
 #[derive(Debug, StructOpt)]
@@ -164,7 +168,8 @@ async fn benchmark_datafusion(opt: DataFusionBenchmarkOpt) -> Result<Vec<RecordB
     let mut benchmark_run = BenchmarkRun::new(opt.query);
     let config = SessionConfig::new()
         .with_target_partitions(opt.partitions)
-        .with_batch_size(opt.batch_size);
+        .with_batch_size(opt.batch_size)
+        .with_collect_statistics(!opt.disable_statistics);
     let ctx = SessionContext::with_config(config);
 
     // register tables
@@ -440,7 +445,7 @@ async fn get_table(
         format,
         file_extension: extension.to_owned(),
         target_partitions,
-        collect_stat: true,
+        collect_stat: ctx.config.collect_statistics,
         table_partition_cols: vec![],
     };
 
@@ -1374,6 +1379,7 @@ mod tests {
                 file_format: "tbl".to_string(),
                 mem_table: false,
                 output_path: None,
+                disable_statistics: false,
             };
             let actual = benchmark_datafusion(opt).await?;