You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "alamb (via GitHub)" <gi...@apache.org> on 2023/03/21 19:23:55 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #5658: Add -o option to all e2e benches

alamb commented on code in PR #5658:
URL: https://github.com/apache/arrow-datafusion/pull/5658#discussion_r1143884095


##########
benchmarks/src/bin/parquet.rs:
##########
@@ -230,54 +243,58 @@ async fn run_filter_benchmarks(opt: Opt, test_file: &TestParquetFile) -> Result<
     ];
 
     let filter_matrix = vec![
-        // Selective-ish filter
-        col("request_method").eq(lit("GET")),
-        // Non-selective filter
-        col("request_method").not_eq(lit("GET")),
-        // Basic conjunction
-        col("request_method")
-            .eq(lit("POST"))
-            .and(col("response_status").eq(lit(503_u16))),
-        // Nested filters
-        col("request_method").eq(lit("POST")).and(or(
-            col("response_status").eq(lit(503_u16)),
-            col("response_status").eq(lit(403_u16)),
-        )),
-        // Many filters
-        disjunction([
+        ("Selective-ish filter", col("request_method").eq(lit("GET"))),
+        (
+            "Non-selective filter",

Review Comment:
   this is nice to add the details into the output file



##########
benchmarks/src/lib.rs:
##########
@@ -15,4 +15,116 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use datafusion::DATAFUSION_VERSION;
+use serde::Serialize;
+use serde_json::Value;
+use std::{collections::HashMap, time::SystemTime};
+
 pub mod tpch;
+
+#[derive(Debug, Serialize)]
+pub struct RunContext {
+    /// Benchmark crate version
+    pub benchmark_version: String,
+    /// DataFusion crate version
+    pub datafusion_version: String,
+    /// Number of CPU cores
+    pub num_cpus: usize,
+    /// Start time
+    pub start_time: u64,
+    /// CLI arguments
+    pub arguments: Vec<String>,
+}
+
+impl Default for RunContext {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RunContext {
+    pub fn new() -> Self {
+        Self {
+            benchmark_version: env!("CARGO_PKG_VERSION").to_owned(),
+            datafusion_version: DATAFUSION_VERSION.to_owned(),
+            num_cpus: num_cpus::get(),
+            start_time: SystemTime::now()
+                .duration_since(SystemTime::UNIX_EPOCH)
+                .expect("current time is later than UNIX_EPOCH")
+                .as_secs(),
+            arguments: std::env::args().skip(1).collect::<Vec<String>>(),
+        }
+    }
+}
+
+/// A single iteration of a benchmark query
+#[derive(Debug, Serialize)]
+struct QueryIter {
+    elapsed: f64,
+    row_count: usize,
+}
+/// A single benchmark case
+#[derive(Debug, Serialize)]
+pub struct BenchQuery {
+    query: String,
+    iterations: Vec<QueryIter>,
+    start_time: u64,

Review Comment:
   I see this was just carried over from the previous approach, but I wonder why it needs to be converted to `u64` (and loses the fact that this is now epoch seconds)
   
   Perhaps we can either keep the field as `SystemTime` or document what unit (seconds since epoch) this is?



##########
benchmarks/src/lib.rs:
##########
@@ -15,4 +15,116 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use datafusion::DATAFUSION_VERSION;
+use serde::Serialize;
+use serde_json::Value;
+use std::{collections::HashMap, time::SystemTime};
+
 pub mod tpch;
+
+#[derive(Debug, Serialize)]
+pub struct RunContext {
+    /// Benchmark crate version
+    pub benchmark_version: String,
+    /// DataFusion crate version
+    pub datafusion_version: String,
+    /// Number of CPU cores
+    pub num_cpus: usize,
+    /// Start time
+    pub start_time: u64,
+    /// CLI arguments
+    pub arguments: Vec<String>,
+}
+
+impl Default for RunContext {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RunContext {
+    pub fn new() -> Self {
+        Self {
+            benchmark_version: env!("CARGO_PKG_VERSION").to_owned(),
+            datafusion_version: DATAFUSION_VERSION.to_owned(),
+            num_cpus: num_cpus::get(),
+            start_time: SystemTime::now()
+                .duration_since(SystemTime::UNIX_EPOCH)
+                .expect("current time is later than UNIX_EPOCH")
+                .as_secs(),
+            arguments: std::env::args().skip(1).collect::<Vec<String>>(),
+        }
+    }
+}
+
+/// A single iteration of a benchmark query
+#[derive(Debug, Serialize)]
+struct QueryIter {
+    elapsed: f64,

Review Comment:
   Can you please add some documentation about what unit this is in (I think it is milliseconds?)
   
   Relatedly I wonder if we could make this API easier to use by storing a `Duration` https://doc.rust-lang.org/std/time/struct.Duration.html, calculated with `SystemTime::now() - start` 
   
   



##########
benchmarks/src/bin/h2o.rs:
##########
@@ -113,13 +118,16 @@ async fn group_by(opt: &GroupBy) -> Result<()> {
     let start = Instant::now();
     let df = ctx.sql(sql).await?;
     let batches = df.collect().await?;
-    let elapsed = start.elapsed().as_millis();
-
+    let elapsed = start.elapsed().as_secs_f64() * 1000.0;
+    let numrows = batches.iter().map(|b| b.num_rows()).sum::<usize>();
     if opt.debug {
         pretty::print_batches(&batches)?;
     }
-
+    rundata.write_iter(elapsed, numrows);
     println!("h2o groupby query {} took {} ms", opt.query, elapsed);
 
+    if let Some(path) = &opt.output_path {

Review Comment:
   minor: maybe we could encapsulate the write here inside `rundata`:
   
   ```rust
     run_data.maybe_write_json(opt.output_path.as_ref())?
   ```
   
   



##########
benchmarks/src/bin/h2o.rs:
##########
@@ -113,13 +118,16 @@ async fn group_by(opt: &GroupBy) -> Result<()> {
     let start = Instant::now();
     let df = ctx.sql(sql).await?;
     let batches = df.collect().await?;
-    let elapsed = start.elapsed().as_millis();
-
+    let elapsed = start.elapsed().as_secs_f64() * 1000.0;
+    let numrows = batches.iter().map(|b| b.num_rows()).sum::<usize>();

Review Comment:
   I think num_rows is pretty fast (it doesn't actually do any work , it just returns a field's value): https://docs.rs/arrow-array/35.0.0/src/arrow_array/record_batch.rs.html#278



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org