You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ar...@apache.org on 2019/12/05 22:59:40 UTC

[impala] 05/06: IMPALA-9215: report_benchmark_results.py fails with missing key

This is an automated email from the ASF dual-hosted git repository.

arodoni pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 92aa2c16f172cf15cf6b026ab7af9c72576772f5
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Wed Dec 4 18:49:48 2019 +0100

    IMPALA-9215: report_benchmark_results.py fails with missing key
    
    report_benchmark_results.py failed with missing key because it tried
    to lookup 'num_instances' from the perf result json file. The JSON
    file contained exec summary generated by
    impala_beeswax.py::__build_summary_table() which omitted number of
    instances.
    
    This patch adds 'num_instances' to the summary table created by
    impala_beeswax.py.
    
    To keep report_benchmark_results.py simple it assumes that both perf
    json files contain 'num_instances', i.e. if a user issues
    single_node_perf_run.py to compare two commits, both of them must
    contain this fix.
    
    I tested the PS manually.
    
    Change-Id: I822c86f621f5a348b56d672c263a2cf9321767ee
    Reviewed-on: http://gerrit.cloudera.org:8080/14830
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/beeswax/impala_beeswax.py             | 3 ++-
 tests/benchmark/report_benchmark_results.py | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/beeswax/impala_beeswax.py b/tests/beeswax/impala_beeswax.py
index b788a38..ab10023 100644
--- a/tests/beeswax/impala_beeswax.py
+++ b/tests/beeswax/impala_beeswax.py
@@ -284,7 +284,8 @@ class ImpalaBeeswaxClient(object):
       else:
         avg_time = 0
 
-      row["num_hosts"] = len(node.exec_stats)
+      row["num_instances"] = len(node.exec_stats)
+      row["num_hosts"] = node.num_hosts
       row["avg_time"] = avg_time
 
     is_sink = node.node_id == -1
diff --git a/tests/benchmark/report_benchmark_results.py b/tests/benchmark/report_benchmark_results.py
index 0274e64..5a4cc07 100755
--- a/tests/benchmark/report_benchmark_results.py
+++ b/tests/benchmark/report_benchmark_results.py
@@ -338,7 +338,6 @@ class Report(object):
     the report).
     """
     def __init__(self, results, ref_results):
-
       self.workload_name = '{0}({1})'.format(
           results[RESULT_LIST][0][QUERY][WORKLOAD_NAME].upper(),
           results[RESULT_LIST][0][QUERY][SCALE_FACTOR])
@@ -407,7 +406,6 @@ class Report(object):
       For example:
       Regression: TPCDS-Q52 [parquet/none/none] (1.390s -> 1.982s [+42.59%])
       """
-
       perf_change_type = ("(R) Regression" if zval >= 0 and tval >= 0
                           else "(I) Improvement" if zval <= 0 and tval <= 0
                           else "(?) Anomoly")
@@ -613,6 +611,7 @@ class CombinedExecSummaries(object):
       prefix (str)
       operator (str)
       num_hosts (int)
+      num_instances (int)
       num_rows (int)
       est_num_rows (int)
       detail (str)
@@ -641,7 +640,8 @@ class CombinedExecSummaries(object):
     for row_num, row in enumerate(first_exec_summary):
       combined_row = {}
       # Copy fixed values from the first exec summary
-      for key in [PREFIX, OPERATOR, NUM_HOSTS, NUM_ROWS, EST_NUM_ROWS, DETAIL]:
+      for key in [PREFIX, OPERATOR, NUM_HOSTS, NUM_INSTANCES, NUM_ROWS, EST_NUM_ROWS,
+                  DETAIL]:
         combined_row[key] = row[key]
 
       avg_times = [exec_summary[row_num][AVG_TIME] for exec_summary in exec_summaries]
@@ -769,6 +769,7 @@ class ExecSummaryComparison(object):
       prefix (str)
       operator (str)
       num_hosts (int)
+      num_instances (int)
       avg_time (float)
       stddev_time (float)
       avg_time_change (float): % change in avg time compared to reference