You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/02/15 23:42:14 UTC

[impala] 01/02: IMPALA-11113 and IMPALA-11114: fixed single_node_perf_run.py for TPCDS

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 182617ee87aaf23abe46dd0cc5b82133e4d41803
Author: Gergely Fürnstáhl <gf...@cloudera.com>
AuthorDate: Wed Feb 9 09:53:24 2022 +0100

    IMPALA-11113 and IMPALA-11114: fixed single_node_perf_run.py for TPCDS
    
    Fixed the UTF-8 UnicodeDecodeError which was thrown while dumping and
    loading the json file. Now the script ignores non-decodable characters.
    
    Fixed the ZeroDevisionError coming from t-test when the standard
    deviations were 0. "(N/A) Invalid t-test type" is shown for significant
    changes and a hint at the end if any invalid t-test was detected.
    
    Change-Id: I094763188a1f3ddf40b7140c65acf95918a6597f
    Reviewed-on: http://gerrit.cloudera.org:8080/18215
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Quanlong Huang <hu...@gmail.com>
---
 bin/run-workload.py                         |  2 +-
 bin/single_node_perf_run.py                 |  2 +-
 tests/benchmark/report_benchmark_results.py | 20 ++++++++++++++++----
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/bin/run-workload.py b/bin/run-workload.py
index d406627..8d2d2f7 100755
--- a/bin/run-workload.py
+++ b/bin/run-workload.py
@@ -277,6 +277,6 @@ if __name__ == "__main__":
 
   # Store the results
   with open(options.results_json_file, 'w') as f:
-    json.dump(result_map, f, cls=CustomJSONEncoder)
+    json.dump(result_map, f, cls=CustomJSONEncoder, ensure_ascii=False)
 
   exit(exit_code)
diff --git a/bin/single_node_perf_run.py b/bin/single_node_perf_run.py
index d215019..ae89abe 100755
--- a/bin/single_node_perf_run.py
+++ b/bin/single_node_perf_run.py
@@ -191,7 +191,7 @@ def generate_profile_file(name, hash, base_dir):
   Writes the runtime profiles back in a simple text file in the same directory.
   """
   with open(name) as fid:
-    data = json.load(fid)
+    data = json.loads(fid.read().decode("utf-8", "ignore"))
     with open(os.path.join(base_dir, hash + "_profile.txt"), "w+") as out:
       # For each query
       for key in data:
diff --git a/tests/benchmark/report_benchmark_results.py b/tests/benchmark/report_benchmark_results.py
index 26bf783..f8a2105 100755
--- a/tests/benchmark/report_benchmark_results.py
+++ b/tests/benchmark/report_benchmark_results.py
@@ -32,6 +32,7 @@ from __future__ import division
 import difflib
 import json
 import logging
+import math
 import os
 import prettytable
 import re
@@ -218,7 +219,7 @@ def get_dict_from_json(filename):
     cur[RESULT_LIST].append(query_result)
 
   with open(filename, "r") as f:
-    data = json.load(f)
+    data = json.loads(f.read().decode("utf-8", "ignore"))
     grouped = defaultdict( lambda: defaultdict(
         lambda: defaultdict(lambda: defaultdict(list))))
     for workload_name, workload in data.items():
@@ -296,6 +297,7 @@ def calculate_time_stats(grouped):
 class Report(object):
 
   significant_perf_change = False
+  invalid_t_tests = False
 
   class FileFormatComparisonRow(object):
     """Represents a row in the overview table, where queries are grouped together and
@@ -386,8 +388,13 @@ class Report(object):
 
     def __check_perf_change_significance(self, stat, ref_stat):
       zval = calculate_mwu(stat[SORTED], ref_stat[SORTED])
-      tval = calculate_tval(stat[AVG], stat[STDDEV], stat[ITERATIONS],
-                            ref_stat[AVG], ref_stat[STDDEV], ref_stat[ITERATIONS])
+      try:
+        tval = calculate_tval(stat[AVG], stat[STDDEV], stat[ITERATIONS],
+                              ref_stat[AVG], ref_stat[STDDEV], ref_stat[ITERATIONS])
+      except ZeroDivisionError:
+        # t-test cannot be performed if both standard deviations are 0
+        tval = float('nan')
+        Report.invalid_t_tests = True
       try:
         percent_difference = abs(ref_stat[AVG] - stat[AVG]) * 100 / ref_stat[AVG]
       except ZeroDivisionError:
@@ -408,6 +415,7 @@ class Report(object):
       """
       perf_change_type = ("(R) Regression" if zval >= 0 and tval >= 0
                           else "(I) Improvement" if zval <= 0 and tval <= 0
+                          else "(N/A) Invalid t-test" if math.isnan(tval)
                           else "(?) Anomoly")
       query = result[RESULT_LIST][0][QUERY]
 
@@ -588,7 +596,11 @@ class Report(object):
     output += variability_analysis_str
 
     if Report.significant_perf_change:
-      output += 'Significant perf change detected'
+      output += 'Significant perf change detected.\n'
+
+    if Report.invalid_t_tests:
+      output += 'Invalid t-tests detected. It is not possible to perform t-test with ' \
+                '0 standard deviation. Try increasing the number of iterations.\n'
 
     return output