You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2024/01/20 16:56:14 UTC

(impala) branch master updated (5dfcdf1c9 -> 7e5bb385e)

This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


    from 5dfcdf1c9 IMPALA-12716: Fix timeout thresholds in test_catalog_operations_with_rpc_retry
     new 3af3b2c8a IMPALA-12038: Switch report_benchmark_results.py to python 3
     new 7e5bb385e Addendum: IMPALA-12584: Enable strict data file access by default

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 be/src/common/global-flags.cc                      |   5 +-
 be/src/util/backend-gflag-util.cc                  |   6 +-
 common/thrift/BackendGflags.thrift                 |   2 +-
 .../org/apache/impala/catalog/FeIcebergTable.java  |   2 +-
 .../org/apache/impala/service/BackendConfig.java   |   8 +-
 .../impala/catalog/FileMetadataLoaderTest.java     |   2 +
 testdata/bin/create-load-data.sh                   |   3 +
 tests/benchmark/report_benchmark_results.py        | 122 +++++++++++----------
 tests/custom_cluster/test_iceberg_strict_data.py   |  24 ++--
 tests/query_test/test_iceberg.py                   |   4 -
 10 files changed, 98 insertions(+), 80 deletions(-)


(impala) 02/02: Addendum: IMPALA-12584: Enable strict data file access by default

Posted by bo...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 7e5bb385e1d391c790c6645e5abc3d38d37f39f0
Author: Peter Rozsa <pr...@cloudera.com>
AuthorDate: Tue Jan 9 16:37:22 2024 +0100

    Addendum: IMPALA-12584: Enable strict data file access by default
    
    This change sets the default value to 'true' for
    'iceberg_restrict_data_file_location' and changes the flag name to
    'iceberg_allow_datafiles_in_table_location_only'. Tests related to
    multiple storage locations in Iceberg tables are moved out to custom
    cluster tests. During test data loading, the flag is set to 'false'
    to make the creation of 'iceberg_multiple_storage_locations' table
    possible.
    
    Change-Id: Ifec84c86132a8a44d7e161006dcf51be2e7c7e57
    Reviewed-on: http://gerrit.cloudera.org:8080/20874
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/common/global-flags.cc                      |  5 +++--
 be/src/util/backend-gflag-util.cc                  |  6 +++---
 common/thrift/BackendGflags.thrift                 |  2 +-
 .../org/apache/impala/catalog/FeIcebergTable.java  |  2 +-
 .../org/apache/impala/service/BackendConfig.java   |  8 ++++++--
 .../impala/catalog/FileMetadataLoaderTest.java     |  2 ++
 testdata/bin/create-load-data.sh                   |  3 +++
 tests/custom_cluster/test_iceberg_strict_data.py   | 24 ++++++++++++----------
 tests/query_test/test_iceberg.py                   |  4 ----
 9 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/be/src/common/global-flags.cc b/be/src/common/global-flags.cc
index 8f4d07dd9..20fdab4a3 100644
--- a/be/src/common/global-flags.cc
+++ b/be/src/common/global-flags.cc
@@ -421,8 +421,9 @@ DEFINE_int32(iceberg_reload_new_files_threshold, 100, "(Advanced) If during a ta
     "reload all file metadata. If number of new files are less or equal to this, "
     "catalogd will only load the metadata of the newly added files.");
 
-DEFINE_bool(iceberg_restrict_data_file_location, false, "If true, Impala does not allow "
-    "Iceberg data file locations outside of the table directory during reads");
+DEFINE_bool(iceberg_allow_datafiles_in_table_location_only, true, "If true, Impala "
+    "does not allow Iceberg data file locations outside of the table directory during "
+    "reads");
 
 // Host and port of Statestore Service
 DEFINE_string(state_store_host, "localhost",
diff --git a/be/src/util/backend-gflag-util.cc b/be/src/util/backend-gflag-util.cc
index 64f2fe0be..40414c871 100644
--- a/be/src/util/backend-gflag-util.cc
+++ b/be/src/util/backend-gflag-util.cc
@@ -111,7 +111,7 @@ DECLARE_string(java_weigher);
 DECLARE_int32(iceberg_reload_new_files_threshold);
 DECLARE_bool(enable_skipping_older_events);
 DECLARE_bool(enable_json_scanner);
-DECLARE_bool(iceberg_restrict_data_file_location);
+DECLARE_bool(iceberg_allow_datafiles_in_table_location_only);
 DECLARE_int32(catalog_operation_log_size);
 DECLARE_string(hostname);
 DECLARE_bool(allow_catalog_cache_op_from_masked_users);
@@ -443,8 +443,8 @@ Status PopulateThriftBackendGflags(TBackendGflags& cfg) {
   cfg.__set_iceberg_reload_new_files_threshold(FLAGS_iceberg_reload_new_files_threshold);
   cfg.__set_enable_skipping_older_events(FLAGS_enable_skipping_older_events);
   cfg.__set_enable_json_scanner(FLAGS_enable_json_scanner);
-  cfg.__set_iceberg_restrict_data_file_location(
-    FLAGS_iceberg_restrict_data_file_location);
+  cfg.__set_iceberg_allow_datafiles_in_table_location_only(
+      FLAGS_iceberg_allow_datafiles_in_table_location_only);
   cfg.__set_max_filter_error_rate_from_full_scan(
       FLAGS_max_filter_error_rate_from_full_scan);
   cfg.__set_catalog_operation_log_size(FLAGS_catalog_operation_log_size);
diff --git a/common/thrift/BackendGflags.thrift b/common/thrift/BackendGflags.thrift
index fb1ef7fc8..35aecb63d 100644
--- a/common/thrift/BackendGflags.thrift
+++ b/common/thrift/BackendGflags.thrift
@@ -275,5 +275,5 @@ struct TBackendGflags {
 
   122: required bool allow_catalog_cache_op_from_masked_users
 
-  123: required bool iceberg_restrict_data_file_location
+  123: required bool iceberg_allow_datafiles_in_table_location_only
 }
diff --git a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
index 16c68a382..d04cf2530 100644
--- a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
@@ -1027,7 +1027,7 @@ public interface FeIcebergTable extends FeFsTable {
       Table icebergApiTable = icebergTable.getIcebergApiTable();
       Preconditions.checkNotNull(icebergApiTable);
       Map<String, String> properties = icebergApiTable.properties();
-      if (BackendConfig.INSTANCE.icebergRestrictDataFileLocation()) {
+      if (BackendConfig.INSTANCE.icebergAllowDatafileInTableLocationOnly()) {
         return true;
       }
       return !(PropertyUtil.propertyAsBoolean(properties,
diff --git a/fe/src/main/java/org/apache/impala/service/BackendConfig.java b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
index 570669216..c7b4a0bef 100644
--- a/fe/src/main/java/org/apache/impala/service/BackendConfig.java
+++ b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
@@ -429,8 +429,12 @@ public class BackendConfig {
     return backendCfg_.iceberg_reload_new_files_threshold;
   }
 
-  public boolean icebergRestrictDataFileLocation() {
-    return backendCfg_.iceberg_restrict_data_file_location;
+  public boolean icebergAllowDatafileInTableLocationOnly() {
+    return backendCfg_.iceberg_allow_datafiles_in_table_location_only;
+  }
+
+  public void setIcebergAllowDatafileInTableLocationOnly(boolean flag) {
+    backendCfg_.iceberg_allow_datafiles_in_table_location_only = flag;
   }
 
   public boolean isJsonScannerEnabled() {
diff --git a/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java b/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java
index d84e04ff2..a0f5129e2 100644
--- a/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java
+++ b/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java
@@ -34,6 +34,7 @@ import org.apache.hadoop.hive.metastore.api.MetaException;
 import org.apache.impala.catalog.HdfsPartition.FileDescriptor;
 import org.apache.impala.catalog.iceberg.GroupedContentFiles;
 import org.apache.impala.compat.MetastoreShim;
+import org.apache.impala.service.BackendConfig;
 import org.apache.impala.testutil.CatalogServiceTestCatalog;
 import org.apache.impala.thrift.TNetworkAddress;
 import org.apache.impala.util.IcebergUtil;
@@ -291,6 +292,7 @@ public class FileMetadataLoaderTest {
   @Test
   public void testIcebergMultipleStorageLocations() throws IOException, CatalogException {
     CatalogServiceCatalog catalog = CatalogServiceTestCatalog.create();
+    BackendConfig.INSTANCE.setIcebergAllowDatafileInTableLocationOnly(false);
     IcebergFileMetadataLoader fml1 = getLoaderForIcebergTable(catalog,
         "functional_parquet", "iceberg_multiple_storage_locations",
         /* oldFds = */ Collections.emptyList(),
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index dfb8e8d58..3571af516 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -165,6 +165,9 @@ function start-impala {
   : ${START_CLUSTER_ARGS=""}
   # Use a fast statestore update so that DDL operations run faster.
   START_CLUSTER_ARGS_INT="--state_store_args=--statestore_update_frequency_ms=50"
+  # Disable strict datafile location checks for Iceberg tables
+  DATAFILE_LOCATION_CHECK="-iceberg_allow_datafiles_in_table_location_only=false"
+  START_CLUSTER_ARGS_INT+=("--catalogd_args=$DATAFILE_LOCATION_CHECK")
   if [[ "${TARGET_FILESYSTEM}" == "local" ]]; then
     START_CLUSTER_ARGS_INT+=("--impalad_args=--abort_on_config_error=false -s 1")
   else
diff --git a/tests/custom_cluster/test_iceberg_strict_data.py b/tests/custom_cluster/test_iceberg_strict_data.py
index 52f8f5a76..2426dd602 100644
--- a/tests/custom_cluster/test_iceberg_strict_data.py
+++ b/tests/custom_cluster/test_iceberg_strict_data.py
@@ -20,33 +20,35 @@ import pytest
 
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 
-SELECT_STATEMENT = "SELECT COUNT(1) FROM " \
-  "functional_parquet.iceberg_multiple_storage_locations"
-EXCEPTION = "IcebergTableLoadingException: " \
-  "Error loading metadata for Iceberg table"
-
 
 class TestIcebergStrictDataFileLocation(CustomClusterTestSuite):
   """Tests for checking the behaviour of startup flag
-   'iceberg_restrict_data_file_location'."""
+   'iceberg_allow_datafiles_in_table_location_only'."""
+
+  SELECT_STATEMENT = "SELECT COUNT(1) FROM " \
+    "functional_parquet.iceberg_multiple_storage_locations"
+  EXCEPTION = "IcebergTableLoadingException: " \
+    "Error loading metadata for Iceberg table"
+
   @classmethod
   def get_workload(self):
     return 'functional-query'
 
   @CustomClusterTestSuite.with_args(
-      catalogd_args='--iceberg_restrict_data_file_location=true')
+      catalogd_args='--iceberg_allow_datafiles_in_table_location_only=true')
   @pytest.mark.execute_serially
   def test_restricted_location(self, vector):
     """If the flag is enabled, tables with multiple storage locations will fail
     to load their datafiles."""
-    result = self.execute_query_expect_failure(self.client, SELECT_STATEMENT)
-    assert EXCEPTION in str(result)
+    result = self.execute_query_expect_failure(self.client, self.SELECT_STATEMENT)
+    assert self.EXCEPTION in str(result)
 
   @CustomClusterTestSuite.with_args(
-      catalogd_args='--iceberg_restrict_data_file_location=false')
+      catalogd_args='--iceberg_allow_datafiles_in_table_location_only=false')
   @pytest.mark.execute_serially
   def test_disabled(self, vector):
     """If the flag is disabled, and tables with multiple storage locations
     are configured properly, the tables load successfully."""
-    result = self.execute_query_expect_success(self.client, SELECT_STATEMENT)
+    result = self.execute_query_expect_success(self.client, self.SELECT_STATEMENT)
     assert '9' in result.data
+    self.run_test_case('QueryTest/iceberg-multiple-storage-locations-table', vector)
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index 814f328c6..769850b17 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -1092,10 +1092,6 @@ class TestIcebergTable(IcebergTestSuite):
     args = ['-q', 'DROP TABLE {0}.{1}'.format(db_name, tbl_name)]
     results = run_impala_shell_cmd(vector, args)
 
-  def test_multiple_storage_locations(self, vector, unique_database):
-    self.run_test_case('QueryTest/iceberg-multiple-storage-locations-table',
-                       vector, unique_database)
-
   def test_mixed_file_format(self, vector, unique_database):
     self.run_test_case('QueryTest/iceberg-mixed-file-format', vector,
                       unique_database)


(impala) 01/02: IMPALA-12038: Switch report_benchmark_results.py to python 3

Posted by bo...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 3af3b2c8ab25ac132e78a69cd1e0a4bed1c99bb3
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Mon Apr 3 21:58:57 2023 -0700

    IMPALA-12038: Switch report_benchmark_results.py to python 3
    
    When using TPC-DS with a large number of iterations, the
    results JSON files are enormous. Using Python2,
    report_benchmark_results.py runs out of memory and fails to
    produce the report. Python 3 is more efficient in how it
    processes Unicode inputs (see Python PEP-0393), so it's
    memory usage is much lower. It is able to handle generating
    reports that Python 2 cannot.
    
    As a general cleanup, this fixes all the flake8 issues for this file.
    
    Testing:
     - Processed very large JSON results (4+GB each for both baseline
       result and new result). Python 3 completes successfully when
       Python 2 failed.
    
    Change-Id: Idbde17f720b18d38dc2c2104ecf3fec807c1839d
    Reviewed-on: http://gerrit.cloudera.org:8080/20918
    Reviewed-by: Riza Suminto <ri...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/benchmark/report_benchmark_results.py | 122 +++++++++++++++-------------
 1 file changed, 66 insertions(+), 56 deletions(-)

diff --git a/tests/benchmark/report_benchmark_results.py b/tests/benchmark/report_benchmark_results.py
index 6a447d5ed..e7ddcc123 100755
--- a/tests/benchmark/report_benchmark_results.py
+++ b/tests/benchmark/report_benchmark_results.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env impala-python
+#!/usr/bin/env impala-python3
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
@@ -38,7 +38,7 @@ import os
 import prettytable
 import re
 from collections import defaultdict
-from datetime import date, datetime
+from datetime import date
 from optparse import OptionParser
 from tests.util.calculation_util import (
     calculate_tval, calculate_avg, calculate_stddev, calculate_geomean, calculate_mwu)
@@ -113,30 +113,30 @@ parser.add_option("--output_all_summary_nodes", dest="output_all_summary_nodes",
 parser.add_option("--build_version", dest="build_version", default='UNKNOWN',
                  help="Build/version info about the Impalad instance results are from.")
 parser.add_option("--lab_run_info", dest="lab_run_info", default='UNKNOWN',
-                 help="Information about the lab run (name/id) that published "\
+                 help="Information about the lab run (name/id) that published "
                  "the results.")
 parser.add_option("--run_user_name", dest="run_user_name", default='anonymous',
                  help="User name that this run is associated with in the perf database")
 parser.add_option("--tval_threshold", dest="tval_threshold", default=3.0,
-                 type="float", help="The ttest t-value at which a performance change "\
+                 type="float", help="The ttest t-value at which a performance change "
                  "will be flagged as sigificant.")
 parser.add_option("--zval_threshold", dest="zval_threshold", default=3.0, type="float",
                   help="The Mann-Whitney Z-value at which a performance change will be "
                   "flagged as sigificant.")
 parser.add_option("--min_percent_change_threshold",
                  dest="min_percent_change_threshold", default=5.0,
-                 type="float", help="Any performance changes below this threshold" \
-                 " will not be classified as significant. If the user specifies an" \
+                 type="float", help="Any performance changes below this threshold"
+                 " will not be classified as significant. If the user specifies an"
                  " empty value, the threshold will be set to 0")
 parser.add_option("--max_percent_change_threshold",
                  dest="max_percent_change_threshold", default=float("inf"),
-                 type="float", help="Any performance changes above this threshold"\
-                 " will be classified as significant. If the user specifies an" \
+                 type="float", help="Any performance changes above this threshold"
+                 " will be classified as significant. If the user specifies an"
                  " empty value, the threshold will be set to positive infinity")
 parser.add_option("--allowed_latency_diff_secs",
                  dest="allowed_latency_diff_secs", default=0.0, type="float",
-                 help="If specified, only a timing change that differs by more than\
-                 this value will be considered significant.")
+                 help="If specified, only a timing change that differs by more than"
+                 " this value will be considered significant.")
 options, args = parser.parse_args()
 
 
@@ -187,8 +187,8 @@ def get_dict_from_json(filename):
       level.append([('query', 'workload_name'), ('query', 'scale_factor')])
       # In the middle layer, we group by file format and compression type
       level.append([('query', 'test_vector', 'file_format'),
-      ('query', 'test_vector', 'compression_codec'),
-      ('query', 'test_vector', 'compression_type')])
+                    ('query', 'test_vector', 'compression_codec'),
+                    ('query', 'test_vector', 'compression_type')])
       # In the bottom layer, we group by query name
       level.append([('query', 'name')])
 
@@ -219,9 +219,9 @@ def get_dict_from_json(filename):
       cur = cur[get_key(level_num)]
     cur[RESULT_LIST].append(query_result)
 
-  with open(filename, "r") as f:
+  with open(filename, "rb") as f:
     data = json.loads(f.read().decode("utf-8", "ignore"))
-    grouped = defaultdict( lambda: defaultdict(
+    grouped = defaultdict(lambda: defaultdict(
         lambda: defaultdict(lambda: defaultdict(list))))
     for workload_name, workload in data.items():
       for query_result in workload:
@@ -230,6 +230,7 @@ def get_dict_from_json(filename):
     calculate_time_stats(grouped)
     return grouped
 
+
 def all_query_results(grouped):
   for workload_scale, workload in grouped.items():
     for file_format, queries in workload.items():
@@ -249,19 +250,21 @@ def get_commit_date(commit_sha):
     response = urlopen(request).read()
     data = json.loads(response.decode('utf8'))
     return data['commit']['committer']['date'][:10]
-  except:
+  except Exception:
     return ''
 
+
 def get_impala_version(grouped):
   """Figure out Impala version by looking at query profile."""
   first_result = next(all_query_results(grouped))
   profile = first_result['result_list'][0]['runtime_profile']
-  match = re.search('Impala Version:\s(.*)\s\(build\s(.*)\)', profile)
+  match = re.search(r'Impala Version:\s(.*)\s\(build\s(.*)\)', profile)
   version = match.group(1)
   commit_sha = match.group(2)
   commit_date = get_commit_date(commit_sha)
   return '{0} ({1})'.format(version, commit_date)
 
+
 def calculate_time_stats(grouped):
   """
   Add statistics to the nested dictionary.
@@ -298,6 +301,7 @@ def calculate_time_stats(grouped):
         results[SORTED] = [query_results[TIME_TAKEN] for query_results in result_list]
         results[SORTED].sort()
 
+
 class Report(object):
 
   significant_perf_change = False
@@ -322,7 +326,6 @@ class Report(object):
           for ref_query_results in ref_results[RESULT_LIST]:
             ref_time_list.append(ref_query_results[TIME_TAKEN])
 
-
       self.workload_name = '{0}({1})'.format(
           workload_scale[0][1].upper(), workload_scale[1][1])
 
@@ -437,15 +440,15 @@ class Report(object):
                  "({ref_avg:.2f}s -> {avg:.2f}s [{delta:+.2%}])\n")
 
       perf_change_str = template.format(
-          perf_change_type = perf_change_type,
-          workload_name = workload_name,
-          query_name = query_name,
-          file_format = file_format,
-          compression_codec = compression_codec,
-          compression_type = compression_type,
-          ref_avg = ref_result[AVG],
-          avg = result[AVG],
-          delta = calculate_change(result[AVG], ref_result[AVG]))
+          perf_change_type=perf_change_type,
+          workload_name=workload_name,
+          query_name=query_name,
+          file_format=file_format,
+          compression_codec=compression_codec,
+          compression_type=compression_type,
+          ref_avg=ref_result[AVG],
+          avg=result[AVG],
+          delta=calculate_change(result[AVG], ref_result[AVG]))
 
       perf_change_str += build_exec_summary_str(result, ref_result)
 
@@ -460,7 +463,7 @@ class Report(object):
         self.base_rel_stddev = float('inf')
       else:
         self.base_rel_stddev = ref_results[STDDEV] / ref_results[AVG]\
-            if ref_results > 0 else 0.0
+            if ref_results[AVG] > 0 else 0.0
 
       self.workload_name = '{0}({1})'.format(
           results[RESULT_LIST][0][QUERY][WORKLOAD_NAME].upper(),
@@ -479,17 +482,17 @@ class Report(object):
                  "({base_rel_stddev:.2%} -> {rel_stddev:.2%})\n")
 
       if self.significant_variability and ref_results:
-        #If ref_results do not exist, variability analysis will not be conducted
+        # If ref_results do not exist, variability analysis will not be conducted
         self.variability_str = variability_template.format(
-            workload_name = self.workload_name,
-            query_name = self.query_name,
-            file_format = self.file_format,
-            compression = self.compression,
-            base_rel_stddev = self.base_rel_stddev,
-            rel_stddev = self.rel_stddev)
+            workload_name=self.workload_name,
+            query_name=self.query_name,
+            file_format=self.file_format,
+            compression=self.compression,
+            base_rel_stddev=self.base_rel_stddev,
+            rel_stddev=self.rel_stddev)
 
         self.exec_summary_str = build_exec_summary_str(
-            results, ref_results, for_variability = True)
+            results, ref_results, for_variability=True)
       else:
         self.variability_str = str()
         self.exec_summary_str = str()
@@ -511,12 +514,12 @@ class Report(object):
     for workload_scale, workload in self.grouped.items():
       for file_format, queries in workload.items():
         if self.ref_grouped is not None and workload_scale in self.ref_grouped and\
-            file_format in self.ref_grouped[ workload_scale]:
+            file_format in self.ref_grouped[workload_scale]:
           ref_queries = self.ref_grouped[workload_scale][file_format]
           self.file_format_comparison_rows.append(Report.FileFormatComparisonRow(
             workload_scale, file_format, queries, ref_queries))
         else:
-          #If not present in reference results, set to None
+          # If not present in reference results, set to None
           ref_queries = None
         for query_name, results in queries.items():
           if self.ref_grouped is not None and workload_scale in self.ref_grouped and\
@@ -529,19 +532,19 @@ class Report(object):
             query_variability_row = Report.QueryVariabilityRow(results, ref_results)
             self.query_variability_rows.append(query_variability_row)
           else:
-            #If not present in reference results, set to None
+            # If not present in reference results, set to None
             ref_results = None
 
   def __str__(self):
     output = str()
 
-    #per file format analysis overview table
+    # per file format analysis overview table
     table = prettytable.PrettyTable(['Workload', 'File Format', 'Avg (s)', 'Delta(Avg)',
                                      'GeoMean(s)', 'Delta(GeoMean)'])
     table.float_format = '.2'
     table.align = 'l'
     self.file_format_comparison_rows.sort(
-        key = lambda row: row.delta_geomean, reverse = True)
+        key=lambda row: row.delta_geomean, reverse=True)
     for row in self.file_format_comparison_rows:
       table_row = [
           row.workload_name,
@@ -554,7 +557,7 @@ class Report(object):
 
     output += str(table) + '\n\n'
 
-    #main comparison table
+    # main comparison table
     detailed_performance_change_analysis_str = str()
     table = prettytable.PrettyTable(['Workload', 'Query', 'File Format', 'Avg(s)',
                                      'Base Avg(s)', 'Delta(Avg)', 'StdDev(%)',
@@ -562,7 +565,7 @@ class Report(object):
                                      'MW Zval', 'Tval'])
     table.float_format = '.2'
     table.align = 'l'
-    #Sort table from worst to best regression
+    # Sort table from worst to best regression
     self.query_comparison_rows.sort(key=lambda row: row.delta_avg + row.median_diff,
                                     reverse=True)
     for row in self.query_comparison_rows:
@@ -593,7 +596,7 @@ class Report(object):
     output += detailed_performance_change_analysis_str
 
     variability_analysis_str = str()
-    self.query_variability_rows.sort(key = lambda row: row.rel_stddev, reverse = True)
+    self.query_variability_rows.sort(key=lambda row: row.rel_stddev, reverse=True)
     for row in self.query_variability_rows:
       variability_analysis_str += str(row)
 
@@ -608,6 +611,7 @@ class Report(object):
 
     return output
 
+
 class CombinedExecSummaries(object):
   """All execution summaries for each query are combined into this object.
 
@@ -714,7 +718,7 @@ class CombinedExecSummaries(object):
     table.float_format = '.2'
 
     for row in self.rows:
-      table_row = [ row[PREFIX] + row[OPERATOR],
+      table_row = [row[PREFIX] + row[OPERATOR],
           prettyprint_values(row[NUM_HOSTS]),
           prettyprint_values(row[NUM_INSTANCES]),
           prettyprint_time(row[AVG_TIME]),
@@ -806,7 +810,7 @@ class ExecSummaryComparison(object):
   CombinedExecSummaries.compare(reference).
   """
 
-  def __init__(self, combined_summary, ref_combined_summary, for_variability = False):
+  def __init__(self, combined_summary, ref_combined_summary, for_variability=False):
 
     # Store the original summaries, in case we can't build a comparison
     self.combined_summary = combined_summary
@@ -927,7 +931,7 @@ class ExecSummaryComparison(object):
           prettyprint_values(row[NUM_HOSTS]),
           prettyprint_values(row[NUM_INSTANCES]),
           prettyprint_values(row[NUM_ROWS]),
-          prettyprint_values(row[EST_NUM_ROWS]) ]
+          prettyprint_values(row[EST_NUM_ROWS])]
 
       table_contains_at_least_one_row = True
       table.add_row(table_row)
@@ -942,8 +946,8 @@ class ExecSummaryComparison(object):
     def is_significant(row):
       """Check if the performance change in the row was significant"""
       return options.output_all_summary_nodes or (
-        row[MAX_TIME] > 100000000 and
-        row[PERCENT_OF_QUERY] > 0.02)
+        row[MAX_TIME] > 100000000
+        and row[PERCENT_OF_QUERY] > 0.02)
 
     self.__build_rows()
     if self.error_str:
@@ -993,10 +997,12 @@ class ExecSummaryComparison(object):
 
     return str(table)
 
+
 def calculate_change(val, ref_val):
   """Calculate how big the change in val compared to ref_val is compared to total"""
   return (val - ref_val) / ref_val if ref_val != 0 else 0.0
 
+
 def prettyprint(val, units, divisor):
   """ Print a value in human readable format along with it's unit.
 
@@ -1016,15 +1022,19 @@ def prettyprint(val, units, divisor):
         return "%3.2f%s" % (val, unit)
     val /= divisor
 
+
 def prettyprint_bytes(byte_val):
   return prettyprint(byte_val, ['B', 'KB', 'MB', 'GB', 'TB'], 1024.0)
 
+
 def prettyprint_values(unit_val):
   return prettyprint(unit_val, ["", "K", "M", "B"], 1000.0)
 
+
 def prettyprint_time(time_val):
   return prettyprint(time_val, ["ns", "us", "ms", "s"], 1000.0)
 
+
 def prettyprint_percent(percent_val):
   return '{0:+.2%}'.format(percent_val)
 
@@ -1056,12 +1066,12 @@ def save_runtime_diffs(results, ref_results, change_significant, zval, tval):
       prefix = 'imp'
 
   runtime_profile_file_name = template.format(
-      prefix = prefix,
-      query_name = query[NAME],
-      scale_factor = query[SCALE_FACTOR],
-      file_format = query[TEST_VECTOR][FILE_FORMAT],
-      compression_codec = query[TEST_VECTOR][COMPRESSION_CODEC],
-      compression_type = query[TEST_VECTOR][COMPRESSION_TYPE])
+      prefix=prefix,
+      query_name=query[NAME],
+      scale_factor=query[SCALE_FACTOR],
+      file_format=query[TEST_VECTOR][FILE_FORMAT],
+      compression_codec=query[TEST_VECTOR][COMPRESSION_CODEC],
+      compression_type=query[TEST_VECTOR][COMPRESSION_TYPE])
 
   # Go into results dir
   dir_path = os.path.join(os.environ["IMPALA_HOME"], 'results')
@@ -1076,8 +1086,8 @@ def save_runtime_diffs(results, ref_results, change_significant, zval, tval):
   runtime_profile_diff = diff.make_file(
       ref_runtime_profile.splitlines(),
       runtime_profile.splitlines(),
-      fromdesc = "Baseline Runtime Profile",
-      todesc = "Current Runtime Profile")
+      fromdesc="Baseline Runtime Profile",
+      todesc="Current Runtime Profile")
 
   with open(runtime_profile_file_path, 'w+') as f:
     f.write(runtime_profile_diff)