You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2022/09/19 21:11:44 UTC

[impala] branch master updated (5fd4e6a11 -> 19114c720)

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


    from 5fd4e6a11 IMPALA-11438: Add tests for CREATE TABLE LIKE PARQUET STORED AS ICEBERG
     new 17ee89f3d IMPALA-11573:  Certain methods used by the replanning feature can be improved
     new 190b5e41b IMPALA-11572: deflake test_mt_dop_skew_lpt
     new 19114c720 IMPALA-11578: Exclude locality test for remote FS

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../java/org/apache/impala/service/Frontend.java   | 19 ++++++-----
 tests/common/skip.py                               |  4 +++
 tests/custom_cluster/test_scheduler_locality.py    |  2 ++
 tests/query_test/test_scanners.py                  | 39 +++++++++++++---------
 4 files changed, 40 insertions(+), 24 deletions(-)


[impala] 02/03: IMPALA-11572: deflake test_mt_dop_skew_lpt

Posted by jo...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 190b5e41b1e9d24a91432cc470c91e6fff84a041
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Mon Sep 12 15:49:46 2022 +0200

    IMPALA-11572: deflake test_mt_dop_skew_lpt
    
    test_mt_dop_skew_lpt was flaky. Also, it calculated the
    min(bytes_read) / max(bytes_read) globally across all fragment
    insteances, not just among the intra-node fragment instances.
    
    To deflake the test, this test:
     * calculate intra-node min(bytes_read) / max(bytes_read) ratios
       instead of global ones
     * print out the ratios so we'll know the numbers when the test fails
     * eliminate compression codec test dimension which is not used anyway
    
    Change-Id: I823542c21fe8f10f43a501fe4175da883eaf2f99
    Reviewed-on: http://gerrit.cloudera.org:8080/18970
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/query_test/test_scanners.py | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 45865bee6..f49ebbd53 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -375,7 +375,8 @@ class TestHdfsScannerSkew(ImpalaTestSuite):
   def add_test_dimensions(cls):
     super(TestHdfsScannerSkew, cls).add_test_dimensions()
     cls.ImpalaTestMatrix.add_constraint(lambda v:
-        v.get_value('table_format').file_format in ('text'))
+        v.get_value('table_format').file_format in ('text') and
+        v.get_value('table_format').compression_codec == 'none')
 
   @SkipIfLocal.multiple_impalad
   def test_mt_dop_skew_lpt(self, vector, unique_database):
@@ -384,32 +385,41 @@ class TestHdfsScannerSkew(ImpalaTestSuite):
        load balancing with a shared queue between the instances. With IMPALA-11539
        the items in the queue are ordered by scan sizes from largest to smallest, i.e.
        we are doing Longest-Processing Time (LPT) scheduling."""
-    def bytes_read_statistics(profile):
+    def count_intra_node_skew(profile):
+      SKEW_THRESHOLD = 0.85
       lines = [line.strip() for line in profile.splitlines() if "- BytesRead: " in line]
       assert len(lines) == 7  # Averaged fragment + 6 fragment
-      min = None
-      max = None
+      bytes_read_array = []
       for i in range(1, len(lines)):
         # A line looks like:
         # - BytesRead: 202.77 MB (212617555)
         # we only need '212617555' from it
         bytes_read_str = re.findall(r'\((\d+)\)', lines[i])[0]
         bytes_read = int(bytes_read_str)
-        if min is None and max is None:
-          min = max = bytes_read
-          continue
-        if bytes_read < min: min = bytes_read
-        if bytes_read > max: max = bytes_read
-      return [min, max]
+        bytes_read_array.append(bytes_read)
+      count_skew = 0
+      # MT_DOP fragments are next to each other in the profile, so fragment instances
+      # belonging to a single executor starts at 0, 2, 4
+      for i in [0, 2, 4]:
+        a = bytes_read_array[i]
+        b = bytes_read_array[i + 1]
+        if a < b:
+          ratio = float(a) / float(b)
+        else:
+          ratio = float(b) / float(a)
+        print "Intra-node bytes read ratio:", ratio
+        if ratio < SKEW_THRESHOLD:
+          count_skew += 1
+      return count_skew
 
     tbl_name = unique_database + ".lineitem_skew"
     with self.create_impala_client() as imp_client:
       imp_client.set_configuration_option('mt_dop', '2')
       imp_client.execute("""create table {} like tpch.lineitem""".format(tbl_name))
       # Create a couple of small data files
-      for i in range(1, 5):
+      for i in range(1, 11):
         imp_client.execute("""insert into {} select * from tpch.lineitem
-                              where l_orderkey % 5 = 0""".format(tbl_name))
+                              where l_orderkey % 11 = 0""".format(tbl_name))
       # Create a couple of large files
       imp_client.execute("insert into {} select * from tpch.lineitem".format(tbl_name))
 
@@ -423,9 +433,8 @@ class TestHdfsScannerSkew(ImpalaTestSuite):
                       min(l_receiptdate),min(l_shipinstruct),min(l_shipmode),min(l_comment)
                from {}""".format(tbl_name))
         profile = results.runtime_profile
-        [min, max] = bytes_read_statistics(profile)
-        if float(min) / float(max) < 0.5: cnt_fail += 1
-      assert cnt_fail < 3
+        cnt_fail += count_intra_node_skew(profile)
+      assert cnt_fail <= 5
 
 
 class TestHudiParquet(ImpalaTestSuite):


[impala] 03/03: IMPALA-11578: Exclude locality test for remote FS

Posted by jo...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 19114c7205318e8df158348a02695c378df1abe6
Author: Michael Smith <mi...@cloudera.com>
AuthorDate: Tue Sep 13 11:13:38 2022 -0700

    IMPALA-11578: Exclude locality test for remote FS
    
    Exclude test_scheduler_locality when the filesystem can only be remote.
    
    Change-Id: Ie6198421f21bc2520773ecbb34ffaf65969ebc43
    Reviewed-on: http://gerrit.cloudera.org:8080/18980
    Reviewed-by: Wenzhe Zhou <wz...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/common/skip.py                            | 4 ++++
 tests/custom_cluster/test_scheduler_locality.py | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/tests/common/skip.py b/tests/common/skip.py
index 8f380419f..055030a43 100644
--- a/tests/common/skip.py
+++ b/tests/common/skip.py
@@ -50,6 +50,10 @@ class SkipIfFS:
       reason="HDFS encryption is not supported")
   hdfs_block_size = pytest.mark.skipif(not IS_HDFS, reason="uses it's own block size")
   hdfs_acls = pytest.mark.skipif(not IS_HDFS, reason="HDFS acls are not supported")
+  # TODO: IMPALA-11584: see if this can be collapsed into SkipIfNotHdfsMinicluster
+  always_remote = pytest.mark.skipif(IS_EC or not (IS_HDFS or IS_OZONE)
+      or IMPALA_TEST_CLUSTER_PROPERTIES.is_remote_cluster(),
+      reason="Only HDFS and Ozone tests are run co-located")
 
   # Special case product limitations.
   empty_directory = pytest.mark.skipif(IS_S3,
diff --git a/tests/custom_cluster/test_scheduler_locality.py b/tests/custom_cluster/test_scheduler_locality.py
index bccbcfa82..0ab1dc301 100644
--- a/tests/custom_cluster/test_scheduler_locality.py
+++ b/tests/custom_cluster/test_scheduler_locality.py
@@ -19,12 +19,14 @@
 
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 from tests.common.network import get_external_ip
+from tests.common.skip import SkipIfFS
 
 
 LOCAL_ASSIGNMENTS_METRIC = "simple-scheduler.local-assignments.total"
 TOTAL_ASSIGNMENTS_METRIC = "simple-scheduler.assignments.total"
 
 
+@SkipIfFS.always_remote
 class TestSchedulerLocality(CustomClusterTestSuite):
   """Tests for local and remote disk scheduling."""
 


[impala] 01/03: IMPALA-11573: Certain methods used by the replanning feature can be improved

Posted by jo...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 17ee89f3db74fce1045ea9bd8e9f9fe727bea70e
Author: Qifan Chen <qc...@cloudera.com>
AuthorDate: Fri Sep 9 18:29:20 2022 -0400

    IMPALA-11573:  Certain methods used by the replanning feature can be improved
    
    This patch improves certain methods used by the replan feature
    (IMPALA-10992) so that they can be called by the external frontend
    component in Hive. Specifically, the declaration of these methods
    becomes public static, and the initialization of a static data
    member checks whether the dependent object exists.
    
    Testing:
    1. Run unit tests;
    2. Run "core" tests.
    
    Change-Id: I334523f86e4292e9591306179eb1ab43be316c99
    Reviewed-on: http://gerrit.cloudera.org:8080/18968
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Quanlong Huang <hu...@gmail.com>
---
 .../main/java/org/apache/impala/service/Frontend.java | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/service/Frontend.java b/fe/src/main/java/org/apache/impala/service/Frontend.java
index f2044377e..99e2ae7f3 100644
--- a/fe/src/main/java/org/apache/impala/service/Frontend.java
+++ b/fe/src/main/java/org/apache/impala/service/Frontend.java
@@ -231,7 +231,8 @@ public class Frontend {
 
   // Maximum number of times to retry a query if it fails due to inconsistent metadata.
   private static final int INCONSISTENT_METADATA_NUM_RETRIES =
-      BackendConfig.INSTANCE.getLocalCatalogMaxFetchRetries();
+      (BackendConfig.INSTANCE != null) ?
+      BackendConfig.INSTANCE.getLocalCatalogMaxFetchRetries() : 0;
 
   // Maximum number of threads used to check authorization for the user when executing
   // show tables/databases.
@@ -1762,7 +1763,7 @@ public class Frontend {
    *
    * Also imposes the artificial two-executor groups for testing when needed.
    */
-  private List<TExecutorGroupSet> setupThresholdsForExecutorGroupSets(
+  public static List<TExecutorGroupSet> setupThresholdsForExecutorGroupSets(
       List<TExecutorGroupSet> executorGroupSets, String request_pool,
       boolean default_executor_group, boolean test_replan) throws ImpalaException {
     RequestPoolService poolService = RequestPoolService.getInstance();
@@ -1854,7 +1855,7 @@ public class Frontend {
   // Only the following types of statements are considered auto scalable since each
   // can be planned by the distributed planner utilizing the number of executors in
   // an executor group as input.
-  private boolean canStmtBeAutoScaled(TStmtType type) {
+  public static boolean canStmtBeAutoScaled(TStmtType type) {
     return type == TStmtType.EXPLAIN || type == TStmtType.QUERY || type == TStmtType.DML;
   }
 
@@ -1879,11 +1880,11 @@ public class Frontend {
       default_executor_group = e.getExec_group_name_prefix() == null
           || e.getExec_group_name_prefix().isEmpty();
     }
-
-    List<TExecutorGroupSet> executorGroupSetsToUse = setupThresholdsForExecutorGroupSets(
-        originalExecutorGroupSets, queryOptions.getRequest_pool(), default_executor_group,
-        enable_replan
-            && (RuntimeEnv.INSTANCE.isTestEnv() || queryOptions.isTest_replan()));
+    List<TExecutorGroupSet> executorGroupSetsToUse =
+        Frontend.setupThresholdsForExecutorGroupSets(originalExecutorGroupSets,
+            queryOptions.getRequest_pool(), default_executor_group,
+            enable_replan
+                && (RuntimeEnv.INSTANCE.isTestEnv() || queryOptions.isTest_replan()));
 
     int num_executor_group_sets = executorGroupSetsToUse.size();
     if (num_executor_group_sets == 0) {
@@ -1938,7 +1939,7 @@ public class Frontend {
       } else if (!enable_replan) {
         reason = "query option 'enable_replan' is false";
         break;
-      } else if (!canStmtBeAutoScaled(req.stmt_type)) {
+      } else if (!Frontend.canStmtBeAutoScaled(req.stmt_type)) {
         reason = "query is not auto-scalable";
         break;
       }