You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by kw...@apache.org on 2019/05/05 04:32:47 UTC

[impala] branch master updated (3fb3657 -> 25db9ea)

This is an automated email from the ASF dual-hosted git repository.

kwho pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


    from 3fb3657  IMPALA-5351: Support storing column comment of kudu table
     new 1423122  IMPALA-8472: Fix the refresh privilege workaround in Ranger
     new 748a3d5  Fix redundant downloads of hive source tarball
     new 23d7a6d  IMPALA-8492: reenable large string tests in docker
     new 25db9ea  IMPALA-8496: Fix flakiness of test_data_cache.py

The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 bin/bootstrap_toolchain.py                         | 26 +++++++++------
 .../ranger/RangerAuthorizationChecker.java         |  5 ---
 .../ranger/RangerCatalogdAuthorizationManager.java |  4 ---
 .../queries/QueryTest/data-cache.test              |  4 ++-
 .../queries/QueryTest/large_strings.test           |  5 ++-
 tests/authorization/test_ranger.py                 |  3 ++
 tests/common/skip.py                               |  2 --
 tests/custom_cluster/test_data_cache.py            | 38 +++++++++++++++++++---
 tests/query_test/test_insert.py                    |  1 -
 tests/query_test/test_queries.py                   |  1 -
 10 files changed, 57 insertions(+), 32 deletions(-)


[impala] 02/04: Fix redundant downloads of hive source tarball

Posted by kw...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kwho pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 748a3d57e5036934ea11723eea11e2535dd6fffa
Author: Vihang Karajgaonkar <vi...@cloudera.com>
AuthorDate: Thu May 2 13:14:46 2019 -0700

    Fix redundant downloads of hive source tarball
    
    Since CDP_BUILD_NUMBER was bumped to 1056671 the name of the hive source
    tarball changed. Not only the tar ball name was changed, the file it
    gets extracted to is also different from the tar file itself. Due to
    this the bootstrap_toolchain.py fails to check if the downloaded
    hive source component already exists and it downloads again unnecessarily.
    This patch improves bootstrap_toolchain.py to take
    non-standard tarfiles which extracts to a different directory name
    compared to the tar file.
    
    Testing done:
    1. Removed the local toolchain and ran the script couple of times to
    make sure that it downloads the hive tar ball only once.
    
    Change-Id: Ifd04a1a367a0cc4aa0a2b490a45fbc93a862c83a
    Reviewed-on: http://gerrit.cloudera.org:8080/13219
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 bin/bootstrap_toolchain.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index 6a6eeb7..f0b54f6 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -104,7 +104,7 @@ class Package(object):
 
 
 class CdpComponent(object):
-  def __init__(self, basename, makedir=False):
+  def __init__(self, basename, makedir=False, pkg_directory=None):
     """
     basename: the name of the file to be downloaded, without its .tar.gz suffix
     makedir: if false, it is assumed that the downloaded tarball will expand
@@ -115,6 +115,12 @@ class CdpComponent(object):
     """
     self.basename = basename
     self.makedir = makedir
+    cdp_components_home = os.environ.get("CDP_COMPONENTS_HOME")
+    if cdp_components_home is None:
+      raise Exception("CDP_COMPONENTS_HOME is not set. Cannot determine the "
+                      "component package directory")
+    self.pkg_directory = "{0}/{1}".format(cdp_components_home,
+                            pkg_directory if pkg_directory else basename)
 
 
 def try_get_platform_release_label():
@@ -449,8 +455,8 @@ def download_cdp_components(cdp_components, url_prefix):
     os.makedirs(cdp_components_home)
 
   def download(component):
-    pkg_directory = "{0}/{1}".format(cdp_components_home, component.basename)
-    if os.path.isdir(pkg_directory): return
+
+    if os.path.isdir(component.pkg_directory): return
     file_name = "{0}.tar.gz".format(component.basename)
     download_path = "{0}/{1}".format(url_prefix, file_name)
     dst = cdp_components_home
@@ -461,14 +467,14 @@ def download_cdp_components(cdp_components, url_prefix):
       wget_and_unpack_package(download_path, file_name, dst, False)
     except:  # noqa
       # Clean up any partially-unpacked result.
-      if os.path.isdir(pkg_directory):
-        shutil.rmtree(pkg_directory)
+      if os.path.isdir(component.pkg_directory):
+        shutil.rmtree(component.pkg_directory)
       # Clean up any temp directory if we made one
       if component.makedir:
         shutil.rmtree(dst)
       raise
     if component.makedir:
-      os.rename(dst, pkg_directory)
+      os.rename(dst, component.pkg_directory)
 
   execute_many(download, cdp_components)
 
@@ -570,10 +576,10 @@ if __name__ == "__main__":
   ]
   use_cdp_hive = os.getenv("USE_CDP_HIVE") == "true"
   if use_cdp_hive:
-    cdp_components.append(CdpComponent("hive-{0}-source"
-                          .format(os.environ.get("IMPALA_HIVE_VERSION")))),
-    cdp_components.append(CdpComponent("apache-hive-{0}-bin"
-                          .format(os.environ.get("IMPALA_HIVE_VERSION")))),
+    hive_version = os.environ.get("IMPALA_HIVE_VERSION")
+    cdp_components.append(CdpComponent("hive-{0}-source".format(hive_version),
+                          pkg_directory="hive-{0}".format(hive_version))),
+    cdp_components.append(CdpComponent("apache-hive-{0}-bin".format(hive_version))),
     cdp_components.append(CdpComponent(
         "tez-{0}-minimal".format(os.environ.get("IMPALA_TEZ_VERSION")),
         makedir=True))


[impala] 04/04: IMPALA-8496: Fix flakiness of test_data_cache.py

Posted by kw...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kwho pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 25db9ea8f3f4de3086610ccb6040a101691e6340
Author: Michael Ho <kw...@cloudera.com>
AuthorDate: Sat May 4 01:09:55 2019 -0700

    IMPALA-8496: Fix flakiness of test_data_cache.py
    
    test_data_cache.py was added as part of IMPALA-8341 to verify
    that the DataCache hit / miss counts and the DataCache metrics
    are working as expected. The test seems to fail intermittently
    due to unexpected cache misses.
    
    Part of the test creates a temp table from tpch_parquet.lineitem
    and uses it to join against tpch_parquet.lineitem itself on the
    l_orderkey column. The test expects a complete cache hit for
    tpch_parquet.lineitem when joining against the temp table as it
    should be cached entirely as part of CTAS statement. However, this
    doesn't work as expected all the time. In particular, the data cache
    internally divides up the key space into multiple shards and a key
    is hashed to determine the shard it belongs to. By default, the
    number of shards is the same as number of CPU cores (e.g. 16 for AWS
    m5-4xlarge instance). Since the cache size is set to 500MB, each shard
    will have a capacity of 31MB only. In some cases, it's possible that
    some rows of l_orderkey are evicted if the shard they belong to grow
    beyond 31MB. The problem is not deterministic as part of the cache key
    is the modification time of the file, which changes from run-to-run as
    it's essentially determined by the data loading time of the job. This
    leads to flakiness of the test.
    
    To fix this problem, this patch forces the data cache to use a single
    shard only for determinisim. In addition, the test is also skipped for
    non-HDFS and HDFS erasure encoding builds as it's dependent on the scan
    range assignment. To exercise the cache more extensively, the plan is
    to enable it by default for S3 builds instead of relying on BE and E2E
    tests only.
    
    Testing done:
    - Ran test_data_cache.py 10+ times, each with different mtime
      for tpch_parquet.lineitem; Used to fail 2 out of 3 runs.
    
    Change-Id: I98d5b8fa1d3fb25682a64bffaf56d751a140e4c9
    Reviewed-on: http://gerrit.cloudera.org:8080/13242
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../queries/QueryTest/data-cache.test              |  4 ++-
 tests/custom_cluster/test_data_cache.py            | 38 +++++++++++++++++++---
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/testdata/workloads/functional-query/queries/QueryTest/data-cache.test b/testdata/workloads/functional-query/queries/QueryTest/data-cache.test
index 4cd1062..3f0564e 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/data-cache.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/data-cache.test
@@ -9,6 +9,8 @@ row_regex: .*DataCacheMissCount: 64 \(64\).*
 ====
 ---- QUERY
 select count(*) from tpch_parquet.lineitem t1, test_parquet t2 where t1.l_orderkey = t2.l_orderkey;
+---- RESULTS
+30012985
 ---- RUNTIME_PROFILE
 # Exepct cache hits for t1 and cache misses for t2.
 row_regex: .*DataCacheHitCount: 6 \(6\).*
@@ -36,7 +38,7 @@ row_regex: .*DataCachePartialHitCount: 0 \(0\).*
 insert overwrite test_parquet select * from tpch_parquet.lineitem where l_shipmode = 'AIR';
 ====
 ---- QUERY
-# Verifies that stale data from the cache is not used due to change in mtime.
+# Verifies that stale data from the cache is not used.
 select count(distinct l_orderkey) from test_parquet;
 ---- RESULTS
 652393
diff --git a/tests/custom_cluster/test_data_cache.py b/tests/custom_cluster/test_data_cache.py
index f8db6b0..88464bb 100644
--- a/tests/custom_cluster/test_data_cache.py
+++ b/tests/custom_cluster/test_data_cache.py
@@ -18,11 +18,16 @@
 import pytest
 
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
+from tests.common.skip import SkipIf, SkipIfEC
 
 
+@SkipIf.not_hdfs
+@SkipIfEC.scheduling
 class TestDataCache(CustomClusterTestSuite):
   """ This test enables the data cache and verfies that cache hit and miss counts
-  in the runtime profile and metrics are as expected.
+  in the runtime profile and metrics are as expected. Run on non-EC HDFS only as
+  this test checks the number of data cache hit counts, which implicitly relies
+  on the scheduler's behavior and number of HDFS blocks.
   """
   @classmethod
   def get_workload(self):
@@ -30,14 +35,37 @@ class TestDataCache(CustomClusterTestSuite):
 
   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(
-      impalad_args="--always_use_data_cache=true --data_cache_write_concurrency=64",
+      impalad_args="--always_use_data_cache=true --data_cache_write_concurrency=64"
+      " --cache_force_single_shard=true",
       start_args="--data_cache_dir=/tmp --data_cache_size=500MB", cluster_size=1)
-  def test_data_cache(self, vector, unique_database):
+  def test_data_cache_deterministic(self, vector, unique_database):
       """ This test creates a temporary table from another table, overwrites it with
-      some other data and verifies that no stale data is read from the cache. Runs
-      with a single node to make it easier to verify the runtime profile.  """
+      some other data and verifies that no stale data is read from the cache. Runs with
+      a single node to make it easier to verify the runtime profile. Also enables higher
+      write concurrency and uses a single shard to avoid non-determinism.
+      """
       self.run_test_case('QueryTest/data-cache', vector, unique_database)
       assert self.get_metric('impala-server.io-mgr.remote-data-cache-dropped-bytes') >= 0
       assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') > 0
       assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') > 0
       assert self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') > 0
+
+  @pytest.mark.execute_serially
+  @CustomClusterTestSuite.with_args(
+      impalad_args="--always_use_data_cache=true",
+      start_args="--data_cache_dir=/tmp --data_cache_size=500MB", cluster_size=1)
+  def test_data_cache(self, vector):
+      """ This test scans the same table twice and verifies the cache hit count metrics
+      are correct. The exact number of bytes hit is non-deterministic between runs due
+      to different mtime of files and multiple shards in the cache.
+      """
+      QUERY = "select * from tpch_parquet.lineitem"
+      # Do a first run to warm up the cache. Expect no hits.
+      self.execute_query(QUERY)
+      assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') == 0
+      assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') > 0
+      assert self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') > 0
+
+      # Do a second run. Expect some hits.
+      self.execute_query(QUERY)
+      assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') > 0


[impala] 03/04: IMPALA-8492: reenable large string tests in docker

Posted by kw...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kwho pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 23d7a6dce6ce4287edf20e3f63b225822cb88598
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Fri May 3 12:12:41 2019 -0700

    IMPALA-8492: reenable large string tests in docker
    
    IMPALA-4865 is fixed so these now pass. I noticed
    that the IMPALA-4874 test occasionally hit
    "Memory Limit Exceeded" when looped, so I reduced
    the data size there slightly.
    
    Testing:
    Looped the tests locally against a dockerised minicluster
    for a while.
    
    Change-Id: I030f4eff2d3fb771fc92b760efb13170e68285dc
    Reviewed-on: http://gerrit.cloudera.org:8080/13233
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../workloads/functional-query/queries/QueryTest/large_strings.test  | 5 ++---
 tests/common/skip.py                                                 | 2 --
 tests/query_test/test_insert.py                                      | 1 -
 tests/query_test/test_queries.py                                     | 1 -
 4 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/testdata/workloads/functional-query/queries/QueryTest/large_strings.test b/testdata/workloads/functional-query/queries/QueryTest/large_strings.test
index 0cca71b..9df5a50 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/large_strings.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/large_strings.test
@@ -219,12 +219,11 @@ from (
 select cast(fnv_hash(concat(l_comment, 'a')) as string) as h from tpch_parquet.lineitem union all
 select cast(fnv_hash(concat(l_comment, 'b')) as string) as h from tpch_parquet.lineitem union all
 select cast(fnv_hash(concat(l_comment, 'c')) as string) as h from tpch_parquet.lineitem union all
-select cast(fnv_hash(concat(l_comment, 'd')) as string) as h from tpch_parquet.lineitem union all
-select cast(fnv_hash(concat(l_comment, 'e')) as string) as h from tpch_parquet.lineitem) a;
+select cast(fnv_hash(concat(l_comment, 'd')) as string) as h from tpch_parquet.lineitem) a;
 ---- TYPES
 INT,INT
 ---- RESULTS
-611468161,611468161
+489174530,489174530
 =====
 ---- QUERY
 select repeat('the quick brown fox', 1024 * 1024 * 100)
diff --git a/tests/common/skip.py b/tests/common/skip.py
index cad1caa..646695b 100644
--- a/tests/common/skip.py
+++ b/tests/common/skip.py
@@ -192,8 +192,6 @@ class SkipIfDockerizedCluster:
       IS_DOCKERIZED_TEST_CLUSTER, reason="Daemon logs not exposed in host.")
   accesses_host_filesystem = pytest.mark.skipif(
       IS_DOCKERIZED_TEST_CLUSTER, reason="Daemon would need to access host filesystem.")
-  jvm_oom_large_string = pytest.mark.skipif(IS_DOCKERIZED_TEST_CLUSTER,
-      reason="IMPALA-4865: JVM hits OOM for large string. Heap is smaller in docker.")
   insert_acls = pytest.mark.skipif(IS_DOCKERIZED_TEST_CLUSTER,
       reason="IMPALA-8384: insert ACL tests are broken on dockerised minicluster.")
 
diff --git a/tests/query_test/test_insert.py b/tests/query_test/test_insert.py
index a55fdbe..5014156 100644
--- a/tests/query_test/test_insert.py
+++ b/tests/query_test/test_insert.py
@@ -80,7 +80,6 @@ class TestInsertQueries(ImpalaTestSuite):
             v.get_value('compression_codec') == 'none'))
 
   @pytest.mark.execute_serially
-  @SkipIfDockerizedCluster.jvm_oom_large_string
   def test_insert_large_string(self, vector, unique_database):
     """Test handling of large strings in inserter and scanner."""
     if "-Xcheck:jni" in os.environ.get("LIBHDFS_OPTS", ""):
diff --git a/tests/query_test/test_queries.py b/tests/query_test/test_queries.py
index 2a44fce..d156212 100644
--- a/tests/query_test/test_queries.py
+++ b/tests/query_test/test_queries.py
@@ -204,7 +204,6 @@ class TestQueriesParquetTables(ImpalaTestSuite):
     return 'functional-query'
 
   @SkipIfEC.oom
-  @SkipIfDockerizedCluster.jvm_oom_large_string
   @pytest.mark.execute_serially
   def test_very_large_strings(self, vector):
     """Regression test for IMPALA-1619. Doesn't need to be run on all file formats.


[impala] 01/04: IMPALA-8472: Fix the refresh privilege workaround in Ranger

Posted by kw...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kwho pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 1423122945a901792c205d1532593f36c5fd2f87
Author: Fredy Wijaya <fw...@cloudera.com>
AuthorDate: Fri May 3 08:06:31 2019 -0700

    IMPALA-8472: Fix the refresh privilege workaround in Ranger
    
    This patch fixes the refresh privilege workaround with the one properly
    defined in Ranger (see RANGER-2374).
    
    Testing:
    - Ran all FE tests
    - Ran all E2E authorization tests
    
    Change-Id: Ica8216353b10ad7366a2b5b7b7d86a4e0af844f8
    Reviewed-on: http://gerrit.cloudera.org:8080/13229
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../impala/authorization/ranger/RangerAuthorizationChecker.java      | 5 -----
 .../authorization/ranger/RangerCatalogdAuthorizationManager.java     | 4 ----
 tests/authorization/test_ranger.py                                   | 3 +++
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/authorization/ranger/RangerAuthorizationChecker.java b/fe/src/main/java/org/apache/impala/authorization/ranger/RangerAuthorizationChecker.java
index 1a0f2df..b938bee 100644
--- a/fe/src/main/java/org/apache/impala/authorization/ranger/RangerAuthorizationChecker.java
+++ b/fe/src/main/java/org/apache/impala/authorization/ranger/RangerAuthorizationChecker.java
@@ -57,7 +57,6 @@ public class RangerAuthorizationChecker extends AuthorizationChecker {
 
   // These are Ranger access types (privileges).
   public static final String UPDATE_ACCESS_TYPE = "update";
-  public static final String REFRESH_ACCESS_TYPE = "read";
   public static final String SELECT_ACCESS_TYPE = "select";
 
   private final RangerDefaultAuditHandler auditHandler_;
@@ -266,10 +265,6 @@ public class RangerAuthorizationChecker extends AuthorizationChecker {
     } else if (privilege == Privilege.INSERT) {
       // Ranger plugin for Hive considers INSERT to be UPDATE.
       accessType = UPDATE_ACCESS_TYPE;
-    } else if (privilege == Privilege.REFRESH) {
-      // TODO: this is a hack. It will need to be fixed once refresh is added into Hive
-      // service definition.
-      accessType = REFRESH_ACCESS_TYPE;
     } else {
       accessType = privilege.name().toLowerCase();
     }
diff --git a/fe/src/main/java/org/apache/impala/authorization/ranger/RangerCatalogdAuthorizationManager.java b/fe/src/main/java/org/apache/impala/authorization/ranger/RangerCatalogdAuthorizationManager.java
index 87ca927..ad7d02d 100644
--- a/fe/src/main/java/org/apache/impala/authorization/ranger/RangerCatalogdAuthorizationManager.java
+++ b/fe/src/main/java/org/apache/impala/authorization/ranger/RangerCatalogdAuthorizationManager.java
@@ -277,10 +277,6 @@ public class RangerCatalogdAuthorizationManager implements AuthorizationManager
 
     if (level == TPrivilegeLevel.INSERT) {
       request.getAccessTypes().add(RangerAuthorizationChecker.UPDATE_ACCESS_TYPE);
-    } else if (level == TPrivilegeLevel.REFRESH) {
-      // TODO: this is a hack. It will need to be fixed once refresh is added into Hive
-      // service definition.
-      request.getAccessTypes().add(RangerAuthorizationChecker.REFRESH_ACCESS_TYPE);
     } else {
       request.getAccessTypes().add(level.name().toLowerCase());
     }
diff --git a/tests/authorization/test_ranger.py b/tests/authorization/test_ranger.py
index b1e62c1..b66cc58 100644
--- a/tests/authorization/test_ranger.py
+++ b/tests/authorization/test_ranger.py
@@ -214,16 +214,19 @@ class TestRanger(CustomClusterTestSuite):
         ["USER", user, "", "", "", "*", "", "create", "false"],
         ["USER", user, "", "", "", "*", "", "drop", "false"],
         ["USER", user, "", "", "", "*", "", "insert", "false"],
+        ["USER", user, "", "", "", "*", "", "refresh", "false"],
         ["USER", user, "", "", "", "*", "", "select", "false"],
         ["USER", user, "*", "", "", "", "*", "alter", "false"],
         ["USER", user, "*", "", "", "", "*", "create", "false"],
         ["USER", user, "*", "", "", "", "*", "drop", "false"],
         ["USER", user, "*", "", "", "", "*", "insert", "false"],
+        ["USER", user, "*", "", "", "", "*", "refresh", "false"],
         ["USER", user, "*", "", "", "", "*", "select", "false"],
         ["USER", user, "*", "*", "*", "", "", "alter", "false"],
         ["USER", user, "*", "*", "*", "", "", "create", "false"],
         ["USER", user, "*", "*", "*", "", "", "drop", "false"],
         ["USER", user, "*", "*", "*", "", "", "insert", "false"],
+        ["USER", user, "*", "*", "*", "", "", "refresh", "false"],
         ["USER", user, "*", "*", "*", "", "", "select", "false"]])
 
       admin_client.execute("grant all on server to user {0}".format(user))