You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2024/01/23 23:27:21 UTC

(impala) branch master updated (ad0dc6748 -> 1415a979e)

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


    from ad0dc6748 IMPALA-12740: Fix TestHdfsJsonScanNodeErrors fails in exhaustive mode
     new 3072a2110 IMPALA-12727: Reduce IO threads for non-TARGET_FILESYSTEM filesystems
     new 1415a979e IMPALA-12743: Fix incremental stats are filtered out by HMS due to HIVE-27114

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 bin/start-impala-cluster.py            | 33 +++++++++++++++++++++++++++++++++
 fe/src/test/resources/hive-site.xml.py |  5 ++++-
 2 files changed, 37 insertions(+), 1 deletion(-)


(impala) 02/02: IMPALA-12743: Fix incremental stats are filtered out by HMS due to HIVE-27114

Posted by st...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 1415a979eb6f6196990434277bfc28e742acdad2
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Tue Jan 23 12:43:45 2024 +0800

    IMPALA-12743: Fix incremental stats are filtered out by HMS due to HIVE-27114
    
    HIVE-27114 adds a new property in hive-site.xml for HMS clients to
    filter out unwanted partition parameters:
      hive.metastore.partitions.parameters.exclude.pattern
    It defaults to "impala_intermediate_stats_chunk%". This excludes the
    incremental stats of Impala. Impala should set this to an empty string
    to get rid of the impact.
    
    Tests:
     - Ran CatalogTest#testPullIncrementalStats which failed when running on
       higher Hive versions that have HIVE-27114.
    
    Change-Id: I033e811f4e55b3af04f7a68c69b5779c72e4b053
    Reviewed-on: http://gerrit.cloudera.org:8080/20937
    Reviewed-by: Laszlo Gaal <la...@cloudera.com>
    Reviewed-by: Csaba Ringhofer <cs...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 fe/src/test/resources/hive-site.xml.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fe/src/test/resources/hive-site.xml.py b/fe/src/test/resources/hive-site.xml.py
index 563d32fc8..c5f377925 100644
--- a/fe/src/test/resources/hive-site.xml.py
+++ b/fe/src/test/resources/hive-site.xml.py
@@ -65,7 +65,10 @@ CONFIG.update({
   'hive.repl.bootstrap.dump.open.txn.timeout': '120s',
 
   # allow both hs2 and hs2-http protocols
-  'hive.server2.transport.mode': 'all'
+  'hive.server2.transport.mode': 'all',
+
+  # Don't filter out incremental stats of Impala. The default is "impala_intermediate_stats_chunk%".
+  'hive.metastore.partitions.parameters.exclude.pattern': '""',
 })
 
 if variant == 'changed_external_dir':


(impala) 01/02: IMPALA-12727: Reduce IO threads for non-TARGET_FILESYSTEM filesystems

Posted by st...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 3072a2110a558357b328d20ec6bf546ece0251f3
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Thu Jan 4 20:28:26 2024 -0800

    IMPALA-12727: Reduce IO threads for non-TARGET_FILESYSTEM filesystems
    
    The DiskIoMgr starts a large number of threads for each different
    type of object store, most of which are idle. For development,
    this slows down processing minidumps and debugging with gdb.
    
    This adds an option "reduce_disk_io_threads" to bin/start-impala-cluster.py
    that sets the thread count startup parameter to one for any filesystem
    that is not the TARGET_FILESYSTEM. On a typical development setup
    running against HDFS, this reduces the number of DiskIoMgr threads
    by 150 and the HDFS monitoring threads by 150 as well. This option is
    enabled by default. It can disabled by setting --reduce_disk_io_threads=False
    for bin/start-impala-cluster.py.
    
    Separately, DiskIoMgr should be modified to reduce the number of
    threads it spawns in general.
    
    Testing:
     - Hand tested this on my local development system
    
    Change-Id: Ic8ee1fb1f9b9fe65d542d024573562b3bb120b76
    Reviewed-on: http://gerrit.cloudera.org:8080/20920
    Reviewed-by: Michael Smith <mi...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 bin/start-impala-cluster.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/bin/start-impala-cluster.py b/bin/start-impala-cluster.py
index ea274dedd..7d2db963f 100755
--- a/bin/start-impala-cluster.py
+++ b/bin/start-impala-cluster.py
@@ -53,6 +53,7 @@ LOG.setLevel(level=logging.DEBUG)
 KUDU_MASTER_HOSTS = os.getenv("KUDU_MASTER_HOSTS", "127.0.0.1")
 DEFAULT_IMPALA_MAX_LOG_FILES = os.environ.get("IMPALA_MAX_LOG_FILES", 10)
 INTERNAL_LISTEN_HOST = os.getenv("INTERNAL_LISTEN_HOST", "localhost")
+TARGET_FILESYSTEM = os.getenv("TARGET_FILESYSTEM") or "hdfs"
 
 # Options
 parser = OptionParser()
@@ -168,6 +169,10 @@ parser.add_option("--enable_statestored_ha", dest="enable_statestored_ha",
                   action="store_true", default=False,
                   help="If true, enables StatestoreD HA - the cluster will be launched "
                   "with two statestored instances as Active-Passive HA pair.")
+parser.add_option("--reduce_disk_io_threads", default="True", type="choice",
+                  choices=["true", "True", "false", "False"],
+                  help="If true, reduce the number of disk io mgr threads for "
+                  "filesystems that are not the TARGET_FILESYSTEM.")
 
 # For testing: list of comma-separated delays, in milliseconds, that delay impalad catalog
 # replica initialization. The ith delay is applied to the ith impalad.
@@ -577,6 +582,34 @@ def build_impalad_arg_lists(cluster_size, num_coordinators, use_exclusive_coordi
               args=args, state_store_port=state_store_port,
               state_store_2_port=state_store_2_port)
 
+    if options.reduce_disk_io_threads.lower() == 'true':
+      # This leaves the default value for the TARGET_FILESYSTEM, but it reduces the thread
+      # count for every other filesystem that is not the TARGET_FILESYSTEM.
+      if TARGET_FILESYSTEM != 'abfs':
+        args = "{args} -num_abfs_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'adls':
+        args = "{args} -num_adls_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'cosn':
+        args = "{args} -num_cos_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'gs':
+        args = "{args} -num_gcs_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'hdfs':
+        args = "{args} -num_remote_hdfs_file_oper_io_threads=1".format(args=args)
+        args = "{args} -num_remote_hdfs_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'obs':
+        args = "{args} -num_obs_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'oss':
+        args = "{args} -num_oss_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'ozone':
+        args = "{args} -num_ozone_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 's3':
+        args = "{args} -num_s3_io_threads=1".format(args=args)
+        args = "{args} -num_s3_file_oper_io_threads=1".format(args=args)
+
+      # SFS (single-file system) doesn't have a corresponding TARGET_FILESYSTEM, and
+      # it can always be restricted.
+      args = "{args} -num_sfs_io_threads=1".format(args=args)
+
     if "geospatial_library" not in args:
       args = "{args} -geospatial_library={geospatial_library}".format(
           args=args, geospatial_library=options.geospatial_library)