You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2024/01/23 23:27:22 UTC

(impala) 01/02: IMPALA-12727: Reduce IO threads for non-TARGET_FILESYSTEM filesystems

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 3072a2110a558357b328d20ec6bf546ece0251f3
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Thu Jan 4 20:28:26 2024 -0800

    IMPALA-12727: Reduce IO threads for non-TARGET_FILESYSTEM filesystems
    
    The DiskIoMgr starts a large number of threads for each different
    type of object store, most of which are idle. For development,
    this slows down processing minidumps and debugging with gdb.
    
    This adds an option "reduce_disk_io_threads" to bin/start-impala-cluster.py
    that sets the thread count startup parameter to one for any filesystem
    that is not the TARGET_FILESYSTEM. On a typical development setup
    running against HDFS, this reduces the number of DiskIoMgr threads
    by 150 and the HDFS monitoring threads by 150 as well. This option is
    enabled by default. It can disabled by setting --reduce_disk_io_threads=False
    for bin/start-impala-cluster.py.
    
    Separately, DiskIoMgr should be modified to reduce the number of
    threads it spawns in general.
    
    Testing:
     - Hand tested this on my local development system
    
    Change-Id: Ic8ee1fb1f9b9fe65d542d024573562b3bb120b76
    Reviewed-on: http://gerrit.cloudera.org:8080/20920
    Reviewed-by: Michael Smith <mi...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 bin/start-impala-cluster.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/bin/start-impala-cluster.py b/bin/start-impala-cluster.py
index ea274dedd..7d2db963f 100755
--- a/bin/start-impala-cluster.py
+++ b/bin/start-impala-cluster.py
@@ -53,6 +53,7 @@ LOG.setLevel(level=logging.DEBUG)
 KUDU_MASTER_HOSTS = os.getenv("KUDU_MASTER_HOSTS", "127.0.0.1")
 DEFAULT_IMPALA_MAX_LOG_FILES = os.environ.get("IMPALA_MAX_LOG_FILES", 10)
 INTERNAL_LISTEN_HOST = os.getenv("INTERNAL_LISTEN_HOST", "localhost")
+TARGET_FILESYSTEM = os.getenv("TARGET_FILESYSTEM") or "hdfs"
 
 # Options
 parser = OptionParser()
@@ -168,6 +169,10 @@ parser.add_option("--enable_statestored_ha", dest="enable_statestored_ha",
                   action="store_true", default=False,
                   help="If true, enables StatestoreD HA - the cluster will be launched "
                   "with two statestored instances as Active-Passive HA pair.")
+parser.add_option("--reduce_disk_io_threads", default="True", type="choice",
+                  choices=["true", "True", "false", "False"],
+                  help="If true, reduce the number of disk io mgr threads for "
+                  "filesystems that are not the TARGET_FILESYSTEM.")
 
 # For testing: list of comma-separated delays, in milliseconds, that delay impalad catalog
 # replica initialization. The ith delay is applied to the ith impalad.
@@ -577,6 +582,34 @@ def build_impalad_arg_lists(cluster_size, num_coordinators, use_exclusive_coordi
               args=args, state_store_port=state_store_port,
               state_store_2_port=state_store_2_port)
 
+    if options.reduce_disk_io_threads.lower() == 'true':
+      # This leaves the default value for the TARGET_FILESYSTEM, but it reduces the thread
+      # count for every other filesystem that is not the TARGET_FILESYSTEM.
+      if TARGET_FILESYSTEM != 'abfs':
+        args = "{args} -num_abfs_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'adls':
+        args = "{args} -num_adls_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'cosn':
+        args = "{args} -num_cos_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'gs':
+        args = "{args} -num_gcs_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'hdfs':
+        args = "{args} -num_remote_hdfs_file_oper_io_threads=1".format(args=args)
+        args = "{args} -num_remote_hdfs_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'obs':
+        args = "{args} -num_obs_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'oss':
+        args = "{args} -num_oss_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 'ozone':
+        args = "{args} -num_ozone_io_threads=1".format(args=args)
+      if TARGET_FILESYSTEM != 's3':
+        args = "{args} -num_s3_io_threads=1".format(args=args)
+        args = "{args} -num_s3_file_oper_io_threads=1".format(args=args)
+
+      # SFS (single-file system) doesn't have a corresponding TARGET_FILESYSTEM, and
+      # it can always be restricted.
+      args = "{args} -num_sfs_io_threads=1".format(args=args)
+
     if "geospatial_library" not in args:
       args = "{args} -geospatial_library={geospatial_library}".format(
           args=args, geospatial_library=options.geospatial_library)