You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2024/01/23 23:27:22 UTC
(impala) 01/02: IMPALA-12727: Reduce IO threads for non-TARGET_FILESYSTEM filesystems
This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 3072a2110a558357b328d20ec6bf546ece0251f3
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Thu Jan 4 20:28:26 2024 -0800
IMPALA-12727: Reduce IO threads for non-TARGET_FILESYSTEM filesystems
The DiskIoMgr starts a large number of threads for each different
type of object store, most of which are idle. For development,
this slows down processing minidumps and debugging with gdb.
This adds an option "reduce_disk_io_threads" to bin/start-impala-cluster.py
that sets the thread count startup parameter to one for any filesystem
that is not the TARGET_FILESYSTEM. On a typical development setup
running against HDFS, this reduces the number of DiskIoMgr threads
by 150 and the HDFS monitoring threads by 150 as well. This option is
enabled by default. It can disabled by setting --reduce_disk_io_threads=False
for bin/start-impala-cluster.py.
Separately, DiskIoMgr should be modified to reduce the number of
threads it spawns in general.
Testing:
- Hand tested this on my local development system
Change-Id: Ic8ee1fb1f9b9fe65d542d024573562b3bb120b76
Reviewed-on: http://gerrit.cloudera.org:8080/20920
Reviewed-by: Michael Smith <mi...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
bin/start-impala-cluster.py | 33 +++++++++++++++++++++++++++++++++
1 file changed, 33 insertions(+)
diff --git a/bin/start-impala-cluster.py b/bin/start-impala-cluster.py
index ea274dedd..7d2db963f 100755
--- a/bin/start-impala-cluster.py
+++ b/bin/start-impala-cluster.py
@@ -53,6 +53,7 @@ LOG.setLevel(level=logging.DEBUG)
KUDU_MASTER_HOSTS = os.getenv("KUDU_MASTER_HOSTS", "127.0.0.1")
DEFAULT_IMPALA_MAX_LOG_FILES = os.environ.get("IMPALA_MAX_LOG_FILES", 10)
INTERNAL_LISTEN_HOST = os.getenv("INTERNAL_LISTEN_HOST", "localhost")
+TARGET_FILESYSTEM = os.getenv("TARGET_FILESYSTEM") or "hdfs"
# Options
parser = OptionParser()
@@ -168,6 +169,10 @@ parser.add_option("--enable_statestored_ha", dest="enable_statestored_ha",
action="store_true", default=False,
help="If true, enables StatestoreD HA - the cluster will be launched "
"with two statestored instances as Active-Passive HA pair.")
+parser.add_option("--reduce_disk_io_threads", default="True", type="choice",
+ choices=["true", "True", "false", "False"],
+ help="If true, reduce the number of disk io mgr threads for "
+ "filesystems that are not the TARGET_FILESYSTEM.")
# For testing: list of comma-separated delays, in milliseconds, that delay impalad catalog
# replica initialization. The ith delay is applied to the ith impalad.
@@ -577,6 +582,34 @@ def build_impalad_arg_lists(cluster_size, num_coordinators, use_exclusive_coordi
args=args, state_store_port=state_store_port,
state_store_2_port=state_store_2_port)
+ if options.reduce_disk_io_threads.lower() == 'true':
+ # This leaves the default value for the TARGET_FILESYSTEM, but it reduces the thread
+ # count for every other filesystem that is not the TARGET_FILESYSTEM.
+ if TARGET_FILESYSTEM != 'abfs':
+ args = "{args} -num_abfs_io_threads=1".format(args=args)
+ if TARGET_FILESYSTEM != 'adls':
+ args = "{args} -num_adls_io_threads=1".format(args=args)
+ if TARGET_FILESYSTEM != 'cosn':
+ args = "{args} -num_cos_io_threads=1".format(args=args)
+ if TARGET_FILESYSTEM != 'gs':
+ args = "{args} -num_gcs_io_threads=1".format(args=args)
+ if TARGET_FILESYSTEM != 'hdfs':
+ args = "{args} -num_remote_hdfs_file_oper_io_threads=1".format(args=args)
+ args = "{args} -num_remote_hdfs_io_threads=1".format(args=args)
+ if TARGET_FILESYSTEM != 'obs':
+ args = "{args} -num_obs_io_threads=1".format(args=args)
+ if TARGET_FILESYSTEM != 'oss':
+ args = "{args} -num_oss_io_threads=1".format(args=args)
+ if TARGET_FILESYSTEM != 'ozone':
+ args = "{args} -num_ozone_io_threads=1".format(args=args)
+ if TARGET_FILESYSTEM != 's3':
+ args = "{args} -num_s3_io_threads=1".format(args=args)
+ args = "{args} -num_s3_file_oper_io_threads=1".format(args=args)
+
+ # SFS (single-file system) doesn't have a corresponding TARGET_FILESYSTEM, and
+ # it can always be restricted.
+ args = "{args} -num_sfs_io_threads=1".format(args=args)
+
if "geospatial_library" not in args:
args = "{args} -geospatial_library={geospatial_library}".format(
args=args, geospatial_library=options.geospatial_library)