You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2023/12/11 07:13:56 UTC

(impala) 02/03: IMPALA-12595: Allow automatic removal of old logs from previous PID

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 3381fbf761989ce46da4f83a952bd247dc652ff8
Author: Riza Suminto <ri...@cloudera.com>
AuthorDate: Mon Dec 4 14:25:35 2023 -0800

    IMPALA-12595: Allow automatic removal of old logs from previous PID
    
    IMPALA-11184 add code to target specific PID for log rotation. This
    align with glog behavior and grant safety. That is, it is strictly limit
    log rotation to only consider log files made by the currently running
    Impalad and exclude logs made by previous PID or other living-colocated
    Impalads. The downside of this limit is that logs can start accumulate
    in a node when impalad is frequently restarted and is only resolvable by
    admin doing manual log removal.
    
    To help avoid this manual removal, this patch adds a backend flag
    'log_rotation_match_pid' that relax the limit by dropping the PID in
    glob pattern. Default value for this new flag is False. However, for
    testing purpose, start-impala-cluster.py will override it to True since
    test minicluster logs to a common log directory. Setting
    'log_rotation_match_pid' to True will prevent one impalad from
    interfering with log rotation of other impalad in minicluster.
    
    As a minimum exercise for this new log rotation behavior,
    test_breakpad.py::TestLogging is modified to invoke
    start-impala-cluster.py with 'log_rotation_match_pid' set to False.
    
    Testing:
    - Add test_excessive_cerr_ignore_pid and test_excessive_cerr_match_pid.
    - Split TestLogging into two. One run test_excessive_cerr_ignore_pid in
      core exploration, while the other run the rest of logging tests in
      exhaustive exploration.
    - Pass exhaustive tests.
    
    Change-Id: I599799e73f27f941a1d7f3dec0f40b4f05ea5ceb
    Reviewed-on: http://gerrit.cloudera.org:8080/20754
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/common/global-flags.cc             |  6 ++
 be/src/common/logging.cc                  | 14 ++++-
 bin/start-impala-cluster.py               |  9 +++
 tests/common/custom_cluster_test_suite.py |  8 ++-
 tests/custom_cluster/test_breakpad.py     | 98 +++++++++++++++++++++++++------
 5 files changed, 112 insertions(+), 23 deletions(-)

diff --git a/be/src/common/global-flags.cc b/be/src/common/global-flags.cc
index b7190ff48..906a99ab3 100644
--- a/be/src/common/global-flags.cc
+++ b/be/src/common/global-flags.cc
@@ -232,6 +232,12 @@ DEFINE_bool(redirect_stdout_stderr, true,
 DEFINE_int32(max_log_files, 10, "Maximum number of log files to retain per severity "
     "level. The most recent log files are retained. If set to 0, all log files are "
     "retained.");
+DEFINE_bool(log_rotation_match_pid, false,
+    "If set to True, Impala log rotation will only consider log files that match with "
+    "PID of currently running service. Otherwise, log rotation will ignore the PID in "
+    "log file names and may remove older log files from previous PID run. "
+    "Set to True if log files from prior run must be retained or when running multiple "
+    "instances of same service with common log directory. Default to False.");
 
 static const string re2_mem_limit_help_msg =
     "Maximum bytes of memory to be used by re2's regex engine "
diff --git a/be/src/common/logging.cc b/be/src/common/logging.cc
index 825767ffa..f05a7815c 100644
--- a/be/src/common/logging.cc
+++ b/be/src/common/logging.cc
@@ -44,6 +44,7 @@ DECLARE_string(redaction_rules_file);
 DECLARE_string(log_filename);
 DECLARE_bool(redirect_stdout_stderr);
 DECLARE_int32(max_log_size);
+DECLARE_bool(log_rotation_match_pid);
 
 using boost::uuids::random_generator;
 using impala::TUniqueId;
@@ -98,10 +99,17 @@ impala::Status ResolveLogSymlink(const string& symlink_path, string& canonical_p
 // We specifically target the base file name created by glog.
 // Glog's default base file name follow this pattern:
 // "<program name>.<hostname>.<user name>.log.<severity level>.<date>-<time>.<pid>"
+// IMPALA-12595: FLAGS_log_rotation_match_pid is added to control whether to match against
+// PID or not.
 inline string GlobPatternForLog(google::LogSeverity severity) {
-  return strings::Substitute("$0/$1*.log.$2.*.$3", FLAGS_log_dir,
-      google::ProgramInvocationShortName(), google::GetLogSeverityName(severity),
-      getpid());
+  if (FLAGS_log_rotation_match_pid) {
+    return strings::Substitute("$0/$1*.log.$2.*.$3", FLAGS_log_dir,
+        google::ProgramInvocationShortName(), google::GetLogSeverityName(severity),
+        getpid());
+  } else {
+    return strings::Substitute("$0/$1*.log.$2.*", FLAGS_log_dir,
+        google::ProgramInvocationShortName(), google::GetLogSeverityName(severity));
+  }
 }
 
 impala::Status GetLatestCanonicalLogPath(
diff --git a/bin/start-impala-cluster.py b/bin/start-impala-cluster.py
index 4fb86f224..ea274dedd 100755
--- a/bin/start-impala-cluster.py
+++ b/bin/start-impala-cluster.py
@@ -105,6 +105,10 @@ parser.add_option("--max_log_files", default=DEFAULT_IMPALA_MAX_LOG_FILES,
                   help="Max number of log files before rotation occurs.")
 parser.add_option("--log_level", type="int", dest="log_level", default=1,
                    help="Set the impalad backend logging level")
+parser.add_option("--ignore_pid_on_log_rotation", dest="ignore_pid_on_log_rotation",
+                  action='store_true', default=False,
+                  help=("Determine if log rotation should ignore or match PID in "
+                        "log file name."))
 parser.add_option("--jvm_args", dest="jvm_args", default="",
                   help="Additional arguments to pass to the JVM(s) during startup.")
 parser.add_option("--env_vars", dest="env_vars", default="",
@@ -282,6 +286,11 @@ def build_logging_args(service_name):
   logging"""
   result = ["-logbufsecs=5", "-v={0}".format(options.log_level),
       "-max_log_files={0}".format(options.max_log_files)]
+  if not options.ignore_pid_on_log_rotation:
+    # IMPALA-12595: ignore_pid_on_log_rotation default to False in this script.
+    # This is because multiple impalads still logs to the same log_dir in minicluster
+    # and we want to keep all logs for debugging purpose.
+    result += ["-log_rotation_match_pid=true"]
   if options.docker_network is None:
     # Impala inside a docker container should always log to the same location.
     result += ["-log_filename={0}".format(service_name),
diff --git a/tests/common/custom_cluster_test_suite.py b/tests/common/custom_cluster_test_suite.py
index 17be12ad3..104be6c09 100644
--- a/tests/common/custom_cluster_test_suite.py
+++ b/tests/common/custom_cluster_test_suite.py
@@ -292,7 +292,8 @@ class CustomClusterTestSuite(ImpalaTestSuite):
                             expected_subscribers=0,
                             default_query_options=None,
                             statestored_timeout_s=60,
-                            impalad_timeout_s=60):
+                            impalad_timeout_s=60,
+                            ignore_pid_on_log_rotation=False):
     cls.impala_log_dir = impala_log_dir
     # We ignore TEST_START_CLUSTER_ARGS here. Custom cluster tests specifically test that
     # certain custom startup arguments work and we want to keep them independent of dev
@@ -304,6 +305,11 @@ class CustomClusterTestSuite(ImpalaTestSuite):
            '--log_dir=%s' % impala_log_dir,
            '--log_level=%s' % log_level]
 
+    if ignore_pid_on_log_rotation:
+      # IMPALA-12595: Ignore PID on log rotation for all custom cluster tests.
+      # Most of test in custom_cluster need to match PID, except some test for logging.
+      cmd.append('--ignore_pid_on_log_rotation')
+
     if use_exclusive_coordinators:
       cmd.append("--use_exclusive_coordinators")
 
diff --git a/tests/custom_cluster/test_breakpad.py b/tests/custom_cluster/test_breakpad.py
index 38c827588..81173e6a8 100644
--- a/tests/custom_cluster/test_breakpad.py
+++ b/tests/custom_cluster/test_breakpad.py
@@ -388,15 +388,12 @@ class TestBreakpadExhaustive(TestBreakpadBase):
     assert reduced_minidump_size < full_minidump_size
 
 
-class TestLogging(TestBreakpadBase):
-  """Exhaustive tests to check that impala log is rolled periodically, obeying
-  max_log_size and max_log_files, even in the presence of heavy stderr writing.
-  """
+class TestLoggingBase(TestBreakpadBase):
+  _default_max_log_files = 2
+
   @classmethod
   def setup_class(cls):
-    if cls.exploration_strategy() != 'exhaustive':
-      pytest.skip('These logging tests only run in exhaustive')
-    super(TestLogging, cls).setup_class()
+    super(TestLoggingBase, cls).setup_class()
 
   def start_cluster_with_args(self, cluster_size, log_dir, **kwargs):
     cluster_options = []
@@ -404,9 +401,11 @@ class TestLogging(TestBreakpadBase):
       daemon_options = " ".join("-{0}={1}".format(k, v) for k, v in kwargs.items())
       cluster_options.append("--{0}={1}".format(daemon_arg, daemon_options))
     self._start_impala_cluster(cluster_options, cluster_size=cluster_size,
-                               expected_num_impalads=cluster_size, impala_log_dir=log_dir)
+                               expected_num_impalads=cluster_size,
+                               impala_log_dir=log_dir,
+                               ignore_pid_on_log_rotation=True)
 
-  def assert_logs(self, daemon, max_count, max_bytes):
+  def assert_logs(self, daemon, max_count, max_bytes, match_pid=True):
     """Assert that there are at most 'max_count' of INFO + ERROR log files for the
     specified daemon and the individual file size does not exceed 'max_bytes'.
     Also assert that stdout/stderr are redirected to correct file on each rotation."""
@@ -415,11 +414,11 @@ class TestLogging(TestBreakpadBase):
                 + glob.glob("%s/%s*log.INFO.*" % (log_dir, daemon))
     assert len(log_paths) <= max_count
 
-    # group log_paths by pid and kind
+    # group log_paths by kind and pid (if match_pid).
     log_group = {}
     for path in sorted(log_paths):
       tok = path.split('.')
-      key = tok[-1] + '.' + tok[-3]  # pid + kind
+      key = tok[-1] + '.' + tok[-3] if match_pid else tok[-3]
       if key in log_group:
         log_group[key].append(path)
       else:
@@ -453,13 +452,15 @@ class TestLogging(TestBreakpadBase):
     except OSError:
       pass
 
-  def start_excessive_cerr_cluster(self, test_cluster_size=1, remove_symlink=False):
+  def start_excessive_cerr_cluster(self, test_cluster_size=1, remove_symlink=False,
+                                   match_pid=True, max_log_count_begin=0,
+                                   max_log_count_end=None):
     """Check that impalad log is kept being rotated when most writing activity is coming
     from stderr stream.
     Along with LogFaultInjectionThread in init.cc, this test will fill impalad error logs
     with approximately 128kb error messages per second."""
     test_logbufsecs = 3
-    test_max_log_files = 2
+    test_max_log_files = self._default_max_log_files
     test_max_log_size = 1  # 1 MB
     test_error_msg = ('123456789abcde_' * 64)  # 1 KB error message
     test_debug_actions = 'LOG_MAINTENANCE_STDERR:FAIL@1.0@' + test_error_msg
@@ -467,26 +468,69 @@ class TestLogging(TestBreakpadBase):
     os.chmod(self.tmp_dir, 0o744)
 
     expected_log_max_bytes = int(1.2 * 1024**2)  # 1.2 MB
-    self.assert_logs(daemon, 0, expected_log_max_bytes)
+    self.assert_logs(daemon, max_log_count_begin, expected_log_max_bytes)
     self.start_cluster_with_args(test_cluster_size, self.tmp_dir,
                                  logbufsecs=test_logbufsecs,
                                  max_log_files=test_max_log_files,
                                  max_log_size=test_max_log_size,
-                                 debug_actions=test_debug_actions)
+                                 debug_actions=test_debug_actions,
+                                 log_rotation_match_pid=('1' if match_pid else '0'))
     self.wait_for_num_processes(daemon, test_cluster_size, 30)
     # Count both INFO and ERROR logs
-    expected_log_max_count = test_max_log_files * test_cluster_size * 2
+    if max_log_count_end is None:
+      max_log_count_end = test_max_log_files * test_cluster_size * 2
     # Wait for log maintenance thread to flush and rotate the logs asynchronously.
+    duration = test_logbufsecs * 10
     start = time.time()
-    while (time.time() - start < 40):
+    while (time.time() - start < duration):
       time.sleep(1)
-      self.assert_logs(daemon, expected_log_max_count, expected_log_max_bytes)
+      self.assert_logs(daemon, max_log_count_end, expected_log_max_bytes)
       if (remove_symlink):
         pattern = self.tmp_dir + '/' + daemon + '*'
         symlinks = glob.glob(pattern + '.INFO') + glob.glob(pattern + '.ERROR')
         for symlink in symlinks:
           self.silent_remove(symlink)
 
+
+class TestLogging(TestLoggingBase):
+  """Core tests to check that impala log is rolled periodically, obeying
+  max_log_size, max_log_files, and log_rotation_match_pid even in the presence of heavy
+  stderr writing.
+  """
+
+  @classmethod
+  def setup_class(cls):
+    super(TestLogging, cls).setup_class()
+
+  @pytest.mark.execute_serially
+  def test_excessive_cerr_ignore_pid(self):
+    """Test excessive cerr activity twice with restart in between and no PID matching."""
+    self.start_excessive_cerr_cluster(test_cluster_size=1, remove_symlink=False,
+                                      match_pid=False)
+    self.kill_cluster(SIGTERM)
+    # There should be no impalad/catalogd/statestored running.
+    assert self.get_num_processes('impalad') == 0
+    assert self.get_num_processes('catalogd') == 0
+    assert self.get_num_processes('statestored') == 0
+    max_count = self._default_max_log_files * 2
+    self.start_excessive_cerr_cluster(test_cluster_size=1, remove_symlink=False,
+                                      match_pid=False,
+                                      max_log_count_begin=max_count,
+                                      max_log_count_end=max_count)
+
+
+class TestLoggingExhaustive(TestLoggingBase):
+  """Exhaustive tests to check that impala log is rolled periodically, obeying
+  max_log_size, max_log_files, and log_rotation_match_pid even in the presence of heavy
+  stderr writing.
+  """
+
+  @classmethod
+  def setup_class(cls):
+    if cls.exploration_strategy() != 'exhaustive':
+      pytest.skip('TestLoggingExhaustive only run in exhaustive')
+    super(TestLoggingExhaustive, cls).setup_class()
+
   @pytest.mark.execute_serially
   def test_excessive_cerr(self):
     """Test excessive cerr activity with single node cluster."""
@@ -495,4 +539,20 @@ class TestLogging(TestBreakpadBase):
   @pytest.mark.execute_serially
   def test_excessive_cerr_no_symlink(self):
     """Test excessive cerr activity with two node cluster and missing log symlinks."""
-    self.start_excessive_cerr_cluster(2, True)
+    self.start_excessive_cerr_cluster(test_cluster_size=2, remove_symlink=False)
+
+  @pytest.mark.execute_serially
+  def test_excessive_cerr_match_pid(self):
+    """Test excessive cerr activity twice with restart in between and PID matching."""
+    self.start_excessive_cerr_cluster(1, remove_symlink=False, match_pid=True)
+    self.kill_cluster(SIGTERM)
+    # There should be no impalad/catalogd/statestored running.
+    assert self.get_num_processes('impalad') == 0
+    assert self.get_num_processes('catalogd') == 0
+    assert self.get_num_processes('statestored') == 0
+    max_count_begin = self._default_max_log_files * 2
+    max_count_end = max_count_begin * 2
+    self.start_excessive_cerr_cluster(test_cluster_size=1, remove_symlink=False,
+                                      match_pid=True,
+                                      max_log_count_begin=max_count_begin,
+                                      max_log_count_end=max_count_end)