You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/08/25 22:18:12 UTC

impala git commit: IMPALA-7483: abort stuck impalad/catalogd on JVM deadlock

Repository: impala
Updated Branches:
  refs/heads/master a8f8c8d6f -> c285ab979


IMPALA-7483: abort stuck impalad/catalogd on JVM deadlock

The polling interval can be adjusted with
--jvm_deadlock_detector_interval_s. Setting the interval to <= 0
disables the hang check. I don't think this should ever be needed but I
wanted to keep it around in case we ever see false positive deadlock and
need a way to disable the hang detection.

After studying the OpenJDK code I concluded that the deadlock detection
cost O(# java threads) and had to execute at a VM safepoint so will
block progress of Java threads but not native threads. This seems cheap
enough to execute at one minute intervals without impacting perfomance
measurably.

Testing:
Modified code to replace findDeadlockedThreads() with
getAllThreadIds() to induce the failure in impalad and catalogd.
Confirmed that it generated a core and dumped stacks to impalad.ERROR.

Added manual testing in JvmPauseMonitor.main() that induces a deadlock:

  CLASSPATH=$CLASSPATH:fe/target/impala-frontend-0.1-SNAPSHOT.jar \
      java org.apache.impala.util.JvmPauseMonitor deadlock

Ran exhaustive tests, mostly to completion except for hitting
IMPALA-7488.

I'm open to ideas about how to write an automated test for this and/or
whether it's worth it - it might be possible to add an impalad debug
flag that deliberately induces a deadlock or write a backend death
test, but I don't know if it's worth the cost of adding and maintaining
complex test infra.

Change-Id: If46f879e7ab9eeead0c6e9a09844e1356012898d
Reviewed-on: http://gerrit.cloudera.org:8080/11317
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/c285ab97
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/c285ab97
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/c285ab97

Branch: refs/heads/master
Commit: c285ab9791214097eb377b39c3ddc530f7459894
Parents: a8f8c8d
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Thu Aug 23 18:04:16 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Sat Aug 25 21:57:07 2018 +0000

----------------------------------------------------------------------
 be/src/util/jni-util.cc                         |  14 ++-
 .../java/org/apache/impala/common/JniUtil.java  |   4 +-
 .../org/apache/impala/util/JvmPauseMonitor.java | 110 +++++++++++++++++--
 3 files changed, 114 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/c285ab97/be/src/util/jni-util.cc
----------------------------------------------------------------------
diff --git a/be/src/util/jni-util.cc b/be/src/util/jni-util.cc
index f47e25d..19e4e10 100644
--- a/be/src/util/jni-util.cc
+++ b/be/src/util/jni-util.cc
@@ -25,6 +25,10 @@
 
 #include "common/names.h"
 
+DEFINE_int64(jvm_deadlock_detector_interval_s, 60,
+    "(Advanced) Interval between JVM deadlock checks. If set to 0 or a negative value, "
+    "deadlock checks are disabled.");
+
 namespace impala {
 
 Status JniUtfCharGuard::create(JNIEnv* env, jstring jstr, JniUtfCharGuard* out) {
@@ -213,9 +217,15 @@ Status JniUtil::InitJvmPauseMonitor() {
   if (!env) return Status("Failed to get/create JVM.");
   if (!jni_util_cl_) return Status("JniUtil::Init() not called.");
   jmethodID init_jvm_pm_method;
-  JniMethodDescriptor init_jvm_pm_desc = {"initPauseMonitor", "()V", &init_jvm_pm_method};
+  JniMethodDescriptor init_jvm_pm_desc = {
+      "initPauseMonitor", "(J)V", &init_jvm_pm_method};
   RETURN_IF_ERROR(JniUtil::LoadStaticJniMethod(env, jni_util_cl_, &init_jvm_pm_desc));
-  RETURN_IF_ERROR(JniUtil::CallJniMethod(jni_util_cl_, init_jvm_pm_method));
+  JNIEnv* jni_env = getJNIEnv();
+  JniLocalFrame jni_frame;
+  RETURN_IF_ERROR(jni_frame.push(jni_env));
+  jni_env->CallObjectMethod(
+      jni_util_cl_, init_jvm_pm_method, FLAGS_jvm_deadlock_detector_interval_s);
+  RETURN_ERROR_IF_EXC(jni_env);
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/impala/blob/c285ab97/fe/src/main/java/org/apache/impala/common/JniUtil.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/common/JniUtil.java b/fe/src/main/java/org/apache/impala/common/JniUtil.java
index 348b76d..f9368fa 100644
--- a/fe/src/main/java/org/apache/impala/common/JniUtil.java
+++ b/fe/src/main/java/org/apache/impala/common/JniUtil.java
@@ -64,8 +64,8 @@ public class JniUtil {
   /**
    * Initializes the JvmPauseMonitor instance.
    */
-  public static void initPauseMonitor() {
-    JvmPauseMonitor.INSTANCE.initPauseMonitor();
+  public static void initPauseMonitor(long deadlockCheckIntervalS) {
+    JvmPauseMonitor.INSTANCE.initPauseMonitor(deadlockCheckIntervalS);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/impala/blob/c285ab97/fe/src/main/java/org/apache/impala/util/JvmPauseMonitor.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/util/JvmPauseMonitor.java b/fe/src/main/java/org/apache/impala/util/JvmPauseMonitor.java
index e4f28bd..8dbca6c 100644
--- a/fe/src/main/java/org/apache/impala/util/JvmPauseMonitor.java
+++ b/fe/src/main/java/org/apache/impala/util/JvmPauseMonitor.java
@@ -19,6 +19,8 @@ package org.apache.impala.util;
 
 import java.lang.management.GarbageCollectorMXBean;
 import java.lang.management.ManagementFactory;
+import java.lang.management.ThreadInfo;
+import java.lang.management.ThreadMXBean;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -45,6 +47,10 @@ public class JvmPauseMonitor {
   // The target sleep time.
   private static final long SLEEP_INTERVAL_MS = 500;
 
+  // Check for Java deadlocks at this interval. Set by init(). 0 or negative means that
+  // the deadlock checks are disabled.
+  private long deadlockCheckIntervalS_ = 0;
+
   // log WARN if we detect a pause longer than this threshold.
   private long warnThresholdMs_;
   private static final long WARN_THRESHOLD_MS = 10000;
@@ -61,9 +67,9 @@ public class JvmPauseMonitor {
   public static JvmPauseMonitor INSTANCE = new JvmPauseMonitor();
 
   // Initializes the pause monitor. No-op if called multiple times.
-  public static void initPauseMonitor() {
+  public static void initPauseMonitor(long deadlockCheckIntervalS) {
     if (INSTANCE.isStarted()) return;
-    INSTANCE.init();
+    INSTANCE.init(deadlockCheckIntervalS);
   }
 
   private JvmPauseMonitor() {
@@ -75,7 +81,8 @@ public class JvmPauseMonitor {
     this.warnThresholdMs_ = warnThresholdMs;
   }
 
-  protected void init() {
+  protected void init(long deadlockCheckIntervalS) {
+    deadlockCheckIntervalS_ = deadlockCheckIntervalS;
     monitorThread_ = new Thread(new Monitor(), "JVM pause monitor");
     monitorThread_.setDaemon(true);
     monitorThread_.start();
@@ -158,6 +165,7 @@ public class JvmPauseMonitor {
     @Override
     public void run() {
       Stopwatch sw = new Stopwatch();
+      Stopwatch timeSinceDeadlockCheck = new Stopwatch().start();
       Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
       LOG.info("Starting JVM pause monitor");
       while (shouldRun) {
@@ -165,6 +173,7 @@ public class JvmPauseMonitor {
         try {
           Thread.sleep(SLEEP_INTERVAL_MS);
         } catch (InterruptedException ie) {
+          LOG.error("JVM pause monitor interrupted", ie);
           return;
         }
         sw.stop();
@@ -179,27 +188,108 @@ public class JvmPauseMonitor {
               extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
         }
         gcTimesBeforeSleep = gcTimesAfterSleep;
+
+        if (deadlockCheckIntervalS_ > 0 &&
+            timeSinceDeadlockCheck.elapsed(TimeUnit.SECONDS) >= deadlockCheckIntervalS_) {
+          checkForDeadlocks();
+          timeSinceDeadlockCheck.reset().start();
+        }
+      }
+    }
+
+    /**
+     * Check for deadlocks between Java threads using the JVM's deadlock detector.
+     * If a deadlock is found, log info about the deadlocked threads and exit the
+     * process.
+     *
+     * We choose to exit the process this situation because the deadlock will likely
+     * cause hangs and other forms of service unavailability and there is no way to
+     * recover from the deadlock except by restarting the process.
+     */
+    private void checkForDeadlocks() {
+      ThreadMXBean threadMx = ManagementFactory.getThreadMXBean();
+      long deadlockedTids[] = threadMx.findDeadlockedThreads();
+      if (deadlockedTids != null) {
+        ThreadInfo deadlockedThreads[] =
+            threadMx.getThreadInfo(deadlockedTids, true, true);
+        // Log diagnostics with error before aborting the process with a FATAL log.
+        LOG.error("Found " + deadlockedThreads.length + " threads in deadlock: ");
+        for (ThreadInfo thread : deadlockedThreads) {
+          // Defensively check for null in case the thread somehow disappeared between
+          // findDeadlockedThreads() and getThreadInfo().
+          if (thread != null) LOG.error(thread.toString());
+        }
+        LOG.error("All threads:" );
+        for (ThreadInfo thread : threadMx.dumpAllThreads(true, true)) {
+          LOG.error(thread.toString());
+        }
+        // In the context of an Impala service, LOG.fatal calls glog's fatal, which
+        // aborts the process, which will produce a coredump if coredumps are enabled.
+        LOG.fatal("Aborting because of deadlocked threads in JVM.");
+        System.exit(1);
       }
     }
   }
 
   /**
-   * Simple 'main' to facilitate manual testing of the pause monitor.
-   *
-   * This main function just leaks memory into a list. Running this class
+   * Helper for manual testing that causes a deadlock between java threads.
+   */
+  private static void causeDeadlock() {
+    final Object obj1 = new Object();
+    final Object obj2 = new Object();
+
+    new Thread(new Runnable() {
+
+      @Override
+      public void run() {
+        while (true) {
+          synchronized(obj2) {
+            synchronized(obj1) {
+              System.err.println("Thread 1 got locks");
+            }
+          }
+        }
+      }
+    }).start();
+
+    while (true) {
+      synchronized(obj1) {
+        synchronized(obj2) {
+          System.err.println("Thread 2 got locks");
+        }
+      }
+    }
+  }
+
+  /**
+   * This function just leaks memory into a list. Running this function
    * with a 1GB heap will very quickly go into "GC hell" and result in
    * log messages about the GC pauses.
    */
-  @SuppressWarnings("resource")
-  public static void main(String []args) throws Exception {
-    JvmPauseMonitor monitor = new JvmPauseMonitor();
-    monitor.init();
+  private static void allocateMemory() {
     List<String> list = Lists.newArrayList();
     int i = 0;
     while (true) {
       list.add(String.valueOf(i++));
     }
   }
+
+  /**
+   * Simple 'main' to facilitate manual testing of the pause monitor.
+   */
+  @SuppressWarnings("resource")
+  public static void main(String []args) throws Exception {
+    JvmPauseMonitor monitor = new JvmPauseMonitor();
+    monitor.init(60);
+    if (args[0].equals("gc")) {
+      allocateMemory();
+    } else if (args[0].equals("deadlock")) {
+      causeDeadlock();
+    } else {
+      System.err.println("Unknown mode");
+    }
+  }
+
 }