You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/08/25 22:18:12 UTC
impala git commit: IMPALA-7483: abort stuck impalad/catalogd on JVM
deadlock
Repository: impala
Updated Branches:
refs/heads/master a8f8c8d6f -> c285ab979
IMPALA-7483: abort stuck impalad/catalogd on JVM deadlock
The polling interval can be adjusted with
--jvm_deadlock_detector_interval_s. Setting the interval to <= 0
disables the hang check. I don't think this should ever be needed but I
wanted to keep it around in case we ever see false positive deadlock and
need a way to disable the hang detection.
After studying the OpenJDK code I concluded that the deadlock detection
cost O(# java threads) and had to execute at a VM safepoint so will
block progress of Java threads but not native threads. This seems cheap
enough to execute at one minute intervals without impacting perfomance
measurably.
Testing:
Modified code to replace findDeadlockedThreads() with
getAllThreadIds() to induce the failure in impalad and catalogd.
Confirmed that it generated a core and dumped stacks to impalad.ERROR.
Added manual testing in JvmPauseMonitor.main() that induces a deadlock:
CLASSPATH=$CLASSPATH:fe/target/impala-frontend-0.1-SNAPSHOT.jar \
java org.apache.impala.util.JvmPauseMonitor deadlock
Ran exhaustive tests, mostly to completion except for hitting
IMPALA-7488.
I'm open to ideas about how to write an automated test for this and/or
whether it's worth it - it might be possible to add an impalad debug
flag that deliberately induces a deadlock or write a backend death
test, but I don't know if it's worth the cost of adding and maintaining
complex test infra.
Change-Id: If46f879e7ab9eeead0c6e9a09844e1356012898d
Reviewed-on: http://gerrit.cloudera.org:8080/11317
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/c285ab97
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/c285ab97
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/c285ab97
Branch: refs/heads/master
Commit: c285ab9791214097eb377b39c3ddc530f7459894
Parents: a8f8c8d
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Thu Aug 23 18:04:16 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Sat Aug 25 21:57:07 2018 +0000
----------------------------------------------------------------------
be/src/util/jni-util.cc | 14 ++-
.../java/org/apache/impala/common/JniUtil.java | 4 +-
.../org/apache/impala/util/JvmPauseMonitor.java | 110 +++++++++++++++++--
3 files changed, 114 insertions(+), 14 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/impala/blob/c285ab97/be/src/util/jni-util.cc
----------------------------------------------------------------------
diff --git a/be/src/util/jni-util.cc b/be/src/util/jni-util.cc
index f47e25d..19e4e10 100644
--- a/be/src/util/jni-util.cc
+++ b/be/src/util/jni-util.cc
@@ -25,6 +25,10 @@
#include "common/names.h"
+DEFINE_int64(jvm_deadlock_detector_interval_s, 60,
+ "(Advanced) Interval between JVM deadlock checks. If set to 0 or a negative value, "
+ "deadlock checks are disabled.");
+
namespace impala {
Status JniUtfCharGuard::create(JNIEnv* env, jstring jstr, JniUtfCharGuard* out) {
@@ -213,9 +217,15 @@ Status JniUtil::InitJvmPauseMonitor() {
if (!env) return Status("Failed to get/create JVM.");
if (!jni_util_cl_) return Status("JniUtil::Init() not called.");
jmethodID init_jvm_pm_method;
- JniMethodDescriptor init_jvm_pm_desc = {"initPauseMonitor", "()V", &init_jvm_pm_method};
+ JniMethodDescriptor init_jvm_pm_desc = {
+ "initPauseMonitor", "(J)V", &init_jvm_pm_method};
RETURN_IF_ERROR(JniUtil::LoadStaticJniMethod(env, jni_util_cl_, &init_jvm_pm_desc));
- RETURN_IF_ERROR(JniUtil::CallJniMethod(jni_util_cl_, init_jvm_pm_method));
+ JNIEnv* jni_env = getJNIEnv();
+ JniLocalFrame jni_frame;
+ RETURN_IF_ERROR(jni_frame.push(jni_env));
+ jni_env->CallObjectMethod(
+ jni_util_cl_, init_jvm_pm_method, FLAGS_jvm_deadlock_detector_interval_s);
+ RETURN_ERROR_IF_EXC(jni_env);
return Status::OK();
}
http://git-wip-us.apache.org/repos/asf/impala/blob/c285ab97/fe/src/main/java/org/apache/impala/common/JniUtil.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/common/JniUtil.java b/fe/src/main/java/org/apache/impala/common/JniUtil.java
index 348b76d..f9368fa 100644
--- a/fe/src/main/java/org/apache/impala/common/JniUtil.java
+++ b/fe/src/main/java/org/apache/impala/common/JniUtil.java
@@ -64,8 +64,8 @@ public class JniUtil {
/**
* Initializes the JvmPauseMonitor instance.
*/
- public static void initPauseMonitor() {
- JvmPauseMonitor.INSTANCE.initPauseMonitor();
+ public static void initPauseMonitor(long deadlockCheckIntervalS) {
+ JvmPauseMonitor.INSTANCE.initPauseMonitor(deadlockCheckIntervalS);
}
/**
http://git-wip-us.apache.org/repos/asf/impala/blob/c285ab97/fe/src/main/java/org/apache/impala/util/JvmPauseMonitor.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/util/JvmPauseMonitor.java b/fe/src/main/java/org/apache/impala/util/JvmPauseMonitor.java
index e4f28bd..8dbca6c 100644
--- a/fe/src/main/java/org/apache/impala/util/JvmPauseMonitor.java
+++ b/fe/src/main/java/org/apache/impala/util/JvmPauseMonitor.java
@@ -19,6 +19,8 @@ package org.apache.impala.util;
import java.lang.management.GarbageCollectorMXBean;
import java.lang.management.ManagementFactory;
+import java.lang.management.ThreadInfo;
+import java.lang.management.ThreadMXBean;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -45,6 +47,10 @@ public class JvmPauseMonitor {
// The target sleep time.
private static final long SLEEP_INTERVAL_MS = 500;
+ // Check for Java deadlocks at this interval. Set by init(). 0 or negative means that
+ // the deadlock checks are disabled.
+ private long deadlockCheckIntervalS_ = 0;
+
// log WARN if we detect a pause longer than this threshold.
private long warnThresholdMs_;
private static final long WARN_THRESHOLD_MS = 10000;
@@ -61,9 +67,9 @@ public class JvmPauseMonitor {
public static JvmPauseMonitor INSTANCE = new JvmPauseMonitor();
// Initializes the pause monitor. No-op if called multiple times.
- public static void initPauseMonitor() {
+ public static void initPauseMonitor(long deadlockCheckIntervalS) {
if (INSTANCE.isStarted()) return;
- INSTANCE.init();
+ INSTANCE.init(deadlockCheckIntervalS);
}
private JvmPauseMonitor() {
@@ -75,7 +81,8 @@ public class JvmPauseMonitor {
this.warnThresholdMs_ = warnThresholdMs;
}
- protected void init() {
+ protected void init(long deadlockCheckIntervalS) {
+ deadlockCheckIntervalS_ = deadlockCheckIntervalS;
monitorThread_ = new Thread(new Monitor(), "JVM pause monitor");
monitorThread_.setDaemon(true);
monitorThread_.start();
@@ -158,6 +165,7 @@ public class JvmPauseMonitor {
@Override
public void run() {
Stopwatch sw = new Stopwatch();
+ Stopwatch timeSinceDeadlockCheck = new Stopwatch().start();
Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
LOG.info("Starting JVM pause monitor");
while (shouldRun) {
@@ -165,6 +173,7 @@ public class JvmPauseMonitor {
try {
Thread.sleep(SLEEP_INTERVAL_MS);
} catch (InterruptedException ie) {
+ LOG.error("JVM pause monitor interrupted", ie);
return;
}
sw.stop();
@@ -179,27 +188,108 @@ public class JvmPauseMonitor {
extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
}
gcTimesBeforeSleep = gcTimesAfterSleep;
+
+ if (deadlockCheckIntervalS_ > 0 &&
+ timeSinceDeadlockCheck.elapsed(TimeUnit.SECONDS) >= deadlockCheckIntervalS_) {
+ checkForDeadlocks();
+ timeSinceDeadlockCheck.reset().start();
+ }
+ }
+ }
+
+ /**
+ * Check for deadlocks between Java threads using the JVM's deadlock detector.
+ * If a deadlock is found, log info about the deadlocked threads and exit the
+ * process.
+ *
+ * We choose to exit the process this situation because the deadlock will likely
+ * cause hangs and other forms of service unavailability and there is no way to
+ * recover from the deadlock except by restarting the process.
+ */
+ private void checkForDeadlocks() {
+ ThreadMXBean threadMx = ManagementFactory.getThreadMXBean();
+ long deadlockedTids[] = threadMx.findDeadlockedThreads();
+ if (deadlockedTids != null) {
+ ThreadInfo deadlockedThreads[] =
+ threadMx.getThreadInfo(deadlockedTids, true, true);
+ // Log diagnostics with error before aborting the process with a FATAL log.
+ LOG.error("Found " + deadlockedThreads.length + " threads in deadlock: ");
+ for (ThreadInfo thread : deadlockedThreads) {
+ // Defensively check for null in case the thread somehow disappeared between
+ // findDeadlockedThreads() and getThreadInfo().
+ if (thread != null) LOG.error(thread.toString());
+ }
+ LOG.error("All threads:" );
+ for (ThreadInfo thread : threadMx.dumpAllThreads(true, true)) {
+ LOG.error(thread.toString());
+ }
+ // In the context of an Impala service, LOG.fatal calls glog's fatal, which
+ // aborts the process, which will produce a coredump if coredumps are enabled.
+ LOG.fatal("Aborting because of deadlocked threads in JVM.");
+ System.exit(1);
}
}
}
/**
- * Simple 'main' to facilitate manual testing of the pause monitor.
- *
- * This main function just leaks memory into a list. Running this class
+ * Helper for manual testing that causes a deadlock between java threads.
+ */
+ private static void causeDeadlock() {
+ final Object obj1 = new Object();
+ final Object obj2 = new Object();
+
+ new Thread(new Runnable() {
+
+ @Override
+ public void run() {
+ while (true) {
+ synchronized(obj2) {
+ synchronized(obj1) {
+ System.err.println("Thread 1 got locks");
+ }
+ }
+ }
+ }
+ }).start();
+
+ while (true) {
+ synchronized(obj1) {
+ synchronized(obj2) {
+ System.err.println("Thread 2 got locks");
+ }
+ }
+ }
+ }
+
+ /**
+ * This function just leaks memory into a list. Running this function
* with a 1GB heap will very quickly go into "GC hell" and result in
* log messages about the GC pauses.
*/
- @SuppressWarnings("resource")
- public static void main(String []args) throws Exception {
- JvmPauseMonitor monitor = new JvmPauseMonitor();
- monitor.init();
+ private static void allocateMemory() {
List<String> list = Lists.newArrayList();
int i = 0;
while (true) {
list.add(String.valueOf(i++));
}
}
+
+ /**
+ * Simple 'main' to facilitate manual testing of the pause monitor.
+ */
+ @SuppressWarnings("resource")
+ public static void main(String []args) throws Exception {
+ JvmPauseMonitor monitor = new JvmPauseMonitor();
+ monitor.init(60);
+ if (args[0].equals("gc")) {
+ allocateMemory();
+ } else if (args[0].equals("deadlock")) {
+ causeDeadlock();
+ } else {
+ System.err.println("Unknown mode");
+ }
+ }
+
}