You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@flink.apache.org by az...@apache.org on 2020/03/24 07:47:40 UTC
[flink] 01/04: [FLINK-16225] Improve metaspace out-of-memory error
handling thrown in user code
This is an automated email from the ASF dual-hosted git repository.
azagrebin pushed a commit to branch release-1.10
in repository https://gitbox.apache.org/repos/asf/flink.git
commit 2c934ff2a42fe256c2a1174788cbe55c05e8e323
Author: Andrey Zagrebin <az...@apache.org>
AuthorDate: Sun Mar 15 14:26:05 2020 +0300
[FLINK-16225] Improve metaspace out-of-memory error handling thrown in user code
Improve error message, explaining the possible reasons and ways to resolve.
In case of metaspace OOM error, try a graceful TM shutdown.
This closes #11408.
---
.../java/org/apache/flink/util/ExceptionUtils.java | 52 ++++++++++++++++++++++
.../runtime/taskexecutor/TaskManagerRunner.java | 8 +++-
.../org/apache/flink/runtime/taskmanager/Task.java | 2 +
3 files changed, 60 insertions(+), 2 deletions(-)
diff --git a/flink-core/src/main/java/org/apache/flink/util/ExceptionUtils.java b/flink-core/src/main/java/org/apache/flink/util/ExceptionUtils.java
index ddd0276..5fc1bfe 100644
--- a/flink-core/src/main/java/org/apache/flink/util/ExceptionUtils.java
+++ b/flink-core/src/main/java/org/apache/flink/util/ExceptionUtils.java
@@ -25,6 +25,7 @@
package org.apache.flink.util;
import org.apache.flink.annotation.Internal;
+import org.apache.flink.configuration.TaskManagerOptions;
import org.apache.flink.util.function.RunnableWithException;
import javax.annotation.Nullable;
@@ -48,6 +49,14 @@ public final class ExceptionUtils {
/** The stringified representation of a null exception reference. */
public static final String STRINGIFIED_NULL_EXCEPTION = "(null)";
+ private static final String TM_METASPACE_OOM_ERROR_MESSAGE = String.format(
+ "Metaspace. The metaspace out-of-memory error has occurred. This can mean two things: either the job requires " +
+ "a larger size of JVM metaspace to load classes or there is a class loading leak. In the first case " +
+ "'%s' configuration option should be increased. If the error persists (usually in cluster after " +
+ "several job (re-)submissions) then there is probably a class loading leak which has to be " +
+ "investigated and fixed. The task executor has to be shutdown...",
+ TaskManagerOptions.JVM_METASPACE.key());
+
/**
* Makes a string representation of the exception's stack trace, or "(null)", if the
* exception is null.
@@ -110,6 +119,49 @@ public final class ExceptionUtils {
}
/**
+ * Generates new {@link OutOfMemoryError} with more detailed message.
+ *
+ * <p>This method improves error message for metaspace {@link OutOfMemoryError}.
+ * It adds description of possible causes and ways of resolution.
+ *
+ * @param exception The exception to enrich.
+ * @return either enriched exception if needed or the original one.
+ */
+ public static Throwable enrichTaskManagerOutOfMemoryError(Throwable exception) {
+ if (isMetaspaceOutOfMemoryError(exception)) {
+ return changeOutOfMemoryErrorMessage(exception, TM_METASPACE_OOM_ERROR_MESSAGE);
+ }
+ return exception;
+ }
+
+ private static OutOfMemoryError changeOutOfMemoryErrorMessage(Throwable exception, String newMessage) {
+ Preconditions.checkArgument(exception instanceof OutOfMemoryError);
+ if (exception.getMessage().equals(newMessage)) {
+ return (OutOfMemoryError) exception;
+ }
+ OutOfMemoryError newError = new OutOfMemoryError(newMessage);
+ newError.initCause(exception.getCause());
+ newError.setStackTrace(exception.getStackTrace());
+ return newError;
+ }
+
+ /**
+ * Checks whether the given exception indicates a JVM metaspace out-of-memory error.
+ *
+ * @param t The exception to check.
+ * @return True, if the exception is the metaspace {@link OutOfMemoryError}, false otherwise.
+ */
+ public static boolean isMetaspaceOutOfMemoryError(Throwable t) {
+ return isOutOfMemoryErrorWithMessageStartingWith(t, "Metaspace");
+ }
+
+ private static boolean isOutOfMemoryErrorWithMessageStartingWith(Throwable t, String prefix) {
+ // the exact matching of the class is checked to avoid matching any custom subclasses of OutOfMemoryError
+ // as we are interested in the original exceptions, generated by JVM.
+ return t.getClass() == OutOfMemoryError.class && t.getMessage() != null && t.getMessage().startsWith(prefix);
+ }
+
+ /**
* Rethrows the given {@code Throwable}, if it represents an error that is fatal to the JVM.
* See {@link ExceptionUtils#isJvmFatalError(Throwable)} for a definition of fatal errors.
*
diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/taskexecutor/TaskManagerRunner.java b/flink-runtime/src/main/java/org/apache/flink/runtime/taskexecutor/TaskManagerRunner.java
index ca93e32..8ed4fe4 100644
--- a/flink-runtime/src/main/java/org/apache/flink/runtime/taskexecutor/TaskManagerRunner.java
+++ b/flink-runtime/src/main/java/org/apache/flink/runtime/taskexecutor/TaskManagerRunner.java
@@ -246,9 +246,13 @@ public class TaskManagerRunner implements FatalErrorHandler, AutoCloseableAsync
@Override
public void onFatalError(Throwable exception) {
- LOG.error("Fatal error occurred while executing the TaskManager. Shutting it down...", exception);
+ Throwable enrichedException = ExceptionUtils.enrichTaskManagerOutOfMemoryError(exception);
+ LOG.error("Fatal error occurred while executing the TaskManager. Shutting it down...", enrichedException);
- if (ExceptionUtils.isJvmFatalOrOutOfMemoryError(exception)) {
+ // In case of the Metaspace OutOfMemoryError, we expect that the graceful shutdown is possible,
+ // as it does not usually require more class loading to fail again with the Metaspace OutOfMemoryError.
+ if (ExceptionUtils.isJvmFatalOrOutOfMemoryError(enrichedException) &&
+ !ExceptionUtils.isMetaspaceOutOfMemoryError(enrichedException)) {
terminateJVM();
} else {
closeAsync();
diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/taskmanager/Task.java b/flink-runtime/src/main/java/org/apache/flink/runtime/taskmanager/Task.java
index 3723f1a..689ada8 100644
--- a/flink-runtime/src/main/java/org/apache/flink/runtime/taskmanager/Task.java
+++ b/flink-runtime/src/main/java/org/apache/flink/runtime/taskmanager/Task.java
@@ -741,6 +741,8 @@ public class Task implements Runnable, TaskSlotPayload, TaskActions, PartitionPr
// an exception was thrown as a side effect of cancelling
// ----------------------------------------------------------------
+ t = ExceptionUtils.enrichTaskManagerOutOfMemoryError(t);
+
try {
// check if the exception is unrecoverable
if (ExceptionUtils.isJvmFatalError(t) ||