You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@oozie.apache.org by rk...@apache.org on 2014/05/14 20:05:42 UTC
git commit: OOZIE-1826 Add thread which detects JVM pauses (rkanter)
Repository: oozie
Updated Branches:
refs/heads/master f82c12408 -> b45a314b2
OOZIE-1826 Add thread which detects JVM pauses (rkanter)
Project: http://git-wip-us.apache.org/repos/asf/oozie/repo
Commit: http://git-wip-us.apache.org/repos/asf/oozie/commit/b45a314b
Tree: http://git-wip-us.apache.org/repos/asf/oozie/tree/b45a314b
Diff: http://git-wip-us.apache.org/repos/asf/oozie/diff/b45a314b
Branch: refs/heads/master
Commit: b45a314b21a0f15e103a7ca8ce4e7f87985722bb
Parents: f82c124
Author: Robert Kanter <rk...@cloudera.com>
Authored: Wed May 14 11:04:06 2014 -0700
Committer: Robert Kanter <rk...@cloudera.com>
Committed: Wed May 14 11:05:15 2014 -0700
----------------------------------------------------------------------
.../oozie/service/JvmPauseMonitorService.java | 185 +++++++++++++++++++
core/src/main/resources/oozie-default.xml | 27 ++-
release-log.txt | 1 +
3 files changed, 212 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/oozie/blob/b45a314b/core/src/main/java/org/apache/oozie/service/JvmPauseMonitorService.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/oozie/service/JvmPauseMonitorService.java b/core/src/main/java/org/apache/oozie/service/JvmPauseMonitorService.java
new file mode 100644
index 0000000..0f1a5f6
--- /dev/null
+++ b/core/src/main/java/org/apache/oozie/service/JvmPauseMonitorService.java
@@ -0,0 +1,185 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.oozie.service;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import com.google.common.base.Stopwatch;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.Daemon;
+import org.apache.oozie.util.Instrumentation;
+import org.apache.oozie.util.XLog;
+
+/**
+ * This class sets up a simple thread that sleeps for a short period of time. If the sleep takes significantly longer than its
+ * target time, it implies that the JVM or host machine has paused processing, which may cause other problems. If such a pause is
+ * detected, the thread logs a message and updates the Instrumentation.
+ *
+ * Adapted from org.apache.hadoop.util.JvmPauseMonitor
+ */
+public class JvmPauseMonitorService implements Service {
+
+ private static XLog LOG = XLog.getLog(JvmPauseMonitorService.class);
+
+ public static final String CONF_PREFIX = Service.CONF_PREFIX + "JvmPauseMonitorService.";
+
+ /**
+ * The target sleep time
+ */
+ private static final long SLEEP_INTERVAL_MS = 500;
+
+ /**
+ * log WARN if we detect a pause longer than this threshold
+ */
+ private long warnThresholdMs;
+ private static final String WARN_THRESHOLD_KEY = CONF_PREFIX + "warn-threshold.ms";
+ private static final long WARN_THRESHOLD_DEFAULT = 10000;
+
+ /**
+ * log INFO if we detect a pause longer than this threshold
+ */
+ private long infoThresholdMs;
+ private static final String INFO_THRESHOLD_KEY = CONF_PREFIX + "info-threshold.ms";
+ private static final long INFO_THRESHOLD_DEFAULT = 1000;
+
+ private Thread monitorThread;
+ private volatile boolean shouldRun = true;
+ private Instrumentation instrumentation;
+
+ @Override
+ public void init(Services services) throws ServiceException {
+ Configuration conf = services.getConf();
+ warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT);
+ infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT);
+
+ instrumentation = services.get(InstrumentationService.class).get();
+
+ Preconditions.checkState(monitorThread == null,
+ "Already started");
+ monitorThread = new Daemon(new Monitor());
+ monitorThread.start();
+ }
+
+ @Override
+ public void destroy() {
+ shouldRun = false;
+ monitorThread.interrupt();
+ try {
+ monitorThread.join();
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ }
+ }
+
+ @Override
+ public Class<? extends Service> getInterface() {
+ return JvmPauseMonitorService.class;
+ }
+
+ private String formatMessage(long extraSleepTime, Map<String, GcTimes> gcTimesAfterSleep,
+ Map<String, GcTimes> gcTimesBeforeSleep) {
+ Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(), gcTimesBeforeSleep.keySet());
+ List<String> gcDiffs = Lists.newArrayList();
+ for (String name : gcBeanNames) {
+ GcTimes diff = gcTimesAfterSleep.get(name).subtract(gcTimesBeforeSleep.get(name));
+ if (diff.gcCount != 0) {
+ gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff.toString());
+ }
+ }
+
+ String ret = "Detected pause in JVM or host machine (eg GC): pause of approximately " + extraSleepTime + "ms\n";
+ if (gcDiffs.isEmpty()) {
+ ret += "No GCs detected";
+ } else {
+ ret += Joiner.on("\n").join(gcDiffs);
+ }
+ return ret;
+ }
+
+ private Map<String, GcTimes> getGcTimes() {
+ Map<String, GcTimes> map = Maps.newHashMap();
+ List<GarbageCollectorMXBean> gcBeans = ManagementFactory.getGarbageCollectorMXBeans();
+ for (GarbageCollectorMXBean gcBean : gcBeans) {
+ map.put(gcBean.getName(), new GcTimes(gcBean));
+ }
+ return map;
+ }
+
+ private static class GcTimes {
+
+ private GcTimes(GarbageCollectorMXBean gcBean) {
+ gcCount = gcBean.getCollectionCount();
+ gcTimeMillis = gcBean.getCollectionTime();
+ }
+
+ private GcTimes(long count, long time) {
+ this.gcCount = count;
+ this.gcTimeMillis = time;
+ }
+
+ private GcTimes subtract(GcTimes other) {
+ return new GcTimes(this.gcCount - other.gcCount, this.gcTimeMillis - other.gcTimeMillis);
+ }
+
+ @Override
+ public String toString() {
+ return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+ }
+
+ private long gcCount;
+ private long gcTimeMillis;
+ }
+
+ private class Monitor implements Runnable {
+
+ @Override
+ public void run() {
+ Stopwatch sw = new Stopwatch();
+ Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
+ while (shouldRun) {
+ sw.reset().start();
+ try {
+ Thread.sleep(SLEEP_INTERVAL_MS);
+ } catch (InterruptedException ie) {
+ return;
+ }
+ long extraSleepTime = sw.elapsedMillis() - SLEEP_INTERVAL_MS;
+ Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
+
+ if (extraSleepTime > warnThresholdMs) {
+ LOG.warn(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
+ instrumentation.incr("jvm", "pause.warn-threshold", 1);
+ } else if (extraSleepTime > infoThresholdMs) {
+ LOG.info(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
+ instrumentation.incr("jvm", "pause.info-threshold", 1);
+ }
+ instrumentation.incr("jvm", "pause.extraSleepTime", extraSleepTime);
+
+ gcTimesBeforeSleep = gcTimesAfterSleep;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/oozie/blob/b45a314b/core/src/main/resources/oozie-default.xml
----------------------------------------------------------------------
diff --git a/core/src/main/resources/oozie-default.xml b/core/src/main/resources/oozie-default.xml
index c35d090..9788daf 100644
--- a/core/src/main/resources/oozie-default.xml
+++ b/core/src/main/resources/oozie-default.xml
@@ -129,7 +129,8 @@
org.apache.oozie.service.PauseTransitService,
org.apache.oozie.service.GroupsService,
org.apache.oozie.service.ProxyUserService,
- org.apache.oozie.service.XLogStreamingService
+ org.apache.oozie.service.XLogStreamingService,
+ org.apache.oozie.service.JvmPauseMonitorService
</value>
<description>
All services to be created and managed by Oozie Services singleton.
@@ -2081,4 +2082,28 @@
</description>
</property>
+ <!-- JvmPauseMonitorService Configuration -->
+ <property>
+ <name>oozie.service.JvmPauseMonitorService.warn-threshold.ms</name>
+ <value>10000</value>
+ <description>
+ The JvmPauseMonitorService runs a thread that repeatedly tries to detect when the JVM pauses, which could indicate
+ that the JVM or host machine is overloaded or other problems. This thread sleeps for 500ms; if it sleeps for
+ significantly longer, then there is likely a problem. This property specifies the threadshold for when Oozie should log
+ a WARN level message; there is also a counter named "jvm.pause.warn-threshold".
+ </description>
+ </property>
+
+ <property>
+ <name>oozie.service.JvmPauseMonitorService.info-threshold.ms</name>
+ <value>1000</value>
+ <description>
+ The JvmPauseMonitorService runs a thread that repeatedly tries to detect when the JVM pauses, which could indicate
+ that the JVM or host machine is overloaded or other problems. This thread sleeps for 500ms; if it sleeps for
+ significantly longer, then there is likely a problem. This property specifies the threadshold for when Oozie should log
+ an INFO level message; there is also a counter named "jvm.pause.info-threshold".
+ </description>
+ </property>
+
+
</configuration>
http://git-wip-us.apache.org/repos/asf/oozie/blob/b45a314b/release-log.txt
----------------------------------------------------------------------
diff --git a/release-log.txt b/release-log.txt
index da86677..9223025 100644
--- a/release-log.txt
+++ b/release-log.txt
@@ -1,5 +1,6 @@
-- Oozie 4.1.0 release (trunk - unreleased)
+OOZIE-1826 Add thread which detects JVM pauses (rkanter)
OOZIE-1791 add IGNORED status to Coordinator Job and Action (ryota)
OOZIE-1825 Optimize wf_jobs protoconf storage (puru via rohini)
OOZIE-1831 Oozie upgrade fails if workflow jobs are in running or suspended state (satish.mittal via rohini)