You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@oozie.apache.org by rk...@apache.org on 2014/05/14 20:05:42 UTC

git commit: OOZIE-1826 Add thread which detects JVM pauses (rkanter)

Repository: oozie
Updated Branches:
  refs/heads/master f82c12408 -> b45a314b2


OOZIE-1826 Add thread which detects JVM pauses (rkanter)


Project: http://git-wip-us.apache.org/repos/asf/oozie/repo
Commit: http://git-wip-us.apache.org/repos/asf/oozie/commit/b45a314b
Tree: http://git-wip-us.apache.org/repos/asf/oozie/tree/b45a314b
Diff: http://git-wip-us.apache.org/repos/asf/oozie/diff/b45a314b

Branch: refs/heads/master
Commit: b45a314b21a0f15e103a7ca8ce4e7f87985722bb
Parents: f82c124
Author: Robert Kanter <rk...@cloudera.com>
Authored: Wed May 14 11:04:06 2014 -0700
Committer: Robert Kanter <rk...@cloudera.com>
Committed: Wed May 14 11:05:15 2014 -0700

----------------------------------------------------------------------
 .../oozie/service/JvmPauseMonitorService.java   | 185 +++++++++++++++++++
 core/src/main/resources/oozie-default.xml       |  27 ++-
 release-log.txt                                 |   1 +
 3 files changed, 212 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/oozie/blob/b45a314b/core/src/main/java/org/apache/oozie/service/JvmPauseMonitorService.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/oozie/service/JvmPauseMonitorService.java b/core/src/main/java/org/apache/oozie/service/JvmPauseMonitorService.java
new file mode 100644
index 0000000..0f1a5f6
--- /dev/null
+++ b/core/src/main/java/org/apache/oozie/service/JvmPauseMonitorService.java
@@ -0,0 +1,185 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.oozie.service;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import com.google.common.base.Stopwatch;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.Daemon;
+import org.apache.oozie.util.Instrumentation;
+import org.apache.oozie.util.XLog;
+
+/**
+ * This class sets up a simple thread that sleeps for a short period of time. If the sleep takes significantly longer than its
+ * target time, it implies that the JVM or host machine has paused processing, which may cause other problems. If such a pause is
+ * detected, the thread logs a message and updates the Instrumentation.
+ *
+ * Adapted from org.apache.hadoop.util.JvmPauseMonitor
+ */
+public class JvmPauseMonitorService implements Service {
+
+    private static XLog LOG = XLog.getLog(JvmPauseMonitorService.class);
+
+    public static final String CONF_PREFIX = Service.CONF_PREFIX + "JvmPauseMonitorService.";
+
+    /**
+     * The target sleep time
+     */
+    private static final long SLEEP_INTERVAL_MS = 500;
+
+    /**
+     * log WARN if we detect a pause longer than this threshold
+     */
+    private long warnThresholdMs;
+    private static final String WARN_THRESHOLD_KEY = CONF_PREFIX + "warn-threshold.ms";
+    private static final long WARN_THRESHOLD_DEFAULT = 10000;
+
+    /**
+     * log INFO if we detect a pause longer than this threshold
+     */
+    private long infoThresholdMs;
+    private static final String INFO_THRESHOLD_KEY = CONF_PREFIX + "info-threshold.ms";
+    private static final long INFO_THRESHOLD_DEFAULT = 1000;
+
+    private Thread monitorThread;
+    private volatile boolean shouldRun = true;
+    private Instrumentation instrumentation;
+
+    @Override
+    public void init(Services services) throws ServiceException {
+        Configuration conf = services.getConf();
+        warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT);
+        infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT);
+
+        instrumentation = services.get(InstrumentationService.class).get();
+
+        Preconditions.checkState(monitorThread == null,
+                "Already started");
+        monitorThread = new Daemon(new Monitor());
+        monitorThread.start();
+    }
+
+    @Override
+    public void destroy() {
+        shouldRun = false;
+        monitorThread.interrupt();
+        try {
+            monitorThread.join();
+        } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+        }
+    }
+
+    @Override
+    public Class<? extends Service> getInterface() {
+        return JvmPauseMonitorService.class;
+    }
+
+    private String formatMessage(long extraSleepTime, Map<String, GcTimes> gcTimesAfterSleep,
+            Map<String, GcTimes> gcTimesBeforeSleep) {
+        Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(), gcTimesBeforeSleep.keySet());
+        List<String> gcDiffs = Lists.newArrayList();
+        for (String name : gcBeanNames) {
+            GcTimes diff = gcTimesAfterSleep.get(name).subtract(gcTimesBeforeSleep.get(name));
+            if (diff.gcCount != 0) {
+                gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff.toString());
+            }
+        }
+
+        String ret = "Detected pause in JVM or host machine (eg GC): pause of approximately " + extraSleepTime + "ms\n";
+        if (gcDiffs.isEmpty()) {
+            ret += "No GCs detected";
+        } else {
+            ret += Joiner.on("\n").join(gcDiffs);
+        }
+        return ret;
+    }
+
+    private Map<String, GcTimes> getGcTimes() {
+        Map<String, GcTimes> map = Maps.newHashMap();
+        List<GarbageCollectorMXBean> gcBeans = ManagementFactory.getGarbageCollectorMXBeans();
+        for (GarbageCollectorMXBean gcBean : gcBeans) {
+            map.put(gcBean.getName(), new GcTimes(gcBean));
+        }
+        return map;
+    }
+
+    private static class GcTimes {
+
+        private GcTimes(GarbageCollectorMXBean gcBean) {
+            gcCount = gcBean.getCollectionCount();
+            gcTimeMillis = gcBean.getCollectionTime();
+        }
+
+        private GcTimes(long count, long time) {
+            this.gcCount = count;
+            this.gcTimeMillis = time;
+        }
+
+        private GcTimes subtract(GcTimes other) {
+            return new GcTimes(this.gcCount - other.gcCount, this.gcTimeMillis - other.gcTimeMillis);
+        }
+
+        @Override
+        public String toString() {
+            return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+        }
+
+        private long gcCount;
+        private long gcTimeMillis;
+    }
+
+    private class Monitor implements Runnable {
+
+        @Override
+        public void run() {
+            Stopwatch sw = new Stopwatch();
+            Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
+            while (shouldRun) {
+                sw.reset().start();
+                try {
+                    Thread.sleep(SLEEP_INTERVAL_MS);
+                } catch (InterruptedException ie) {
+                    return;
+                }
+                long extraSleepTime = sw.elapsedMillis() - SLEEP_INTERVAL_MS;
+                Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
+
+                if (extraSleepTime > warnThresholdMs) {
+                    LOG.warn(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
+                    instrumentation.incr("jvm", "pause.warn-threshold", 1);
+                } else if (extraSleepTime > infoThresholdMs) {
+                    LOG.info(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
+                    instrumentation.incr("jvm", "pause.info-threshold", 1);
+                }
+                instrumentation.incr("jvm", "pause.extraSleepTime", extraSleepTime);
+
+                gcTimesBeforeSleep = gcTimesAfterSleep;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/oozie/blob/b45a314b/core/src/main/resources/oozie-default.xml
----------------------------------------------------------------------
diff --git a/core/src/main/resources/oozie-default.xml b/core/src/main/resources/oozie-default.xml
index c35d090..9788daf 100644
--- a/core/src/main/resources/oozie-default.xml
+++ b/core/src/main/resources/oozie-default.xml
@@ -129,7 +129,8 @@
             org.apache.oozie.service.PauseTransitService,
             org.apache.oozie.service.GroupsService,
             org.apache.oozie.service.ProxyUserService,
-            org.apache.oozie.service.XLogStreamingService
+            org.apache.oozie.service.XLogStreamingService,
+            org.apache.oozie.service.JvmPauseMonitorService
         </value>
         <description>
             All services to be created and managed by Oozie Services singleton.
@@ -2081,4 +2082,28 @@
         </description>
     </property>
 
+    <!-- JvmPauseMonitorService Configuration -->
+    <property>
+        <name>oozie.service.JvmPauseMonitorService.warn-threshold.ms</name>
+        <value>10000</value>
+        <description>
+            The JvmPauseMonitorService runs a thread that repeatedly tries to detect when the JVM pauses, which could indicate
+            that the JVM or host machine is overloaded or other problems.  This thread sleeps for 500ms; if it sleeps for
+            significantly longer, then there is likely a problem.  This property specifies the threadshold for when Oozie should log
+            a WARN level message; there is also a counter named "jvm.pause.warn-threshold".
+        </description>
+    </property>
+
+    <property>
+        <name>oozie.service.JvmPauseMonitorService.info-threshold.ms</name>
+        <value>1000</value>
+        <description>
+            The JvmPauseMonitorService runs a thread that repeatedly tries to detect when the JVM pauses, which could indicate
+            that the JVM or host machine is overloaded or other problems.  This thread sleeps for 500ms; if it sleeps for
+            significantly longer, then there is likely a problem.  This property specifies the threadshold for when Oozie should log
+            an INFO level message; there is also a counter named "jvm.pause.info-threshold".
+        </description>
+    </property>
+
+
 </configuration>

http://git-wip-us.apache.org/repos/asf/oozie/blob/b45a314b/release-log.txt
----------------------------------------------------------------------
diff --git a/release-log.txt b/release-log.txt
index da86677..9223025 100644
--- a/release-log.txt
+++ b/release-log.txt
@@ -1,5 +1,6 @@
 -- Oozie 4.1.0 release (trunk - unreleased)
 
+OOZIE-1826 Add thread which detects JVM pauses (rkanter)
 OOZIE-1791 add IGNORED status to Coordinator Job and Action (ryota)
 OOZIE-1825 Optimize wf_jobs protoconf storage (puru via rohini)
 OOZIE-1831 Oozie upgrade fails if workflow jobs are in running or suspended state (satish.mittal via rohini)