You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sling.apache.org by ro...@apache.org on 2017/11/07 09:28:13 UTC

[sling-org-apache-sling-discovery-oak] 05/12: SLING-5326 : adding time-difference health-check 'ClocksInSyncHealthCheck'

This is an automated email from the ASF dual-hosted git repository.

rombert pushed a commit to annotated tag org.apache.sling.discovery.oak-1.2.0
in repository https://gitbox.apache.org/repos/asf/sling-org-apache-sling-discovery-oak.git

commit 5808792743806cd9fa182b84727da2ac4dcaeb7d
Author: Stefan Egli <st...@apache.org>
AuthorDate: Tue Nov 24 15:46:37 2015 +0000

    SLING-5326 : adding time-difference health-check 'ClocksInSyncHealthCheck'
    
    git-svn-id: https://svn.apache.org/repos/asf/sling/trunk/bundles/extensions/discovery/oak@1716181 13f79535-47bb-0310-9956-ffa450edef68
---
 pom.xml                                            |   8 +-
 .../discovery/oak/ClocksInSyncHealthCheck.java     | 186 +++++++++++++++++++++
 2 files changed, 193 insertions(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 8fcb059..085c5be 100644
--- a/pom.xml
+++ b/pom.xml
@@ -29,7 +29,7 @@
 
     <artifactId>org.apache.sling.discovery.oak</artifactId>
     <packaging>bundle</packaging>
-    <version>1.1.1-SNAPSHOT</version>
+    <version>1.2.0-SNAPSHOT</version>
 
     <name>Apache Sling Oak-Based Discovery Service</name>
     <description>Implementation of Apache Sling Discovery based on Jackrabbit Oak using its discovery-lite descriptor for in-cluster view detection and a TopologyView through HTTP POST heartbeats announcing sub-topologies to each other.</description>
@@ -319,5 +319,11 @@
             <version>1.3.7</version>
             <scope>test</scope>
         </dependency>
+        <dependency>
+        	<groupId>org.apache.sling</groupId>
+        	<artifactId>org.apache.sling.hc.core</artifactId>
+        	<version>1.0.6</version>
+        	<type>bundle</type>
+        </dependency>
     </dependencies>
 </project>
diff --git a/src/main/java/org/apache/sling/discovery/oak/ClocksInSyncHealthCheck.java b/src/main/java/org/apache/sling/discovery/oak/ClocksInSyncHealthCheck.java
new file mode 100644
index 0000000..900428f
--- /dev/null
+++ b/src/main/java/org/apache/sling/discovery/oak/ClocksInSyncHealthCheck.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sling.discovery.oak;
+
+import java.lang.management.ManagementFactory;
+import java.util.Collection;
+import java.util.Set;
+
+import javax.management.MBeanServer;
+import javax.management.ObjectName;
+
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.PropertyUnbounded;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.sling.discovery.base.connectors.announcement.Announcement;
+import org.apache.sling.discovery.base.connectors.announcement.AnnouncementRegistry;
+import org.apache.sling.discovery.base.connectors.announcement.CachedAnnouncement;
+import org.apache.sling.hc.api.HealthCheck;
+import org.apache.sling.hc.api.Result;
+import org.apache.sling.hc.util.FormattingResultLog;
+import org.apache.sling.settings.SlingSettingsService;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * HealthCheck that builds on-top of DocumentNodeStore's
+ * determineServerTimeDifferenceMillis method which checks how much the local
+ * time differs from the DocumentStore's time. It then applies low- and
+ * high-water marks to that time difference:
+ * <ul>
+ * <li>if the value is higher than the high-water mark (5sec by default), then
+ * it issues a critical</li>
+ * <li>if the value is lower than the high-water but higher than the low-water
+ * mark (1sec by default), then it issues only a warn</li>
+ * <li>if the value is lower than the low-water mark, then it issues only an
+ * info</li>
+ * </ul>
+ */
+@Component(immediate = true, metatype = true, label = "Apache Sling Discovery Oak Clocks-In-Sync Health Check")
+@Properties({
+        @Property(name = HealthCheck.NAME, value = "SlingDiscoveryOakClocksInSyncHC", description = "Health Check name", label = "Name"),
+        @Property(name = HealthCheck.TAGS, unbounded = PropertyUnbounded.ARRAY, description = "Health Check tags", label = "Tags"),
+        @Property(name = HealthCheck.MBEAN_NAME, value = "slingDiscoveryOakClocksInSync", description = "Health Check MBean name", label = "MBean name") })
+@Service(value = HealthCheck.class)
+public class ClocksInSyncHealthCheck implements HealthCheck {
+
+    protected final Logger logger = LoggerFactory.getLogger(getClass());
+
+    private static final String DOCUMENT_NODE_STORE_MBEAN = "org.apache.jackrabbit.oak:name=*,type=\"DocumentNodeStore\",id=*";
+    private static final String TIME_DIFF_METHOD_NAME = "determineServerTimeDifferenceMillis";
+
+    private static final long INTRA_CLUSTER_HIGH_WATER_MARK = 5000;
+    private static final long INTRA_CLUSTER_LOW_WATER_MARK = 1000;
+
+    private static final long INTER_CLUSTER_HIGH_WATER_MARK = 10000;
+    private static final long INTER_CLUSTER_LOW_WATER_MARK = 5000;
+
+    @Reference
+    private AnnouncementRegistry announcementRegistry;
+
+    @Reference
+    private SlingSettingsService settingsService;
+
+    @Override
+    public Result execute() {
+        final FormattingResultLog resultLog = new FormattingResultLog();
+        resultLog.debug("Checking cluster internal clocks");
+        try {
+            final MBeanServer jmxServer = ManagementFactory.getPlatformMBeanServer();
+            ObjectName n = new ObjectName(DOCUMENT_NODE_STORE_MBEAN);
+            Set<ObjectName> names = jmxServer.queryNames(n, null);
+
+            if (names.size() == 0) {
+                resultLog.info("Intra-cluster test n/a (No DocumentNodeStore MBean found)");
+            } else {
+                ObjectName firstName = names.iterator().next();
+                final Object value = jmxServer.invoke(firstName, TIME_DIFF_METHOD_NAME, new Object[0], new String[0]);
+                logger.debug("{} returns {}", new Object[] { firstName, TIME_DIFF_METHOD_NAME, value });
+                resultLog.debug("{} returns {}", firstName, TIME_DIFF_METHOD_NAME, value);
+                if (value != null && (value instanceof Long)) {
+                    Long diffMillis = (Long) value;
+                    if (Math.abs(diffMillis) >= INTRA_CLUSTER_HIGH_WATER_MARK) {
+                        logger.warn(
+                                "execute: clocks in local cluster out of sync by {}ms "
+                                        + "which is equal or higher than the high-water mark of {}ms.",
+                                diffMillis, INTRA_CLUSTER_HIGH_WATER_MARK);
+                        resultLog.critical(
+                                "Clocks heavily out of sync in local cluster: "
+                                        + "time difference of this VM with DocumentStore server: "
+                                        + "{}ms is equal or larger than high-water mark of {}ms",
+                                diffMillis, INTRA_CLUSTER_HIGH_WATER_MARK);
+                    } else if (Math.abs(diffMillis) >= INTRA_CLUSTER_LOW_WATER_MARK) {
+                        logger.warn(
+                                "execute: clocks in local cluster out of sync by {}ms"
+                                        + "ms which is equal or higher than the low-water mark of {}ms.",
+                                diffMillis, INTRA_CLUSTER_LOW_WATER_MARK);
+                        resultLog.warn(
+                                "Clocks noticeably out of sync in local cluster: "
+                                        + "time difference of this VM with DocumentStore server: "
+                                        + "{}ms is equal or larger than low-water mark of {}ms",
+                                diffMillis, INTRA_CLUSTER_LOW_WATER_MARK);
+                    } else {
+                        logger.debug("execute: clocks in local cluster in sync. diff is {}ms"
+                                + "ms which is within low-water mark of {}ms.", diffMillis, INTRA_CLUSTER_LOW_WATER_MARK);
+                        resultLog.info("Clocks in sync in local cluster: time difference of this VM with DocumentStore server: "
+                                + "{}ms is within low-water mark of {}ms", diffMillis, INTRA_CLUSTER_LOW_WATER_MARK);
+                    }
+                }
+            }
+        } catch (final Exception e) {
+            logger.warn("execute: {}, JMX method {} invocation failed: {}",
+                    new Object[] { DOCUMENT_NODE_STORE_MBEAN, TIME_DIFF_METHOD_NAME, e });
+            resultLog.healthCheckError("{}, JMX method {} invocation failed: {}", DOCUMENT_NODE_STORE_MBEAN, TIME_DIFF_METHOD_NAME,
+                    e);
+        }
+
+        final String slingId = settingsService == null ? "n/a" : settingsService.getSlingId();
+
+        if (announcementRegistry == null) {
+            logger.warn("execute: no announcementRegistry ({}) set", announcementRegistry);
+            resultLog.warn("Cannot determine topology clocks since no announcementRegistry ({}) set", announcementRegistry);
+        } else {
+            final Collection<Announcement> localAnnouncements = announcementRegistry.listLocalAnnouncements();
+            if (localAnnouncements.isEmpty()) {
+                logger.info("execute: no topology connectors connected to local instance.");
+                resultLog.info("No topology connectors connected to local instance.");
+            }
+            for (Announcement ann : localAnnouncements) {
+                final String peerSlingId = ann.isInherited() ? ann.getServerInfo() : ann.getOwnerId();
+                final long originallyCreatedAt = ann.getOriginallyCreatedAt();
+                final long receivedAt = ann.getReceivedAt();
+                long diffMillis = Math.abs(originallyCreatedAt - receivedAt);
+                if (Math.abs(diffMillis) >= INTER_CLUSTER_HIGH_WATER_MARK) {
+                    logger.warn(
+                            "execute: clocks between local instance (slingId: {}) and remote instance (slingId: {}) out of sync by {}ms"
+                                    + "ms which is equal or higher than the high-water mark of {}ms.",
+                            new Object[] { slingId, peerSlingId, diffMillis, INTER_CLUSTER_HIGH_WATER_MARK });
+                    resultLog.critical(
+                            "Clocks heavily out of sync between local instance (slingId: {}) and remote instance (slingId: {}): "
+                                    + "by {}ms which is equal or larger than high-water mark of {}ms",
+                            new Object[] { slingId, peerSlingId, diffMillis, INTER_CLUSTER_HIGH_WATER_MARK });
+                } else if (Math.abs(diffMillis) >= INTER_CLUSTER_LOW_WATER_MARK) {
+                    logger.warn(
+                            "execute: clocks out of sync between local instance (slingId: {}) and remote instance (slingId: {}) by {}ms "
+                                    + "ms which is equal or higher than the low-water mark of {}ms.",
+                            new Object[] { slingId, peerSlingId, diffMillis, INTER_CLUSTER_HIGH_WATER_MARK });
+                    resultLog.warn(
+                            "Clocks noticeably out of sync between local instance (slingId: {}) and remote instance (slingId: {}): "
+                            + "by {}ms which is equal or larger than low-water mark of {}ms",
+                            new Object[] { slingId, peerSlingId, diffMillis, INTER_CLUSTER_HIGH_WATER_MARK });
+                } else {
+                    logger.debug(
+                            "execute: clocks in sync between local instance (slingId: {}) and remote instance (slingId: {}). "
+                            + "diff is {}ms which is within low-water mark of {}ms.",
+                            new Object[] { slingId, peerSlingId, diffMillis, INTER_CLUSTER_HIGH_WATER_MARK });
+                    resultLog.info(
+                            "Clocks in sync between local instance (slingId: {}) and remote instance (slingId: {}): "
+                            + "diff is {}ms which is within low-water mark of {}ms",
+                            new Object[] { slingId, peerSlingId, diffMillis, INTER_CLUSTER_HIGH_WATER_MARK });
+                }
+            }
+        }
+
+        return new Result(resultLog);
+    }
+
+}

-- 
To stop receiving notification emails like this one, please contact
"commits@sling.apache.org" <co...@sling.apache.org>.