You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by st...@apache.org on 2015/09/14 14:48:45 UTC

svn commit: r1702934 - in /jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document: ClusterNodeInfo.java DocumentNodeStore.java DocumentNodeStoreService.java util/LeaseCheckDocumentStoreWrapper.java

Author: stefanegli
Date: Mon Sep 14 12:48:44 2015
New Revision: 1702934

URL: http://svn.apache.org/r1702934
Log:
OAK-3397 : stop oak-core bundle instead of doing a System.exit on lease failure

Modified:
    jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
    jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStore.java
    jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStoreService.java
    jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/LeaseCheckDocumentStoreWrapper.java

Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java?rev=1702934&r1=1702933&r2=1702934&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java Mon Sep 14 12:48:44 2015
@@ -30,6 +30,9 @@ import java.util.UUID;
 import org.apache.jackrabbit.oak.commons.StringUtils;
 import org.apache.jackrabbit.oak.stats.Clock;
 import org.apache.jackrabbit.oak.util.OakVersion;
+import org.osgi.framework.Bundle;
+import org.osgi.framework.BundleException;
+import org.osgi.service.component.ComponentContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -38,6 +41,9 @@ import org.slf4j.LoggerFactory;
  */
 public class ClusterNodeInfo {
 
+    private static final String LEASE_CHECK_FAILED_MSG = "performLeaseCheck: this oak instance failed to update "
+            + "the lease in time and can therefore no longer access this DocumentNodeStore.";
+
     private static final Logger LOG = LoggerFactory.getLogger(ClusterNodeInfo.class);
 
     /**
@@ -204,10 +210,12 @@ public class ClusterNodeInfo {
     private ClusterNodeState state;
     
     /**
-     * Whether or not the OAK-2739/leaseCheck failed and thus a System.exit was already triggered
-     * (is used to avoid calling System.exit a hundred times when it then happens)
+     * OAK-2739 / OAK-3397 : once a lease check turns out negative, this flag
+     * is set to prevent any further checks to succeed. Also, only the first
+     * one to change this flag will take the appropriate action that results
+     * from a failed leaseCheck (which is currently to stop oak-core bundle)
      */
-    private volatile boolean systemExitTriggered;
+    private boolean leaseCheckFailed = false;
 
     /**
      * OAK-2739: for development it would be useful to be able to disable the
@@ -236,6 +244,9 @@ public class ClusterNodeInfo {
      */
     private boolean newEntry;
 
+    /** OAK-3397 : this context is used to stop the oak-core bundle in case of lease failure **/
+    private ComponentContext cc;
+
     private ClusterNodeInfo(int id, DocumentStore store, String machineId, String instanceId, ClusterNodeState state,
             RecoverLockState revRecoveryLock, Long leaseEnd, boolean newEntry) {
         this.id = id;
@@ -393,47 +404,96 @@ public class ClusterNodeInfo {
     }
 
     public void performLeaseCheck() {
-        if (leaseCheckDisabled || !renewed) {
+        if (leaseCheckDisabled || !renewed || (cc==null)) {
             // if leaseCheckDisabled is set we never do the check, so return fast
 
             // the 'renewed' flag indicates if this instance *ever* renewed the lease after startup
             // until that is not set, we cannot do the lease check (otherwise startup wouldn't work)
             return;
         }
-        final long now = getCurrentTime();
+        if (leaseCheckFailed) {
+            // unsynchronized access to leaseCheckFailed is fine
+            // since it only ever changes from false to true once
+            // and should the current thread read it erroneously
+            // as false here, it would further down find out that
+            // the lease has indeed still expired and then
+            // go into the synchronized.
+            // (note that once a lease check failed it would not
+            // be updated again, ever, as guaranteed by checking
+            // for leaseCheckFailed in renewLease() )
+            LOG.error(LEASE_CHECK_FAILED_MSG);
+            throw new AssertionError(LEASE_CHECK_FAILED_MSG);
+        }
+        long now = getCurrentTime();
         if (now < (leaseEndTime - leaseTime / 3)) { // OAK-3238 : put the barrier 1/3 before lease end
             // then all is good
             return;
         }
-
-        // OAK-2739 : when the lease is not current, we must stop
-        // the instance immediately to avoid any cluster inconsistency
-        final String errorMsg = "performLeaseCheck: this instance failed to update the lease in time "
-                + "(leaseEndTime: "+leaseEndTime+", now: "+now+", leaseTime: "+leaseTime+") "
-                + "and is thus no longer eligible for taking part in the cluster. Shutting down NOW!";
-        LOG.error(errorMsg);
-
-        // now here comes the thing: we should a) call System.exit in a separate thread
-        // to avoid any deadlock when calling from eg within the shutdown hook
-        // AND b) we should not call system.exit hundred times.
-        // so for b) we use 'systemExitTriggered' to avoid calling it over and over
-        // BUT it doesn't have to be 100% ensured that system.exit is called only once.
-        // it is fine if it gets called once, twice - but just not hundred times.
-        // which is a long way of saying: volatile is fine here - and the 'if' too
-        if (!systemExitTriggered) {
-            systemExitTriggered = true;
-            final Runnable r = new Runnable() {
-
-                @Override
-                public void run() {
-                    System.exit(-1);
-                }
-            };
-            final Thread th = new Thread(r, "FailedLeaseCheckShutdown-Thread");
-            th.setDaemon(true);
-            th.start();
+        synchronized(this) {
+            if (leaseCheckFailed) {
+                LOG.error(LEASE_CHECK_FAILED_MSG);
+                throw new AssertionError(LEASE_CHECK_FAILED_MSG);
+            }
+            // synchronized could have delayed the 'now', so
+            // set it again..
+            now = getCurrentTime();
+            if (now < (leaseEndTime - leaseTime / 3)) { // OAK-3238 : put the barrier 1/3 before lease end
+                // if lease is OK here, then there was a race
+                // between performLeaseCheck and renewLease()
+                // where the winner was: renewLease().
+                // so: luckily we can continue here
+                return;
+            }
+            leaseCheckFailed = true; // make sure only one thread 'wins', ie goes any further
+        }
+        
+        // OAK-3397 : unlike previously, when the lease check fails we should not
+        // do a hard System exit here but rather stop the oak-core bundle 
+        // (or if that fails just deactivate DocumentNodeStore) - with the
+        // goals to prevent this instance to continue to operate
+        // give that a lease failure is a strong indicator of a faulty
+        // instance - and to stop the background threads of DocumentNodeStore,
+        // specifically the BackgroundLeaseUpdate and the BackgroundOperation.
+        
+        final String restarterErrorMsg = LEASE_CHECK_FAILED_MSG+" (leaseEndTime: "+leaseEndTime+
+                ", leaseTime: "+leaseTime+
+                ", lease check end time (1/3 before lease end): "+(leaseEndTime - leaseTime / 3)+
+                ", now: "+now+
+                ", remaining: "+((leaseEndTime - leaseTime / 3) - now)+
+                ") Need to stop oak-core/DocumentNodeStoreService.";
+        LOG.error(restarterErrorMsg);
+        
+        // actual stopping should be done in a separate thread, so:
+        final Runnable r = new Runnable() {
+
+            @Override
+            public void run() {
+                handleLeaseCheckFailed();
+            }
+        };
+        final Thread th = new Thread(r, "FailedLeaseCheck-Thread");
+        th.setDaemon(true);
+        th.start();
+
+        throw new AssertionError(restarterErrorMsg);
+    }
+
+    private void handleLeaseCheckFailed() {
+        try {
+            // plan A: try stopping oak-core
+            LOG.error("handleLeaseCheckFailed: stopping oak-core...");
+            Bundle bundle = cc.getBundleContext().getBundle();
+            bundle.stop();
+            LOG.error("handleLeaseCheckFailed: stopped oak-core.");
+            // plan A worked, perfect!
+        } catch (BundleException e) {
+            LOG.error("handleLeaseCheckFailed: exception while stopping oak-core: "+e, e);
+            // plan B: stop only DocumentNodeStoreService (to stop the background threads)
+            LOG.error("handleLeaseCheckFailed: stopping DocumentNodeStoreService...");
+            cc.disableComponent(DocumentNodeStoreService.class.getName());
+            LOG.error("handleLeaseCheckFailed: stopped DocumentNodeStoreService");
+            // plan B succeeded.
         }
-        throw new AssertionError(errorMsg);
     }
 
     /**
@@ -449,8 +509,18 @@ public class ClusterNodeInfo {
         if (now + 2 * leaseTime / 3 < leaseEndTime) {
             return false;
         }
+        synchronized(this) {
+            if (leaseCheckFailed) {
+                // prevent lease renewal after it failed
+                LOG.error(LEASE_CHECK_FAILED_MSG);
+                throw new AssertionError(LEASE_CHECK_FAILED_MSG);
+            }
+            // synchronized could have delayed the 'now', so
+            // set it again..
+            now = getCurrentTime();
+            leaseEndTime = now + leaseTime;
+        }
         UpdateOp update = new UpdateOp("" + id, true);
-        leaseEndTime = now + leaseTime;
         update.set(LEASE_END_KEY, leaseEndTime);
         update.set(STATE, ClusterNodeState.ACTIVE.name());
         ClusterNodeInfoDocument doc = store.createOrUpdate(Collection.CLUSTER_NODES, update);
@@ -463,6 +533,7 @@ public class ClusterNodeInfo {
         return true;
     }
 
+    /** for testing purpose only, not to be changed at runtime! */
     public void setLeaseTime(long leaseTime) {
         this.leaseTime = leaseTime;
     }
@@ -471,7 +542,17 @@ public class ClusterNodeInfo {
         return leaseTime;
     }
 
+    public void setComponentContext(ComponentContext cc) {
+        this.cc = cc;
+    }
+
     public void dispose() {
+        synchronized(this) {
+            if (leaseCheckFailed) {
+                LOG.warn("dispose: lease check failed, thus not marking instance as cleanly shut down.");
+                return;
+            }
+        }
         UpdateOp update = new UpdateOp("" + id, true);
         update.set(LEASE_END_KEY, null);
         update.set(STATE, null);

Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStore.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStore.java?rev=1702934&r1=1702933&r2=1702934&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStore.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStore.java Mon Sep 14 12:48:44 2015
@@ -566,7 +566,17 @@ public final class DocumentNodeStore
 
         // do a final round of background operations after
         // the background thread stopped
-        internalRunBackgroundUpdateOperations();
+        try{
+            internalRunBackgroundUpdateOperations();
+        } catch(AssertionError ae) {
+            // OAK-3250 : when a lease check fails, subsequent modifying requests
+            // to the DocumentStore will throw an AssertionError. Since as a result
+            // of a failing lease check a bundle.stop is done and thus a dispose of the
+            // DocumentNodeStore happens, it is very likely that in that case 
+            // you run into an AssertionError. We should still continue with disposing
+            // though - thus catching and logging..
+            LOG.error("dispose: an AssertionError happened during dispose's last background ops: "+ae, ae);
+        }
 
         if (leaseUpdateThread != null) {
             try {

Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStoreService.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStoreService.java?rev=1702934&r1=1702933&r2=1702934&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStoreService.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStoreService.java Mon Sep 14 12:48:44 2015
@@ -433,6 +433,13 @@ public class DocumentNodeStoreService {
 
         observerTracker.start(context.getBundleContext());
 
+        ClusterNodeInfo clusterInfo = mk.getNodeStore().getClusterInfo();
+        if (clusterInfo!=null) {
+            clusterInfo.setComponentContext(context);
+        } else {
+            log.warn("registerNodeStore: no ClusterNodeInfo available, lease-check will be disabled!");
+        }
+        
         DocumentStore ds = mk.getDocumentStore();
 
         // OAK-2682: time difference detection applied at startup with a default

Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/LeaseCheckDocumentStoreWrapper.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/LeaseCheckDocumentStoreWrapper.java?rev=1702934&r1=1702933&r2=1702934&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/LeaseCheckDocumentStoreWrapper.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/LeaseCheckDocumentStoreWrapper.java Mon Sep 14 12:48:44 2015
@@ -157,8 +157,11 @@ public final class LeaseCheckDocumentSto
         // this is debatable whether or not a lease check should be done on dispose.
         // I'd say the lease must still be valid as on dispose there could be
         // stuff written to the document store which should only be done
-        // when the lease is valid
-        performLeaseCheck();
+        // when the lease is valid.
+        // however.. dispose() is also called as a result of the 'failed lease check stopping'
+        // mechanism - and in that case this would just throw an exception and the
+        // DocumentNodeStore.dispose() would not correctly finish.
+        // so: let's let the dispose ignore the lease state
         delegate.dispose();
     }