You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by st...@apache.org on 2015/09/14 14:48:45 UTC
svn commit: r1702934 - in
/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document:
ClusterNodeInfo.java DocumentNodeStore.java DocumentNodeStoreService.java
util/LeaseCheckDocumentStoreWrapper.java
Author: stefanegli
Date: Mon Sep 14 12:48:44 2015
New Revision: 1702934
URL: http://svn.apache.org/r1702934
Log:
OAK-3397 : stop oak-core bundle instead of doing a System.exit on lease failure
Modified:
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStore.java
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStoreService.java
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/LeaseCheckDocumentStoreWrapper.java
Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java?rev=1702934&r1=1702933&r2=1702934&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java Mon Sep 14 12:48:44 2015
@@ -30,6 +30,9 @@ import java.util.UUID;
import org.apache.jackrabbit.oak.commons.StringUtils;
import org.apache.jackrabbit.oak.stats.Clock;
import org.apache.jackrabbit.oak.util.OakVersion;
+import org.osgi.framework.Bundle;
+import org.osgi.framework.BundleException;
+import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -38,6 +41,9 @@ import org.slf4j.LoggerFactory;
*/
public class ClusterNodeInfo {
+ private static final String LEASE_CHECK_FAILED_MSG = "performLeaseCheck: this oak instance failed to update "
+ + "the lease in time and can therefore no longer access this DocumentNodeStore.";
+
private static final Logger LOG = LoggerFactory.getLogger(ClusterNodeInfo.class);
/**
@@ -204,10 +210,12 @@ public class ClusterNodeInfo {
private ClusterNodeState state;
/**
- * Whether or not the OAK-2739/leaseCheck failed and thus a System.exit was already triggered
- * (is used to avoid calling System.exit a hundred times when it then happens)
+ * OAK-2739 / OAK-3397 : once a lease check turns out negative, this flag
+ * is set to prevent any further checks to succeed. Also, only the first
+ * one to change this flag will take the appropriate action that results
+ * from a failed leaseCheck (which is currently to stop oak-core bundle)
*/
- private volatile boolean systemExitTriggered;
+ private boolean leaseCheckFailed = false;
/**
* OAK-2739: for development it would be useful to be able to disable the
@@ -236,6 +244,9 @@ public class ClusterNodeInfo {
*/
private boolean newEntry;
+ /** OAK-3397 : this context is used to stop the oak-core bundle in case of lease failure **/
+ private ComponentContext cc;
+
private ClusterNodeInfo(int id, DocumentStore store, String machineId, String instanceId, ClusterNodeState state,
RecoverLockState revRecoveryLock, Long leaseEnd, boolean newEntry) {
this.id = id;
@@ -393,47 +404,96 @@ public class ClusterNodeInfo {
}
public void performLeaseCheck() {
- if (leaseCheckDisabled || !renewed) {
+ if (leaseCheckDisabled || !renewed || (cc==null)) {
// if leaseCheckDisabled is set we never do the check, so return fast
// the 'renewed' flag indicates if this instance *ever* renewed the lease after startup
// until that is not set, we cannot do the lease check (otherwise startup wouldn't work)
return;
}
- final long now = getCurrentTime();
+ if (leaseCheckFailed) {
+ // unsynchronized access to leaseCheckFailed is fine
+ // since it only ever changes from false to true once
+ // and should the current thread read it erroneously
+ // as false here, it would further down find out that
+ // the lease has indeed still expired and then
+ // go into the synchronized.
+ // (note that once a lease check failed it would not
+ // be updated again, ever, as guaranteed by checking
+ // for leaseCheckFailed in renewLease() )
+ LOG.error(LEASE_CHECK_FAILED_MSG);
+ throw new AssertionError(LEASE_CHECK_FAILED_MSG);
+ }
+ long now = getCurrentTime();
if (now < (leaseEndTime - leaseTime / 3)) { // OAK-3238 : put the barrier 1/3 before lease end
// then all is good
return;
}
-
- // OAK-2739 : when the lease is not current, we must stop
- // the instance immediately to avoid any cluster inconsistency
- final String errorMsg = "performLeaseCheck: this instance failed to update the lease in time "
- + "(leaseEndTime: "+leaseEndTime+", now: "+now+", leaseTime: "+leaseTime+") "
- + "and is thus no longer eligible for taking part in the cluster. Shutting down NOW!";
- LOG.error(errorMsg);
-
- // now here comes the thing: we should a) call System.exit in a separate thread
- // to avoid any deadlock when calling from eg within the shutdown hook
- // AND b) we should not call system.exit hundred times.
- // so for b) we use 'systemExitTriggered' to avoid calling it over and over
- // BUT it doesn't have to be 100% ensured that system.exit is called only once.
- // it is fine if it gets called once, twice - but just not hundred times.
- // which is a long way of saying: volatile is fine here - and the 'if' too
- if (!systemExitTriggered) {
- systemExitTriggered = true;
- final Runnable r = new Runnable() {
-
- @Override
- public void run() {
- System.exit(-1);
- }
- };
- final Thread th = new Thread(r, "FailedLeaseCheckShutdown-Thread");
- th.setDaemon(true);
- th.start();
+ synchronized(this) {
+ if (leaseCheckFailed) {
+ LOG.error(LEASE_CHECK_FAILED_MSG);
+ throw new AssertionError(LEASE_CHECK_FAILED_MSG);
+ }
+ // synchronized could have delayed the 'now', so
+ // set it again..
+ now = getCurrentTime();
+ if (now < (leaseEndTime - leaseTime / 3)) { // OAK-3238 : put the barrier 1/3 before lease end
+ // if lease is OK here, then there was a race
+ // between performLeaseCheck and renewLease()
+ // where the winner was: renewLease().
+ // so: luckily we can continue here
+ return;
+ }
+ leaseCheckFailed = true; // make sure only one thread 'wins', ie goes any further
+ }
+
+ // OAK-3397 : unlike previously, when the lease check fails we should not
+ // do a hard System exit here but rather stop the oak-core bundle
+ // (or if that fails just deactivate DocumentNodeStore) - with the
+ // goals to prevent this instance to continue to operate
+ // give that a lease failure is a strong indicator of a faulty
+ // instance - and to stop the background threads of DocumentNodeStore,
+ // specifically the BackgroundLeaseUpdate and the BackgroundOperation.
+
+ final String restarterErrorMsg = LEASE_CHECK_FAILED_MSG+" (leaseEndTime: "+leaseEndTime+
+ ", leaseTime: "+leaseTime+
+ ", lease check end time (1/3 before lease end): "+(leaseEndTime - leaseTime / 3)+
+ ", now: "+now+
+ ", remaining: "+((leaseEndTime - leaseTime / 3) - now)+
+ ") Need to stop oak-core/DocumentNodeStoreService.";
+ LOG.error(restarterErrorMsg);
+
+ // actual stopping should be done in a separate thread, so:
+ final Runnable r = new Runnable() {
+
+ @Override
+ public void run() {
+ handleLeaseCheckFailed();
+ }
+ };
+ final Thread th = new Thread(r, "FailedLeaseCheck-Thread");
+ th.setDaemon(true);
+ th.start();
+
+ throw new AssertionError(restarterErrorMsg);
+ }
+
+ private void handleLeaseCheckFailed() {
+ try {
+ // plan A: try stopping oak-core
+ LOG.error("handleLeaseCheckFailed: stopping oak-core...");
+ Bundle bundle = cc.getBundleContext().getBundle();
+ bundle.stop();
+ LOG.error("handleLeaseCheckFailed: stopped oak-core.");
+ // plan A worked, perfect!
+ } catch (BundleException e) {
+ LOG.error("handleLeaseCheckFailed: exception while stopping oak-core: "+e, e);
+ // plan B: stop only DocumentNodeStoreService (to stop the background threads)
+ LOG.error("handleLeaseCheckFailed: stopping DocumentNodeStoreService...");
+ cc.disableComponent(DocumentNodeStoreService.class.getName());
+ LOG.error("handleLeaseCheckFailed: stopped DocumentNodeStoreService");
+ // plan B succeeded.
}
- throw new AssertionError(errorMsg);
}
/**
@@ -449,8 +509,18 @@ public class ClusterNodeInfo {
if (now + 2 * leaseTime / 3 < leaseEndTime) {
return false;
}
+ synchronized(this) {
+ if (leaseCheckFailed) {
+ // prevent lease renewal after it failed
+ LOG.error(LEASE_CHECK_FAILED_MSG);
+ throw new AssertionError(LEASE_CHECK_FAILED_MSG);
+ }
+ // synchronized could have delayed the 'now', so
+ // set it again..
+ now = getCurrentTime();
+ leaseEndTime = now + leaseTime;
+ }
UpdateOp update = new UpdateOp("" + id, true);
- leaseEndTime = now + leaseTime;
update.set(LEASE_END_KEY, leaseEndTime);
update.set(STATE, ClusterNodeState.ACTIVE.name());
ClusterNodeInfoDocument doc = store.createOrUpdate(Collection.CLUSTER_NODES, update);
@@ -463,6 +533,7 @@ public class ClusterNodeInfo {
return true;
}
+ /** for testing purpose only, not to be changed at runtime! */
public void setLeaseTime(long leaseTime) {
this.leaseTime = leaseTime;
}
@@ -471,7 +542,17 @@ public class ClusterNodeInfo {
return leaseTime;
}
+ public void setComponentContext(ComponentContext cc) {
+ this.cc = cc;
+ }
+
public void dispose() {
+ synchronized(this) {
+ if (leaseCheckFailed) {
+ LOG.warn("dispose: lease check failed, thus not marking instance as cleanly shut down.");
+ return;
+ }
+ }
UpdateOp update = new UpdateOp("" + id, true);
update.set(LEASE_END_KEY, null);
update.set(STATE, null);
Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStore.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStore.java?rev=1702934&r1=1702933&r2=1702934&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStore.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStore.java Mon Sep 14 12:48:44 2015
@@ -566,7 +566,17 @@ public final class DocumentNodeStore
// do a final round of background operations after
// the background thread stopped
- internalRunBackgroundUpdateOperations();
+ try{
+ internalRunBackgroundUpdateOperations();
+ } catch(AssertionError ae) {
+ // OAK-3250 : when a lease check fails, subsequent modifying requests
+ // to the DocumentStore will throw an AssertionError. Since as a result
+ // of a failing lease check a bundle.stop is done and thus a dispose of the
+ // DocumentNodeStore happens, it is very likely that in that case
+ // you run into an AssertionError. We should still continue with disposing
+ // though - thus catching and logging..
+ LOG.error("dispose: an AssertionError happened during dispose's last background ops: "+ae, ae);
+ }
if (leaseUpdateThread != null) {
try {
Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStoreService.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStoreService.java?rev=1702934&r1=1702933&r2=1702934&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStoreService.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/DocumentNodeStoreService.java Mon Sep 14 12:48:44 2015
@@ -433,6 +433,13 @@ public class DocumentNodeStoreService {
observerTracker.start(context.getBundleContext());
+ ClusterNodeInfo clusterInfo = mk.getNodeStore().getClusterInfo();
+ if (clusterInfo!=null) {
+ clusterInfo.setComponentContext(context);
+ } else {
+ log.warn("registerNodeStore: no ClusterNodeInfo available, lease-check will be disabled!");
+ }
+
DocumentStore ds = mk.getDocumentStore();
// OAK-2682: time difference detection applied at startup with a default
Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/LeaseCheckDocumentStoreWrapper.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/LeaseCheckDocumentStoreWrapper.java?rev=1702934&r1=1702933&r2=1702934&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/LeaseCheckDocumentStoreWrapper.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/util/LeaseCheckDocumentStoreWrapper.java Mon Sep 14 12:48:44 2015
@@ -157,8 +157,11 @@ public final class LeaseCheckDocumentSto
// this is debatable whether or not a lease check should be done on dispose.
// I'd say the lease must still be valid as on dispose there could be
// stuff written to the document store which should only be done
- // when the lease is valid
- performLeaseCheck();
+ // when the lease is valid.
+ // however.. dispose() is also called as a result of the 'failed lease check stopping'
+ // mechanism - and in that case this would just throw an exception and the
+ // DocumentNodeStore.dispose() would not correctly finish.
+ // so: let's let the dispose ignore the lease state
delegate.dispose();
}