You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by st...@apache.org on 2015/09/15 11:31:32 UTC
svn commit: r1703129 -
/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
Author: stefanegli
Date: Tue Sep 15 09:31:31 2015
New Revision: 1703129
URL: http://svn.apache.org/r1703129
Log:
OAK-3398 : make lease update more robust by checking state and leaseEnd properties are unchanged when updating the lease - if they are have changed then that can only be done by another instance in the cluster which was declaring this instance as timed out - and in that case the local instance should gracefully step back, aka handleLeaseFailure, oak-core stop
Modified:
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java?rev=1703129&r1=1703128&r2=1703129&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java Tue Sep 15 09:31:31 2015
@@ -38,7 +38,7 @@ import org.slf4j.LoggerFactory;
*/
public class ClusterNodeInfo {
- private static final String LEASE_CHECK_FAILED_MSG = "performLeaseCheck: this oak instance failed to update "
+ private static final String LEASE_CHECK_FAILED_MSG = "This oak instance failed to update "
+ "the lease in time and can therefore no longer access this DocumentNodeStore.";
private static final Logger LOG = LoggerFactory.getLogger(ClusterNodeInfo.class);
@@ -222,6 +222,20 @@ public class ClusterNodeInfo {
* The time (in milliseconds UTC) where the lease of this instance ends.
*/
private volatile long leaseEndTime;
+
+ /**
+ * The value of leaseEnd last updated towards DocumentStore -
+ * this one is used to compare against (for OAK-3398) when checking
+ * if any other instance updated the lease or if the lease is unchanged.
+ * (This is kind of a duplication of the leaseEndTime field, yes - but the semantics
+ * are that previousLeaseEndTime exactly only serves the purpose of
+ * keeping the value of what was stored in the previous lease update.
+ * leaseEndTime on the other hand serves the purpose of *defining the lease end*,
+ * these are two different concerns, thus justify two different fields.
+ * the leaseEndTime for example can be manipulated during tests therefore,
+ * without interfering with previousLeaseEndTime)
+ */
+ private long previousLeaseEndTime;
/**
* The read/write mode.
@@ -455,8 +469,12 @@ public class ClusterNodeInfo {
// then all is good
return;
}
+ // synchronized: we need to guard leaseCheckFailed in order to ensure
+ // that it is only set by 1 thread - thus handleLeaseFailure
+ // is guaranteed to be only called once
synchronized(this) {
if (leaseCheckFailed) {
+ // someone else won and marked leaseCheckFailed - so we only log/throw
LOG.error(LEASE_CHECK_FAILED_MSG);
throw new AssertionError(LEASE_CHECK_FAILED_MSG);
}
@@ -473,6 +491,19 @@ public class ClusterNodeInfo {
leaseCheckFailed = true; // make sure only one thread 'wins', ie goes any further
}
+ final String errorMsg = LEASE_CHECK_FAILED_MSG+" (leaseEndTime: "+leaseEndTime+
+ ", leaseTime: "+leaseTime+
+ ", leaseFailureMargin: "+leaseFailureMargin+
+ ", lease check end time (leaseEndTime-leaseFailureMargin): "+(leaseEndTime - leaseFailureMargin)+
+ ", now: "+now+
+ ", remaining: "+((leaseEndTime - leaseFailureMargin) - now)+
+ ") Need to stop oak-core/DocumentNodeStoreService.";
+ LOG.error(errorMsg);
+
+ handleLeaseFailure(errorMsg);
+ }
+
+ private void handleLeaseFailure(final String errorMsg) {
// OAK-3397 : unlike previously, when the lease check fails we should not
// do a hard System exit here but rather stop the oak-core bundle
// (or if that fails just deactivate DocumentNodeStore) - with the
@@ -481,15 +512,6 @@ public class ClusterNodeInfo {
// instance - and to stop the background threads of DocumentNodeStore,
// specifically the BackgroundLeaseUpdate and the BackgroundOperation.
- final String restarterErrorMsg = LEASE_CHECK_FAILED_MSG+" (leaseEndTime: "+leaseEndTime+
- ", leaseTime: "+leaseTime+
- ", leaseFailureMargin: "+leaseFailureMargin+
- ", lease check end time (leaseEndTime-leaseFailureMargin): "+(leaseEndTime - leaseFailureMargin)+
- ", now: "+now+
- ", remaining: "+((leaseEndTime - leaseFailureMargin) - now)+
- ") Need to stop oak-core/DocumentNodeStoreService.";
- LOG.error(restarterErrorMsg);
-
// actual stopping should be done in a separate thread, so:
if (leaseFailureHandler!=null) {
final Runnable r = new Runnable() {
@@ -506,23 +528,32 @@ public class ClusterNodeInfo {
th.start();
}
- throw new AssertionError(restarterErrorMsg);
+ throw new AssertionError(errorMsg);
}
/**
* Renew the cluster id lease. This method needs to be called once in a while,
* to ensure the same cluster id is not re-used by a different instance.
- * The lease is only renewed when after leaseUpdateInterval millis
- * since last lease update - default being every 10 sec.
+ * The lease is only renewed after 'leaseUpdateInterval' millis
+ * since last lease update - default being every 10 sec (this used to be 30sec).
*
* @return {@code true} if the lease was renewed; {@code false} otherwise.
*/
public boolean renewLease() {
long now = getCurrentTime();
if (now < leaseEndTime - leaseTime + leaseUpdateInterval) {
+ // no need to renew the lease - it is still within 'leaseUpdateInterval'
return false;
}
+ // lease requires renewal
+
synchronized(this) {
+ // this is synchronized since access to leaseCheckFailed and leaseEndTime
+ // are both normally synchronzied to propagate values between renewLease()
+ // and performLeaseCheck().
+ // (there are unsychronized accesses to both of these as well - however
+ // they are both double-checked - and with both reading a stale value is thus OK)
+
if (leaseCheckFailed) {
// prevent lease renewal after it failed
LOG.error(LEASE_CHECK_FAILED_MSG);
@@ -536,7 +567,55 @@ public class ClusterNodeInfo {
UpdateOp update = new UpdateOp("" + id, true);
update.set(LEASE_END_KEY, leaseEndTime);
update.set(STATE, ClusterNodeState.ACTIVE.name());
- ClusterNodeInfoDocument doc = store.createOrUpdate(Collection.CLUSTER_NODES, update);
+ ClusterNodeInfoDocument doc = null;
+ if (renewed && !leaseCheckDisabled) { // if leaseCheckDisabled, then we just update the lease without checking
+ // OAK-3398:
+ // if we renewed the lease ever with this instance/ClusterNodeInfo
+ // (which is the normal case.. except for startup),
+ // then we can now make an assertion that the lease is unchanged
+ // and the incremental update must only succeed if no-one else
+ // did a recover/inactivation in the meantime
+ update.setNew(false); // in this case it is *not* a new document
+ // make two assertions: the leaseEnd must match ..
+ update.equals(LEASE_END_KEY, null, previousLeaseEndTime);
+ // plus it must still be active ..
+ update.equals(STATE, null, ClusterNodeState.ACTIVE.name());
+ // @TODO: to make it 100% failure proof we could introduce
+ // yet another field to clusterNodes: a runtimeId that we
+ // create (UUID) at startup each time - and against that
+ // we could also check here - but that goes a bit far IMO
+ doc = store.findAndUpdate(Collection.CLUSTER_NODES, update);
+ } else {
+ // this is only for startup - then we 'just' overwrite
+ // the lease - or create it - and don't care a lot about what the
+ // status of the lease was
+ doc = store.createOrUpdate(Collection.CLUSTER_NODES, update);
+ }
+ if (doc==null) { // should not occur when leaseCheckDisabled
+ // OAK-3398 : someone else either started recovering or is already through with that.
+ // in both cases the local instance lost the lease-update-game - and hence
+ // should behave and must consider itself as 'lease failed'
+
+ synchronized(this) {
+ if (leaseCheckFailed) {
+ // somehow the instance figured out otherwise that the
+ // lease check failed - so we don't have to too - so we just log/throw
+ LOG.error(LEASE_CHECK_FAILED_MSG);
+ throw new AssertionError(LEASE_CHECK_FAILED_MSG);
+ }
+ leaseCheckFailed = true; // make sure only one thread 'wins', ie goes any further
+ }
+ final String errorMsg = LEASE_CHECK_FAILED_MSG+
+ " (Could not update lease anymore, someone else in the cluster "
+ + "must have noticed this instance' slowness already. "
+ + "Going to invoke leaseFailureHandler!)";
+ LOG.error(errorMsg);
+
+ handleLeaseFailure(errorMsg);
+ // should never be reached: handleLeaseFailure throws an AssertionError
+ return false;
+ }
+ previousLeaseEndTime = leaseEndTime; // store previousLeaseEndTime for reference for next time
String mode = (String) doc.get(READ_WRITE_MODE_KEY);
if (mode != null && !mode.equals(readWriteMode)) {
readWriteMode = mode;