You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by st...@apache.org on 2015/09/14 15:51:18 UTC
svn commit: r1702953 - in /jackrabbit/oak/trunk/oak-core/src:
main/java/org/apache/jackrabbit/oak/plugins/document/
test/java/org/apache/jackrabbit/oak/plugins/document/
Author: stefanegli
Date: Mon Sep 14 13:51:18 2015
New Revision: 1702953
URL: http://svn.apache.org/r1702953
Log:
OAK-3398 : increase lease timeout to 120sec from 60sec - plus update it every 10 sec not only every 20sec - plus make the lease failure margin explicit and set it to 20sec as was previously implicitly the case
Modified:
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterInfoTest.java
jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterTest.java
jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java
Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java?rev=1702953&r1=1702952&r2=1702953&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java Mon Sep 14 13:51:18 2015
@@ -155,15 +155,42 @@ public class ClusterNodeInfo {
*/
private static Clock clock = Clock.SIMPLE;
-
- public static final int DEFAULT_LEASE_DURATION_MILLIS = 1000 * 60;
+ /** OAK-3398 : default lease duration 120sec **/
+ public static final int DEFAULT_LEASE_DURATION_MILLIS = 1000 * 120;
+
+ /** OAK-3398 : default update interval 10sec **/
+ public static final int DEFAULT_LEASE_UPDATE_INTERVAL_MILLIS = 1000 * 10;
+
+ /** OAK-3398 : default failure margin 20sec before actual lease timeout **/
+ public static final int DEFAULT_LEASE_FAILURE_MARGIN_MILLIS = 1000 * 20;
/**
- * The number of milliseconds for a lease (1 minute by default, and
+ * The number of milliseconds for a lease (2 minute by default, and
* initially).
*/
private long leaseTime = DEFAULT_LEASE_DURATION_MILLIS;
-
+
+ /**
+ * The number of milliseconds after which a lease will be updated
+ * (should not be every second as that would increase number of
+ * writes towards DocumentStore considerably - but it should also
+ * not be too low as that would eat into the lease duration on average.
+ */
+ private long leaseUpdateInterval = DEFAULT_LEASE_UPDATE_INTERVAL_MILLIS;
+
+ /**
+ * The number of milliseconds that a lease must still be valid
+ * before prematurely declaring it as failed. The default is 20sec.
+ * The idea of declaring a lease as failed before it actually failed
+ * is to avoid a race condition where the local instance assumes
+ * things are all fine but another instance in the cluster will
+ * 'in the same moment' declare it as failed. The lease should be
+ * checked every second and updated after 10sec, so it should always
+ * have a validity of at least 110sec - if that's down to this margin
+ * of 20sec then things are not good and we have to give up.
+ */
+ private long leaseFailureMargin = DEFAULT_LEASE_FAILURE_MARGIN_MILLIS;
+
/**
* The assigned cluster id.
*/
@@ -425,7 +452,9 @@ public class ClusterNodeInfo {
throw new AssertionError(LEASE_CHECK_FAILED_MSG);
}
long now = getCurrentTime();
- if (now < (leaseEndTime - leaseTime / 3)) { // OAK-3238 : put the barrier 1/3 before lease end
+ // OAK-3238 put the barrier 1/3 of 60sec=20sec before the end
+ // OAK-3398 keeps this the same but uses an explicit leaseFailureMargin for this
+ if (now < (leaseEndTime - leaseFailureMargin)) {
// then all is good
return;
}
@@ -437,7 +466,7 @@ public class ClusterNodeInfo {
// synchronized could have delayed the 'now', so
// set it again..
now = getCurrentTime();
- if (now < (leaseEndTime - leaseTime / 3)) { // OAK-3238 : put the barrier 1/3 before lease end
+ if (now < (leaseEndTime - leaseFailureMargin)) {
// if lease is OK here, then there was a race
// between performLeaseCheck and renewLease()
// where the winner was: renewLease().
@@ -457,9 +486,10 @@ public class ClusterNodeInfo {
final String restarterErrorMsg = LEASE_CHECK_FAILED_MSG+" (leaseEndTime: "+leaseEndTime+
", leaseTime: "+leaseTime+
- ", lease check end time (1/3 before lease end): "+(leaseEndTime - leaseTime / 3)+
+ ", leaseFailureMargin: "+leaseFailureMargin+
+ ", lease check end time (leaseEndTime-leaseFailureMargin): "+(leaseEndTime - leaseFailureMargin)+
", now: "+now+
- ", remaining: "+((leaseEndTime - leaseTime / 3) - now)+
+ ", remaining: "+((leaseEndTime - leaseFailureMargin) - now)+
") Need to stop oak-core/DocumentNodeStoreService.";
LOG.error(restarterErrorMsg);
@@ -499,14 +529,14 @@ public class ClusterNodeInfo {
/**
* Renew the cluster id lease. This method needs to be called once in a while,
* to ensure the same cluster id is not re-used by a different instance.
- * The lease is only renewed when a third of the lease time passed. That is,
- * with a lease time of 60 seconds, the lease is renewed every 20 seconds.
+ * The lease is only renewed when after leaseUpdateInterval millis
+ * since last lease update - default being every 10 sec.
*
* @return {@code true} if the lease was renewed; {@code false} otherwise.
*/
public boolean renewLease() {
long now = getCurrentTime();
- if (now + 2 * leaseTime / 3 < leaseEndTime) {
+ if (now < leaseEndTime - leaseTime + leaseUpdateInterval) {
return false;
}
synchronized(this) {
@@ -534,9 +564,14 @@ public class ClusterNodeInfo {
}
/** for testing purpose only, not to be changed at runtime! */
- public void setLeaseTime(long leaseTime) {
+ void setLeaseTime(long leaseTime) {
this.leaseTime = leaseTime;
}
+
+ /** for testing purpose only, not to be changed at runtime! */
+ void setLeaseUpdateInterval(long leaseUpdateInterval) {
+ this.leaseUpdateInterval = leaseUpdateInterval;
+ }
public long getLeaseTime() {
return leaseTime;
Modified: jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterInfoTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterInfoTest.java?rev=1702953&r1=1702952&r2=1702953&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterInfoTest.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterInfoTest.java Mon Sep 14 13:51:18 2015
@@ -61,7 +61,9 @@ public class ClusterInfoTest {
clock.waitUntil(clock.getTime() + ns1.getClusterInfo().getLeaseTime());
ns1.getClusterInfo().setLeaseTime(0);
+ ns1.getClusterInfo().setLeaseUpdateInterval(0);
ns2.getClusterInfo().setLeaseTime(0);
+ ns2.getClusterInfo().setLeaseUpdateInterval(0);
List<ClusterNodeInfoDocument> list = mem.query(
Collection.CLUSTER_NODES, "0", "a", Integer.MAX_VALUE);
@@ -113,8 +115,8 @@ public class ClusterInfoTest {
// current lease end
long leaseEnd = getLeaseEndTime(ns);
- // wait a bit, but not more than a third of the lease time
- clock.waitUntil(clock.getTime() + (ns.getClusterInfo().getLeaseTime() / 3) - 1000);
+ // wait a bit, 1sec less than leaseUpdateTime (10sec-1sec by default)
+ clock.waitUntil(clock.getTime() + ClusterNodeInfo.DEFAULT_LEASE_UPDATE_INTERVAL_MILLIS - 1000);
// must not renew lease right now
ns.renewClusterIdLease();
Modified: jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterTest.java?rev=1702953&r1=1702952&r2=1702953&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterTest.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterTest.java Mon Sep 14 13:51:18 2015
@@ -127,6 +127,7 @@ public class ClusterTest {
c1 = ClusterNodeInfo.getInstance(store, "m1", null);
assertEquals(1, c1.getId());
c1.setLeaseTime(1);
+ c1.setLeaseUpdateInterval(0);
// this will quickly expire
c1.renewLease();
Thread.sleep(10);
Modified: jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java?rev=1702953&r1=1702952&r2=1702953&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java Mon Sep 14 13:51:18 2015
@@ -286,7 +286,7 @@ public class DocumentDiscoveryLiteServic
stopLastRevThread();
logger.info("crash: stopped lastrev thread, now setting least to end within 1 sec");
- boolean renewed = setLeaseTime(1000 /* 1 sec */);
+ boolean renewed = setLeaseTime(1000 /* 1 sec */, 10 /*10ms*/);
if (!renewed) {
logger.info("halt");
fail("did not renew clusterid lease");
@@ -311,8 +311,9 @@ public class DocumentDiscoveryLiteServic
* time so that the crash detection doesn't take a minute (as it would
* by default)
*/
- private boolean setLeaseTime(final int leaseTime) throws NoSuchFieldException {
+ private boolean setLeaseTime(final int leaseTime, final int leaseUpdateInterval) throws NoSuchFieldException {
ns.getClusterInfo().setLeaseTime(leaseTime);
+ ns.getClusterInfo().setLeaseUpdateInterval(leaseUpdateInterval);
PrivateAccessor.setField(ns.getClusterInfo(), "leaseEndTime", System.currentTimeMillis() + (leaseTime / 3) - 10 /* 10ms safety margin */);
boolean renewed = ns.renewClusterIdLease();
return renewed;
@@ -405,8 +406,9 @@ public class DocumentDiscoveryLiteServic
ns.merge(root, EmptyHook.INSTANCE, CommitInfo.EMPTY);
}
- public void setLeastTimeout(long timeoutInMs) throws NoSuchFieldException {
+ public void setLeastTimeout(long timeoutInMs, long updateIntervalInMs) throws NoSuchFieldException {
ns.getClusterInfo().setLeaseTime(timeoutInMs);
+ ns.getClusterInfo().setLeaseUpdateInterval(updateIntervalInMs);
PrivateAccessor.setField(ns.getClusterInfo(), "leaseEndTime", System.currentTimeMillis() - 1000);
}
@@ -948,7 +950,7 @@ public class DocumentDiscoveryLiteServic
// so: stop testing at this point:
return;
}
- newInstance.setLeastTimeout(5000);
+ newInstance.setLeastTimeout(5000, 1000);
newInstance.startSimulatingWrites(500);
logger.info("Case 0: created instance: " + newInstance.ns.getClusterId());
if (newInstance.ns.getClusterId() != cid) {
@@ -982,7 +984,7 @@ public class DocumentDiscoveryLiteServic
// so: stop testing at this point:
return;
}
- newInstance.setLeastTimeout(5000);
+ newInstance.setLeastTimeout(5000, 1000);
newInstance.startSimulatingWrites(500);
logger.info("Case 1: created instance: " + newInstance.ns.getClusterId());
instances.add(newInstance);