You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by st...@apache.org on 2015/09/14 15:51:18 UTC

svn commit: r1702953 - in /jackrabbit/oak/trunk/oak-core/src: main/java/org/apache/jackrabbit/oak/plugins/document/ test/java/org/apache/jackrabbit/oak/plugins/document/

Author: stefanegli
Date: Mon Sep 14 13:51:18 2015
New Revision: 1702953

URL: http://svn.apache.org/r1702953
Log:
OAK-3398 : increase lease timeout to 120sec from 60sec - plus update it every 10 sec not only every 20sec - plus make the lease failure margin explicit and set it to 20sec as was previously implicitly the case

Modified:
    jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
    jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterInfoTest.java
    jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterTest.java
    jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java

Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java?rev=1702953&r1=1702952&r2=1702953&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java Mon Sep 14 13:51:18 2015
@@ -155,15 +155,42 @@ public class ClusterNodeInfo {
      */
     private static Clock clock = Clock.SIMPLE;
 
-
-    public static final int DEFAULT_LEASE_DURATION_MILLIS = 1000 * 60;
+    /** OAK-3398 : default lease duration 120sec **/
+    public static final int DEFAULT_LEASE_DURATION_MILLIS = 1000 * 120;
+    
+    /** OAK-3398 : default update interval 10sec **/
+    public static final int DEFAULT_LEASE_UPDATE_INTERVAL_MILLIS = 1000 * 10;
+    
+    /** OAK-3398 : default failure margin 20sec before actual lease timeout **/
+    public static final int DEFAULT_LEASE_FAILURE_MARGIN_MILLIS = 1000 * 20;
 
     /**
-     * The number of milliseconds for a lease (1 minute by default, and
+     * The number of milliseconds for a lease (2 minute by default, and
      * initially).
      */
     private long leaseTime = DEFAULT_LEASE_DURATION_MILLIS;
-
+    
+    /**
+     * The number of milliseconds after which a lease will be updated
+     * (should not be every second as that would increase number of 
+     * writes towards DocumentStore considerably - but it should also
+     * not be too low as that would eat into the lease duration on average.
+     */
+    private long leaseUpdateInterval = DEFAULT_LEASE_UPDATE_INTERVAL_MILLIS;
+
+    /**
+     * The number of milliseconds that a lease must still be valid
+     * before prematurely declaring it as failed. The default is 20sec.
+     * The idea of declaring a lease as failed before it actually failed
+     * is to avoid a race condition where the local instance assumes
+     * things are all fine but another instance in the cluster will
+     * 'in the same moment' declare it as failed. The lease should be 
+     * checked every second and updated after 10sec, so it should always
+     * have a validity of at least 110sec - if that's down to this margin
+     * of 20sec then things are not good and we have to give up.
+     */
+    private long leaseFailureMargin = DEFAULT_LEASE_FAILURE_MARGIN_MILLIS;
+    
     /**
      * The assigned cluster id.
      */
@@ -425,7 +452,9 @@ public class ClusterNodeInfo {
             throw new AssertionError(LEASE_CHECK_FAILED_MSG);
         }
         long now = getCurrentTime();
-        if (now < (leaseEndTime - leaseTime / 3)) { // OAK-3238 : put the barrier 1/3 before lease end
+        // OAK-3238 put the barrier 1/3 of 60sec=20sec before the end
+        // OAK-3398 keeps this the same but uses an explicit leaseFailureMargin for this
+        if (now < (leaseEndTime - leaseFailureMargin)) {
             // then all is good
             return;
         }
@@ -437,7 +466,7 @@ public class ClusterNodeInfo {
             // synchronized could have delayed the 'now', so
             // set it again..
             now = getCurrentTime();
-            if (now < (leaseEndTime - leaseTime / 3)) { // OAK-3238 : put the barrier 1/3 before lease end
+            if (now < (leaseEndTime - leaseFailureMargin)) {
                 // if lease is OK here, then there was a race
                 // between performLeaseCheck and renewLease()
                 // where the winner was: renewLease().
@@ -457,9 +486,10 @@ public class ClusterNodeInfo {
         
         final String restarterErrorMsg = LEASE_CHECK_FAILED_MSG+" (leaseEndTime: "+leaseEndTime+
                 ", leaseTime: "+leaseTime+
-                ", lease check end time (1/3 before lease end): "+(leaseEndTime - leaseTime / 3)+
+                ", leaseFailureMargin: "+leaseFailureMargin+
+                ", lease check end time (leaseEndTime-leaseFailureMargin): "+(leaseEndTime - leaseFailureMargin)+
                 ", now: "+now+
-                ", remaining: "+((leaseEndTime - leaseTime / 3) - now)+
+                ", remaining: "+((leaseEndTime - leaseFailureMargin) - now)+
                 ") Need to stop oak-core/DocumentNodeStoreService.";
         LOG.error(restarterErrorMsg);
         
@@ -499,14 +529,14 @@ public class ClusterNodeInfo {
     /**
      * Renew the cluster id lease. This method needs to be called once in a while,
      * to ensure the same cluster id is not re-used by a different instance.
-     * The lease is only renewed when a third of the lease time passed. That is,
-     * with a lease time of 60 seconds, the lease is renewed every 20 seconds.
+     * The lease is only renewed when after leaseUpdateInterval millis
+     * since last lease update - default being every 10 sec.
      *
      * @return {@code true} if the lease was renewed; {@code false} otherwise.
      */
     public boolean renewLease() {
         long now = getCurrentTime();
-        if (now + 2 * leaseTime / 3 < leaseEndTime) {
+        if (now < leaseEndTime - leaseTime + leaseUpdateInterval) {
             return false;
         }
         synchronized(this) {
@@ -534,9 +564,14 @@ public class ClusterNodeInfo {
     }
 
     /** for testing purpose only, not to be changed at runtime! */
-    public void setLeaseTime(long leaseTime) {
+    void setLeaseTime(long leaseTime) {
         this.leaseTime = leaseTime;
     }
+    
+    /** for testing purpose only, not to be changed at runtime! */
+    void setLeaseUpdateInterval(long leaseUpdateInterval) {
+        this.leaseUpdateInterval = leaseUpdateInterval;
+    }
 
     public long getLeaseTime() {
         return leaseTime;

Modified: jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterInfoTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterInfoTest.java?rev=1702953&r1=1702952&r2=1702953&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterInfoTest.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterInfoTest.java Mon Sep 14 13:51:18 2015
@@ -61,7 +61,9 @@ public class ClusterInfoTest {
         clock.waitUntil(clock.getTime() + ns1.getClusterInfo().getLeaseTime());
 
         ns1.getClusterInfo().setLeaseTime(0);
+        ns1.getClusterInfo().setLeaseUpdateInterval(0);
         ns2.getClusterInfo().setLeaseTime(0);
+        ns2.getClusterInfo().setLeaseUpdateInterval(0);
 
         List<ClusterNodeInfoDocument> list = mem.query(
                 Collection.CLUSTER_NODES, "0", "a", Integer.MAX_VALUE);
@@ -113,8 +115,8 @@ public class ClusterInfoTest {
         // current lease end
         long leaseEnd = getLeaseEndTime(ns);
 
-        // wait a bit, but not more than a third of the lease time
-        clock.waitUntil(clock.getTime() + (ns.getClusterInfo().getLeaseTime() / 3) - 1000);
+        // wait a bit, 1sec less than leaseUpdateTime (10sec-1sec by default)
+        clock.waitUntil(clock.getTime() + ClusterNodeInfo.DEFAULT_LEASE_UPDATE_INTERVAL_MILLIS - 1000);
 
         // must not renew lease right now
         ns.renewClusterIdLease();

Modified: jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterTest.java?rev=1702953&r1=1702952&r2=1702953&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterTest.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/ClusterTest.java Mon Sep 14 13:51:18 2015
@@ -127,6 +127,7 @@ public class ClusterTest {
         c1 = ClusterNodeInfo.getInstance(store, "m1", null);
         assertEquals(1, c1.getId());
         c1.setLeaseTime(1);
+        c1.setLeaseUpdateInterval(0);
         // this will quickly expire
         c1.renewLease();
         Thread.sleep(10);

Modified: jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java?rev=1702953&r1=1702952&r2=1702953&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java Mon Sep 14 13:51:18 2015
@@ -286,7 +286,7 @@ public class DocumentDiscoveryLiteServic
             stopLastRevThread();
             logger.info("crash: stopped lastrev thread, now setting least to end within 1 sec");
 
-            boolean renewed = setLeaseTime(1000 /* 1 sec */);
+            boolean renewed = setLeaseTime(1000 /* 1 sec */, 10 /*10ms*/);
             if (!renewed) {
                 logger.info("halt");
                 fail("did not renew clusterid lease");
@@ -311,8 +311,9 @@ public class DocumentDiscoveryLiteServic
          * time so that the crash detection doesn't take a minute (as it would
          * by default)
          */
-        private boolean setLeaseTime(final int leaseTime) throws NoSuchFieldException {
+        private boolean setLeaseTime(final int leaseTime, final int leaseUpdateInterval) throws NoSuchFieldException {
             ns.getClusterInfo().setLeaseTime(leaseTime);
+            ns.getClusterInfo().setLeaseUpdateInterval(leaseUpdateInterval);
             PrivateAccessor.setField(ns.getClusterInfo(), "leaseEndTime", System.currentTimeMillis() + (leaseTime / 3) - 10 /* 10ms safety margin */);
             boolean renewed = ns.renewClusterIdLease();
             return renewed;
@@ -405,8 +406,9 @@ public class DocumentDiscoveryLiteServic
             ns.merge(root, EmptyHook.INSTANCE, CommitInfo.EMPTY);
         }
 
-        public void setLeastTimeout(long timeoutInMs) throws NoSuchFieldException {
+        public void setLeastTimeout(long timeoutInMs, long updateIntervalInMs) throws NoSuchFieldException {
             ns.getClusterInfo().setLeaseTime(timeoutInMs);
+            ns.getClusterInfo().setLeaseUpdateInterval(updateIntervalInMs);
             PrivateAccessor.setField(ns.getClusterInfo(), "leaseEndTime", System.currentTimeMillis() - 1000);
         }
 
@@ -948,7 +950,7 @@ public class DocumentDiscoveryLiteServic
                             // so: stop testing at this point:
                             return;
                         }
-                        newInstance.setLeastTimeout(5000);
+                        newInstance.setLeastTimeout(5000, 1000);
                         newInstance.startSimulatingWrites(500);
                         logger.info("Case 0: created instance: " + newInstance.ns.getClusterId());
                         if (newInstance.ns.getClusterId() != cid) {
@@ -982,7 +984,7 @@ public class DocumentDiscoveryLiteServic
                             // so: stop testing at this point:
                             return;
                         }
-                        newInstance.setLeastTimeout(5000);
+                        newInstance.setLeastTimeout(5000, 1000);
                         newInstance.startSimulatingWrites(500);
                         logger.info("Case 1: created instance: " + newInstance.ns.getClusterId());
                         instances.add(newInstance);