You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by st...@apache.org on 2015/09/15 12:04:48 UTC
svn commit: r1703137 -
/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
Author: stefanegli
Date: Tue Sep 15 10:04:47 2015
New Revision: 1703137
URL: http://svn.apache.org/r1703137
Log:
OAK-3399 : 'one last chance before lease failure': doing 5 x 1sec retry loop when entering the leaseFailureMargin of 20sec to help situations where we eg the process was paused and we're just waking up - including the lease update thread which would do a lease update right away. For all other situations, all this 5sec retry loop does is reducing the leaseFailureMargin effectively down to 15sec - which is still fine as the clock accuracy is 4sec - so 11sec hard margin left (after 90sec of missing lease updates that is)
Modified:
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java?rev=1703137&r1=1703136&r2=1703137&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java Tue Sep 15 10:04:47 2015
@@ -158,9 +158,16 @@ public class ClusterNodeInfo {
/** OAK-3398 : default update interval 10sec **/
public static final int DEFAULT_LEASE_UPDATE_INTERVAL_MILLIS = 1000 * 10;
- /** OAK-3398 : default failure margin 20sec before actual lease timeout **/
+ /** OAK-3398 : default failure margin 20sec before actual lease timeout
+ * (note that OAK-3399 / MAX_RETRY_SLEEPS_BEFORE_LEASE_FAILURE eats
+ * off another few seconds from this margin, by default 5sec,
+ * so the actual default failure-margin is down to 15sec - and that is high-noon!)
+ */
public static final int DEFAULT_LEASE_FAILURE_MARGIN_MILLIS = 1000 * 20;
+ /** OAK-3399 : max number of times we're doing a 1sec retry loop just before declaring lease failure **/
+ private static final int MAX_RETRY_SLEEPS_BEFORE_LEASE_FAILURE = 5;
+
/**
* The number of milliseconds for a lease (2 minute by default, and
* initially).
@@ -478,15 +485,42 @@ public class ClusterNodeInfo {
LOG.error(LEASE_CHECK_FAILED_MSG);
throw new AssertionError(LEASE_CHECK_FAILED_MSG);
}
- // synchronized could have delayed the 'now', so
- // set it again..
- now = getCurrentTime();
- if (now < (leaseEndTime - leaseFailureMargin)) {
- // if lease is OK here, then there was a race
- // between performLeaseCheck and renewLease()
- // where the winner was: renewLease().
- // so: luckily we can continue here
- return;
+ for(int i=0; i<MAX_RETRY_SLEEPS_BEFORE_LEASE_FAILURE; i++) {
+ now = getCurrentTime();
+ if (now < (leaseEndTime - leaseFailureMargin)) {
+ // if lease is OK here, then there was a race
+ // between performLeaseCheck and renewLease()
+ // where the winner was: renewLease().
+ // so: luckily we can continue here
+ return;
+ }
+ // OAK-3399 : in case of running into the leaseFailureMargin
+ // (shortly, 20sec, before the lease times out), we're now doing
+ // a short retry loop of 1sec sleeps (default 5x1sec=5sec),
+ // to give this instance 'one last chance' before we have to
+ // declare the lease as failed.
+ // This sort of retry loop would allow situations such as
+ // when running a single-node cluster and interrupting/pausing
+ // the process temporarily: in this case when waking up, the
+ // lease might momentarily be timed out, but the lease would
+ // still be 'updateable' and that would happen pretty soon
+ // after waking up. So in that case, doing these retry-sleeps
+ // would help.
+ // in most other cases where the local instance is not doing
+ // lease updates due to 'GC-death' or 'lease-thread-crashed'
+ // or the like, it would not help. But it would also not hurt
+ // as the margin is 20sec and we're just reducing it by 5sec
+ // (in the un-paused case)
+ try {
+ LOG.info("performLeaseCheck: lease within "+leaseFailureMargin+
+ "ms of failing ("+(leaseEndTime-now)+" ms precisely) - "
+ + "waiting 1sec to retry (up to another "+
+ (MAX_RETRY_SLEEPS_BEFORE_LEASE_FAILURE-1-i)+" times)...");
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {
+ LOG.warn("performLeaseCheck: got interrupted - giving up: "+e, e);
+ break;
+ }
}
leaseCheckFailed = true; // make sure only one thread 'wins', ie goes any further
}