You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by re...@apache.org on 2015/09/22 19:17:40 UTC

svn commit: r1704670 - in /jackrabbit/oak/trunk/oak-core/src: main/java/org/apache/jackrabbit/oak/plugins/document/ main/java/org/apache/jackrabbit/oak/plugins/document/mongo/ test/java/org/apache/jackrabbit/oak/plugins/document/

Author: reschke
Date: Tue Sep 22 17:17:37 2015
New Revision: 1704670

URL: http://svn.apache.org/viewvc?rev=1704670&view=rev
Log:
OAK-3435: LastRevRecovery: log failures to update ClusterInfo, keep track which node is running the recovery, improve logging

Modified:
    jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
    jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/LastRevRecoveryAgent.java
    jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/MissingLastRevSeeker.java
    jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoMissingLastRevSeeker.java
    jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java

Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java?rev=1704670&r1=1704669&r2=1704670&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java Tue Sep 22 17:17:37 2015
@@ -123,6 +123,11 @@ public class ClusterNodeInfo {
     }
 
     /**
+     * Flag indicating which cluster node is running the recovery.
+     */
+    public static final String REV_RECOVERY_BY = "recoveryBy";
+
+    /**
      * Additional info, such as the process id, for support.
      */
     private static final String INFO_KEY = "info";

Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/LastRevRecoveryAgent.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/LastRevRecoveryAgent.java?rev=1704670&r1=1704669&r2=1704670&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/LastRevRecoveryAgent.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/LastRevRecoveryAgent.java Tue Sep 22 17:17:37 2015
@@ -31,6 +31,7 @@ import java.util.Map;
 import java.util.concurrent.locks.ReentrantLock;
 
 import javax.annotation.CheckForNull;
+import javax.annotation.Nonnull;
 
 import com.google.common.base.Predicate;
 import com.google.common.collect.Lists;
@@ -279,7 +280,9 @@ public class LastRevRecoveryAgent {
      * @return the number of restored nodes
      */
     private int recoverCandidates(final int clusterId, final long startTime) {
-        boolean lockAcquired = missingLastRevUtil.acquireRecoveryLock(clusterId);
+
+        int myClusterId = nodeStore.getClusterId();
+        boolean lockAcquired = missingLastRevUtil.acquireRecoveryLock(clusterId, myClusterId);
 
         //TODO What if recovery is being performed for current clusterNode by some other node
         //should we halt the startup
@@ -290,7 +293,8 @@ public class LastRevRecoveryAgent {
         }
 
         Iterable<NodeDocument> suspects = missingLastRevUtil.getCandidates(startTime);
-        log.debug("Performing Last Revision recovery for cluster {}", clusterId);
+
+        log.info("Performing Last Revision Recovery for clusterNodeId {}", clusterId);
 
         try {
             return recover(suspects.iterator(), clusterId);
@@ -350,11 +354,12 @@ public class LastRevRecoveryAgent {
         return missingLastRevUtil.isRecoveryNeeded(nodeStore.getClock().getTime());
     }
 
-    public void performRecoveryIfNeeded(){
-        if(isRecoveryNeeded()){
+    public void performRecoveryIfNeeded() {
+        if (isRecoveryNeeded()) {
             List<Integer> clusterIds = getRecoveryCandidateNodes();
-            log.info("Starting last revision recovery for following clusterId {}", clusterIds);
-            for(int clusterId : clusterIds){
+            log.info("ClusterNodeId [{}] starting Last Revision Recovery for clusterNodeId(s) {}", nodeStore.getClusterId(),
+                    clusterIds);
+            for (int clusterId : clusterIds) {
                 recover(clusterId);
             }
         }
@@ -366,29 +371,34 @@ public class LastRevRecoveryAgent {
      * @return the recovery candidate nodes
      */
     public List<Integer> getRecoveryCandidateNodes() {
+
         Iterable<ClusterNodeInfoDocument> clusters = missingLastRevUtil.getAllClusters();
         List<Integer> candidateClusterNodes = Lists.newArrayList();
+        List<String> beingRecoveredRightNow = Lists.newArrayList();
 
         for (ClusterNodeInfoDocument nodeInfo : clusters) {
-            if (isRecoveryNeeded(nodeInfo)) {
-                candidateClusterNodes.add(Integer.valueOf(nodeInfo.getId()));
+            String id = nodeInfo.getId();
+            if (nodeInfo.isBeingRecovered()) {
+                Long recoveredBy = (Long) nodeInfo.get(ClusterNodeInfo.REV_RECOVERY_BY);
+                beingRecoveredRightNow.add(nodeInfo == null ? id : String.format("%s (by %d)", id, recoveredBy));
+            } else if (isRecoveryNeeded(nodeInfo)) {
+                candidateClusterNodes.add(Integer.valueOf(id));
             }
         }
 
+        if (!beingRecoveredRightNow.isEmpty()) {
+            log.info("Active cluster nodes already in the process of being recovered: " + beingRecoveredRightNow);
+        }
+
         return candidateClusterNodes;
     }
 
-    private boolean isRecoveryNeeded(ClusterNodeInfoDocument nodeInfo) {
-        if (nodeInfo != null) {
-            // Check if _lastRev recovery needed for this cluster node
-            // state is Active && currentTime past the leaseEnd time && recoveryLock not held by someone
-            if (nodeInfo.isActive()
-                    && nodeStore.getClock().getTime() > nodeInfo.getLeaseEndTime()
-                    && !nodeInfo.isBeingRecovered()) {
-                return true;
-            }
-        }
-        return false;
+    /**
+     * Check if _lastRev recovery needed for this cluster node state is Active
+     * && currentTime past the leaseEnd time && recoveryLock not held by someone
+     */
+    private boolean isRecoveryNeeded(@Nonnull ClusterNodeInfoDocument nodeInfo) {
+        return nodeInfo.isActive() && nodeStore.getClock().getTime() > nodeInfo.getLeaseEndTime() && !nodeInfo.isBeingRecovered();
     }
 
     private static class ClusterPredicate implements Predicate<Revision> {

Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/MissingLastRevSeeker.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/MissingLastRevSeeker.java?rev=1704670&r1=1704669&r2=1704670&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/MissingLastRevSeeker.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/MissingLastRevSeeker.java Tue Sep 22 17:17:37 2015
@@ -21,6 +21,8 @@ package org.apache.jackrabbit.oak.plugin
 
 import org.apache.jackrabbit.oak.plugins.document.ClusterNodeInfo.RecoverLockState;
 import org.apache.jackrabbit.oak.plugins.document.util.Utils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import com.google.common.base.Predicate;
 import com.google.common.collect.Iterables;
@@ -30,10 +32,14 @@ import static org.apache.jackrabbit.oak.
 import static org.apache.jackrabbit.oak.plugins.document.util.Utils.getSelectedDocuments;
 
 /**
- * Utils to retrieve _lastRev missing update candidates.
+ * Utilities to retrieve _lastRev missing update candidates.
  */
 public class MissingLastRevSeeker {
+
+    private static final Logger LOG = LoggerFactory.getLogger(MissingLastRevSeeker.class);
+
     protected final String ROOT_PATH = "/";
+
     private final DocumentStore store;
 
     public MissingLastRevSeeker(DocumentStore store) {
@@ -80,20 +86,35 @@ public class MissingLastRevSeeker {
         });
     }
 
-    public boolean acquireRecoveryLock(int clusterId){
-        //This approach has a race condition where two different cluster nodes
-        //can acquire the lock simultaneously.
-        UpdateOp update = new UpdateOp(Integer.toString(clusterId), true);
-        update.set(ClusterNodeInfo.REV_RECOVERY_LOCK, RecoverLockState.ACQUIRED.name());
-        store.createOrUpdate(Collection.CLUSTER_NODES, update);
-        return true;
-    }
-
-    public void releaseRecoveryLock(int clusterId){
-        UpdateOp update = new UpdateOp(Integer.toString(clusterId), true);
-        update.set(ClusterNodeInfo.REV_RECOVERY_LOCK, null);
-        update.set(ClusterNodeInfo.STATE, null);
-        store.createOrUpdate(Collection.CLUSTER_NODES, update);
+    public boolean acquireRecoveryLock(int clusterId, int recoveredBy) {
+        try {
+            // This approach has a race condition where two different cluster
+            // nodes
+            // can acquire the lock simultaneously.
+            UpdateOp update = new UpdateOp(Integer.toString(clusterId), true);
+            update.set(ClusterNodeInfo.REV_RECOVERY_LOCK, RecoverLockState.ACQUIRED.name());
+            if (recoveredBy != 0) {
+                update.set(ClusterNodeInfo.REV_RECOVERY_BY, recoveredBy);
+            }
+            store.createOrUpdate(Collection.CLUSTER_NODES, update);
+            return true;
+        } catch (RuntimeException ex) {
+            LOG.error("Failed to acquire the recovery lock for clusterNodeId " + clusterId, ex);
+            throw (ex);
+        }
+    }
+
+    public void releaseRecoveryLock(int clusterId) {
+        try {
+            UpdateOp update = new UpdateOp(Integer.toString(clusterId), true);
+            update.set(ClusterNodeInfo.REV_RECOVERY_LOCK, null);
+            update.set(ClusterNodeInfo.REV_RECOVERY_BY, null);
+            update.set(ClusterNodeInfo.STATE, null);
+            store.createOrUpdate(Collection.CLUSTER_NODES, update);
+        } catch (RuntimeException ex) {
+            LOG.error("Failed to release the recovery lock for clusterNodeId " + clusterId, ex);
+            throw (ex);
+        }
     }
 
     public NodeDocument getRoot() {

Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoMissingLastRevSeeker.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoMissingLastRevSeeker.java?rev=1704670&r1=1704669&r2=1704670&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoMissingLastRevSeeker.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoMissingLastRevSeeker.java Tue Sep 22 17:17:37 2015
@@ -73,7 +73,7 @@ public class MongoMissingLastRevSeeker e
     }
 
     @Override
-    public boolean acquireRecoveryLock(int clusterId) {
+    public boolean acquireRecoveryLock(int clusterId, int recoveredBy) {
         QueryBuilder query =
                 start().and(
                         start(Document.ID).is(Integer.toString(clusterId)).get(),
@@ -85,6 +85,9 @@ public class MongoMissingLastRevSeeker e
 
         BasicDBObject setUpdates = new BasicDBObject();
         setUpdates.append(ClusterNodeInfo.REV_RECOVERY_LOCK, RecoverLockState.ACQUIRED.name());
+        if (recoveredBy != 0) {
+            setUpdates.append(ClusterNodeInfo.REV_RECOVERY_BY, recoveredBy);
+        }
 
         BasicDBObject update = new BasicDBObject();
         update.append("$set", setUpdates);

Modified: jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java?rev=1704670&r1=1704669&r2=1704670&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java Tue Sep 22 17:17:37 2015
@@ -780,14 +780,14 @@ public class DocumentDiscoveryLiteServic
         assertNotNull(missingLastRevUtil);
         MissingLastRevSeeker mockedLongduringMissingLastRevUtil = mock(MissingLastRevSeeker.class, delegatesTo(missingLastRevUtil));
         final Semaphore waitBeforeLocking = new Semaphore(0);
-        when(mockedLongduringMissingLastRevUtil.acquireRecoveryLock(anyInt())).then(new Answer<Boolean>() {
+        when(mockedLongduringMissingLastRevUtil.acquireRecoveryLock(anyInt(), anyInt())).then(new Answer<Boolean>() {
 
             @Override
             public Boolean answer(InvocationOnMock invocation) throws Throwable {
                 logger.info("going to waitBeforeLocking");
                 waitBeforeLocking.acquire();
                 logger.info("done with waitBeforeLocking");
-                return missingLastRevUtil.acquireRecoveryLock((Integer) invocation.getArguments()[0]);
+                return missingLastRevUtil.acquireRecoveryLock((Integer) invocation.getArguments()[0], (Integer) invocation.getArguments()[1]);
             }
         });
         PrivateAccessor.setField(s1.ns.getLastRevRecoveryAgent(), "missingLastRevUtil", mockedLongduringMissingLastRevUtil);