You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by re...@apache.org on 2015/09/22 19:17:40 UTC
svn commit: r1704670 - in /jackrabbit/oak/trunk/oak-core/src:
main/java/org/apache/jackrabbit/oak/plugins/document/
main/java/org/apache/jackrabbit/oak/plugins/document/mongo/
test/java/org/apache/jackrabbit/oak/plugins/document/
Author: reschke
Date: Tue Sep 22 17:17:37 2015
New Revision: 1704670
URL: http://svn.apache.org/viewvc?rev=1704670&view=rev
Log:
OAK-3435: LastRevRecovery: log failures to update ClusterInfo, keep track which node is running the recovery, improve logging
Modified:
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/LastRevRecoveryAgent.java
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/MissingLastRevSeeker.java
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoMissingLastRevSeeker.java
jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java
Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java?rev=1704670&r1=1704669&r2=1704670&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/ClusterNodeInfo.java Tue Sep 22 17:17:37 2015
@@ -123,6 +123,11 @@ public class ClusterNodeInfo {
}
/**
+ * Flag indicating which cluster node is running the recovery.
+ */
+ public static final String REV_RECOVERY_BY = "recoveryBy";
+
+ /**
* Additional info, such as the process id, for support.
*/
private static final String INFO_KEY = "info";
Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/LastRevRecoveryAgent.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/LastRevRecoveryAgent.java?rev=1704670&r1=1704669&r2=1704670&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/LastRevRecoveryAgent.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/LastRevRecoveryAgent.java Tue Sep 22 17:17:37 2015
@@ -31,6 +31,7 @@ import java.util.Map;
import java.util.concurrent.locks.ReentrantLock;
import javax.annotation.CheckForNull;
+import javax.annotation.Nonnull;
import com.google.common.base.Predicate;
import com.google.common.collect.Lists;
@@ -279,7 +280,9 @@ public class LastRevRecoveryAgent {
* @return the number of restored nodes
*/
private int recoverCandidates(final int clusterId, final long startTime) {
- boolean lockAcquired = missingLastRevUtil.acquireRecoveryLock(clusterId);
+
+ int myClusterId = nodeStore.getClusterId();
+ boolean lockAcquired = missingLastRevUtil.acquireRecoveryLock(clusterId, myClusterId);
//TODO What if recovery is being performed for current clusterNode by some other node
//should we halt the startup
@@ -290,7 +293,8 @@ public class LastRevRecoveryAgent {
}
Iterable<NodeDocument> suspects = missingLastRevUtil.getCandidates(startTime);
- log.debug("Performing Last Revision recovery for cluster {}", clusterId);
+
+ log.info("Performing Last Revision Recovery for clusterNodeId {}", clusterId);
try {
return recover(suspects.iterator(), clusterId);
@@ -350,11 +354,12 @@ public class LastRevRecoveryAgent {
return missingLastRevUtil.isRecoveryNeeded(nodeStore.getClock().getTime());
}
- public void performRecoveryIfNeeded(){
- if(isRecoveryNeeded()){
+ public void performRecoveryIfNeeded() {
+ if (isRecoveryNeeded()) {
List<Integer> clusterIds = getRecoveryCandidateNodes();
- log.info("Starting last revision recovery for following clusterId {}", clusterIds);
- for(int clusterId : clusterIds){
+ log.info("ClusterNodeId [{}] starting Last Revision Recovery for clusterNodeId(s) {}", nodeStore.getClusterId(),
+ clusterIds);
+ for (int clusterId : clusterIds) {
recover(clusterId);
}
}
@@ -366,29 +371,34 @@ public class LastRevRecoveryAgent {
* @return the recovery candidate nodes
*/
public List<Integer> getRecoveryCandidateNodes() {
+
Iterable<ClusterNodeInfoDocument> clusters = missingLastRevUtil.getAllClusters();
List<Integer> candidateClusterNodes = Lists.newArrayList();
+ List<String> beingRecoveredRightNow = Lists.newArrayList();
for (ClusterNodeInfoDocument nodeInfo : clusters) {
- if (isRecoveryNeeded(nodeInfo)) {
- candidateClusterNodes.add(Integer.valueOf(nodeInfo.getId()));
+ String id = nodeInfo.getId();
+ if (nodeInfo.isBeingRecovered()) {
+ Long recoveredBy = (Long) nodeInfo.get(ClusterNodeInfo.REV_RECOVERY_BY);
+ beingRecoveredRightNow.add(nodeInfo == null ? id : String.format("%s (by %d)", id, recoveredBy));
+ } else if (isRecoveryNeeded(nodeInfo)) {
+ candidateClusterNodes.add(Integer.valueOf(id));
}
}
+ if (!beingRecoveredRightNow.isEmpty()) {
+ log.info("Active cluster nodes already in the process of being recovered: " + beingRecoveredRightNow);
+ }
+
return candidateClusterNodes;
}
- private boolean isRecoveryNeeded(ClusterNodeInfoDocument nodeInfo) {
- if (nodeInfo != null) {
- // Check if _lastRev recovery needed for this cluster node
- // state is Active && currentTime past the leaseEnd time && recoveryLock not held by someone
- if (nodeInfo.isActive()
- && nodeStore.getClock().getTime() > nodeInfo.getLeaseEndTime()
- && !nodeInfo.isBeingRecovered()) {
- return true;
- }
- }
- return false;
+ /**
+ * Check if _lastRev recovery needed for this cluster node state is Active
+ * && currentTime past the leaseEnd time && recoveryLock not held by someone
+ */
+ private boolean isRecoveryNeeded(@Nonnull ClusterNodeInfoDocument nodeInfo) {
+ return nodeInfo.isActive() && nodeStore.getClock().getTime() > nodeInfo.getLeaseEndTime() && !nodeInfo.isBeingRecovered();
}
private static class ClusterPredicate implements Predicate<Revision> {
Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/MissingLastRevSeeker.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/MissingLastRevSeeker.java?rev=1704670&r1=1704669&r2=1704670&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/MissingLastRevSeeker.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/MissingLastRevSeeker.java Tue Sep 22 17:17:37 2015
@@ -21,6 +21,8 @@ package org.apache.jackrabbit.oak.plugin
import org.apache.jackrabbit.oak.plugins.document.ClusterNodeInfo.RecoverLockState;
import org.apache.jackrabbit.oak.plugins.document.util.Utils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
@@ -30,10 +32,14 @@ import static org.apache.jackrabbit.oak.
import static org.apache.jackrabbit.oak.plugins.document.util.Utils.getSelectedDocuments;
/**
- * Utils to retrieve _lastRev missing update candidates.
+ * Utilities to retrieve _lastRev missing update candidates.
*/
public class MissingLastRevSeeker {
+
+ private static final Logger LOG = LoggerFactory.getLogger(MissingLastRevSeeker.class);
+
protected final String ROOT_PATH = "/";
+
private final DocumentStore store;
public MissingLastRevSeeker(DocumentStore store) {
@@ -80,20 +86,35 @@ public class MissingLastRevSeeker {
});
}
- public boolean acquireRecoveryLock(int clusterId){
- //This approach has a race condition where two different cluster nodes
- //can acquire the lock simultaneously.
- UpdateOp update = new UpdateOp(Integer.toString(clusterId), true);
- update.set(ClusterNodeInfo.REV_RECOVERY_LOCK, RecoverLockState.ACQUIRED.name());
- store.createOrUpdate(Collection.CLUSTER_NODES, update);
- return true;
- }
-
- public void releaseRecoveryLock(int clusterId){
- UpdateOp update = new UpdateOp(Integer.toString(clusterId), true);
- update.set(ClusterNodeInfo.REV_RECOVERY_LOCK, null);
- update.set(ClusterNodeInfo.STATE, null);
- store.createOrUpdate(Collection.CLUSTER_NODES, update);
+ public boolean acquireRecoveryLock(int clusterId, int recoveredBy) {
+ try {
+ // This approach has a race condition where two different cluster
+ // nodes
+ // can acquire the lock simultaneously.
+ UpdateOp update = new UpdateOp(Integer.toString(clusterId), true);
+ update.set(ClusterNodeInfo.REV_RECOVERY_LOCK, RecoverLockState.ACQUIRED.name());
+ if (recoveredBy != 0) {
+ update.set(ClusterNodeInfo.REV_RECOVERY_BY, recoveredBy);
+ }
+ store.createOrUpdate(Collection.CLUSTER_NODES, update);
+ return true;
+ } catch (RuntimeException ex) {
+ LOG.error("Failed to acquire the recovery lock for clusterNodeId " + clusterId, ex);
+ throw (ex);
+ }
+ }
+
+ public void releaseRecoveryLock(int clusterId) {
+ try {
+ UpdateOp update = new UpdateOp(Integer.toString(clusterId), true);
+ update.set(ClusterNodeInfo.REV_RECOVERY_LOCK, null);
+ update.set(ClusterNodeInfo.REV_RECOVERY_BY, null);
+ update.set(ClusterNodeInfo.STATE, null);
+ store.createOrUpdate(Collection.CLUSTER_NODES, update);
+ } catch (RuntimeException ex) {
+ LOG.error("Failed to release the recovery lock for clusterNodeId " + clusterId, ex);
+ throw (ex);
+ }
}
public NodeDocument getRoot() {
Modified: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoMissingLastRevSeeker.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoMissingLastRevSeeker.java?rev=1704670&r1=1704669&r2=1704670&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoMissingLastRevSeeker.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoMissingLastRevSeeker.java Tue Sep 22 17:17:37 2015
@@ -73,7 +73,7 @@ public class MongoMissingLastRevSeeker e
}
@Override
- public boolean acquireRecoveryLock(int clusterId) {
+ public boolean acquireRecoveryLock(int clusterId, int recoveredBy) {
QueryBuilder query =
start().and(
start(Document.ID).is(Integer.toString(clusterId)).get(),
@@ -85,6 +85,9 @@ public class MongoMissingLastRevSeeker e
BasicDBObject setUpdates = new BasicDBObject();
setUpdates.append(ClusterNodeInfo.REV_RECOVERY_LOCK, RecoverLockState.ACQUIRED.name());
+ if (recoveredBy != 0) {
+ setUpdates.append(ClusterNodeInfo.REV_RECOVERY_BY, recoveredBy);
+ }
BasicDBObject update = new BasicDBObject();
update.append("$set", setUpdates);
Modified: jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java?rev=1704670&r1=1704669&r2=1704670&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java (original)
+++ jackrabbit/oak/trunk/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/document/DocumentDiscoveryLiteServiceTest.java Tue Sep 22 17:17:37 2015
@@ -780,14 +780,14 @@ public class DocumentDiscoveryLiteServic
assertNotNull(missingLastRevUtil);
MissingLastRevSeeker mockedLongduringMissingLastRevUtil = mock(MissingLastRevSeeker.class, delegatesTo(missingLastRevUtil));
final Semaphore waitBeforeLocking = new Semaphore(0);
- when(mockedLongduringMissingLastRevUtil.acquireRecoveryLock(anyInt())).then(new Answer<Boolean>() {
+ when(mockedLongduringMissingLastRevUtil.acquireRecoveryLock(anyInt(), anyInt())).then(new Answer<Boolean>() {
@Override
public Boolean answer(InvocationOnMock invocation) throws Throwable {
logger.info("going to waitBeforeLocking");
waitBeforeLocking.acquire();
logger.info("done with waitBeforeLocking");
- return missingLastRevUtil.acquireRecoveryLock((Integer) invocation.getArguments()[0]);
+ return missingLastRevUtil.acquireRecoveryLock((Integer) invocation.getArguments()[0], (Integer) invocation.getArguments()[1]);
}
});
PrivateAccessor.setField(s1.ns.getLastRevRecoveryAgent(), "missingLastRevUtil", mockedLongduringMissingLastRevUtil);