You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2017/06/13 11:06:28 UTC

lucene-solr:jira/solr-10704: SOLR-10704 Fix issues from review.

Repository: lucene-solr
Updated Branches:
  refs/heads/jira/solr-10704 1f04fd59e -> 13e6f6f4e


SOLR-10704 Fix issues from review.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/13e6f6f4
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/13e6f6f4
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/13e6f6f4

Branch: refs/heads/jira/solr-10704
Commit: 13e6f6f4e768e89bb1235b34c8ff81dd289a2192
Parents: 1f04fd5
Author: Andrzej Bialecki <ab...@apache.org>
Authored: Tue Jun 13 13:06:13 2017 +0200
Committer: Andrzej Bialecki <ab...@apache.org>
Committed: Tue Jun 13 13:06:13 2017 +0200

----------------------------------------------------------------------
 .../org/apache/solr/cloud/ReplaceNodeCmd.java   | 36 +++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/13e6f6f4/solr/core/src/java/org/apache/solr/cloud/ReplaceNodeCmd.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplaceNodeCmd.java b/solr/core/src/java/org/apache/solr/cloud/ReplaceNodeCmd.java
index c046d75..317799e 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ReplaceNodeCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ReplaceNodeCmd.java
@@ -63,6 +63,7 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
     String source = message.getStr("source");
     String target = message.getStr("target");
     String async = message.getStr("async");
+    int timeout = message.getInt("timeout", 10 * 60); // 10 minutes
     boolean parallel = message.getBool("parallel", false);
     ClusterState clusterState = zkStateReader.getClusterState();
 
@@ -73,14 +74,16 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Target Node: " + target + " is not live");
     }
     List<ZkNodeProps> sourceReplicas = getReplicasOfNode(source, clusterState);
-    // how many leaders are we moving?
+    // how many leaders are we moving? for these replicas we have to make sure that either:
+    // * another existing replica can become a leader, or
+    // * we wait until the newly created replica completes recovery (and can become the new leader)
     int numLeaders = 0;
     for (ZkNodeProps props : sourceReplicas) {
       if (props.getBool(ZkStateReader.LEADER_PROP, false)) {
         numLeaders++;
       }
     }
-    // map of shardId_replicaId to watchers
+    // map of collectionName_coreNodeName to watchers
     Map<String, RecoveryWatcher> watchers = new HashMap<>();
     List<ZkNodeProps> createdReplicas = new ArrayList<>();
 
@@ -91,13 +94,14 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
 
     for (ZkNodeProps sourceReplica : sourceReplicas) {
       if (sourceReplica.getBool(ZkStateReader.LEADER_PROP, false)) {
-        String shardId = sourceReplica.getStr(SHARD_ID_PROP);
-        String replicaId = sourceReplica.getStr(ZkStateReader.REPLICA_PROP);
-        String collectionId = sourceReplica.getStr(COLLECTION_PROP);
-        String key = shardId + "_" + replicaId;
-        RecoveryWatcher watcher = new RecoveryWatcher(collectionId, shardId, replicaId, replicasToRecover);
+        String shardName = sourceReplica.getStr(SHARD_ID_PROP);
+        String coreNodeName = sourceReplica.getStr(ZkStateReader.CORE_NODE_NAME_PROP);
+        String replicaName = sourceReplica.getStr(ZkStateReader.REPLICA_PROP);
+        String collectionName = sourceReplica.getStr(COLLECTION_PROP);
+        String key = collectionName + "_" + coreNodeName;
+        RecoveryWatcher watcher = new RecoveryWatcher(collectionName, shardName, replicaName, replicasToRecover);
         watchers.put(key, watcher);
-        zkStateReader.registerCollectionStateWatcher(collectionId, watcher);
+        zkStateReader.registerCollectionStateWatcher(collectionName, watcher);
       }
       NamedList nl = new NamedList();
       log.info("Going to create replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
@@ -129,7 +133,7 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
     }
 
     log.debug("Waiting for replicas to be added");
-    if (!countDownLatch.await(5, TimeUnit.MINUTES)) {
+    if (!countDownLatch.await(timeout, TimeUnit.SECONDS)) {
       log.info("Timed out waiting for replicas to be added");
       anyOneFailed.set(true);
     } else {
@@ -138,7 +142,7 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
 
     // now wait for leader replicas to recover
     log.debug("Waiting for " + numLeaders + " leader replicas to recover");
-    if (!replicasToRecover.await(5, TimeUnit.MINUTES)) {
+    if (!replicasToRecover.await(timeout, TimeUnit.SECONDS)) {
       log.info("Timed out waiting for " + replicasToRecover.getCount() + " leader replicas to recover");
       anyOneFailed.set(true);
     } else {
@@ -172,6 +176,7 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
         }
       }
       cleanupLatch.await(5, TimeUnit.MINUTES);
+      return;
     }
 
 
@@ -191,6 +196,7 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
                 COLLECTION_PROP, e.getKey(),
                 SHARD_ID_PROP, slice.getName(),
                 ZkStateReader.CORE_NAME_PROP, replica.getCoreName(),
+                ZkStateReader.CORE_NODE_NAME_PROP, replica.getStr(ZkStateReader.CORE_NODE_NAME_PROP),
                 ZkStateReader.REPLICA_PROP, replica.getName(),
                 ZkStateReader.REPLICA_TYPE, replica.getType().name(),
                 ZkStateReader.LEADER_PROP, String.valueOf(replica.equals(slice.getLeader())),
@@ -229,9 +235,15 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
         return true;
       }
       for (Replica replica : slice.getReplicas()) {
-        if (!replica.getName().equals(replicaId)) { // another replica exists - check that it's active
+        // check if another replica exists - doesn't have to be the one we're moving
+        // as long as it's active and can become a leader, in which case we don't have to wait
+        // for recovery of specifically the one that we've just added
+        if (!replica.getName().equals(replicaId)) {
+          if (replica.getType().equals(Replica.Type.PULL)) { // not eligible for leader election
+            continue;
+          }
           // check its state
-          if (replica.getState().equals(Replica.State.ACTIVE)) { // recovered - stop waiting
+          if (replica.isActive(liveNodes)) { // recovered - stop waiting
             countDownLatch.countDown();
             return true;
           }