You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2016/06/30 09:52:48 UTC

lucene-solr:master: SOLR-9264: Optimize ZkController.publishAndWaitForDownStates to not read all collection states and watch relevant collections instead

Repository: lucene-solr
Updated Branches:
  refs/heads/master 1dc7480bc -> 015e0fc1c


SOLR-9264: Optimize ZkController.publishAndWaitForDownStates to not read all collection states and watch relevant collections instead


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/015e0fc1
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/015e0fc1
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/015e0fc1

Branch: refs/heads/master
Commit: 015e0fc1cf1d581c9657cd8f5588062c02588793
Parents: 1dc7480
Author: Shalin Shekhar Mangar <sh...@apache.org>
Authored: Thu Jun 30 15:22:57 2016 +0530
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Thu Jun 30 15:22:57 2016 +0530

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  3 +
 .../org/apache/solr/cloud/ZkController.java     | 68 ++++++++------------
 2 files changed, 30 insertions(+), 41 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/015e0fc1/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 94b07a3..59f411f 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -106,6 +106,9 @@ Optimizations
 
 * SOLR-9219: Make hdfs blockcache read buffer sizes configurable and improve cache concurrency. (Mark Miller)
 
+* SOLR-9264: Optimize ZkController.publishAndWaitForDownStates to not read all collection states and
+  watch relevant collections instead. (Hrishikesh Gadre, shalin)
+
 Other Changes
 ----------------------
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/015e0fc1/solr/core/src/java/org/apache/solr/cloud/ZkController.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 102774f..c4cd36c 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -37,11 +37,13 @@ import java.util.Map;
 import java.util.Properties;
 import java.util.Set;
 import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import com.google.common.base.Strings;
 import org.apache.commons.lang.StringUtils;
@@ -52,25 +54,7 @@ import org.apache.solr.cloud.overseer.OverseerAction;
 import org.apache.solr.cloud.overseer.SliceMutator;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
-import org.apache.solr.common.cloud.BeforeReconnect;
-import org.apache.solr.common.cloud.ClusterState;
-import org.apache.solr.common.cloud.ClusterStateUtil;
-import org.apache.solr.common.cloud.DefaultConnectionStrategy;
-import org.apache.solr.common.cloud.DefaultZkACLProvider;
-import org.apache.solr.common.cloud.DefaultZkCredentialsProvider;
-import org.apache.solr.common.cloud.DocCollection;
-import org.apache.solr.common.cloud.OnReconnect;
-import org.apache.solr.common.cloud.Replica;
-import org.apache.solr.common.cloud.Slice;
-import org.apache.solr.common.cloud.SolrZkClient;
-import org.apache.solr.common.cloud.ZkACLProvider;
-import org.apache.solr.common.cloud.ZkCmdExecutor;
-import org.apache.solr.common.cloud.ZkConfigManager;
-import org.apache.solr.common.cloud.ZkCoreNodeProps;
-import org.apache.solr.common.cloud.ZkCredentialsProvider;
-import org.apache.solr.common.cloud.ZkNodeProps;
-import org.apache.solr.common.cloud.ZkStateReader;
-import org.apache.solr.common.cloud.ZooKeeperException;
+import org.apache.solr.common.cloud.*;
 import org.apache.solr.common.params.CollectionParams;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.SolrParams;
@@ -93,6 +77,7 @@ import org.apache.zookeeper.Op;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
 import org.apache.zookeeper.data.Stat;
+import org.eclipse.jetty.util.ConcurrentHashSet;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.slf4j.MDC;
@@ -746,35 +731,36 @@ public final class ZkController {
       InterruptedException {
 
     publishNodeAsDown(getNodeName());
-    
-    // now wait till the updates are in our state
-    long now = System.nanoTime();
-    long timeout = now + TimeUnit.NANOSECONDS.convert(WAIT_DOWN_STATES_TIMEOUT_SECONDS, TimeUnit.SECONDS);
-
-    while (System.nanoTime() < timeout) {
-      boolean foundStates = true;
-      ClusterState clusterState = zkStateReader.getClusterState();
-      Map<String, DocCollection> collections = clusterState.getCollectionsMap();
-      for (Map.Entry<String, DocCollection> entry : collections.entrySet()) {
-        DocCollection collection = entry.getValue();
-        Collection<Slice> slices = collection.getSlices();
-        for (Slice slice : slices) {
-          Collection<Replica> replicas = slice.getReplicas();
-          for (Replica replica : replicas) {
-            if (getNodeName().equals(replica.getNodeName()) && replica.getState() != Replica.State.DOWN) {
+
+    Set<String> collectionsWithLocalReplica = ConcurrentHashMap.newKeySet();
+    for (SolrCore core : cc.getCores()) {
+      collectionsWithLocalReplica.add(core.getCoreDescriptor().getCloudDescriptor().getCollectionName());
+    }
+
+    CountDownLatch latch = new CountDownLatch(collectionsWithLocalReplica.size());
+    for (String collectionWithLocalReplica : collectionsWithLocalReplica) {
+      zkStateReader.registerCollectionStateWatcher(collectionWithLocalReplica, (liveNodes, collectionState) -> {
+        boolean foundStates = true;
+        for (SolrCore core : cc.getCores()) {
+          if (core.getCoreDescriptor().getCloudDescriptor().getCollectionName().equals(collectionWithLocalReplica))  {
+            Replica replica = collectionState.getReplica(core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
+            if (replica.getState() != Replica.State.DOWN) {
               foundStates = false;
             }
           }
         }
-      }
 
-      Thread.sleep(1000);
-      if (foundStates) {
-        return;
-      }
+        if (foundStates && collectionsWithLocalReplica.remove(collectionWithLocalReplica))  {
+          latch.countDown();
+        }
+        return foundStates;
+      });
     }
 
-    log.warn("Timed out waiting to see all nodes published as DOWN in our cluster state.");
+    boolean allPublishedDown = latch.await(WAIT_DOWN_STATES_TIMEOUT_SECONDS, TimeUnit.SECONDS);
+    if (!allPublishedDown) {
+      log.warn("Timed out waiting to see all nodes published as DOWN in our cluster state.");
+    }
   }
 
   /**