You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2016/06/29 11:52:19 UTC

lucene-solr:branch_6x: SOLR-8777: Duplicate Solr process can cripple a running process (cherry picked from commit 4ea95bf)

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x 976501f6f -> 812fd346f


SOLR-8777: Duplicate Solr process can cripple a running process
(cherry picked from commit 4ea95bf)


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/812fd346
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/812fd346
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/812fd346

Branch: refs/heads/branch_6x
Commit: 812fd346f7a136ccfe550a6ba0d7b0e634d68769
Parents: 976501f
Author: Shalin Shekhar Mangar <sh...@apache.org>
Authored: Wed Jun 29 14:49:59 2016 +0530
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Wed Jun 29 16:59:56 2016 +0530

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  4 +-
 .../org/apache/solr/cloud/ZkController.java     | 96 ++++++++++++++------
 2 files changed, 70 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/812fd346/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 817d85e..10f490a 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -54,10 +54,12 @@ Bug Fixes
 
 * SOLR-8626: 404 error when clicking nodes in cloud graph view in angular UI. (janhoy, Trey Grainger via shalin)
 
+* SOLR-8777: Duplicate Solr process can cripple a running process. (Jessica Cheng Mallet, Scott Blum, shalin)
+
 * SOLR-9254: GraphTermsQueryQParserPlugin throws NPE when field being searched is not present in segment
   (Joel Bernstein)
 
-* SOLR-8657: Fix SolrRequestInfo error logs if QuerySenderListener is being used (Pascal Chollet, 
+* SOLR-8657: Fix SolrRequestInfo error logs if QuerySenderListener is being used (Pascal Chollet,
   Tom�s Fern�ndez L�bbe)
 
 Optimizations

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/812fd346/solr/core/src/java/org/apache/solr/cloud/ZkController.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 1388ee5..102774f 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -25,8 +25,19 @@ import java.net.URLEncoder;
 import java.net.UnknownHostException;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
 import java.util.concurrent.Callable;
+import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
@@ -41,7 +52,25 @@ import org.apache.solr.cloud.overseer.OverseerAction;
 import org.apache.solr.cloud.overseer.SliceMutator;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
-import org.apache.solr.common.cloud.*;
+import org.apache.solr.common.cloud.BeforeReconnect;
+import org.apache.solr.common.cloud.ClusterState;
+import org.apache.solr.common.cloud.ClusterStateUtil;
+import org.apache.solr.common.cloud.DefaultConnectionStrategy;
+import org.apache.solr.common.cloud.DefaultZkACLProvider;
+import org.apache.solr.common.cloud.DefaultZkCredentialsProvider;
+import org.apache.solr.common.cloud.DocCollection;
+import org.apache.solr.common.cloud.OnReconnect;
+import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.SolrZkClient;
+import org.apache.solr.common.cloud.ZkACLProvider;
+import org.apache.solr.common.cloud.ZkCmdExecutor;
+import org.apache.solr.common.cloud.ZkConfigManager;
+import org.apache.solr.common.cloud.ZkCoreNodeProps;
+import org.apache.solr.common.cloud.ZkCredentialsProvider;
+import org.apache.solr.common.cloud.ZkNodeProps;
+import org.apache.solr.common.cloud.ZkStateReader;
+import org.apache.solr.common.cloud.ZooKeeperException;
 import org.apache.solr.common.params.CollectionParams;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.SolrParams;
@@ -642,6 +671,8 @@ public final class ZkController {
       zkStateReader.createClusterStateWatchersAndUpdate();
       this.baseURL = zkStateReader.getBaseUrlForNodeName(this.nodeName);
 
+      checkForExistingEphemeralNode();
+
       // start the overseer first as following code may need it's processing
       if (!zkRunOnly) {
         overseerElector = new LeaderElector(zkClient);
@@ -678,6 +709,39 @@ public final class ZkController {
 
   }
 
+  private void checkForExistingEphemeralNode() throws KeeperException, InterruptedException {
+    if (zkRunOnly) {
+      return;
+    }
+    String nodeName = getNodeName();
+    String nodePath = ZkStateReader.LIVE_NODES_ZKNODE + "/" + nodeName;
+
+    if (!zkClient.exists(nodePath, true)) {
+      return;
+    }
+
+    final CountDownLatch deletedLatch = new CountDownLatch(1);
+    Stat stat = zkClient.exists(nodePath, event -> {
+      if (Watcher.Event.EventType.None.equals(event.getType())) {
+        return;
+      }
+      if (Watcher.Event.EventType.NodeDeleted.equals(event.getType())) {
+        deletedLatch.countDown();
+      }
+    }, true);
+
+    if (stat == null) {
+      // znode suddenly disappeared but that's okay
+      return;
+    }
+
+    boolean deleted = deletedLatch.await(zkClient.getSolrZooKeeper().getSessionTimeout() * 2, TimeUnit.MILLISECONDS);
+    if (!deleted) {
+      throw new SolrException(ErrorCode.SERVER_ERROR, "A previous ephemeral live node still exists. " +
+          "Solr cannot continue. Please ensure that no other Solr process using the same port is running already.");
+    }
+  }
+
   public void publishAndWaitForDownStates() throws KeeperException,
       InterruptedException {
 
@@ -752,33 +816,7 @@ public final class ZkController {
     String nodeName = getNodeName();
     String nodePath = ZkStateReader.LIVE_NODES_ZKNODE + "/" + nodeName;
     log.info("Register node as live in ZooKeeper:" + nodePath);
-
-    try {
-      boolean nodeDeleted = true;
-      try {
-        // we attempt a delete in the case of a quick server bounce -
-        // if there was not a graceful close, the node may exist
-        // until expiration timeout - so a node won't be created here because
-        // it exists, but eventually the node will be removed. So delete
-        // in case it exists and create a new node.
-        zkClient.delete(nodePath, -1, true);
-      } catch (KeeperException.NoNodeException e) {
-        // fine if there is nothing to delete
-        // TODO: annoying that ZK logs a warning on us
-        nodeDeleted = false;
-      }
-      if (nodeDeleted) {
-        log
-            .info("Found a previous node that still exists while trying to register a new live node "
-                + nodePath + " - removing existing node to create another.");
-      }
-      zkClient.makePath(nodePath, CreateMode.EPHEMERAL, true);
-    } catch (KeeperException e) {
-      // it's okay if the node already exists
-      if (e.code() != KeeperException.Code.NODEEXISTS) {
-        throw e;
-      }
-    }
+    zkClient.makePath(nodePath, CreateMode.EPHEMERAL, true);
   }
 
   public String getNodeName() {