You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2020/07/22 16:47:35 UTC

[lucene-solr] branch reference_impl updated: @300 A little overseer leader election hardening attempt.

This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/reference_impl by this push:
     new 0458859  @300 A little overseer leader election hardening attempt.
0458859 is described below

commit 045885939317e1de6b42fa29f30159eca592eab5
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Wed Jul 22 11:47:02 2020 -0500

    @300 A little overseer leader election hardening attempt.
---
 .../java/org/apache/solr/cloud/LeaderElector.java  |  7 +++--
 .../apache/solr/cloud/OverseerElectionContext.java |  5 ----
 .../solr/cloud/ShardLeaderElectionContextBase.java |  2 +-
 .../java/org/apache/solr/cloud/ZkController.java   | 30 ++++++++--------------
 solr/reference_branch/prod/Dockerfile              |  2 +-
 .../org/apache/solr/common/cloud/SolrZkClient.java |  4 ++-
 6 files changed, 19 insertions(+), 31 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
index 7e357fc..38396bc 100644
--- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
+++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
@@ -368,17 +368,16 @@ public  class LeaderElector {
 
   void retryElection(ElectionContext context, boolean joinAtHead) throws KeeperException, InterruptedException, IOException {
     ElectionWatcher watcher = this.watcher;
-    ElectionContext ctx = context.copy();
     if (electionContexts != null) {
-      ElectionContext prevContext = electionContexts.put(contextKey, ctx);
+      ElectionContext prevContext = electionContexts.put(contextKey, context);
       if (prevContext != null) {
         prevContext.close();
       }
     }
     if (watcher != null) watcher.cancel();
     this.context.close();
-    this.context = ctx;
-    joinElection(ctx, true, joinAtHead);
+    this.context = context;
+    joinElection(context, true, joinAtHead);
   }
 
 }
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
index 1ca5b6f..2b07920 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
@@ -124,11 +124,6 @@ final class OverseerElectionContext extends ShardLeaderElectionContextBase {
   }
 
   @Override
-  public ElectionContext copy() {
-    return new OverseerElectionContext(id, zkClient, overseer);
-  }
-
-  @Override
   public void joinedElectionFired() {
 
   }
diff --git a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
index 96284d3..5a8c98a 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
@@ -125,7 +125,7 @@ class ShardLeaderElectionContextBase extends ElectionContext {
   }
 
   @Override
-  void runLeaderProcess(ElectionContext context, boolean weAreReplacement, int pauseBeforeStartMs)
+  synchronized void runLeaderProcess(ElectionContext context, boolean weAreReplacement, int pauseBeforeStartMs)
           throws KeeperException, InterruptedException, IOException {
     // register as leader - if an ephemeral is already there, wait to see if it goes away
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 02c30b8..94fd668 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -230,7 +230,7 @@ public class ZkController implements Closeable {
 
   private static byte[] emptyJson = "{}".getBytes(StandardCharsets.UTF_8);
 
-  private final Map<ContextKey, ElectionContext> electionContexts = new ConcurrentHashMap<>(132, 0.75f, 50) {
+  private final Map<ContextKey, ElectionContext> electionContexts = new ConcurrentHashMap<>(64, 0.75f, 16) {
     @Override
     public ElectionContext put(ContextKey key, ElectionContext value) {
       if (ZkController.this.isClosed || cc.isShutDown()) {
@@ -240,7 +240,7 @@ public class ZkController implements Closeable {
     }
   };
 
-  private final Map<ContextKey, ElectionContext> overseerContexts = new ConcurrentHashMap<>(132, 0.75f, 50) {
+  private final Map<ContextKey, ElectionContext> overseerContexts = new ConcurrentHashMap<>(3, 0.75f, 1) {
     @Override
     public ElectionContext put(ContextKey key, ElectionContext value) {
       if (ZkController.this.isClosed || cc.isShutDown()) {
@@ -464,7 +464,7 @@ public class ZkController implements Closeable {
 
             // start the overseer first as following code may need it's processing
 
-            ElectionContext context = new OverseerElectionContext(getNodeName(), zkClient, overseer);
+            ElectionContext context = new OverseerElectionContext(getNodeName(), zkClient ,overseer);
             ElectionContext prevContext = overseerContexts.put(new ContextKey("overseer", "overseer"), context);
             if (prevContext != null) {
               prevContext.close();
@@ -1393,6 +1393,7 @@ public class ZkController implements Closeable {
    ///     log.info("get lock for creating ephem live node");
  //       lock.lock();
         log.info("do create ephem live node");
+
         createLiveNodeImpl(nodePath, nodeAddedPath);
 //      } finally {
 //        log.info("unlock");
@@ -1426,19 +1427,9 @@ public class ZkController implements Closeable {
       try {
         zkClient.getSolrZooKeeper().create(nodePath, null, zkClient.getZkACLProvider().getACLsToAdd(nodePath), CreateMode.EPHEMERAL);
       } catch (KeeperException.NodeExistsException e) {
-        log.warn("Found our ephemeral live node already exists. This must be a quick restart after a hard shutdown, removing existing live node {}", nodePath);
-        zkClient.delete(nodePath, -1);
-
-        List<String> collections = zkClient.getChildren(COLLECTIONS_ZKNODE, null, false);
-        for (String collection : collections) {
-          log.warn("Cleaning up ephemerals for leadership for  {}", collection);
-          List<String> shards = zkClient.getChildren(COLLECTIONS_ZKNODE + "/" +collection + "/leaders", null, false);
-          for (String shard : shards) {
-            zkClient.clean(COLLECTIONS_ZKNODE + "/leaders/" + shard);
-            zkClient.cleanChildren(COLLECTIONS_ZKNODE + "/leader_elect/" + shard);
-          }
-        }
-
+        log.warn("Found our ephemeral live node already exists. This must be a quick restart after a hard shutdown, waiting for it to expire {}", nodePath);
+        // TODO nocommit wait for expiration properly and try again
+        Thread.sleep(15000);
         zkClient.getSolrZooKeeper().create(nodePath, null, zkClient.getZkACLProvider().getACLsToAdd(nodePath), CreateMode.EPHEMERAL);
       }
     } catch (Exception e) {
@@ -2416,12 +2407,13 @@ public class ZkController implements Closeable {
             }
           }
         } else { // We're in the right place, now attempt to rejoin
-          overseerElector.retryElection(new OverseerElectionContext(getNodeName(), zkClient,
-              overseer), joinAtHead);
+          overseerElector.retryElection(new OverseerElectionContext(getNodeName(),
+              zkClient, overseer), joinAtHead);
           return;
         }
       } else {
-        overseerElector.retryElection(overseerElector.getContext(), joinAtHead);
+        overseerElector.retryElection(new OverseerElectionContext(getNodeName(),
+               zkClient, overseer), joinAtHead);
       }
     } catch (Exception e) {
       ParWork.propegateInterrupt(e);
diff --git a/solr/reference_branch/prod/Dockerfile b/solr/reference_branch/prod/Dockerfile
index 5abe8c6..0f59c75 100644
--- a/solr/reference_branch/prod/Dockerfile
+++ b/solr/reference_branch/prod/Dockerfile
@@ -10,7 +10,7 @@ ADD start-solr.sh /start-solr.sh
 
 RUN chmod +x /start-solr.sh
 
-RUN apt-get -y update; apt-get -y upgrade; apt-get -y install openjdk-11-jdk; apt-get -y install ant; apt-get -y install git
+RUN apt-get -y update; apt-get -y upgrade; apt-get -y install openjdk-11-jdk-headless; apt-get -y install ant; apt-get -y install git
 
 RUN cd "${BUILD_DIR}"; git clone https://github.com/apache/lucene-solr.git --branch reference_impl --single-branch reference_impl; \
 cd "${BUILD_DIR}/reference_impl"; ant ivy-bootstrap;cd solr; ant package -Dversion=9.0.0-miller_ref_impl; \
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java
index fa2db3e..ec18d97 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java
@@ -159,6 +159,7 @@ public class SolrZkClient implements Closeable {
   public SolrZkClient(String zkServerAddress, int zkClientTimeout, int clientConnectTimeout,
       ZkClientConnectionStrategy strat, final OnReconnect onReconnect, BeforeReconnect beforeReconnect, ZkACLProvider zkACLProvider, IsClosed higherLevelIsClosed) {
     ObjectReleaseTracker.track(this);
+    log.info("Creating new {} instance {}", SolrZkClient.class.getSimpleName(), this);
     closeTracker = new CloseTracker();
     this.zkServerAddress = zkServerAddress;
     this.higherLevelIsClosed = higherLevelIsClosed;
@@ -188,7 +189,7 @@ public class SolrZkClient implements Closeable {
   }
 
   public SolrZkClient start() {
-
+    log.info("Starting {} instance {}", SolrZkClient.class.getSimpleName(), this);
     try {
       zkClientConnectionStrategy.connect(zkServerAddress, zkClientTimeout, wrapWatcher(connManager),
               zooKeeper -> {
@@ -857,6 +858,7 @@ public class SolrZkClient implements Closeable {
   }
 
   public void close() {
+    log.info("Closing {} instance {}", SolrZkClient.class.getSimpleName(), this);
     closeTracker.close();
     if (isClosed) return; // it's okay if we over close - same as solrcore
     isClosed = true;