You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2020/07/22 16:47:35 UTC
[lucene-solr] branch reference_impl updated: @300 A little overseer
leader election hardening attempt.
This is an automated email from the ASF dual-hosted git repository.
markrmiller pushed a commit to branch reference_impl
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/reference_impl by this push:
new 0458859 @300 A little overseer leader election hardening attempt.
0458859 is described below
commit 045885939317e1de6b42fa29f30159eca592eab5
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Wed Jul 22 11:47:02 2020 -0500
@300 A little overseer leader election hardening attempt.
---
.../java/org/apache/solr/cloud/LeaderElector.java | 7 +++--
.../apache/solr/cloud/OverseerElectionContext.java | 5 ----
.../solr/cloud/ShardLeaderElectionContextBase.java | 2 +-
.../java/org/apache/solr/cloud/ZkController.java | 30 ++++++++--------------
solr/reference_branch/prod/Dockerfile | 2 +-
.../org/apache/solr/common/cloud/SolrZkClient.java | 4 ++-
6 files changed, 19 insertions(+), 31 deletions(-)
diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
index 7e357fc..38396bc 100644
--- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
+++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
@@ -368,17 +368,16 @@ public class LeaderElector {
void retryElection(ElectionContext context, boolean joinAtHead) throws KeeperException, InterruptedException, IOException {
ElectionWatcher watcher = this.watcher;
- ElectionContext ctx = context.copy();
if (electionContexts != null) {
- ElectionContext prevContext = electionContexts.put(contextKey, ctx);
+ ElectionContext prevContext = electionContexts.put(contextKey, context);
if (prevContext != null) {
prevContext.close();
}
}
if (watcher != null) watcher.cancel();
this.context.close();
- this.context = ctx;
- joinElection(ctx, true, joinAtHead);
+ this.context = context;
+ joinElection(context, true, joinAtHead);
}
}
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
index 1ca5b6f..2b07920 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
@@ -124,11 +124,6 @@ final class OverseerElectionContext extends ShardLeaderElectionContextBase {
}
@Override
- public ElectionContext copy() {
- return new OverseerElectionContext(id, zkClient, overseer);
- }
-
- @Override
public void joinedElectionFired() {
}
diff --git a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
index 96284d3..5a8c98a 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
@@ -125,7 +125,7 @@ class ShardLeaderElectionContextBase extends ElectionContext {
}
@Override
- void runLeaderProcess(ElectionContext context, boolean weAreReplacement, int pauseBeforeStartMs)
+ synchronized void runLeaderProcess(ElectionContext context, boolean weAreReplacement, int pauseBeforeStartMs)
throws KeeperException, InterruptedException, IOException {
// register as leader - if an ephemeral is already there, wait to see if it goes away
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 02c30b8..94fd668 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -230,7 +230,7 @@ public class ZkController implements Closeable {
private static byte[] emptyJson = "{}".getBytes(StandardCharsets.UTF_8);
- private final Map<ContextKey, ElectionContext> electionContexts = new ConcurrentHashMap<>(132, 0.75f, 50) {
+ private final Map<ContextKey, ElectionContext> electionContexts = new ConcurrentHashMap<>(64, 0.75f, 16) {
@Override
public ElectionContext put(ContextKey key, ElectionContext value) {
if (ZkController.this.isClosed || cc.isShutDown()) {
@@ -240,7 +240,7 @@ public class ZkController implements Closeable {
}
};
- private final Map<ContextKey, ElectionContext> overseerContexts = new ConcurrentHashMap<>(132, 0.75f, 50) {
+ private final Map<ContextKey, ElectionContext> overseerContexts = new ConcurrentHashMap<>(3, 0.75f, 1) {
@Override
public ElectionContext put(ContextKey key, ElectionContext value) {
if (ZkController.this.isClosed || cc.isShutDown()) {
@@ -464,7 +464,7 @@ public class ZkController implements Closeable {
// start the overseer first as following code may need it's processing
- ElectionContext context = new OverseerElectionContext(getNodeName(), zkClient, overseer);
+ ElectionContext context = new OverseerElectionContext(getNodeName(), zkClient ,overseer);
ElectionContext prevContext = overseerContexts.put(new ContextKey("overseer", "overseer"), context);
if (prevContext != null) {
prevContext.close();
@@ -1393,6 +1393,7 @@ public class ZkController implements Closeable {
/// log.info("get lock for creating ephem live node");
// lock.lock();
log.info("do create ephem live node");
+
createLiveNodeImpl(nodePath, nodeAddedPath);
// } finally {
// log.info("unlock");
@@ -1426,19 +1427,9 @@ public class ZkController implements Closeable {
try {
zkClient.getSolrZooKeeper().create(nodePath, null, zkClient.getZkACLProvider().getACLsToAdd(nodePath), CreateMode.EPHEMERAL);
} catch (KeeperException.NodeExistsException e) {
- log.warn("Found our ephemeral live node already exists. This must be a quick restart after a hard shutdown, removing existing live node {}", nodePath);
- zkClient.delete(nodePath, -1);
-
- List<String> collections = zkClient.getChildren(COLLECTIONS_ZKNODE, null, false);
- for (String collection : collections) {
- log.warn("Cleaning up ephemerals for leadership for {}", collection);
- List<String> shards = zkClient.getChildren(COLLECTIONS_ZKNODE + "/" +collection + "/leaders", null, false);
- for (String shard : shards) {
- zkClient.clean(COLLECTIONS_ZKNODE + "/leaders/" + shard);
- zkClient.cleanChildren(COLLECTIONS_ZKNODE + "/leader_elect/" + shard);
- }
- }
-
+ log.warn("Found our ephemeral live node already exists. This must be a quick restart after a hard shutdown, waiting for it to expire {}", nodePath);
+ // TODO nocommit wait for expiration properly and try again
+ Thread.sleep(15000);
zkClient.getSolrZooKeeper().create(nodePath, null, zkClient.getZkACLProvider().getACLsToAdd(nodePath), CreateMode.EPHEMERAL);
}
} catch (Exception e) {
@@ -2416,12 +2407,13 @@ public class ZkController implements Closeable {
}
}
} else { // We're in the right place, now attempt to rejoin
- overseerElector.retryElection(new OverseerElectionContext(getNodeName(), zkClient,
- overseer), joinAtHead);
+ overseerElector.retryElection(new OverseerElectionContext(getNodeName(),
+ zkClient, overseer), joinAtHead);
return;
}
} else {
- overseerElector.retryElection(overseerElector.getContext(), joinAtHead);
+ overseerElector.retryElection(new OverseerElectionContext(getNodeName(),
+ zkClient, overseer), joinAtHead);
}
} catch (Exception e) {
ParWork.propegateInterrupt(e);
diff --git a/solr/reference_branch/prod/Dockerfile b/solr/reference_branch/prod/Dockerfile
index 5abe8c6..0f59c75 100644
--- a/solr/reference_branch/prod/Dockerfile
+++ b/solr/reference_branch/prod/Dockerfile
@@ -10,7 +10,7 @@ ADD start-solr.sh /start-solr.sh
RUN chmod +x /start-solr.sh
-RUN apt-get -y update; apt-get -y upgrade; apt-get -y install openjdk-11-jdk; apt-get -y install ant; apt-get -y install git
+RUN apt-get -y update; apt-get -y upgrade; apt-get -y install openjdk-11-jdk-headless; apt-get -y install ant; apt-get -y install git
RUN cd "${BUILD_DIR}"; git clone https://github.com/apache/lucene-solr.git --branch reference_impl --single-branch reference_impl; \
cd "${BUILD_DIR}/reference_impl"; ant ivy-bootstrap;cd solr; ant package -Dversion=9.0.0-miller_ref_impl; \
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java
index fa2db3e..ec18d97 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java
@@ -159,6 +159,7 @@ public class SolrZkClient implements Closeable {
public SolrZkClient(String zkServerAddress, int zkClientTimeout, int clientConnectTimeout,
ZkClientConnectionStrategy strat, final OnReconnect onReconnect, BeforeReconnect beforeReconnect, ZkACLProvider zkACLProvider, IsClosed higherLevelIsClosed) {
ObjectReleaseTracker.track(this);
+ log.info("Creating new {} instance {}", SolrZkClient.class.getSimpleName(), this);
closeTracker = new CloseTracker();
this.zkServerAddress = zkServerAddress;
this.higherLevelIsClosed = higherLevelIsClosed;
@@ -188,7 +189,7 @@ public class SolrZkClient implements Closeable {
}
public SolrZkClient start() {
-
+ log.info("Starting {} instance {}", SolrZkClient.class.getSimpleName(), this);
try {
zkClientConnectionStrategy.connect(zkServerAddress, zkClientTimeout, wrapWatcher(connManager),
zooKeeper -> {
@@ -857,6 +858,7 @@ public class SolrZkClient implements Closeable {
}
public void close() {
+ log.info("Closing {} instance {}", SolrZkClient.class.getSimpleName(), this);
closeTracker.close();
if (isClosed) return; // it's okay if we over close - same as solrcore
isClosed = true;