You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/04/19 08:00:41 UTC
lucene-solr:branch_7_3: SOLR-12087: Deleting replicas sometimes fails
and causes the replicas to exist in the down state
Repository: lucene-solr
Updated Branches:
refs/heads/branch_7_3 8523f384a -> ec9ccb5cd
SOLR-12087: Deleting replicas sometimes fails and causes the replicas to exist in the down state
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/ec9ccb5c
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/ec9ccb5c
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/ec9ccb5c
Branch: refs/heads/branch_7_3
Commit: ec9ccb5cd07f54eacba2cf071281cc9c37b766c1
Parents: 8523f38
Author: Cao Manh Dat <da...@apache.org>
Authored: Thu Mar 22 16:11:47 2018 +0700
Committer: Cao Manh Dat <da...@apache.org>
Committed: Thu Apr 19 15:00:27 2018 +0700
----------------------------------------------------------------------
solr/CHANGES.txt | 2 +
.../java/org/apache/solr/cloud/CloudUtil.java | 12 +++++
.../cloud/LeaderInitiatedRecoveryThread.java | 30 ++++++++---
.../solr/cloud/overseer/ReplicaMutator.java | 13 ++++-
.../src/java/org/apache/solr/util/TimeOut.java | 17 ++++++
.../apache/solr/cloud/DeleteReplicaTest.java | 57 ++++++++++++++++++++
.../apache/solr/common/cloud/ZkStateReader.java | 2 +
7 files changed, 126 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec9ccb5c/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index c0b84a6..c7e86cc 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -34,6 +34,8 @@ Bug Fixes
* SOLR-12204: Upgrade commons-fileupload dependency to 1.3.3 to address CVE-2016-1000031. (Steve Rowe)
+* SOLR-12087: Deleting replicas sometimes fails and causes the replicas to exist in the down state (Cao Manh Dat)
+
================== 7.3.0 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec9ccb5c/solr/core/src/java/org/apache/solr/cloud/CloudUtil.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/CloudUtil.java b/solr/core/src/java/org/apache/solr/cloud/CloudUtil.java
index 0d45129..e0bd786 100644
--- a/solr/core/src/java/org/apache/solr/cloud/CloudUtil.java
+++ b/solr/core/src/java/org/apache/solr/cloud/CloudUtil.java
@@ -29,6 +29,7 @@ import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig;
import org.apache.solr.client.solrj.cloud.autoscaling.SolrCloudManager;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
@@ -94,6 +95,17 @@ public class CloudUtil {
}
}
+ public static boolean replicaExists(ClusterState clusterState, String collection, String shard, String coreNodeName) {
+ DocCollection docCollection = clusterState.getCollectionOrNull(collection);
+ if (docCollection != null) {
+ Slice slice = docCollection.getSlice(shard);
+ if (slice != null) {
+ return slice.getReplica(coreNodeName) != null;
+ }
+ }
+ return false;
+ }
+
/**
* Returns a displayable unified path to the given resource. For non-solrCloud that will be the
* same as getConfigDir, but for Cloud it will be getConfigSetZkPath ending in a /
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec9ccb5c/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java b/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
index 9c46236..071bfbf 100644
--- a/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
+++ b/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
@@ -38,6 +38,7 @@ import org.slf4j.LoggerFactory;
import java.lang.invoke.MethodHandles;
import java.net.ConnectException;
import java.net.SocketException;
+import java.net.UnknownHostException;
import java.util.List;
/**
@@ -88,7 +89,9 @@ public class LeaderInitiatedRecoveryThread extends Thread {
if (!zkController.isReplicaInRecoveryHandling(replicaUrl)) {
throw new SolrException(ErrorCode.INVALID_STATE, "Replica: " + replicaUrl + " should have been marked under leader initiated recovery in ZkController but wasn't.");
}
-
+ if (!CloudUtil.replicaExists(zkController.getClusterState(), collection, shardId, replicaCoreNodeName)) {
+ log.info("Replica does not exist, skip doing LIR");
+ }
boolean sendRecoveryCommand = publishDownState(replicaCoreName, replicaCoreNodeName, replicaNodeName, replicaUrl, false);
if (sendRecoveryCommand) {
@@ -152,9 +155,11 @@ public class LeaderInitiatedRecoveryThread extends Thread {
ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(),
ZkStateReader.BASE_URL_PROP, nodeProps.getBaseUrl(),
ZkStateReader.CORE_NAME_PROP, nodeProps.getCoreName(),
+ ZkStateReader.CORE_NODE_NAME_PROP, replicaCoreNodeName,
ZkStateReader.NODE_NAME_PROP, nodeProps.getNodeName(),
ZkStateReader.SHARD_ID_PROP, shardId,
- ZkStateReader.COLLECTION_PROP, collection);
+ ZkStateReader.COLLECTION_PROP, collection,
+ ZkStateReader.FORCE_SET_STATE_PROP, "false");
log.warn("Leader is publishing core={} coreNodeName ={} state={} on behalf of un-reachable replica {}",
replicaCoreName, replicaCoreNodeName, Replica.State.DOWN.toString(), replicaUrl);
zkController.getOverseerJobQueue().offer(Utils.toJSON(m));
@@ -166,6 +171,12 @@ public class LeaderInitiatedRecoveryThread extends Thread {
return sendRecoveryCommand;
}
+ private void removeLIRState(String replicaCoreNodeName) {
+ zkController.updateLeaderInitiatedRecoveryState(collection,
+ shardId,
+ replicaCoreNodeName, Replica.State.ACTIVE, leaderCd, true);
+ }
+
/*
protected scope for testing purposes
*/
@@ -219,13 +230,20 @@ public class LeaderInitiatedRecoveryThread extends Thread {
(rootCause instanceof ConnectException ||
rootCause instanceof ConnectTimeoutException ||
rootCause instanceof NoHttpResponseException ||
- rootCause instanceof SocketException);
+ rootCause instanceof SocketException ||
+ rootCause instanceof UnknownHostException);
- SolrException.log(log, recoveryUrl + ": Could not tell a replica to recover", t);
-
if (!wasCommError) {
continueTrying = false;
- }
+ }
+
+ if (rootCause.getMessage().contains("Unable to locate core")) {
+ log.info("Replica {} is removed, hence remove its lir state", replicaCoreNodeName);
+ removeLIRState(replicaCoreNodeName);
+ break;
+ } else {
+ SolrException.log(log, recoveryUrl + ": Could not tell a replica to recover, wasCommError:"+wasCommError, t);
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec9ccb5c/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java b/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
index f2c9a2f..07f0eb3 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
@@ -30,6 +30,7 @@ import org.apache.commons.lang.StringUtils;
import org.apache.solr.client.solrj.cloud.autoscaling.DistribStateManager;
import org.apache.solr.client.solrj.cloud.autoscaling.SolrCloudManager;
import org.apache.solr.client.solrj.cloud.autoscaling.VersionedData;
+import org.apache.solr.cloud.CloudUtil;
import org.apache.solr.cloud.Overseer;
import org.apache.solr.cloud.api.collections.Assign;
import org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler;
@@ -233,15 +234,25 @@ public class ReplicaMutator {
private ZkWriteCommand updateState(final ClusterState prevState, ZkNodeProps message, String collectionName, Integer numShards, boolean collectionExists) {
String sliceName = message.getStr(ZkStateReader.SHARD_ID_PROP);
-
String coreNodeName = message.getStr(ZkStateReader.CORE_NODE_NAME_PROP);
+ boolean forceSetState = message.getBool(ZkStateReader.FORCE_SET_STATE_PROP, true);
+
DocCollection collection = prevState.getCollectionOrNull(collectionName);
+ if (!forceSetState && !CloudUtil.replicaExists(prevState, collectionName, sliceName, coreNodeName)) {
+ log.info("Failed to update state because the replica does not exist, {}", message);
+ return ZkStateWriter.NO_OP;
+ }
+
if (coreNodeName == null) {
coreNodeName = ClusterStateMutator.getAssignedCoreNodeName(collection,
message.getStr(ZkStateReader.NODE_NAME_PROP), message.getStr(ZkStateReader.CORE_NAME_PROP));
if (coreNodeName != null) {
log.debug("node=" + coreNodeName + " is already registered");
} else {
+ if (!forceSetState) {
+ log.info("Failed to update state because the replica does not exist, {}", message);
+ return ZkStateWriter.NO_OP;
+ }
// if coreNodeName is null, auto assign one
coreNodeName = Assign.assignCoreNodeName(stateManager, collection);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec9ccb5c/solr/core/src/java/org/apache/solr/util/TimeOut.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/util/TimeOut.java b/solr/core/src/java/org/apache/solr/util/TimeOut.java
index bcc29961..87a8c94 100644
--- a/solr/core/src/java/org/apache/solr/util/TimeOut.java
+++ b/solr/core/src/java/org/apache/solr/util/TimeOut.java
@@ -17,6 +17,8 @@
package org.apache.solr.util;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.function.Supplier;
import org.apache.solr.common.util.TimeSource;
@@ -48,4 +50,19 @@ public class TimeOut {
public long timeElapsed(TimeUnit unit) {
return unit.convert(timeSource.getTime() - startTime, NANOSECONDS);
}
+
+ /**
+ * Wait until the given {@link Supplier} returns true or the time out expires which ever happens first
+ * @param messageOnTimeOut the exception message to be used in case a TimeoutException is thrown
+ * @param supplier a {@link Supplier} that returns a {@link Boolean} value
+ * @throws InterruptedException if any thread has interrupted the current thread
+ * @throws TimeoutException if the timeout expires
+ */
+ public void waitFor(String messageOnTimeOut, Supplier<Boolean> supplier)
+ throws InterruptedException, TimeoutException {
+ while (!supplier.get() && !hasTimedOut()) {
+ Thread.sleep(500);
+ }
+ if (hasTimedOut()) throw new TimeoutException(messageOnTimeOut);
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec9ccb5c/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java
index 7429c1c..3208ebd 100644
--- a/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java
@@ -21,8 +21,11 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.EnumSet;
+import java.util.List;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
@@ -30,6 +33,7 @@ import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.CoreStatus;
import org.apache.solr.cloud.overseer.OverseerAction;
import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
@@ -39,6 +43,7 @@ import org.apache.solr.common.util.TimeSource;
import org.apache.solr.common.util.Utils;
import org.apache.solr.core.ZkContainer;
import org.apache.solr.util.TimeOut;
+import org.apache.zookeeper.KeeperException;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
@@ -283,5 +288,57 @@ public class DeleteReplicaTest extends SolrCloudTestCase {
if (timeOut.hasTimedOut()) fail("Wait for " + lostNodeName + " to leave failed!");
}
}
+
+ @Test
+ public void deleteReplicaOnIndexing() throws Exception {
+ final String collectionName = "deleteReplicaOnIndexing";
+ CollectionAdminRequest.createCollection(collectionName, "conf", 1, 2)
+ .process(cluster.getSolrClient());
+ waitForState("", collectionName, clusterShape(1, 2));
+ AtomicBoolean closed = new AtomicBoolean(false);
+ Thread[] threads = new Thread[100];
+ for (int i = 0; i < threads.length; i++) {
+ int finalI = i;
+ threads[i] = new Thread(() -> {
+ int doc = finalI * 10000;
+ while (!closed.get()) {
+ try {
+ cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", String.valueOf(doc++)));
+ } catch (Exception e) {
+ LOG.error("Failed on adding document to {}", collectionName, e);
+ }
+ }
+ });
+ threads[i].start();
+ }
+
+ Slice shard1 = getCollectionState(collectionName).getSlice("shard1");
+ Replica nonLeader = shard1.getReplicas(rep -> !rep.getName().equals(shard1.getLeader().getName())).get(0);
+ CollectionAdminRequest.deleteReplica(collectionName, "shard1", nonLeader.getName()).process(cluster.getSolrClient());
+ closed.set(true);
+ for (int i = 0; i < threads.length; i++) {
+ threads[i].join();
+ }
+
+ try {
+ cluster.getSolrClient().waitForState(collectionName, 20, TimeUnit.SECONDS, (liveNodes, collectionState) -> collectionState.getReplicas().size() == 1);
+ } catch (TimeoutException e) {
+ LOG.info("Timeout wait for state {}", getCollectionState(collectionName));
+ throw e;
+ }
+
+ TimeOut timeOut = new TimeOut(20, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+ timeOut.waitFor("Time out waiting for LIR state get removed", () -> {
+ String lirPath = ZkController.getLeaderInitiatedRecoveryZnodePath(collectionName, "shard1");
+ try {
+ List<String> children = zkClient().getChildren(lirPath, null, true);
+ return children.size() == 0;
+ } catch (KeeperException.NoNodeException e) {
+ return true;
+ } catch (Exception e) {
+ throw new AssertionError(e);
+ }
+ });
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec9ccb5c/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
index f598da8..8f436d2 100644
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
@@ -78,6 +78,8 @@ public class ZkStateReader implements Closeable {
public static final String CORE_NODE_NAME_PROP = "core_node_name";
public static final String ROLES_PROP = "roles";
public static final String STATE_PROP = "state";
+ // if this flag equals to false and the replica does not exist in cluster state, set state op become no op (default is true)
+ public static final String FORCE_SET_STATE_PROP = "force_set_state";
/** SolrCore name. */
public static final String CORE_NAME_PROP = "core";
public static final String COLLECTION_PROP = "collection";