You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2019/10/02 17:13:40 UTC
[lucene-solr] branch master updated: SOLR-13811: Refactor
AutoAddReplicasIntegrationTest to isolate problematic situation into an
AwaitsFix test method
This is an automated email from the ASF dual-hosted git repository.
hossman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new a57ec14 SOLR-13811: Refactor AutoAddReplicasIntegrationTest to isolate problematic situation into an AwaitsFix test method
a57ec14 is described below
commit a57ec148e52507104fdf0f99381d2b485fa846fc
Author: Chris Hostetter <ho...@apache.org>
AuthorDate: Wed Oct 2 10:13:33 2019 -0700
SOLR-13811: Refactor AutoAddReplicasIntegrationTest to isolate problematic situation into an AwaitsFix test method
---
.../AutoAddReplicasIntegrationTest.java | 332 ++++++++++++++++-----
1 file changed, 265 insertions(+), 67 deletions(-)
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasIntegrationTest.java
index a5dedc3..68898fb 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasIntegrationTest.java
@@ -24,6 +24,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrServerException;
@@ -31,7 +32,9 @@ import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.request.V2Request;
+import org.apache.solr.cloud.MiniSolrCloudCluster;
import org.apache.solr.cloud.SolrCloudTestCase;
+import org.apache.solr.common.cloud.CollectionStatePredicate;
import org.apache.solr.common.cloud.ClusterStateUtil;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
@@ -49,16 +52,15 @@ import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+@org.apache.solr.util.LogLevel("org.apache.solr.cloud.autoscaling=DEBUG;org.apache.solr.cloud.autoscaling.NodeLostTrigger=TRACE;org.apache.solr.cloud.Overseer=DEBUG;org.apache.solr.cloud.overseer=DEBUG")
public class AutoAddReplicasIntegrationTest extends SolrCloudTestCase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-
- private static final String COLLECTION1 = "testSimple1";
- private static final String COLLECTION2 = "testSimple2";
+
protected String getConfigSet() {
return "cloud-minimal";
}
-
+
@Before
public void setupCluster() throws Exception {
configureCluster(3)
@@ -82,102 +84,267 @@ public class AutoAddReplicasIntegrationTest extends SolrCloudTestCase {
}
}
+ /**
+ * Test that basic autoAddReplicaLogic kicks in when a node is lost
+ */
@Test
public void testSimple() throws Exception {
- JettySolrRunner jetty1 = cluster.getJettySolrRunner(0);
- JettySolrRunner jetty2 = cluster.getJettySolrRunner(1);
- JettySolrRunner jetty3 = cluster.getJettySolrRunner(2);
- CollectionAdminRequest.createCollection(COLLECTION1, "conf", 2, 2)
- .setCreateNodeSet(jetty1.getNodeName()+","+jetty2.getNodeName())
- .setAutoAddReplicas(true)
- .setMaxShardsPerNode(2)
- .process(cluster.getSolrClient());
+ final String COLLECTION = "test_simple";
+ final ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
+ final JettySolrRunner jetty1 = cluster.getJettySolrRunner(1);
+ final JettySolrRunner jetty2 = cluster.getJettySolrRunner(2);
+ log.info("Creating {} using jetty1:{}/{} and jetty2:{}/{}", COLLECTION,
+ jetty1.getNodeName(), jetty1.getLocalPort(),
+ jetty2.getNodeName(), jetty2.getLocalPort());
+
+ CollectionAdminRequest.createCollection(COLLECTION, "conf", 2, 2)
+ .setCreateNodeSet(jetty1.getNodeName()+","+jetty2.getNodeName())
+ .setAutoAddReplicas(true)
+ .setMaxShardsPerNode(2)
+ .process(cluster.getSolrClient());
- cluster.waitForActiveCollection(COLLECTION1, 2, 4);
+ cluster.waitForActiveCollection(COLLECTION, 2, 4);
- CollectionAdminRequest.createCollection(COLLECTION2, "conf", 2, 2)
- .setCreateNodeSet(jetty2.getNodeName()+","+jetty3.getNodeName())
- .setAutoAddReplicas(false)
- .setMaxShardsPerNode(2)
- .process(cluster.getSolrClient());
+ // start the tests
+ JettySolrRunner lostJetty = random().nextBoolean() ? jetty1 : jetty2;
+ String lostNodeName = lostJetty.getNodeName();
+ List<Replica> replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION, zkStateReader, lostNodeName);
+ log.info("Stopping random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+ lostJetty.stop();
- cluster.waitForActiveCollection(COLLECTION2, 2, 4);
+ cluster.waitForJettyToStop(lostJetty);
+ waitForNodeLeave(lostNodeName);
- // the number of cores in jetty1 (5) will be larger than jetty3 (1)
- CollectionAdminRequest.createCollection("testSimple3", "conf", 3, 1)
- .setCreateNodeSet(jetty1.getNodeName())
- .setAutoAddReplicas(false)
- .setMaxShardsPerNode(3)
- .process(cluster.getSolrClient());
+ waitForState(COLLECTION + "=(2,4) w/o down replicas",
+ COLLECTION, clusterShapeNoDownReplicas(2,4), 90, TimeUnit.SECONDS);
+
+ checkSharedFsReplicasMovedCorrectly(replacedHdfsReplicas, zkStateReader, COLLECTION);
+
+ log.info("Re-starting (same) random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+ lostJetty.start();
+
+ waitForNodeLive(lostJetty);
+
+ assertTrue("Timeout waiting for all live and active",
+ ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 90000));
+
+ }
- cluster.waitForActiveCollection("testSimple3", 3, 3);
+ /**
+ * Test that basic autoAddReplicaLogic logic is <b>not</b> used if the cluster prop for it is disabled
+ * (even if sys prop is set after collection is created)
+ */
+ @Test
+ public void testClusterPropOverridesCollecitonProp() throws Exception {
+ final String COLLECTION = "test_clusterprop";
+ final ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
+ final JettySolrRunner jetty1 = cluster.getJettySolrRunner(1);
+ final JettySolrRunner jetty2 = cluster.getJettySolrRunner(2);
+
+ log.info("Creating {} using jetty1:{}/{} and jetty2:{}/{}", COLLECTION,
+ jetty1.getNodeName(), jetty1.getLocalPort(),
+ jetty2.getNodeName(), jetty2.getLocalPort());
+
+ CollectionAdminRequest.createCollection(COLLECTION, "conf", 2, 2)
+ .setCreateNodeSet(jetty1.getNodeName()+","+jetty2.getNodeName())
+ .setAutoAddReplicas(true)
+ .setMaxShardsPerNode(2)
+ .process(cluster.getSolrClient());
- ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
+ cluster.waitForActiveCollection(COLLECTION, 2, 4);
- // start the tests
- JettySolrRunner lostJetty = random().nextBoolean() ? cluster.getJettySolrRunner(0) : cluster.getJettySolrRunner(1);
+ // check cluster property is considered
+ disableAutoAddReplicasInCluster();
+
+ JettySolrRunner lostJetty = random().nextBoolean() ? jetty1 : jetty2;
String lostNodeName = lostJetty.getNodeName();
- List<Replica> replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION1, zkStateReader, lostNodeName);
+ List<Replica> replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION, zkStateReader, lostNodeName);
+
+ log.info("Stopping random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
lostJetty.stop();
cluster.waitForJettyToStop(lostJetty);
waitForNodeLeave(lostNodeName);
- // ensure that 2 shards have 2 active replicas and only 4 replicas in total
- // i.e. old replicas have been deleted.
- // todo remove the condition for total replicas == 4 after SOLR-11591 is fixed
- waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, (liveNodes, collectionState) -> clusterShape(2, 4).matches(liveNodes, collectionState)
- && collectionState.getReplicas().size() == 4, 90, TimeUnit.SECONDS);
- checkSharedFsReplicasMovedCorrectly(replacedHdfsReplicas, zkStateReader, COLLECTION1);
+ waitForState(COLLECTION + "=(2,2)", COLLECTION,
+ clusterShape(2, 2), 90, TimeUnit.SECONDS);
+
+
+ log.info("Re-starting (same) random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
lostJetty.start();
- cluster.waitForAllNodes(30);
+ waitForNodeLive(lostJetty);
+
+ assertTrue("Timeout waiting for all live and active",
+ ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 90000));
- assertTrue("Timeout waiting for all live and active", ClusterStateUtil.waitForAllActiveAndLiveReplicas(cluster.getSolrClient().getZkStateReader(), 90000));
+ waitForState(COLLECTION + "=(2,4) w/o down replicas",
+ COLLECTION, clusterShapeNoDownReplicas(2,4), 90, TimeUnit.SECONDS);
- // check cluster property is considered
- disableAutoAddReplicasInCluster();
- lostNodeName = jetty3.getNodeName();
- jetty3.stop();
+ }
+
+ /**
+ * Test that we can modify a collection after creation to add autoAddReplicas.
+ */
+ @Test
+ public void testAddCollectionPropAfterCreation() throws Exception {
+ final String COLLECTION = "test_addprop";
+ final ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
+ final JettySolrRunner jetty1 = cluster.getJettySolrRunner(1);
+ final JettySolrRunner jetty2 = cluster.getJettySolrRunner(2);
+
+ log.info("Creating {} using jetty1:{}/{} and jetty2:{}/{}", COLLECTION,
+ jetty1.getNodeName(), jetty1.getLocalPort(),
+ jetty2.getNodeName(), jetty2.getLocalPort());
+
+ CollectionAdminRequest.createCollection(COLLECTION, "conf", 2, 2)
+ .setCreateNodeSet(jetty1.getNodeName()+","+jetty2.getNodeName())
+ .setAutoAddReplicas(false) // NOTE: false
+ .setMaxShardsPerNode(2)
+ .process(cluster.getSolrClient());
- cluster.waitForJettyToStop(jetty3);
+ cluster.waitForActiveCollection(COLLECTION, 2, 4);
+
+ log.info("Modifying {} to use autoAddReplicas", COLLECTION);
+ new CollectionAdminRequest.AsyncCollectionAdminRequest(CollectionParams.CollectionAction.MODIFYCOLLECTION) {
+ @Override
+ public SolrParams getParams() {
+ ModifiableSolrParams params = (ModifiableSolrParams) super.getParams();
+ params.set("collection", COLLECTION);
+ params.set("autoAddReplicas", true);
+ return params;
+ }
+ }.process(cluster.getSolrClient());
+
+ JettySolrRunner lostJetty = random().nextBoolean() ? jetty1 : jetty2;
+ String lostNodeName = lostJetty.getNodeName();
+ List<Replica> replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION, zkStateReader, lostNodeName);
+
+ log.info("Stopping random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+ lostJetty.stop();
+
+ cluster.waitForJettyToStop(lostJetty);
waitForNodeLeave(lostNodeName);
+
+ waitForState(COLLECTION + "=(2,4) w/o down replicas",
+ COLLECTION, clusterShapeNoDownReplicas(2,4), 90, TimeUnit.SECONDS);
+ checkSharedFsReplicasMovedCorrectly(replacedHdfsReplicas, zkStateReader, COLLECTION);
+
+ log.info("Re-starting (same) random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+ lostJetty.start();
+
+ waitForNodeLive(lostJetty);
+
+ assertTrue("Timeout waiting for all live and active",
+ ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 90000));
+ }
+
+ /**
+ * Test a specific sequence of problematic events:
+ * <ul>
+ * <li>create a collection with autoAddReplicas=<b>false</b></li>
+ * <li>stop a nodeX in use by the collection</li>
+ * <li>re-start nodeX</li>
+ * <li>set autoAddReplicas=<b>true</b></li>
+ * <li>re-stop nodeX</li>
+ * </ul>
+ */
+ @Test
+ @AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-13811")
+ public void testRapidStopStartStopWithPropChange() throws Exception {
+
+ // This is the collection we'll be focused on in our testing...
+ final String COLLECTION = "test_stoptwice";
+ // This is a collection we'll use as a "marker" to ensure we "wait" for the
+ // autoAddReplicas logic (via NodeLostTrigger) to kick in at least once before proceeding...
+ final String ALT_COLLECTION = "test_dummy";
- waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, clusterShape(2, 2));
- jetty3.start();
- waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, clusterShape(2, 4));
- waitForState("Waiting for collection " + COLLECTION2, COLLECTION2, clusterShape(2, 4));
- enableAutoAddReplicasInCluster();
+ final ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
+ final JettySolrRunner jetty1 = cluster.getJettySolrRunner(1);
+ final JettySolrRunner jetty2 = cluster.getJettySolrRunner(2);
+ log.info("Creating {} using jetty1:{}/{} and jetty2:{}/{}", COLLECTION,
+ jetty1.getNodeName(), jetty1.getLocalPort(),
+ jetty2.getNodeName(), jetty2.getLocalPort());
+
+ CollectionAdminRequest.createCollection(COLLECTION, "conf", 2, 2)
+ .setCreateNodeSet(jetty1.getNodeName()+","+jetty2.getNodeName())
+ .setAutoAddReplicas(false) // NOTE: false
+ .setMaxShardsPerNode(2)
+ .process(cluster.getSolrClient());
+
+ log.info("Creating {} using jetty1:{}/{} and jetty2:{}/{}", ALT_COLLECTION,
+ jetty1.getNodeName(), jetty1.getLocalPort(),
+ jetty2.getNodeName(), jetty2.getLocalPort());
+
+ CollectionAdminRequest.createCollection(ALT_COLLECTION, "conf", 2, 2)
+ .setCreateNodeSet(jetty1.getNodeName()+","+jetty2.getNodeName())
+ .setAutoAddReplicas(true) // NOTE: true
+ .setMaxShardsPerNode(2)
+ .process(cluster.getSolrClient());
+
+ cluster.waitForActiveCollection(COLLECTION, 2, 4);
+ cluster.waitForActiveCollection(ALT_COLLECTION, 2, 4);
- // test for multiple collections
+ JettySolrRunner lostJetty = random().nextBoolean() ? jetty1 : jetty2;
+ String lostNodeName = lostJetty.getNodeName();
+ List<Replica> replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION, zkStateReader, lostNodeName);
+
+ log.info("Stopping random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+ lostJetty.stop();
+
+ cluster.waitForJettyToStop(lostJetty);
+ waitForNodeLeave(lostNodeName);
+
+ // ensure that our marker collection indicates that the autoAddReplicas logic
+ // has detected the down node and done some processing
+ waitForState(ALT_COLLECTION + "=(2,4) w/o down replicas",
+ ALT_COLLECTION, clusterShapeNoDownReplicas(2,4), 90, TimeUnit.SECONDS);
+
+ waitForState(COLLECTION + "=(2,2)", COLLECTION, clusterShape(2, 2));
+
+ log.info("Re-starting (same) random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+ lostJetty.start();
+ // save time, don't bother waiting for lostJetty to start until after updating collection prop...
+
+ log.info("Modifying {} to use autoAddReplicas", COLLECTION);
new CollectionAdminRequest.AsyncCollectionAdminRequest(CollectionParams.CollectionAction.MODIFYCOLLECTION) {
@Override
public SolrParams getParams() {
ModifiableSolrParams params = (ModifiableSolrParams) super.getParams();
- params.set("collection", COLLECTION2);
+ params.set("collection", COLLECTION);
params.set("autoAddReplicas", true);
return params;
}
}.process(cluster.getSolrClient());
- lostNodeName = jetty2.getNodeName();
- replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION2, zkStateReader, lostNodeName);
-
- jetty2.stop();
-
- cluster.waitForJettyToStop(jetty2);
+ // make sure lostJetty is fully up before stopping again...
+ waitForNodeLive(lostJetty);
+
+ log.info("Re-Stopping (same) random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+ lostJetty.stop();
+ cluster.waitForJettyToStop(lostJetty);
waitForNodeLeave(lostNodeName);
- waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, clusterShape(2, 4), 45, TimeUnit.SECONDS);
- waitForState("Waiting for collection " + COLLECTION2, COLLECTION2, clusterShape(2, 4), 45, TimeUnit.SECONDS);
- checkSharedFsReplicasMovedCorrectly(replacedHdfsReplicas, zkStateReader, COLLECTION2);
- // overseer failover test..
+ // TODO: this is the problematic situation...
+ // wether or not NodeLostTrigger noticed that lostJetty was re-started and shutdown *again*
+ // and that the new auoAddReplicas=true since the last time lostJetty was shutdown is respected
+ waitForState(COLLECTION + "=(2,4) w/o down replicas",
+ COLLECTION, clusterShapeNoDownReplicas(2,4), 90, TimeUnit.SECONDS);
+ checkSharedFsReplicasMovedCorrectly(replacedHdfsReplicas, zkStateReader, COLLECTION);
+
+ log.info("Re-Re-starting (same) random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+ lostJetty.start();
+
+ waitForNodeLive(lostJetty);
+
+ assertTrue("Timeout waiting for all live and active",
+ ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 90000));
}
-
+
private void disableAutoAddReplicasInCluster() throws SolrServerException, IOException {
Map m = makeMap(
"action", CollectionParams.CollectionAction.CLUSTERPROP.toLower(),
@@ -225,13 +392,44 @@ public class AutoAddReplicasIntegrationTest extends SolrCloudTestCase {
return replacedHdfsReplicas;
}
- private void waitForNodeLeave(String lostNodeName) throws InterruptedException {
+ /**
+ * {@link MiniSolrCloudCluster#waitForNode} Doesn't check isRunning first, and we don't want to
+ * use {@link MiniSolrCloudCluster#waitForAllNodes} because we don't want to waste cycles checking
+ * nodes we aren't messing with
+ */
+ private void waitForNodeLive(final JettySolrRunner jetty)
+ throws InterruptedException, TimeoutException, IOException {
+ log.info("waitForNodeLive: {}/{}", jetty.getNodeName(), jetty.getLocalPort());
+
+ TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+ while(!timeout.hasTimedOut()) {
+ if (jetty.isRunning()) {
+ break;
+ }
+ try {
+ Thread.sleep(100);
+ } catch (InterruptedException e) {
+ // ignore
+ }
+ }
+ if (timeout.hasTimedOut()) {
+ throw new TimeoutException("Waiting for Jetty to stop timed out");
+ }
+ cluster.waitForNode(jetty, 30);
+ }
+
+ private void waitForNodeLeave(String lostNodeName) throws InterruptedException, TimeoutException {
log.info("waitForNodeLeave: {}", lostNodeName);
ZkStateReader reader = cluster.getSolrClient().getZkStateReader();
- TimeOut timeOut = new TimeOut(20, TimeUnit.SECONDS, TimeSource.NANO_TIME);
- while (reader.getClusterState().getLiveNodes().contains(lostNodeName)) {
- Thread.sleep(100);
- if (timeOut.hasTimedOut()) fail("Wait for " + lostNodeName + " to leave failed!");
- }
+ reader.waitForLiveNodes(30, TimeUnit.SECONDS, (o, n) -> !n.contains(lostNodeName));
}
+
+
+ private static CollectionStatePredicate clusterShapeNoDownReplicas(final int expectedShards,
+ final int expectedReplicas) {
+ return (liveNodes, collectionState)
+ -> (clusterShape(expectedShards, expectedReplicas).matches(liveNodes, collectionState)
+ && collectionState.getReplicas().size() == expectedReplicas);
+ }
+
}