You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2019/10/02 17:13:40 UTC

[lucene-solr] branch master updated: SOLR-13811: Refactor AutoAddReplicasIntegrationTest to isolate problematic situation into an AwaitsFix test method

This is an automated email from the ASF dual-hosted git repository.

hossman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new a57ec14  SOLR-13811: Refactor AutoAddReplicasIntegrationTest to isolate problematic situation into an AwaitsFix test method
a57ec14 is described below

commit a57ec148e52507104fdf0f99381d2b485fa846fc
Author: Chris Hostetter <ho...@apache.org>
AuthorDate: Wed Oct 2 10:13:33 2019 -0700

    SOLR-13811: Refactor AutoAddReplicasIntegrationTest to isolate problematic situation into an AwaitsFix test method
---
 .../AutoAddReplicasIntegrationTest.java            | 332 ++++++++++++++++-----
 1 file changed, 265 insertions(+), 67 deletions(-)

diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasIntegrationTest.java
index a5dedc3..68898fb 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/AutoAddReplicasIntegrationTest.java
@@ -24,6 +24,7 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 
 import org.apache.solr.client.solrj.SolrRequest;
 import org.apache.solr.client.solrj.SolrServerException;
@@ -31,7 +32,9 @@ import org.apache.solr.client.solrj.embedded.JettySolrRunner;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.client.solrj.request.QueryRequest;
 import org.apache.solr.client.solrj.request.V2Request;
+import org.apache.solr.cloud.MiniSolrCloudCluster;
 import org.apache.solr.cloud.SolrCloudTestCase;
+import org.apache.solr.common.cloud.CollectionStatePredicate;
 import org.apache.solr.common.cloud.ClusterStateUtil;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
@@ -49,16 +52,15 @@ import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+@org.apache.solr.util.LogLevel("org.apache.solr.cloud.autoscaling=DEBUG;org.apache.solr.cloud.autoscaling.NodeLostTrigger=TRACE;org.apache.solr.cloud.Overseer=DEBUG;org.apache.solr.cloud.overseer=DEBUG")
 public class AutoAddReplicasIntegrationTest extends SolrCloudTestCase {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-  
-  private static final String COLLECTION1 =  "testSimple1";
-  private static final String COLLECTION2 =  "testSimple2";
+
 
   protected String getConfigSet() {
     return "cloud-minimal";
   }
-
+  
   @Before
   public void setupCluster() throws Exception {
     configureCluster(3)
@@ -82,102 +84,267 @@ public class AutoAddReplicasIntegrationTest extends SolrCloudTestCase {
     }
   }
 
+  /**
+   * Test that basic autoAddReplicaLogic kicks in when a node is lost 
+   */
   @Test
   public void testSimple() throws Exception {
-    JettySolrRunner jetty1 = cluster.getJettySolrRunner(0);
-    JettySolrRunner jetty2 = cluster.getJettySolrRunner(1);
-    JettySolrRunner jetty3 = cluster.getJettySolrRunner(2);
-    CollectionAdminRequest.createCollection(COLLECTION1, "conf", 2, 2)
-        .setCreateNodeSet(jetty1.getNodeName()+","+jetty2.getNodeName())
-        .setAutoAddReplicas(true)
-        .setMaxShardsPerNode(2)
-        .process(cluster.getSolrClient());
+    final String COLLECTION = "test_simple";
+    final ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
+    final JettySolrRunner jetty1 = cluster.getJettySolrRunner(1);
+    final JettySolrRunner jetty2 = cluster.getJettySolrRunner(2);
+    log.info("Creating {} using jetty1:{}/{} and jetty2:{}/{}", COLLECTION,
+             jetty1.getNodeName(), jetty1.getLocalPort(),
+             jetty2.getNodeName(), jetty2.getLocalPort());
+             
+    CollectionAdminRequest.createCollection(COLLECTION, "conf", 2, 2)
+      .setCreateNodeSet(jetty1.getNodeName()+","+jetty2.getNodeName())
+      .setAutoAddReplicas(true)
+      .setMaxShardsPerNode(2)
+      .process(cluster.getSolrClient());
     
-    cluster.waitForActiveCollection(COLLECTION1, 2, 4);
+    cluster.waitForActiveCollection(COLLECTION, 2, 4);
     
-    CollectionAdminRequest.createCollection(COLLECTION2, "conf", 2, 2)
-        .setCreateNodeSet(jetty2.getNodeName()+","+jetty3.getNodeName())
-        .setAutoAddReplicas(false)
-        .setMaxShardsPerNode(2)
-        .process(cluster.getSolrClient());
+    // start the tests
+    JettySolrRunner lostJetty = random().nextBoolean() ? jetty1 : jetty2;
+    String lostNodeName = lostJetty.getNodeName();
+    List<Replica> replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION, zkStateReader, lostNodeName);
+    log.info("Stopping random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+    lostJetty.stop();
     
-    cluster.waitForActiveCollection(COLLECTION2, 2, 4);
+    cluster.waitForJettyToStop(lostJetty);
+    waitForNodeLeave(lostNodeName);
     
-    // the number of cores in jetty1 (5) will be larger than jetty3 (1)
-    CollectionAdminRequest.createCollection("testSimple3", "conf", 3, 1)
-        .setCreateNodeSet(jetty1.getNodeName())
-        .setAutoAddReplicas(false)
-        .setMaxShardsPerNode(3)
-        .process(cluster.getSolrClient());
+    waitForState(COLLECTION + "=(2,4) w/o down replicas",
+                 COLLECTION, clusterShapeNoDownReplicas(2,4), 90, TimeUnit.SECONDS);
+                 
+    checkSharedFsReplicasMovedCorrectly(replacedHdfsReplicas, zkStateReader, COLLECTION);
+    
+    log.info("Re-starting (same) random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+    lostJetty.start();
+    
+    waitForNodeLive(lostJetty);
+    
+    assertTrue("Timeout waiting for all live and active",
+               ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 90000));
+
+  }
 
-    cluster.waitForActiveCollection("testSimple3", 3, 3);
+  /**
+   * Test that basic autoAddReplicaLogic logic is <b>not</b> used if the cluster prop for it is disabled 
+   * (even if sys prop is set after collection is created)
+   */
+  @Test
+  public void testClusterPropOverridesCollecitonProp() throws Exception {
+    final String COLLECTION = "test_clusterprop";
+    final ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
+    final JettySolrRunner jetty1 = cluster.getJettySolrRunner(1);
+    final JettySolrRunner jetty2 = cluster.getJettySolrRunner(2);
+
+    log.info("Creating {} using jetty1:{}/{} and jetty2:{}/{}", COLLECTION,
+             jetty1.getNodeName(), jetty1.getLocalPort(),
+             jetty2.getNodeName(), jetty2.getLocalPort());
+             
+    CollectionAdminRequest.createCollection(COLLECTION, "conf", 2, 2)
+      .setCreateNodeSet(jetty1.getNodeName()+","+jetty2.getNodeName())
+      .setAutoAddReplicas(true)
+      .setMaxShardsPerNode(2)
+      .process(cluster.getSolrClient());
     
-    ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
+    cluster.waitForActiveCollection(COLLECTION, 2, 4);
 
-    // start the tests
-    JettySolrRunner lostJetty = random().nextBoolean() ? cluster.getJettySolrRunner(0) : cluster.getJettySolrRunner(1);
+    // check cluster property is considered
+    disableAutoAddReplicasInCluster();
+
+    JettySolrRunner lostJetty = random().nextBoolean() ? jetty1 : jetty2;
     String lostNodeName = lostJetty.getNodeName();
-    List<Replica> replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION1, zkStateReader, lostNodeName);
+    List<Replica> replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION, zkStateReader, lostNodeName);
+    
+    log.info("Stopping random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
     lostJetty.stop();
     
     cluster.waitForJettyToStop(lostJetty);
     
     waitForNodeLeave(lostNodeName);
     
-    // ensure that 2 shards have 2 active replicas and only 4 replicas in total
-    // i.e. old replicas have been deleted.
-    // todo remove the condition for total replicas == 4 after SOLR-11591 is fixed
-    waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, (liveNodes, collectionState) -> clusterShape(2, 4).matches(liveNodes, collectionState)
-        && collectionState.getReplicas().size() == 4, 90, TimeUnit.SECONDS);
-    checkSharedFsReplicasMovedCorrectly(replacedHdfsReplicas, zkStateReader, COLLECTION1);
+    waitForState(COLLECTION + "=(2,2)", COLLECTION,
+                 clusterShape(2, 2), 90, TimeUnit.SECONDS);
+                 
+    
+    log.info("Re-starting (same) random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
     lostJetty.start();
     
-    cluster.waitForAllNodes(30);
+    waitForNodeLive(lostJetty);
+    
+    assertTrue("Timeout waiting for all live and active",
+               ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 90000));
     
-    assertTrue("Timeout waiting for all live and active", ClusterStateUtil.waitForAllActiveAndLiveReplicas(cluster.getSolrClient().getZkStateReader(), 90000));
+    waitForState(COLLECTION + "=(2,4) w/o down replicas",
+                 COLLECTION, clusterShapeNoDownReplicas(2,4), 90, TimeUnit.SECONDS);
 
-    // check cluster property is considered
-    disableAutoAddReplicasInCluster();
-    lostNodeName = jetty3.getNodeName();
-    jetty3.stop();
+  }
+
+  /**
+   * Test that we can modify a collection after creation to add autoAddReplicas.
+   */
+  @Test
+  public void testAddCollectionPropAfterCreation() throws Exception {
+    final String COLLECTION = "test_addprop";
+    final ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
+    final JettySolrRunner jetty1 = cluster.getJettySolrRunner(1);
+    final JettySolrRunner jetty2 = cluster.getJettySolrRunner(2);
+
+    log.info("Creating {} using jetty1:{}/{} and jetty2:{}/{}", COLLECTION,
+             jetty1.getNodeName(), jetty1.getLocalPort(),
+             jetty2.getNodeName(), jetty2.getLocalPort());
+             
+    CollectionAdminRequest.createCollection(COLLECTION, "conf", 2, 2)
+      .setCreateNodeSet(jetty1.getNodeName()+","+jetty2.getNodeName())
+      .setAutoAddReplicas(false) // NOTE: false
+      .setMaxShardsPerNode(2)
+      .process(cluster.getSolrClient());
     
-    cluster.waitForJettyToStop(jetty3);
+    cluster.waitForActiveCollection(COLLECTION, 2, 4);
+    
+    log.info("Modifying {} to use autoAddReplicas", COLLECTION);
+    new CollectionAdminRequest.AsyncCollectionAdminRequest(CollectionParams.CollectionAction.MODIFYCOLLECTION) {
+      @Override
+      public SolrParams getParams() {
+        ModifiableSolrParams params = (ModifiableSolrParams) super.getParams();
+        params.set("collection", COLLECTION);
+        params.set("autoAddReplicas", true);
+        return params;
+      }
+    }.process(cluster.getSolrClient());
+
+    JettySolrRunner lostJetty = random().nextBoolean() ? jetty1 : jetty2;
+    String lostNodeName = lostJetty.getNodeName();
+    List<Replica> replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION, zkStateReader, lostNodeName);
+
+    log.info("Stopping random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+    lostJetty.stop();
+    
+    cluster.waitForJettyToStop(lostJetty);
     
     waitForNodeLeave(lostNodeName);
+
+    waitForState(COLLECTION + "=(2,4) w/o down replicas",
+                 COLLECTION, clusterShapeNoDownReplicas(2,4), 90, TimeUnit.SECONDS);
+    checkSharedFsReplicasMovedCorrectly(replacedHdfsReplicas, zkStateReader, COLLECTION);
+    
+    log.info("Re-starting (same) random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+    lostJetty.start();
+    
+    waitForNodeLive(lostJetty);
+    
+    assertTrue("Timeout waiting for all live and active",
+               ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 90000));
+  }
+
+  /**
+   * Test a specific sequence of problematic events:
+   * <ul>
+   *  <li>create a collection with autoAddReplicas=<b>false</b></li>
+   *  <li>stop a nodeX in use by the collection</li>
+   *  <li>re-start nodeX</li>
+   *  <li>set autoAddReplicas=<b>true</b></li>
+   *  <li>re-stop nodeX</li>
+   * </ul>
+   */
+  @Test
+  @AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-13811")
+  public void testRapidStopStartStopWithPropChange() throws Exception {
+
+    // This is the collection we'll be focused on in our testing...
+    final String COLLECTION = "test_stoptwice";
+    // This is a collection we'll use as a "marker" to ensure we "wait" for the
+    // autoAddReplicas logic (via NodeLostTrigger) to kick in at least once before proceeding...
+    final String ALT_COLLECTION = "test_dummy";
     
-    waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, clusterShape(2, 2));
-    jetty3.start();
-    waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, clusterShape(2, 4));
-    waitForState("Waiting for collection " + COLLECTION2, COLLECTION2, clusterShape(2, 4));
-    enableAutoAddReplicasInCluster();
+    final ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
+    final JettySolrRunner jetty1 = cluster.getJettySolrRunner(1);
+    final JettySolrRunner jetty2 = cluster.getJettySolrRunner(2);
 
+    log.info("Creating {} using jetty1:{}/{} and jetty2:{}/{}", COLLECTION,
+             jetty1.getNodeName(), jetty1.getLocalPort(),
+             jetty2.getNodeName(), jetty2.getLocalPort());
+             
+    CollectionAdminRequest.createCollection(COLLECTION, "conf", 2, 2)
+      .setCreateNodeSet(jetty1.getNodeName()+","+jetty2.getNodeName())
+      .setAutoAddReplicas(false) // NOTE: false
+      .setMaxShardsPerNode(2)
+      .process(cluster.getSolrClient());
+    
+    log.info("Creating {} using jetty1:{}/{} and jetty2:{}/{}", ALT_COLLECTION,
+             jetty1.getNodeName(), jetty1.getLocalPort(),
+             jetty2.getNodeName(), jetty2.getLocalPort());
+             
+    CollectionAdminRequest.createCollection(ALT_COLLECTION, "conf", 2, 2)
+      .setCreateNodeSet(jetty1.getNodeName()+","+jetty2.getNodeName())
+      .setAutoAddReplicas(true) // NOTE: true
+      .setMaxShardsPerNode(2)
+      .process(cluster.getSolrClient());
+    
+    cluster.waitForActiveCollection(COLLECTION, 2, 4);
+    cluster.waitForActiveCollection(ALT_COLLECTION, 2, 4);
 
-    // test for multiple collections
+    JettySolrRunner lostJetty = random().nextBoolean() ? jetty1 : jetty2;
+    String lostNodeName = lostJetty.getNodeName();
+    List<Replica> replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION, zkStateReader, lostNodeName);
+
+    log.info("Stopping random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+    lostJetty.stop();
+    
+    cluster.waitForJettyToStop(lostJetty);
+    waitForNodeLeave(lostNodeName);
+    
+    // ensure that our marker collection indicates that the autoAddReplicas logic
+    // has detected the down node and done some processing
+    waitForState(ALT_COLLECTION + "=(2,4) w/o down replicas",
+                 ALT_COLLECTION, clusterShapeNoDownReplicas(2,4), 90, TimeUnit.SECONDS);
+
+    waitForState(COLLECTION + "=(2,2)", COLLECTION, clusterShape(2, 2));
+    
+    log.info("Re-starting (same) random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+    lostJetty.start();
+    // save time, don't bother waiting for lostJetty to start until after updating collection prop...
+    
+    log.info("Modifying {} to use autoAddReplicas", COLLECTION);
     new CollectionAdminRequest.AsyncCollectionAdminRequest(CollectionParams.CollectionAction.MODIFYCOLLECTION) {
       @Override
       public SolrParams getParams() {
         ModifiableSolrParams params = (ModifiableSolrParams) super.getParams();
-        params.set("collection", COLLECTION2);
+        params.set("collection", COLLECTION);
         params.set("autoAddReplicas", true);
         return params;
       }
     }.process(cluster.getSolrClient());
 
-    lostNodeName = jetty2.getNodeName();
-    replacedHdfsReplicas = getReplacedSharedFsReplicas(COLLECTION2, zkStateReader, lostNodeName);
-    
-    jetty2.stop();
-    
-    cluster.waitForJettyToStop(jetty2);
+    // make sure lostJetty is fully up before stopping again...
+    waitForNodeLive(lostJetty);
+
+    log.info("Re-Stopping (same) random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+    lostJetty.stop();
     
+    cluster.waitForJettyToStop(lostJetty);
     waitForNodeLeave(lostNodeName);
-    waitForState("Waiting for collection " + COLLECTION1, COLLECTION1, clusterShape(2, 4), 45, TimeUnit.SECONDS);
-    waitForState("Waiting for collection " + COLLECTION2, COLLECTION2, clusterShape(2, 4), 45, TimeUnit.SECONDS);
-    checkSharedFsReplicasMovedCorrectly(replacedHdfsReplicas, zkStateReader, COLLECTION2);
 
-    // overseer failover test..
+    // TODO: this is the problematic situation...
+    // wether or not NodeLostTrigger noticed that lostJetty was re-started and shutdown *again*
+    // and that the new auoAddReplicas=true since the last time lostJetty was shutdown is respected
+    waitForState(COLLECTION + "=(2,4) w/o down replicas",
+                 COLLECTION, clusterShapeNoDownReplicas(2,4), 90, TimeUnit.SECONDS);
+    checkSharedFsReplicasMovedCorrectly(replacedHdfsReplicas, zkStateReader, COLLECTION);
+    
+    log.info("Re-Re-starting (same) random node: {} / {}", lostNodeName, lostJetty.getLocalPort());
+    lostJetty.start();
+    
+    waitForNodeLive(lostJetty);
+    
+    assertTrue("Timeout waiting for all live and active",
+               ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 90000));
   }
-
+  
   private void disableAutoAddReplicasInCluster() throws SolrServerException, IOException {
     Map m = makeMap(
         "action", CollectionParams.CollectionAction.CLUSTERPROP.toLower(),
@@ -225,13 +392,44 @@ public class AutoAddReplicasIntegrationTest extends SolrCloudTestCase {
     return replacedHdfsReplicas;
   }
 
-  private void waitForNodeLeave(String lostNodeName) throws InterruptedException {
+  /** 
+   * {@link MiniSolrCloudCluster#waitForNode} Doesn't check isRunning first, and we don't want to 
+   * use {@link MiniSolrCloudCluster#waitForAllNodes} because we don't want to waste cycles checking 
+   * nodes we aren't messing with  
+   */
+  private void waitForNodeLive(final JettySolrRunner jetty)
+    throws InterruptedException, TimeoutException, IOException {
+    log.info("waitForNodeLive: {}/{}", jetty.getNodeName(), jetty.getLocalPort());
+    
+    TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+    while(!timeout.hasTimedOut()) {
+      if (jetty.isRunning()) {
+        break;
+      }
+      try {
+        Thread.sleep(100);
+      } catch (InterruptedException e) {
+        // ignore
+      }
+    }
+    if (timeout.hasTimedOut()) {
+      throw new TimeoutException("Waiting for Jetty to stop timed out");
+    }
+    cluster.waitForNode(jetty, 30);
+  }
+    
+  private void waitForNodeLeave(String lostNodeName) throws InterruptedException, TimeoutException {
     log.info("waitForNodeLeave: {}", lostNodeName);
     ZkStateReader reader = cluster.getSolrClient().getZkStateReader();
-    TimeOut timeOut = new TimeOut(20, TimeUnit.SECONDS, TimeSource.NANO_TIME);
-    while (reader.getClusterState().getLiveNodes().contains(lostNodeName)) {
-      Thread.sleep(100);
-      if (timeOut.hasTimedOut()) fail("Wait for " + lostNodeName + " to leave failed!");
-    }
+    reader.waitForLiveNodes(30, TimeUnit.SECONDS, (o, n) -> !n.contains(lostNodeName));
   }
+
+  
+  private static CollectionStatePredicate clusterShapeNoDownReplicas(final int expectedShards,
+                                                                     final int expectedReplicas) {
+    return (liveNodes, collectionState)
+      -> (clusterShape(expectedShards, expectedReplicas).matches(liveNodes, collectionState)
+          && collectionState.getReplicas().size() == expectedReplicas);
+  }
+  
 }