You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/01/23 08:19:46 UTC

[2/2] lucene-solr:jira/solr-11702: SOLR-11702: Adding testing for doing recovery on restart

SOLR-11702: Adding testing for doing recovery on restart


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/bea9f650
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/bea9f650
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/bea9f650

Branch: refs/heads/jira/solr-11702
Commit: bea9f6501366d604328f22de375011af4a45d3d9
Parents: 7b055f4
Author: Cao Manh Dat <da...@apache.org>
Authored: Tue Jan 23 15:19:16 2018 +0700
Committer: Cao Manh Dat <da...@apache.org>
Committed: Tue Jan 23 15:19:16 2018 +0700

----------------------------------------------------------------------
 .../apache/solr/cloud/HttpPartitionTest.java    | 73 +++++++++++++++++---
 1 file changed, 65 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bea9f650/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java b/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java
index 401cb9b..8837ed4 100644
--- a/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java
@@ -62,6 +62,9 @@ import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static org.apache.solr.common.cloud.Replica.State.DOWN;
+import static org.apache.solr.common.cloud.Replica.State.RECOVERING;
+
 /**
  * Simulates HTTP partitions between a leader and replica but the replica does
  * not lose its ZooKeeper connection.
@@ -125,6 +128,8 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
 
     testLeaderInitiatedRecoveryCRUD();
 
+    testDoRecoveryOnRestart();
+
     // Tests that if we set a minRf that's not satisfied, no recovery is requested, but if minRf is satisfied,
     // recovery is requested
     testMinRf();
@@ -188,10 +193,10 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
       }
     };
 
-    zkController.updateLeaderInitiatedRecoveryState(testCollectionName, shardId, notLeader.getName(), Replica.State.DOWN, cd, true);
+    zkController.updateLeaderInitiatedRecoveryState(testCollectionName, shardId, notLeader.getName(), DOWN, cd, true);
     Map<String,Object> lirStateMap = zkController.getLeaderInitiatedRecoveryStateObject(testCollectionName, shardId, notLeader.getName());
     assertNotNull(lirStateMap);
-    assertSame(Replica.State.DOWN, Replica.State.getState((String) lirStateMap.get(ZkStateReader.STATE_PROP)));
+    assertSame(DOWN, Replica.State.getState((String) lirStateMap.get(ZkStateReader.STATE_PROP)));
 
     // test old non-json format handling
     SolrZkClient zkClient = zkController.getZkClient();
@@ -199,13 +204,65 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
     zkClient.setData(znodePath, "down".getBytes(StandardCharsets.UTF_8), true);
     lirStateMap = zkController.getLeaderInitiatedRecoveryStateObject(testCollectionName, shardId, notLeader.getName());
     assertNotNull(lirStateMap);
-    assertSame(Replica.State.DOWN, Replica.State.getState((String) lirStateMap.get(ZkStateReader.STATE_PROP)));
+    assertSame(DOWN, Replica.State.getState((String) lirStateMap.get(ZkStateReader.STATE_PROP)));
     zkClient.delete(znodePath, -1, false);
 
     // try to clean up
     attemptCollectionDelete(cloudClient, testCollectionName);
   }
 
+  private void testDoRecoveryOnRestart() throws Exception {
+    String testCollectionName = "collDoRecoveryOnRestart";
+    try {
+      // Inject pausing in recovery op, hence the replica won't be able to finish recovery
+      System.setProperty("solr.cloud.wait-for-updates-with-stale-state-pause", String.valueOf(Integer.MAX_VALUE));
+
+      createCollection(testCollectionName, "conf1", 1, 2, 1);
+      cloudClient.setDefaultCollection(testCollectionName);
+
+      sendDoc(1, 2);
+
+      JettySolrRunner leaderJetty = getJettyOnPort(getReplicaPort(getShardLeader(testCollectionName, "shard1", 1000)));
+      List<Replica> notLeaders =
+          ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 2, maxWaitSecsToSeeAllActive);
+      assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, 1);
+
+      SocketProxy proxy0 = getProxyForReplica(notLeaders.get(0));
+      SocketProxy leaderProxy = getProxyForReplica(getShardLeader(testCollectionName, "shard1", 1000));
+
+      proxy0.close();
+      leaderProxy.close();
+
+      // indexing during a partition
+      int achievedRf = sendDoc(2, 1, leaderJetty);
+      assertEquals("Unexpected achieved replication factor", 1, achievedRf);
+      try (ZkShardTerms zkShardTerms = new ZkShardTerms(testCollectionName, "shard1", cloudClient.getZkStateReader().getZkClient())) {
+        assertFalse(zkShardTerms.canBecomeLeader(notLeaders.get(0).getName()));
+      }
+      waitForState(testCollectionName, notLeaders.get(0).getName(), DOWN, 10000);
+
+      // heal partition
+      proxy0.reopen();
+      leaderProxy.reopen();
+
+      waitForState(testCollectionName, notLeaders.get(0).getName(), RECOVERING, 10000);
+
+      System.clearProperty("solr.cloud.wait-for-updates-with-stale-state-pause");
+      JettySolrRunner notLeaderJetty = getJettyOnPort(getReplicaPort(notLeaders.get(0)));
+      notLeaderJetty.stop();
+      //DOWNNODE will bring replica into DOWN state
+      waitForState(testCollectionName, notLeaders.get(0).getName(), DOWN, 10000);
+      notLeaderJetty.start();
+      ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 2, 100);
+      assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, 2);
+    } finally {
+      System.clearProperty("solr.cloud.wait-for-updates-with-stale-state-pause");
+    }
+
+    // try to clean up
+    attemptCollectionDelete(cloudClient, testCollectionName);
+  }
+
   protected void testMinRf() throws Exception {
     // create a collection that has 1 shard and 3 replicas
     String testCollectionName = "collMinRf_1x3";
@@ -314,7 +371,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
     // indexing during a partition
     sendDoc(2, null, leaderJetty);
     // replica should publish itself as DOWN if the network is not healed after some amount time
-    waitForDownState(testCollectionName, notLeader.getName(), 10000);
+    waitForState(testCollectionName, notLeader.getName(), DOWN, 10000);
     
     proxy.reopen();
     leaderProxy.reopen();
@@ -394,7 +451,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
     attemptCollectionDelete(cloudClient, testCollectionName);
   }
 
-  private void waitForDownState(String collection, String replicaName, long ms) throws KeeperException, InterruptedException {
+  private void waitForState(String collection, String replicaName, Replica.State state, long ms) throws KeeperException, InterruptedException {
     TimeOut timeOut = new TimeOut(ms, TimeUnit.MILLISECONDS, TimeSource.CURRENT_TIME);
     Replica.State replicaState = Replica.State.ACTIVE;
     while (!timeOut.hasTimedOut()) {
@@ -405,10 +462,10 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
       Slice slice = slices.iterator().next();
       Replica partitionedReplica = slice.getReplica(replicaName);
       replicaState = partitionedReplica.getState();
-      if (replicaState == Replica.State.DOWN) return;
+      if (replicaState == state) return;
     }
-    assertEquals("The partitioned replica did not published it self as down",
-        Replica.State.DOWN, replicaState);
+    assertEquals("Timeout waiting for state "+ state +" of replica " + replicaName + ", current state " + replicaState,
+        state, replicaState);
   }
 
   protected void testRf3() throws Exception {