You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by kr...@apache.org on 2017/01/03 18:48:49 UTC

[38/50] lucene-solr:jira/solr-8593: SOLR-9906-Use better check to validate if node recovered via PeerSync or Replication

SOLR-9906-Use better check to validate if node recovered via PeerSync or Replication


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/3988532d
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/3988532d
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/3988532d

Branch: refs/heads/jira/solr-8593
Commit: 3988532d26a50b1f3cf51e1d0009a0754cfd6b57
Parents: 832d02b
Author: Noble Paul <no...@apache.org>
Authored: Tue Jan 3 17:52:08 2017 +1030
Committer: Noble Paul <no...@apache.org>
Committed: Tue Jan 3 17:52:08 2017 +1030

----------------------------------------------------------------------
 .../src/java/org/apache/solr/util/TimeOut.java  | 13 +++-
 .../cloud/LeaderFailureAfterFreshStartTest.java | 68 +++++++++++---------
 .../solr/cloud/PeerSyncReplicationTest.java     | 55 +++++++---------
 .../solr/cloud/AbstractDistribZkTestBase.java   | 32 ++++-----
 4 files changed, 89 insertions(+), 79 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3988532d/solr/core/src/java/org/apache/solr/util/TimeOut.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/util/TimeOut.java b/solr/core/src/java/org/apache/solr/util/TimeOut.java
index f823b7e..fd91045 100644
--- a/solr/core/src/java/org/apache/solr/util/TimeOut.java
+++ b/solr/core/src/java/org/apache/solr/util/TimeOut.java
@@ -18,12 +18,15 @@ package org.apache.solr.util;
 
 import java.util.concurrent.TimeUnit;
 
+import static java.util.concurrent.TimeUnit.NANOSECONDS;
+
 public class TimeOut {
 
-  private final long timeoutAt;
+  private final long timeoutAt, startTime;
 
   public TimeOut(long interval, TimeUnit unit) {
-    this.timeoutAt = System.nanoTime() + TimeUnit.NANOSECONDS.convert(interval, unit);
+    startTime = System.nanoTime();
+    this.timeoutAt = startTime + NANOSECONDS.convert(interval, unit);
   }
 
   public boolean hasTimedOut() {
@@ -31,6 +34,10 @@ public class TimeOut {
   }
 
   public long timeLeft(TimeUnit unit) {
-    return unit.convert(timeoutAt - System.nanoTime(), TimeUnit.NANOSECONDS);
+    return unit.convert(timeoutAt - System.nanoTime(), NANOSECONDS);
+  }
+
+  public long timeElapsed(TimeUnit unit) {
+    return unit.convert(System.nanoTime() - startTime, NANOSECONDS);
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3988532d/solr/core/src/test/org/apache/solr/cloud/LeaderFailureAfterFreshStartTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderFailureAfterFreshStartTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderFailureAfterFreshStartTest.java
index ef21386..77dd6b6 100644
--- a/solr/core/src/test/org/apache/solr/cloud/LeaderFailureAfterFreshStartTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/LeaderFailureAfterFreshStartTest.java
@@ -19,6 +19,8 @@ package org.apache.solr.cloud;
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
@@ -26,6 +28,7 @@ import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
 
+import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.lang.RandomStringUtils;
 import org.apache.lucene.util.LuceneTestCase.Slow;
 import org.apache.solr.client.solrj.SolrServerException;
@@ -38,12 +41,13 @@ import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.ModifiableSolrParams;
-import org.apache.solr.handler.ReplicationHandler;
+import org.apache.solr.util.TimeOut;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import static java.util.Collections.singletonList;
+import static java.util.concurrent.TimeUnit.SECONDS;
 
 /**
  * 
@@ -127,52 +131,53 @@ public class LeaderFailureAfterFreshStartTest extends AbstractFullDistribZkTestB
       waitForThingsToLevelOut(30);
 
       checkShardConsistency(false, true);
+      
+      // bring down the other node and index a few docs; so the leader and other node segments diverge
+      forceNodeFailures(singletonList(secondNode));
+      for (int i = 0; i < 10; i++) {
+        indexDoc(id, docId, i1, 50, tlong, 50, t1,
+            "document number " + docId++);
+        if(i % 2 == 0) {
+          commit();
+        }
+      }
+      commit();
+      restartNodes(singletonList(secondNode));
 
       // start the freshNode 
-      ChaosMonkey.start(freshNode.jetty);
-      nodesDown.remove(freshNode);
-
-      waitTillNodesActive();
-      waitForThingsToLevelOut(30);
-      
-      //TODO check how to see if fresh node went into recovery (may be check count for replication handler on new leader) 
-      
-      long numRequestsBefore = (Long) secondNode.jetty
-          .getCoreContainer()
-          .getCores()
-          .iterator()
-          .next()
-          .getRequestHandler(ReplicationHandler.PATH)
-          .getStatistics().get("requests");
+      restartNodes(singletonList(freshNode));
       
+      String replicationProperties = (String) freshNode.jetty.getSolrHome() + "/cores/" +  DEFAULT_TEST_COLLECTION_NAME + "/data/replication.properties";
+      String md5 = DigestUtils.md5Hex(Files.readAllBytes(Paths.get(replicationProperties)));
+        
       // shutdown the original leader
       log.info("Now shutting down initial leader");
       forceNodeFailures(singletonList(initialLeaderJetty));
-      waitForNewLeader(cloudClient, "shard1", (Replica)initialLeaderJetty.client.info  , 15);
+      waitForNewLeader(cloudClient, "shard1", (Replica)initialLeaderJetty.client.info  , new TimeOut(15, SECONDS));
+      waitTillNodesActive();
       log.info("Updating mappings from zk");
       updateMappingsFromZk(jettys, clients, true);
-      
-      long numRequestsAfter = (Long) secondNode.jetty
-          .getCoreContainer()
-          .getCores()
-          .iterator()
-          .next()
-          .getRequestHandler(ReplicationHandler.PATH)
-          .getStatistics().get("requests");
-
-      assertEquals("Node went into replication", numRequestsBefore, numRequestsAfter);
+      assertEquals("Node went into replication", md5, DigestUtils.md5Hex(Files.readAllBytes(Paths.get(replicationProperties))));
       
       success = true;
     } finally {
       System.clearProperty("solr.disableFingerprint");
     }
   }
+  
+  private void restartNodes(List<CloudJettyRunner> nodesToRestart) throws Exception {
+    for (CloudJettyRunner node : nodesToRestart) {
+      chaosMonkey.start(node.jetty);
+      nodesDown.remove(node);
+    }
+    waitTillNodesActive();
+    checkShardConsistency(false, true);
+  }
 
 
   private void forceNodeFailures(List<CloudJettyRunner> replicasToShutDown) throws Exception {
     for (CloudJettyRunner replicaToShutDown : replicasToShutDown) {
       chaosMonkey.killJetty(replicaToShutDown);
-      waitForNoShardInconsistency();
     }
 
     int totalDown = 0;
@@ -205,8 +210,13 @@ public class LeaderFailureAfterFreshStartTest extends AbstractFullDistribZkTestB
       Collection<Replica> replicas = slice.getReplicas();
       boolean allActive = true;
 
+      Collection<String> nodesDownNames = nodesDown.stream()
+          .map(n -> n.coreNodeName)
+          .collect(Collectors.toList());
+      
       Collection<Replica> replicasToCheck = null;
-      replicasToCheck = replicas.stream().filter(r -> nodesDown.contains(r.getName()))
+      replicasToCheck = replicas.stream()
+          .filter(r -> !nodesDownNames.contains(r.getName()))
           .collect(Collectors.toList());
 
       for (Replica replica : replicasToCheck) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3988532d/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java b/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
index e00ea3c..4084ad7 100644
--- a/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
@@ -20,6 +20,8 @@ package org.apache.solr.cloud;
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -41,15 +43,16 @@ import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.ModifiableSolrParams;
-import org.apache.solr.handler.ReplicationHandler;
+import org.apache.solr.util.TimeOut;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import static java.util.Collections.singletonList;
+import static java.util.concurrent.TimeUnit.SECONDS;
 
 /**
- * Test sync peer sync when a node restarts and documents are indexed when node was down.
+ * Test PeerSync when a node restarts and documents are indexed when node was down.
  *
  * This test is modeled after SyncSliceTest
  */
@@ -149,12 +152,12 @@ public class PeerSyncReplicationTest extends AbstractFullDistribZkTestBase {
       log.info("Now shutting down initial leader");
       forceNodeFailures(singletonList(initialLeaderJetty));
       log.info("Updating mappings from zk");
-      waitForNewLeader(cloudClient, "shard1", (Replica) initialLeaderJetty.client.info, 15);
+      waitForNewLeader(cloudClient, "shard1", (Replica) initialLeaderJetty.client.info, new TimeOut(15, SECONDS));
       updateMappingsFromZk(jettys, clients, true);
       assertEquals("PeerSynced node did not become leader", nodePeerSynced, shardToLeaderJetty.get("shard1"));
 
       // bring up node that was down all along, and let it PeerSync from the node that was forced to PeerSynce  
-      bringUpDeadNodeAndEnsureNoReplication(shardToLeaderJetty.get("shard1"), neverLeader, false);
+      bringUpDeadNodeAndEnsureNoReplication(neverLeader, false);
       waitTillNodesActive();
 
       checkShardConsistency(false, true);
@@ -199,7 +202,6 @@ public class PeerSyncReplicationTest extends AbstractFullDistribZkTestBase {
   private void forceNodeFailures(List<CloudJettyRunner> replicasToShutDown) throws Exception {
     for (CloudJettyRunner replicaToShutDown : replicasToShutDown) {
       chaosMonkey.killJetty(replicaToShutDown);
-      waitForNoShardInconsistency();
     }
 
     int totalDown = 0;
@@ -218,8 +220,6 @@ public class PeerSyncReplicationTest extends AbstractFullDistribZkTestBase {
     assertEquals(getShardCount() - totalDown, jetties.size());
 
     nodesDown.addAll(replicasToShutDown);
-
-    Thread.sleep(3000);
   }
   
   
@@ -241,26 +241,17 @@ public class PeerSyncReplicationTest extends AbstractFullDistribZkTestBase {
         "document number " + docId++);
     commit();
 
-    bringUpDeadNodeAndEnsureNoReplication(leaderJetty, replicaToShutDown, disableFingerprint);
+    bringUpDeadNodeAndEnsureNoReplication(replicaToShutDown, disableFingerprint);
 
     return replicaToShutDown;
   }
-  
-  
 
-  private void bringUpDeadNodeAndEnsureNoReplication(CloudJettyRunner leaderJetty, CloudJettyRunner nodeToBringUp,
-      boolean disableFingerprint) throws Exception {
+
+  private void bringUpDeadNodeAndEnsureNoReplication(CloudJettyRunner nodeToBringUp, boolean disableFingerprint)
+      throws Exception {
     // disable fingerprint check if needed
     System.setProperty("solr.disableFingerprint", String.valueOf(disableFingerprint));
 
-    long numRequestsBefore = (Long) leaderJetty.jetty
-        .getCoreContainer()
-        .getCores()
-        .iterator()
-        .next()
-        .getRequestHandler(ReplicationHandler.PATH)
-        .getStatistics().get("requests");
-
     indexInBackground(50);
     
     // bring back dead node and ensure it recovers
@@ -279,15 +270,9 @@ public class PeerSyncReplicationTest extends AbstractFullDistribZkTestBase {
     long cloudClientDocs = cloudClient.query(new SolrQuery("*:*")).getResults().getNumFound();
     assertEquals(docId, cloudClientDocs);
 
-    long numRequestsAfter = (Long) leaderJetty.jetty
-        .getCoreContainer()
-        .getCores()
-        .iterator()
-        .next()
-        .getRequestHandler(ReplicationHandler.PATH)
-        .getStatistics().get("requests");
-
-    assertEquals("PeerSync failed. Had to fail back to replication", numRequestsBefore, numRequestsAfter);
+    // if there was no replication, we should not have replication.properties file
+    String replicationProperties = nodeToBringUp.jetty.getSolrHome() + "/cores/" + DEFAULT_TEST_COLLECTION_NAME + "/data/replication.properties";
+    assertTrue("PeerSync failed. Had to fail back to replication", Files.notExists(Paths.get(replicationProperties)));
   }
 
   
@@ -302,9 +287,15 @@ public class PeerSyncReplicationTest extends AbstractFullDistribZkTestBase {
       Collection<Replica> replicas = slice.getReplicas();
       boolean allActive = true;
 
-      Collection<Replica> replicasToCheck = null;
-      replicasToCheck = replicas.stream().filter(r -> nodesDown.contains(r.getName()))
-            .collect(Collectors.toList());
+      Collection<String> nodesDownNames =
+          nodesDown.stream()
+              .map(n -> n.coreNodeName)
+              .collect(Collectors.toList());
+
+      Collection<Replica> replicasToCheck =
+          replicas.stream()
+              .filter(r -> !nodesDownNames.contains(r.getName()))
+              .collect(Collectors.toList());
 
       for (Replica replica : replicasToCheck) {
         if (!clusterState.liveNodesContain(replica.getNodeName()) || replica.getState() != Replica.State.ACTIVE) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3988532d/solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java
----------------------------------------------------------------------
diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java
index d04d996..7f991a4 100644
--- a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java
+++ b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java
@@ -19,6 +19,7 @@ package org.apache.solr.cloud;
 import java.io.File;
 import java.lang.invoke.MethodHandles;
 import java.util.Map;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.commons.io.FileUtils;
@@ -29,16 +30,20 @@ import org.apache.solr.common.cloud.ClusterState;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.Slice.State;
 import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkStateReader;
-import org.apache.solr.common.cloud.Slice.State;
 import org.apache.solr.core.Diagnostics;
 import org.apache.solr.core.MockDirectoryFactory;
+import org.apache.solr.util.TimeOut;
 import org.apache.zookeeper.KeeperException;
 import org.junit.BeforeClass;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static java.util.concurrent.TimeUnit.MILLISECONDS;
+import static java.util.concurrent.TimeUnit.SECONDS;
+
 public abstract class AbstractDistribZkTestBase extends BaseDistributedSearchTestCase {
   
   private static final String REMOVE_VERSION_FIELD = "remove.version.field";
@@ -226,31 +231,28 @@ public abstract class AbstractDistribZkTestBase extends BaseDistributedSearchTes
     log.info("Collection has disappeared - collection: " + collection);
   }
   
-  static void waitForNewLeader(CloudSolrClient cloudClient, String shardName, Replica oldLeader, int maxWaitInSecs)
+  static void waitForNewLeader(CloudSolrClient cloudClient, String shardName, Replica oldLeader, TimeOut timeOut)
       throws Exception {
-    log.info("Will wait for a node to become leader for {} secs", maxWaitInSecs);
-    boolean waitForLeader = true;
-    int i = 0;
+    log.info("Will wait for a node to become leader for {} secs", timeOut.timeLeft(SECONDS));
     ZkStateReader zkStateReader = cloudClient.getZkStateReader();
     zkStateReader.forceUpdateCollection(DEFAULT_COLLECTION);
-    
-    while(waitForLeader) {
+
+    for (; ; ) {
       ClusterState clusterState = zkStateReader.getClusterState();
       DocCollection coll = clusterState.getCollection("collection1");
       Slice slice = coll.getSlice(shardName);
-      if(slice.getLeader() != oldLeader && slice.getState() == State.ACTIVE) {
-        log.info("New leader got elected in {} secs", i);
+      if (slice.getLeader() != null && !slice.getLeader().equals(oldLeader) && slice.getState() == State.ACTIVE) {
+        log.info("Old leader {}, new leader. New leader got elected in {} ms", oldLeader, slice.getLeader(),timeOut.timeElapsed(MILLISECONDS) );
         break;
       }
-      
-      if(i == maxWaitInSecs) {
+
+      if (timeOut.hasTimedOut()) {
         Diagnostics.logThreadDumps("Could not find new leader in specified timeout");
         zkStateReader.getZkClient().printLayoutToStdOut();
-        fail("Could not find new leader even after waiting for " + maxWaitInSecs + "secs");
+        fail("Could not find new leader even after waiting for " + timeOut.timeElapsed(MILLISECONDS) + "ms");
       }
-      
-      i++;
-      Thread.sleep(1000);
+
+      Thread.sleep(100);
     }
   }