You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/02/06 17:24:06 UTC

[12/15] lucene-solr git commit: add more non-stress test cases

add more non-stress test cases


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/b1b2c799
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/b1b2c799
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/b1b2c799

Branch: refs/heads/jira/lucene-5438-nrt-replication
Commit: b1b2c799aa1d6598daa4ea8c63a5fa5484b5052d
Parents: 82ecccf
Author: Mike McCandless <mi...@apache.org>
Authored: Sat Feb 6 09:04:15 2016 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Sat Feb 6 09:04:15 2016 -0500

----------------------------------------------------------------------
 .../org/apache/lucene/replicator/nrt/Node.java  |   3 +-
 .../lucene/replicator/nrt/ReplicaNode.java      |   2 +
 .../lucene/replicator/nrt/NodeProcess.java      |  17 +-
 .../replicator/nrt/SimplePrimaryNode.java       |  20 +-
 .../replicator/nrt/SimpleReplicaNode.java       |  12 +-
 .../lucene/replicator/nrt/SimpleServer.java     |   5 +-
 .../replicator/nrt/TestNRTReplication.java      | 717 +++++++++++++++----
 .../nrt/TestStressNRTReplication.java           |  13 +-
 8 files changed, 640 insertions(+), 149 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b1b2c799/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/Node.java
----------------------------------------------------------------------
diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/Node.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/Node.java
index 742b19f..e54c01e 100644
--- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/Node.java
+++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/Node.java
@@ -49,7 +49,8 @@ import org.apache.lucene.util.StringHelper;
 abstract class Node implements Closeable {
 
   static boolean VERBOSE_FILES = true;
-  static boolean VERBOSE_CONNECTIONS = false;
+  // nocommit
+  static boolean VERBOSE_CONNECTIONS = true;
 
   // Keys we store into IndexWriter's commit user data:
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b1b2c799/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java
----------------------------------------------------------------------
diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java
index c7af429..133992f 100644
--- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java
+++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/ReplicaNode.java
@@ -86,6 +86,8 @@ abstract class ReplicaNode extends Node {
 
       // Obtain a write lock on this index since we "act like" an IndexWriter, to prevent any other IndexWriter or ReplicaNode from using it:
       writeFileLock = dir.obtainLock(IndexWriter.WRITE_LOCK_NAME);
+      
+      // nocommit must check for no pending deletes here, like IW does
 
       state = "init";
       deleter = new ReplicaFileDeleter(this, dir);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b1b2c799/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/NodeProcess.java
----------------------------------------------------------------------
diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/NodeProcess.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/NodeProcess.java
index be0b3df3..9d8b764 100644
--- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/NodeProcess.java
+++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/NodeProcess.java
@@ -77,6 +77,7 @@ class NodeProcess implements Closeable {
       isOpen = false;
       p.destroy();
       try {
+        p.waitFor();
         pumper.join();
       } catch (InterruptedException ie) {
         Thread.currentThread().interrupt();
@@ -95,6 +96,7 @@ class NodeProcess implements Closeable {
       }
       return true;
     } catch (Throwable t) {
+      // nocommit throw this
       // Something wrong with this replica; skip it:
       System.out.println("PARENT: top: hit SocketException during commit with R" + id + ": " + t + "; skipping");
       return false;
@@ -106,6 +108,7 @@ class NodeProcess implements Closeable {
       c.out.writeByte(SimplePrimaryNode.CMD_COMMIT);
       c.flush();
     } catch (Throwable t) {
+      // nocommit throw this
       // Something wrong with this replica; skip it:
       System.out.println("PARENT: top: hit SocketException during commit with R" + id + ": " + t + "; skipping");
     }
@@ -118,6 +121,7 @@ class NodeProcess implements Closeable {
       c.s.shutdownOutput();
       return c.in.readVLong();
     } catch (Throwable t) {
+      // nocommit throw this
       // Something wrong with this replica; skip it:
       System.out.println("PARENT: top: hit SocketException during getSearchingVersion with R" + id + "; skipping");
       return -1L;
@@ -145,11 +149,11 @@ class NodeProcess implements Closeable {
   public synchronized boolean shutdown() {
     lock.lock();
     try {
-      System.out.println("PARENT: now shutdown node=" + id + " isOpen=" + isOpen);
+      //System.out.println("PARENT: now shutdown node=" + id + " isOpen=" + isOpen);
       if (isOpen) {
         // Ask the child process to shutdown gracefully:
         isOpen = false;
-        System.out.println("PARENT: send CMD_CLOSE to node=" + id);
+        //System.out.println("PARENT: send CMD_CLOSE to node=" + id);
         try (Connection c = new Connection(tcpPort)) {
           c.out.writeByte(SimplePrimaryNode.CMD_CLOSE);
           c.flush();
@@ -174,6 +178,15 @@ class NodeProcess implements Closeable {
     }
   }
 
+  public void newNRTPoint(long version, int primaryTCPPort) throws IOException {
+    try (Connection c = new Connection(tcpPort)) {
+      c.out.writeByte(SimpleReplicaNode.CMD_NEW_NRT_POINT);
+      c.out.writeVLong(version);
+      c.out.writeInt(primaryTCPPort);
+      c.flush();
+    }
+  }
+
   public void addOrUpdateDocument(Connection c, Document doc, boolean isUpdate) throws IOException {
     if (isPrimary == false) {
       throw new IllegalStateException("only primary can index");

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b1b2c799/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java
----------------------------------------------------------------------
diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java
index b9ecced..7f5634c 100644
--- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java
+++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java
@@ -76,8 +76,8 @@ class SimplePrimaryNode extends PrimaryNode {
   final Random random;
 
   // These are updated by parent test process whenever replicas change:
-  int[] replicaTCPPorts;
-  int[] replicaIDs;
+  int[] replicaTCPPorts = new int[0];
+  int[] replicaIDs = new int[0];
 
   // So we only flip a bit once per file name:
   final Set<String> bitFlipped = Collections.synchronizedSet(new HashSet<>());
@@ -115,8 +115,8 @@ class SimplePrimaryNode extends PrimaryNode {
   }
 
   public SimplePrimaryNode(Random random, Path indexPath, int id, int tcpPort, long primaryGen, long forcePrimaryVersion, SearcherFactory searcherFactory,
-                           boolean doFlipBitsDuringCopy) throws IOException {
-    super(initWriter(id, random, indexPath), id, primaryGen, forcePrimaryVersion, searcherFactory);
+                           boolean doFlipBitsDuringCopy, boolean doCheckIndexOnClose) throws IOException {
+    super(initWriter(id, random, indexPath, doCheckIndexOnClose), id, primaryGen, forcePrimaryVersion, searcherFactory);
     this.tcpPort = tcpPort;
     this.random = new Random(random.nextLong());
     this.doFlipBitsDuringCopy = doFlipBitsDuringCopy;
@@ -129,8 +129,8 @@ class SimplePrimaryNode extends PrimaryNode {
     this.replicaTCPPorts = replicaTCPPorts;
   }
 
-  private static IndexWriter initWriter(int id, Random random, Path indexPath) throws IOException {
-    Directory dir = SimpleReplicaNode.getDirectory(random, id, indexPath);
+  private static IndexWriter initWriter(int id, Random random, Path indexPath, boolean doCheckIndexOnClose) throws IOException {
+    Directory dir = SimpleReplicaNode.getDirectory(random, id, indexPath, doCheckIndexOnClose);
 
     MockAnalyzer analyzer = new MockAnalyzer(random);
     analyzer.setMaxTokenLength(TestUtil.nextInt(random, 1, IndexWriter.MAX_TERM_LENGTH));
@@ -599,13 +599,15 @@ class SimplePrimaryNode extends PrimaryNode {
           IndexSearcher searcher = mgr.acquire();
           try {
             long version = ((DirectoryReader) searcher.getIndexReader()).getVersion();
-            int hitCount = searcher.search(new TermQuery(new Term("body", "the")), 1).totalHits;
+            int hitCount = searcher.count(new TermQuery(new Term("body", "the")));
             //message("version=" + version + " searcher=" + searcher);
             out.writeVLong(version);
             out.writeVInt(hitCount);
+            bos.flush();
           } finally {
             mgr.release(searcher);
           }
+          bos.flush();
         }
         continue outer;
 
@@ -615,10 +617,11 @@ class SimplePrimaryNode extends PrimaryNode {
           IndexSearcher searcher = mgr.acquire();
           try {
             long version = ((DirectoryReader) searcher.getIndexReader()).getVersion();
-            int hitCount = searcher.search(new MatchAllDocsQuery(), 1).totalHits;
+            int hitCount = searcher.count(new MatchAllDocsQuery());
             //message("version=" + version + " searcher=" + searcher);
             out.writeVLong(version);
             out.writeVInt(hitCount);
+            bos.flush();
           } finally {
             mgr.release(searcher);
           }
@@ -630,6 +633,7 @@ class SimplePrimaryNode extends PrimaryNode {
           Thread.currentThread().setName("msearch");
           int expectedAtLeastCount = in.readVInt();
           verifyAtLeastMarkerCount(expectedAtLeastCount, out);
+          bos.flush();
         }
         continue outer;
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b1b2c799/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleReplicaNode.java
----------------------------------------------------------------------
diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleReplicaNode.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleReplicaNode.java
index bc8bb03..2510c40 100644
--- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleReplicaNode.java
+++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleReplicaNode.java
@@ -64,8 +64,8 @@ class SimpleReplicaNode extends ReplicaNode {
   /** Changes over time, as primary node crashes and moves around */
   int curPrimaryTCPPort;
 
-  public SimpleReplicaNode(Random random, int id, int tcpPort, Path indexPath, long curPrimaryGen, int primaryTCPPort, SearcherFactory searcherFactory) throws IOException {
-    super(id, getDirectory(random, id, indexPath), searcherFactory);
+  public SimpleReplicaNode(Random random, int id, int tcpPort, Path indexPath, long curPrimaryGen, int primaryTCPPort, SearcherFactory searcherFactory, boolean doCheckIndexOnClose) throws IOException {
+    super(id, getDirectory(random, id, indexPath, doCheckIndexOnClose), searcherFactory);
     this.tcpPort = tcpPort;
     this.random = new Random(random.nextLong());
 
@@ -131,13 +131,13 @@ class SimpleReplicaNode extends ReplicaNode {
     return new SimpleCopyJob(reason, c, copyState, this, files, highPriority, onceDone);
   }
 
-  static Directory getDirectory(Random random, int id, Path path) throws IOException {
+  static Directory getDirectory(Random random, int id, Path path, boolean doCheckIndexOnClose) throws IOException {
     MockDirectoryWrapper dir = LuceneTestCase.newMockFSDirectory(path);
     
     dir.setAssertNoUnrefencedFilesOnClose(true);
-    // This is very costly (takes more time to check than it did to index); we do this ourselves in the end instead of each time a replica
-    // is restarted:
-    dir.setCheckIndexOnClose(false);
+    if (doCheckIndexOnClose) {
+      dir.setCheckIndexOnClose(false);
+    }
 
     // Corrupt any index files not referenced by current commit point; this is important (increases test evilness) because we may have done
     // a hard crash of the previous JVM writing to this directory and so MDW's corrupt-unknown-files-on-close never ran:

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b1b2c799/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleServer.java
----------------------------------------------------------------------
diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleServer.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleServer.java
index 3fdc45f..72e33d7 100644
--- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleServer.java
+++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimpleServer.java
@@ -238,6 +238,7 @@ public class SimpleServer extends LuceneTestCase {
     boolean doRandomCrash = "true".equals(System.getProperty("tests.nrtreplication.doRandomCrash"));
     boolean doRandomClose = "true".equals(System.getProperty("tests.nrtreplication.doRandomClose"));
     boolean doFlipBitsDuringCopy = "true".equals(System.getProperty("tests.nrtreplication.doFlipBitsDuringCopy"));
+    boolean doCheckIndexOnClose = "true".equals(System.getProperty("tests.nrtreplication.checkonclose"));
 
     // Create server socket that we listen for incoming requests on:
     try (final ServerSocket ss = new ServerSocket(0, 0, InetAddress.getLoopbackAddress())) {
@@ -246,11 +247,11 @@ public class SimpleServer extends LuceneTestCase {
       System.out.println("\nPORT: " + tcpPort);
       final Node node;
       if (isPrimary) {
-        node = new SimplePrimaryNode(random(), indexPath, id, tcpPort, primaryGen, forcePrimaryVersion, null, doFlipBitsDuringCopy);
+        node = new SimplePrimaryNode(random(), indexPath, id, tcpPort, primaryGen, forcePrimaryVersion, null, doFlipBitsDuringCopy, doCheckIndexOnClose);
         System.out.println("\nCOMMIT VERSION: " + ((PrimaryNode) node).getLastCommitVersion());
       } else {
         try {
-          node = new SimpleReplicaNode(random(), id, tcpPort, indexPath, primaryGen, primaryTCPPort, null);
+          node = new SimpleReplicaNode(random(), id, tcpPort, indexPath, primaryGen, primaryTCPPort, null, doCheckIndexOnClose);
         } catch (RuntimeException re) {
           if (re.getMessage().startsWith("replica cannot start")) {
             // this is "OK": it means MDW's refusal to delete a segments_N commit point means we cannot start:

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b1b2c799/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestNRTReplication.java
----------------------------------------------------------------------
diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestNRTReplication.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestNRTReplication.java
index 7ba3bc2..15e9c8c 100644
--- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestNRTReplication.java
+++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestNRTReplication.java
@@ -17,15 +17,6 @@ package org.apache.lucene.replicator.nrt;
  * limitations under the License.
  */
 
-import org.apache.lucene.document.Document;
-import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.LineFileDocs;
-import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
-import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
-import org.apache.lucene.util.LuceneTestCase;
-
-import com.carrotsearch.randomizedtesting.SeedUtils;
-
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
@@ -38,6 +29,16 @@ import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.regex.Pattern;
 
+import org.apache.lucene.document.Document;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LineFileDocs;
+import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
+import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+import com.carrotsearch.randomizedtesting.SeedUtils;
+
 // nocommit make some explicit failure tests
 
 // MockRandom's .sd file has no index header/footer:
@@ -49,9 +50,12 @@ public class TestNRTReplication extends LuceneTestCase {
   private Path childTempDir;
 
   final AtomicLong nodeStartCounter = new AtomicLong();
+  private long nextPrimaryGen;
+  private long lastPrimaryGen;
+  LineFileDocs docs;
 
   /** Launches a child "server" (separate JVM), which is either primary or replica node */
-  NodeProcess startNode(int primaryTCPPort, final int id, Path indexPath, boolean isPrimary, long forcePrimaryVersion) throws IOException {
+  private NodeProcess startNode(int primaryTCPPort, final int id, Path indexPath, long forcePrimaryVersion, boolean willCrash) throws IOException {
     List<String> cmd = new ArrayList<>();
 
     cmd.add(System.getProperty("java.home") 
@@ -61,26 +65,30 @@ public class TestNRTReplication extends LuceneTestCase {
         + "java");
     cmd.add("-Xmx512m");
 
+    long myPrimaryGen;
     if (primaryTCPPort != -1) {
+      // I am a replica
       cmd.add("-Dtests.nrtreplication.primaryTCPPort=" + primaryTCPPort);
-    } else if (isPrimary == false) {
-      // We cannot start a replica when there is no primary:
-      return null;
+      myPrimaryGen = lastPrimaryGen;
+    } else {
+      myPrimaryGen = nextPrimaryGen++;
+      lastPrimaryGen = myPrimaryGen;
     }
+    cmd.add("-Dtests.nrtreplication.primaryGen=" + myPrimaryGen);
     cmd.add("-Dtests.nrtreplication.closeorcrash=false");
 
     cmd.add("-Dtests.nrtreplication.node=true");
     cmd.add("-Dtests.nrtreplication.nodeid=" + id);
     cmd.add("-Dtests.nrtreplication.startNS=" + Node.globalStartNS);
     cmd.add("-Dtests.nrtreplication.indexpath=" + indexPath);
-    if (isPrimary) {
+    cmd.add("-Dtests.nrtreplication.checkonclose=true");
+
+    if (primaryTCPPort == -1) {
+      // We are the primary node
       cmd.add("-Dtests.nrtreplication.isPrimary=true");
       cmd.add("-Dtests.nrtreplication.forcePrimaryVersion=" + forcePrimaryVersion);
     }
 
-    long myPrimaryGen = 0;
-    cmd.add("-Dtests.nrtreplication.primaryGen=" + myPrimaryGen);
-
     // Mixin our own counter because this is called from a fresh thread which means the seed otherwise isn't changing each time we spawn a
     // new node:
     long seed = random().nextLong() * nodeStartCounter.incrementAndGet();
@@ -112,7 +120,6 @@ public class TestNRTReplication extends LuceneTestCase {
     long initCommitVersion = -1;
     long initInfosVersion = -1;
     Pattern logTimeStart = Pattern.compile("^[0-9\\.]+s .*");
-    boolean willCrash = false;
     boolean sawExistingSegmentsFile = false;
 
     while (true) {
@@ -169,7 +176,7 @@ public class TestNRTReplication extends LuceneTestCase {
                                            message("done wait for process " + p);
                                            int exitValue = p.exitValue();
                                            message("exit value=" + exitValue + " willCrash=" + finalWillCrash);
-                                           if (exitValue != 0) {
+                                           if (exitValue != 0 && finalWillCrash == false) {
                                              // should fail test
                                              throw new RuntimeException("node " + id + " process had unexpected non-zero exit status=" + exitValue);
                                            }
@@ -178,32 +185,33 @@ public class TestNRTReplication extends LuceneTestCase {
     pumper.setName("pump" + id);
 
     message("top: node=" + id + " started at tcpPort=" + tcpPort + " initCommitVersion=" + initCommitVersion + " initInfosVersion=" + initInfosVersion);
-    return new NodeProcess(p, id, tcpPort, pumper, isPrimary, initCommitVersion, initInfosVersion, nodeClosing);
+    return new NodeProcess(p, id, tcpPort, pumper, primaryTCPPort == -1, initCommitVersion, initInfosVersion, nodeClosing);
   }
 
-  public void testReplicateDeleteAllDocuments() throws Exception {
-
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
     Node.globalStartNS = System.nanoTime();
     childTempDir = createTempDir("child");
+    docs = new LineFileDocs(random());
+  }
+
+  @Override
+  public void tearDown() throws Exception {
+    super.tearDown();
+    docs.close();
+  }
+
+  public void testReplicateDeleteAllDocuments() throws Exception {
 
-    message("change thread name from " + Thread.currentThread().getName());
-    Thread.currentThread().setName("main");
-    
     Path primaryPath = createTempDir("primary");
-    NodeProcess primary = startNode(-1, 0, primaryPath, true, -1);
+    NodeProcess primary = startNode(-1, 0, primaryPath, -1, false);
 
     Path replicaPath = createTempDir("replica");
-    NodeProcess replica = startNode(primary.tcpPort, 1, replicaPath, false, -1);
+    NodeProcess replica = startNode(primary.tcpPort, 1, replicaPath, -1, false);
 
     // Tell primary current replicas:
-    try (Connection c = new Connection(primary.tcpPort)) {
-      c.out.writeByte(SimplePrimaryNode.CMD_SET_REPLICAS);
-      c.out.writeVInt(1);
-      c.out.writeVInt(replica.id);
-      c.out.writeVInt(replica.tcpPort);
-      c.flush();
-      c.in.readByte();
-    }
+    sendReplicasToPrimary(primary, replica);
 
     // Index 10 docs into primary:
     LineFileDocs docs = new LineFileDocs(random());
@@ -215,33 +223,14 @@ public class TestNRTReplication extends LuceneTestCase {
     }
 
     // Nothing in replica index yet
-    Connection replicaC = new Connection(replica.tcpPort);
-    replicaC.out.writeByte(SimplePrimaryNode.CMD_SEARCH_ALL);
-    replicaC.flush();
-    long version1 = replicaC.in.readVLong();
-    assertEquals(0L, version1);
-    int hitCount = replicaC.in.readVInt();
-    assertEquals(0, hitCount);
+    assertVersionAndHits(replica, 0, 0);
 
     // Refresh primary, which also pushes to replica:
     long primaryVersion1 = primary.flush(0);
     assertTrue(primaryVersion1 > 0);
 
-    long version2;
-
     // Wait for replica to show the change
-    while (true) {
-      replicaC.out.writeByte(SimplePrimaryNode.CMD_SEARCH_ALL);
-      replicaC.flush();
-      version2 = replicaC.in.readVLong();
-      hitCount = replicaC.in.readVInt();
-      if (version2 == primaryVersion1) {
-        assertEquals(10, hitCount);
-        // good!
-        break;
-      }
-      Thread.sleep(10);
-    }
+    waitForVersionAndHits(replica, primaryVersion1, 10);
 
     // Delete all docs from primary
     if (random().nextBoolean()) {
@@ -255,32 +244,14 @@ public class TestNRTReplication extends LuceneTestCase {
     }
 
     // Replica still shows 10 docs:
-    replicaC.out.writeByte(SimplePrimaryNode.CMD_SEARCH_ALL);
-    replicaC.flush();
-    long version3 = replicaC.in.readVLong();
-    assertEquals(version2, version3);
-    hitCount = replicaC.in.readVInt();
-    assertEquals(10, hitCount);
+    assertVersionAndHits(replica, primaryVersion1, 10);
     
     // Refresh primary, which also pushes to replica:
     long primaryVersion2 = primary.flush(0);
     assertTrue(primaryVersion2 > primaryVersion1);
 
     // Wait for replica to show the change
-    long version4;
-    while (true) {
-      replicaC.out.writeByte(SimplePrimaryNode.CMD_SEARCH_ALL);
-      replicaC.flush();
-      version4 = replicaC.in.readVLong();
-      hitCount = replicaC.in.readVInt();
-      if (version4 == primaryVersion2) {
-        assertTrue(version4 > version3);
-        assertEquals(0, hitCount);
-        // good!
-        break;
-      }
-      Thread.sleep(10);
-    }
+    waitForVersionAndHits(replica, primaryVersion2, 0);
 
     // Index 10 docs again:
     for(int i=0;i<10;i++) {
@@ -293,21 +264,8 @@ public class TestNRTReplication extends LuceneTestCase {
     assertTrue(primaryVersion3 > primaryVersion2);
 
     // Wait for replica to show the change
-    while (true) {
-      replicaC.out.writeByte(SimplePrimaryNode.CMD_SEARCH_ALL);
-      replicaC.flush();
-      long version5 = replicaC.in.readVLong();
-      hitCount = replicaC.in.readVInt();
-      if (version5 == primaryVersion3) {
-        assertEquals(10, hitCount);
-        assertTrue(version5 > version4);
-        // good!
-        break;
-      }
-      Thread.sleep(10);
-    }
+    waitForVersionAndHits(replica, primaryVersion3, 10);
 
-    replicaC.close();
     primaryC.close();
 
     replica.close();
@@ -316,27 +274,13 @@ public class TestNRTReplication extends LuceneTestCase {
 
   public void testReplicateForceMerge() throws Exception {
 
-    Node.globalStartNS = System.nanoTime();
-    childTempDir = createTempDir("child");
-
-    message("change thread name from " + Thread.currentThread().getName());
-    Thread.currentThread().setName("main");
-    
     Path primaryPath = createTempDir("primary");
-    NodeProcess primary = startNode(-1, 0, primaryPath, true, -1);
+    NodeProcess primary = startNode(-1, 0, primaryPath, -1, false);
 
     Path replicaPath = createTempDir("replica");
-    NodeProcess replica = startNode(primary.tcpPort, 1, replicaPath, false, -1);
+    NodeProcess replica = startNode(primary.tcpPort, 1, replicaPath, -1, false);
 
-    // Tell primary current replicas:
-    try (Connection c = new Connection(primary.tcpPort)) {
-      c.out.writeByte(SimplePrimaryNode.CMD_SET_REPLICAS);
-      c.out.writeVInt(1);
-      c.out.writeVInt(replica.id);
-      c.out.writeVInt(replica.tcpPort);
-      c.flush();
-      c.in.readByte();
-    }
+    sendReplicasToPrimary(primary, replica);
 
     // Index 10 docs into primary:
     LineFileDocs docs = new LineFileDocs(random());
@@ -367,29 +311,560 @@ public class TestNRTReplication extends LuceneTestCase {
     long primaryVersion3 = primary.flush(0);
     assertTrue(primaryVersion3 > primaryVersion2);
 
-    Connection replicaC = new Connection(replica.tcpPort);
-
     // Wait for replica to show the change
-    while (true) {
-      replicaC.out.writeByte(SimplePrimaryNode.CMD_SEARCH_ALL);
-      replicaC.flush();
-      long version = replicaC.in.readVLong();
-      int hitCount = replicaC.in.readVInt();
-      if (version == primaryVersion3) {
-        assertEquals(20, hitCount);
-        // good!
-        break;
+    waitForVersionAndHits(replica, primaryVersion3, 20);
+
+    primaryC.close();
+
+    replica.close();
+    primary.close();
+  }
+
+  // Start up, index 10 docs, replicate, but crash and restart the replica without committing it:
+  public void testReplicaCrashNoCommit() throws Exception {
+
+    Path primaryPath = createTempDir("primary");
+    NodeProcess primary = startNode(-1, 0, primaryPath, -1, false);
+
+    Path replicaPath = createTempDir("replica");
+    NodeProcess replica = startNode(primary.tcpPort, 1, replicaPath, -1, true);
+
+    sendReplicasToPrimary(primary, replica);
+
+    // Index 10 docs into primary:
+    LineFileDocs docs = new LineFileDocs(random());
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+      for(int i=0;i<10;i++) {
+        Document doc = docs.nextDoc();
+        primary.addOrUpdateDocument(c, doc, false);
       }
-      Thread.sleep(10);
     }
 
-    replicaC.close();
-    primaryC.close();
+    // Refresh primary, which also pushes to replica:
+    long primaryVersion1 = primary.flush(0);
+    assertTrue(primaryVersion1 > 0);
+
+    // Wait for replica to sync up:
+    waitForVersionAndHits(replica, primaryVersion1, 10);
+
+    // Crash replica:
+    replica.crash();
+
+    // Restart replica:
+    replica = startNode(primary.tcpPort, 1, replicaPath, -1, false);
+
+    // On startup the replica searches the last commit (empty here):
+    assertVersionAndHits(replica, 0, 0);
+
+    // Ask replica to sync:
+    replica.newNRTPoint(primaryVersion1, primary.tcpPort);
+    waitForVersionAndHits(replica, primaryVersion1, 10);
+
+    replica.close();
+    primary.close();
+  }
+
+  // Start up, index 10 docs, replicate, commit, crash and restart the replica
+  public void testReplicaCrashWithCommit() throws Exception {
+
+    Path primaryPath = createTempDir("primary");
+    NodeProcess primary = startNode(-1, 0, primaryPath, -1, false);
+
+    Path replicaPath = createTempDir("replica");
+    NodeProcess replica = startNode(primary.tcpPort, 1, replicaPath, -1, true);
+
+    sendReplicasToPrimary(primary, replica);
+
+    // Index 10 docs into primary:
+    LineFileDocs docs = new LineFileDocs(random());
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+      for(int i=0;i<10;i++) {
+        Document doc = docs.nextDoc();
+        primary.addOrUpdateDocument(c, doc, false);
+      }
+    }
+
+    // Refresh primary, which also pushes to replica:
+    long primaryVersion1 = primary.flush(0);
+    assertTrue(primaryVersion1 > 0);
+
+    // Wait for replica to sync up:
+    waitForVersionAndHits(replica, primaryVersion1, 10);
+
+    // Commit and crash replica:
+    replica.commit();
+    replica.crash();
+
+    // Restart replica:
+    replica = startNode(primary.tcpPort, 1, replicaPath, -1, false);
+
+    // On startup the replica searches the last commit:
+    assertVersionAndHits(replica, primaryVersion1, 10);
 
     replica.close();
     primary.close();
   }
 
+  // Start up, index 10 docs, replicate, commit, crash, index more docs, replicate, then restart the replica
+  public void testIndexingWhileReplicaIsDown() throws Exception {
+
+    Path primaryPath = createTempDir("primary");
+    NodeProcess primary = startNode(-1, 0, primaryPath, -1, false);
+
+    Path replicaPath = createTempDir("replica");
+    NodeProcess replica = startNode(primary.tcpPort, 1, replicaPath, -1, true);
+
+    sendReplicasToPrimary(primary, replica);
+
+    // Index 10 docs into primary:
+    LineFileDocs docs = new LineFileDocs(random());
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+      for(int i=0;i<10;i++) {
+        Document doc = docs.nextDoc();
+        primary.addOrUpdateDocument(c, doc, false);
+      }
+    }
+
+    // Refresh primary, which also pushes to replica:
+    long primaryVersion1 = primary.flush(0);
+    assertTrue(primaryVersion1 > 0);
+
+    // Wait for replica to sync up:
+    waitForVersionAndHits(replica, primaryVersion1, 10);
+
+    // Commit and crash replica:
+    replica.commit();
+    replica.crash();
+
+    sendReplicasToPrimary(primary);
+
+    // Index 10 more docs, while replica is down
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+      for(int i=0;i<10;i++) {
+        Document doc = docs.nextDoc();
+        primary.addOrUpdateDocument(c, doc, false);
+      }
+    }
+
+    // And flush:
+    long primaryVersion2 = primary.flush(0);
+    assertTrue(primaryVersion2 > primaryVersion1);
+
+    // Now restart replica:
+    replica = startNode(primary.tcpPort, 1, replicaPath, -1, false);
+
+    sendReplicasToPrimary(primary, replica);
+
+    // On startup the replica still searches its last commit:
+    assertVersionAndHits(replica, primaryVersion1, 10);
+
+    // Now ask replica to sync:
+    replica.newNRTPoint(primaryVersion2, primary.tcpPort);
+
+    waitForVersionAndHits(replica, primaryVersion2, 20);
+
+    replica.close();
+    primary.close();
+  }
+ 
+  // Crash primary and promote a replica
+  public void testCrashPrimary1() throws Exception {
+
+    Path path1 = createTempDir("1");
+    NodeProcess primary = startNode(-1, 0, path1, -1, true);
+
+    Path path2 = createTempDir("2");
+    NodeProcess replica = startNode(primary.tcpPort, 1, path2, -1, true);
+
+    sendReplicasToPrimary(primary, replica);
+
+    // Index 10 docs into primary:
+    LineFileDocs docs = new LineFileDocs(random());
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+      for(int i=0;i<10;i++) {
+        Document doc = docs.nextDoc();
+        primary.addOrUpdateDocument(c, doc, false);
+      }
+    }
+
+    // Refresh primary, which also pushes to replica:
+    long primaryVersion1 = primary.flush(0);
+    assertTrue(primaryVersion1 > 0);
+
+    // Wait for replica to sync up:
+    waitForVersionAndHits(replica, primaryVersion1, 10);
+
+    // Crash primary:
+    primary.crash();
+
+    // Promote replica:
+    replica.commit();
+    replica.close();
+    
+    primary = startNode(-1, 1, path2, -1, false);
+
+    // Should still see 10 docs:
+    assertVersionAndHits(primary, primaryVersion1, 10);
+
+    primary.close();
+  }
+
+  // Crash primary and then restart it
+  public void testCrashPrimary2() throws Exception {
+
+    Path path1 = createTempDir("1");
+    NodeProcess primary = startNode(-1, 0, path1, -1, true);
+
+    Path path2 = createTempDir("2");
+    NodeProcess replica = startNode(primary.tcpPort, 1, path2, -1, true);
+
+    sendReplicasToPrimary(primary, replica);
+
+    // Index 10 docs into primary:
+    LineFileDocs docs = new LineFileDocs(random());
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+      for(int i=0;i<10;i++) {
+        Document doc = docs.nextDoc();
+        primary.addOrUpdateDocument(c, doc, false);
+      }
+    }
+
+    // Refresh primary, which also pushes to replica:
+    long primaryVersion1 = primary.flush(0);
+    assertTrue(primaryVersion1 > 0);
+
+    // Wait for replica to sync up:
+    waitForVersionAndHits(replica, primaryVersion1, 10);
+
+    primary.commit();
+
+    // Index 10 docs, but crash before replicating or committing:
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+      for(int i=0;i<10;i++) {
+        Document doc = docs.nextDoc();
+        primary.addOrUpdateDocument(c, doc, false);
+      }
+    }
+
+    // Crash primary:
+    primary.crash();
+
+    // Restart it:
+    primary = startNode(-1, 0, path1, -1, true);
+
+    sendReplicasToPrimary(primary, replica);
+
+    // Index 10 more docs
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+      for(int i=0;i<10;i++) {
+        Document doc = docs.nextDoc();
+        primary.addOrUpdateDocument(c, doc, false);
+      }
+    }
+
+    long primaryVersion2 = primary.flush(0);
+    assertTrue(primaryVersion2 > primaryVersion1);
+
+    // Wait for replica to sync up:
+    waitForVersionAndHits(replica, primaryVersion2, 20);
+
+    primary.close();
+    replica.close();
+  }
+
+  // Crash primary and then restart it, while a replica node is down, then bring replica node back up and make sure it properly "unforks" itself
+  public void testCrashPrimary3() throws Exception {
+
+    Path path1 = createTempDir("1");
+    NodeProcess primary = startNode(-1, 0, path1, -1, true);
+
+    Path path2 = createTempDir("2");
+    NodeProcess replica = startNode(primary.tcpPort, 1, path2, -1, true);
+
+    sendReplicasToPrimary(primary, replica);
+
+    // Index 10 docs into primary:
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+      for(int i=0;i<10;i++) {
+        Document doc = docs.nextDoc();
+        primary.addOrUpdateDocument(c, doc, false);
+      }
+    }
+
+    // Refresh primary, which also pushes to replica:
+    long primaryVersion1 = primary.flush(0);
+    assertTrue(primaryVersion1 > 0);
+
+    // Wait for replica to sync up:
+    waitForVersionAndHits(replica, primaryVersion1, 10);
+
+    replica.commit();
+
+    replica.close();
+    primary.crash();
+
+    // At this point replica is "in the future": it has 10 docs committed, but the primary crashed before committing so it has 0 docs
+
+    // Restart primary:
+    primary = startNode(-1, 0, path1, -1, true);
+
+    // Index 20 docs into primary:
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+      for(int i=0;i<20;i++) {
+        Document doc = docs.nextDoc();
+        primary.addOrUpdateDocument(c, doc, false);
+      }
+    }
+
+    // Flush primary, but there are no replicas to sync to:
+    long primaryVersion2 = primary.flush(0);
+
+    // Now restart replica, which on init should detect on a "lost branch" because its 10 docs that were committed came from a different
+    // primary node:
+    replica = startNode(primary.tcpPort, 1, path2, -1, true);
+
+    assertVersionAndHits(replica, primaryVersion2, 20);
+
+    primary.close();
+    replica.close();
+  }
+
+  public void testCrashPrimaryWhileCopying() throws Exception {
+
+    Path path1 = createTempDir("1");
+    NodeProcess primary = startNode(-1, 0, path1, -1, true);
+
+    Path path2 = createTempDir("2");
+    NodeProcess replica = startNode(primary.tcpPort, 1, path2, -1, true);
+
+    sendReplicasToPrimary(primary, replica);
+
+    // Index 100 docs into primary:
+    LineFileDocs docs = new LineFileDocs(random());
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+      for(int i=0;i<100;i++) {
+        Document doc = docs.nextDoc();
+        primary.addOrUpdateDocument(c, doc, false);
+      }
+    }
+
+    // Refresh primary, which also pushes (async) to replica:
+    long primaryVersion1 = primary.flush(0);
+    assertTrue(primaryVersion1 > 0);
+
+    Thread.sleep(TestUtil.nextInt(random(), 1, 30));
+
+    // Crash primary, likely/hopefully while replica is still copying
+    primary.crash();
+
+    // Could see either 100 docs (replica finished before crash) or 0 docs:
+    try (Connection c = new Connection(replica.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_SEARCH_ALL);
+      c.flush();
+      long version = c.in.readVLong();
+      int hitCount = c.in.readVInt();
+      if (version == 0) {
+        assertEquals(0, hitCount);
+      } else {
+        assertEquals(primaryVersion1, version);
+        assertEquals(100, hitCount);
+      }
+    }
+
+    primary.close();
+    replica.close();
+  }
+
+  public void testCrashReplica() throws Exception {
+
+    Path path1 = createTempDir("1");
+    NodeProcess primary = startNode(-1, 0, path1, -1, true);
+
+    Path path2 = createTempDir("2");
+    NodeProcess replica = startNode(primary.tcpPort, 1, path2, -1, true);
+
+    sendReplicasToPrimary(primary, replica);
+
+    // Index 10 docs into primary:
+    LineFileDocs docs = new LineFileDocs(random());
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+      for(int i=0;i<10;i++) {
+        Document doc = docs.nextDoc();
+        primary.addOrUpdateDocument(c, doc, false);
+      }
+    }
+
+    // Refresh primary, which also pushes to replica:
+    long primaryVersion1 = primary.flush(0);
+    assertTrue(primaryVersion1 > 0);
+
+    // Wait for replica to sync up:
+    waitForVersionAndHits(replica, primaryVersion1, 10);
+
+    // Crash replica
+    replica.crash();
+
+    sendReplicasToPrimary(primary);
+
+    // Lots of new flushes while replica is down:
+    long primaryVersion2 = 0;
+    for(int iter=0;iter<10;iter++) {
+      // Index 10 docs into primary:
+      try (Connection c = new Connection(primary.tcpPort)) {
+        c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+        for(int i=0;i<10;i++) {
+          Document doc = docs.nextDoc();
+          primary.addOrUpdateDocument(c, doc, false);
+        }
+      }
+      primaryVersion2 = primary.flush(0);
+    }
+
+    // Start up replica again:
+    replica = startNode(primary.tcpPort, 1, path2, -1, true);
+
+    sendReplicasToPrimary(primary, replica);
+
+    // Now ask replica to sync:
+    replica.newNRTPoint(primaryVersion2, primary.tcpPort);
+
+    // Make sure it sees all docs that were indexed while it was down:
+    assertVersionAndHits(primary, primaryVersion2, 110);
+
+    replica.close();
+    primary.close();
+  }
+
+  public void testFullClusterCrash() throws Exception {
+
+    Path path1 = createTempDir("1");
+    NodeProcess primary = startNode(-1, 0, path1, -1, true);
+
+    Path path2 = createTempDir("2");
+    NodeProcess replica1 = startNode(primary.tcpPort, 1, path2, -1, true);
+
+    Path path3 = createTempDir("3");
+    NodeProcess replica2 = startNode(primary.tcpPort, 2, path3, -1, true);
+
+    sendReplicasToPrimary(primary, replica1, replica2);
+
+    // Index 50 docs into primary:
+    LineFileDocs docs = new LineFileDocs(random());
+    long primaryVersion1 = 0;
+    for (int iter=0;iter<5;iter++) {
+      try (Connection c = new Connection(primary.tcpPort)) {
+        c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+        for(int i=0;i<10;i++) {
+          Document doc = docs.nextDoc();
+          primary.addOrUpdateDocument(c, doc, false);
+        }
+      }
+
+      // Refresh primary, which also pushes to replicas:
+      primaryVersion1 = primary.flush(0);
+      assertTrue(primaryVersion1 > 0);
+    }
+
+    // Wait for replicas to sync up:
+    waitForVersionAndHits(replica1, primaryVersion1, 50);
+    waitForVersionAndHits(replica2, primaryVersion1, 50);
+
+    primary.commit();
+    replica1.commit();
+    replica2.commit();
+
+    // Index 10 more docs, but don't sync to replicas:
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_INDEXING);
+      for(int i=0;i<10;i++) {
+        Document doc = docs.nextDoc();
+        primary.addOrUpdateDocument(c, doc, false);
+      }
+    }
+
+    // Full cluster crash
+    primary.crash();
+    replica1.crash();
+    replica2.crash();
+
+    // Full cluster restart
+    primary = startNode(-1, 0, path1, -1, true);
+    replica1 = startNode(primary.tcpPort, 1, path2, -1, true);
+    replica2 = startNode(primary.tcpPort, 2, path3, -1, true);
+
+    // Only 50 because we didn't commit primary before the crash:
+    
+    // It's -1 because it's unpredictable how IW changes segments version on init:
+    assertVersionAndHits(primary, -1, 50);
+    assertVersionAndHits(replica1, primaryVersion1, 50);
+    assertVersionAndHits(replica2, primaryVersion1, 50);
+
+    primary.close();
+    replica1.close();
+    replica2.close();
+  }
+
+  /** Tell primary current replicas. */
+  private void sendReplicasToPrimary(NodeProcess primary, NodeProcess... replicas) throws IOException {
+    try (Connection c = new Connection(primary.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_SET_REPLICAS);
+      c.out.writeVInt(replicas.length);
+      for(int id=0;id<replicas.length;id++) {
+        NodeProcess replica = replicas[id];
+        c.out.writeVInt(replica.id);
+        c.out.writeVInt(replica.tcpPort);
+      }
+      c.flush();
+      c.in.readByte();
+    }
+  }
+
+  /** Verifies this node is currently searching the specified version with the specified total hit count, or that it eventually does when
+   *  keepTrying is true. */
+  private void assertVersionAndHits(NodeProcess node, long expectedVersion, int expectedHitCount) throws Exception {
+    try (Connection c = new Connection(node.tcpPort)) {
+      c.out.writeByte(SimplePrimaryNode.CMD_SEARCH_ALL);
+      c.flush();
+      long version = c.in.readVLong();
+      int hitCount = c.in.readVInt();
+      if (expectedVersion != -1) {
+        assertEquals("hitCount=" + hitCount, expectedVersion, version);
+      }
+      assertEquals(expectedHitCount, hitCount);
+    }
+  }
+
+  private void waitForVersionAndHits(NodeProcess node, long expectedVersion, int expectedHitCount) throws Exception {
+    try (Connection c = new Connection(node.tcpPort)) {
+      while (true) {
+        c.out.writeByte(SimplePrimaryNode.CMD_SEARCH_ALL);
+        c.flush();
+        long version = c.in.readVLong();
+        int hitCount = c.in.readVInt();
+
+        if (version == expectedVersion) {
+          assertEquals(expectedHitCount, hitCount);
+          break;
+        }
+
+        assertTrue(version < expectedVersion);
+        Thread.sleep(10);
+      }
+    }
+  }
+
   static void message(String message) {
     long now = System.nanoTime();
     System.out.println(String.format(Locale.ROOT,

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b1b2c799/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestStressNRTReplication.java
----------------------------------------------------------------------
diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestStressNRTReplication.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestStressNRTReplication.java
index 63ff12a..04bbdc1 100644
--- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestStressNRTReplication.java
+++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/TestStressNRTReplication.java
@@ -518,6 +518,10 @@ public class TestStressNRTReplication extends LuceneTestCase {
       return null;
     }
 
+    // This is very costly (takes more time to check than it did to index); we do this ourselves in the end instead of each time a replica
+    // is restarted:
+    // cmd.add("-Dtests.nrtreplication.checkonclose=true");
+
     cmd.add("-Dtests.nrtreplication.node=true");
     cmd.add("-Dtests.nrtreplication.nodeid=" + id);
     cmd.add("-Dtests.nrtreplication.startNS=" + Node.globalStartNS);
@@ -590,7 +594,6 @@ public class TestStressNRTReplication extends LuceneTestCase {
     long initInfosVersion = -1;
     Pattern logTimeStart = Pattern.compile("^[0-9\\.]+s .*");
     boolean willCrash = false;
-    boolean sawExistingSegmentsFile = false;
 
     while (true) {
       String l = r.readLine();
@@ -609,12 +612,6 @@ public class TestStressNRTReplication extends LuceneTestCase {
 
         // Hackity hack, in case primary crashed/closed and we haven't noticed (reaped the process) yet:
         if (isPrimary == false) {
-          if (sawExistingSegmentsFile) {
-            // This means MDW's virus checker blocked us from deleting segments_N that we must delete in order to start ... just return null
-            // and retry again later:
-            message("failed to remove segments_N; skipping");
-            return null;
-          }
           for(int i=0;i<100;i++) {
             NodeProcess primary2 = primary;
             if (primaryGen != myPrimaryGen || primary2 == null || primary2.nodeIsClosing.get()) {
@@ -658,8 +655,6 @@ public class TestStressNRTReplication extends LuceneTestCase {
         willCrash = true;
       } else if (l.startsWith("NODE STARTED")) {
         break;
-      } else if (l.contains("replica cannot start: existing segments file=")) {
-        sawExistingSegmentsFile = true;
       }
     }