You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2017/07/03 14:23:12 UTC

lucene-solr:branch_6x: SOLR-10914: RecoveryStrategy's sendPrepRecoveryCmd can get stuck for 5 minutes if leader is unloaded

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x 3b26b2a64 -> df727d313


SOLR-10914: RecoveryStrategy's sendPrepRecoveryCmd can get stuck for 5 minutes if leader is unloaded

(cherry picked from commit 157ff9a)


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/df727d31
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/df727d31
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/df727d31

Branch: refs/heads/branch_6x
Commit: df727d313f6f63f73b8efe0a0448b263581670bd
Parents: 3b26b2a
Author: Shalin Shekhar Mangar <sh...@apache.org>
Authored: Mon Jul 3 19:50:33 2017 +0530
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Mon Jul 3 19:53:06 2017 +0530

----------------------------------------------------------------------
 solr/CHANGES.txt                                |   2 +
 .../org/apache/solr/cloud/RecoveryStrategy.java |  27 +----
 .../org/apache/solr/util/TestInjection.java     |   2 +-
 solr/core/src/test-files/solr/solr.xml          |   1 +
 .../apache/solr/cloud/TestCloudRecovery.java    |   8 --
 .../org/apache/solr/cloud/TestPrepRecovery.java | 109 +++++++++++++++++++
 6 files changed, 117 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/df727d31/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index f02c5e0..4a1bed7 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -146,6 +146,8 @@ when using one of Exact*StatsCache (Mikhail Khludnev)
 * SOLR-10910: Clean up a few details left over from pluggable transient core and untangling
   CoreDescriptor/CoreContainer references (Erick Erickson)
 
+* SOLR-10914: RecoveryStrategy's sendPrepRecoveryCmd can get stuck for 5 minutes if leader is unloaded. (shalin)
+
 Optimizations
 ----------------------
 * SOLR-10634: JSON Facet API: When a field/terms facet will retrieve all buckets (i.e. limit:-1)

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/df727d31/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index 0df55b8..00a0f53 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -19,7 +19,6 @@ package org.apache.solr.cloud;
 import java.io.Closeable;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.net.SocketTimeoutException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -647,29 +646,12 @@ public class RecoveryStrategy extends Thread implements Closeable {
       prepCmd.setOnlyIfLeaderActive(true);
     }
 
-    final int maxTries = 30;
-    for (int numTries = 0; numTries < maxTries; numTries++) {
-      try {
-        sendPrepRecoveryCmd(leaderBaseUrl, prepCmd);
-        break;
-      } catch (ExecutionException e) {
-        if (e.getCause() instanceof SolrServerException) {
-          SolrServerException solrException = (SolrServerException) e.getCause();
-          if (solrException.getRootCause() instanceof SocketTimeoutException && numTries < maxTries) {
-            LOG.warn("Socket timeout on send prep recovery cmd, retrying.. ");
-            continue;
-          }
-        }
-        throw  e;
-      }
-    }
-  }
-
-  final private void sendPrepRecoveryCmd(String leaderBaseUrl, WaitForState prepCmd)
-      throws SolrServerException, IOException, InterruptedException, ExecutionException {
+    int conflictWaitMs = zkController.getLeaderConflictResolveWait();
+    // timeout after 5 seconds more than the max timeout (conflictWait + 3 seconds) on the server side
+    int readTimeout = conflictWaitMs + 8000;
     try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) {
       client.setConnectionTimeout(10000);
-      client.setSoTimeout(10000);
+      client.setSoTimeout(readTimeout);
       HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd);
       prevSendPreRecoveryHttpUriRequest = mrr.httpUriRequest;
 
@@ -678,5 +660,4 @@ public class RecoveryStrategy extends Thread implements Closeable {
       mrr.future.get();
     }
   }
-
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/df727d31/solr/core/src/java/org/apache/solr/util/TestInjection.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/util/TestInjection.java b/solr/core/src/java/org/apache/solr/util/TestInjection.java
index 5e4dc75..e02b0eb 100644
--- a/solr/core/src/java/org/apache/solr/util/TestInjection.java
+++ b/solr/core/src/java/org/apache/solr/util/TestInjection.java
@@ -311,7 +311,7 @@ public class TestInjection {
       boolean enabled = pair.first();
       int chanceIn100 = pair.second();
       // Prevent for continuous pause forever
-      if (enabled && rand.nextInt(100) >= (100 - chanceIn100) && countPrepRecoveryOpPauseForever.get() < 2) {
+      if (enabled && rand.nextInt(100) >= (100 - chanceIn100) && countPrepRecoveryOpPauseForever.get() < 1) {
         countPrepRecoveryOpPauseForever.incrementAndGet();
         log.info("inject pause forever for prep recovery op");
         try {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/df727d31/solr/core/src/test-files/solr/solr.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/solr.xml b/solr/core/src/test-files/solr/solr.xml
index 526dffa..ae27fe7 100644
--- a/solr/core/src/test-files/solr/solr.xml
+++ b/solr/core/src/test-files/solr/solr.xml
@@ -43,6 +43,7 @@
     <int name="zkClientTimeout">${solr.zkclienttimeout:30000}</int>
     <bool name="genericCoreNodeNames">${genericCoreNodeNames:true}</bool>
     <int name="leaderVoteWait">${leaderVoteWait:10000}</int>
+    <int name="leaderConflictResolveWait">${leaderConflictResolveWait:180000}</int>
     <int name="distribUpdateConnTimeout">${distribUpdateConnTimeout:45000}</int>
     <int name="distribUpdateSoTimeout">${distribUpdateSoTimeout:340000}</int>
     <int name="autoReplicaFailoverWaitAfterExpiration">${autoReplicaFailoverWaitAfterExpiration:10000}</int>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/df727d31/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
index 164eeab..4cb62fb 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
@@ -43,8 +43,6 @@ import org.apache.solr.core.SolrCore;
 import org.apache.solr.metrics.SolrMetricManager;
 import org.apache.solr.update.DirectUpdateHandler2;
 import org.apache.solr.update.UpdateLog;
-import org.apache.solr.util.TestInjection;
-import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -55,7 +53,6 @@ public class TestCloudRecovery extends SolrCloudTestCase {
 
   @BeforeClass
   public static void setupCluster() throws Exception {
-    TestInjection.prepRecoveryOpPauseForever = "true:30";
     System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
     System.setProperty("solr.ulog.numRecordsToKeep", "1000");
 
@@ -71,11 +68,6 @@ public class TestCloudRecovery extends SolrCloudTestCase {
         false, true, 30);
   }
 
-  @AfterClass
-  public static void afterClass() {
-    TestInjection.reset();
-  }
-
   @Before
   public void resetCollection() throws IOException, SolrServerException {
     cluster.getSolrClient().deleteByQuery(COLLECTION, "*:*");

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/df727d31/solr/core/src/test/org/apache/solr/cloud/TestPrepRecovery.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPrepRecovery.java b/solr/core/src/test/org/apache/solr/cloud/TestPrepRecovery.java
new file mode 100644
index 0000000..a80565b
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/cloud/TestPrepRecovery.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.cloud;
+
+import org.apache.solr.client.solrj.embedded.JettySolrRunner;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.util.TestInjection;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Tests for PREPRECOVERY CoreAdmin API
+ */
+public class TestPrepRecovery extends SolrCloudTestCase {
+
+  @BeforeClass
+  public static void setupCluster() throws Exception {
+    System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
+    System.setProperty("solr.ulog.numRecordsToKeep", "1000");
+    // the default is 180s and our waitForState times out in 90s
+    // so we lower this to 10s so that we can still test timeouts
+    System.setProperty("leaderConflictResolveWait", "10000");
+
+    configureCluster(2)
+        .addConfig("config", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf"))
+        .withSolrXml(TEST_PATH().resolve("solr.xml"))
+        .configure();
+  }
+
+  public static void tearCluster() throws Exception {
+    System.clearProperty("leaderConflictResolveWait");
+  }
+
+  @Test
+  public void testLeaderUnloaded() throws Exception {
+    CloudSolrClient solrClient = cluster.getSolrClient();
+
+    String collectionName = "testLeaderUnloaded";
+    CollectionAdminRequest.createCollection(collectionName, 1, 2)
+        .process(solrClient);
+
+    waitForState("Expected collection: testLeaderUnloaded to be live with 1 shard and 2 replicas",
+        collectionName, clusterShape(1, 2));
+
+    JettySolrRunner newNode = cluster.startJettySolrRunner();
+    String newNodeName = newNode.getNodeName();
+
+    // add a replica to the new node so that it starts watching the collection
+    CollectionAdminRequest.addReplicaToShard(collectionName, "shard1")
+        .setNode(newNodeName)
+        .process(solrClient);
+
+    // now delete the leader
+    Replica leader = solrClient.getZkStateReader().getLeaderRetry(collectionName, "shard1");
+    CollectionAdminRequest.deleteReplica(collectionName, "shard1", leader.getName())
+        .process(solrClient);
+
+    // add another replica to the new node. When it starts recovering, it will likely have stale state
+    // and ask the erstwhile leader to PREPRECOVERY which will hang for about 30 seconds
+    CollectionAdminRequest.addReplicaToShard(collectionName, "shard1")
+        .setNode(newNodeName)
+        .process(solrClient);
+
+    // in the absence of the fixes made in SOLR-10914, this statement will timeout after 90s
+    waitForState("Expected collection: testLeaderUnloaded to be live with 1 shard and 3 replicas",
+        collectionName, clusterShape(1, 3));
+  }
+
+  public void testLeaderNotResponding() throws Exception {
+    CloudSolrClient solrClient = cluster.getSolrClient();
+
+    String collectionName = "testLeaderNotResponding";
+    CollectionAdminRequest.createCollection(collectionName, 1, 1)
+        .process(solrClient);
+
+    waitForState("Expected collection: testLeaderNotResponding to be live with 1 shard and 1 replicas",
+        collectionName, clusterShape(1, 1));
+
+    TestInjection.prepRecoveryOpPauseForever = "true:100";
+    try {
+      CollectionAdminRequest.addReplicaToShard(collectionName, "shard1")
+          .process(solrClient);
+
+      // in the absence of fixes made in SOLR-9716, prep recovery waits forever and the following statement
+      // times out in 90 seconds
+      waitForState("Expected collection: testLeaderNotResponding to be live with 1 shard and 2 replicas",
+          collectionName, clusterShape(1, 2));
+    } finally {
+      TestInjection.reset();
+    }
+  }
+}