You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2017/07/04 01:24:10 UTC
[50/53] [abbrv] lucene-solr:feature/autoscaling: SOLR-10914:
RecoveryStrategy's sendPrepRecoveryCmd can get stuck for 5 minutes if leader
is unloaded
SOLR-10914: RecoveryStrategy's sendPrepRecoveryCmd can get stuck for 5 minutes if leader is unloaded
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/157ff9a4
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/157ff9a4
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/157ff9a4
Branch: refs/heads/feature/autoscaling
Commit: 157ff9a4e159158f4ecc474d1874da97577e6190
Parents: b978f37
Author: Shalin Shekhar Mangar <sh...@apache.org>
Authored: Mon Jul 3 19:50:33 2017 +0530
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Mon Jul 3 19:50:33 2017 +0530
----------------------------------------------------------------------
solr/CHANGES.txt | 2 +
.../org/apache/solr/cloud/RecoveryStrategy.java | 27 +----
.../org/apache/solr/util/TestInjection.java | 2 +-
solr/core/src/test-files/solr/solr.xml | 1 +
.../apache/solr/cloud/TestCloudRecovery.java | 8 --
.../org/apache/solr/cloud/TestPrepRecovery.java | 109 +++++++++++++++++++
6 files changed, 117 insertions(+), 32 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/157ff9a4/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index aadc6db..1bc960d 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -501,6 +501,8 @@ when using one of Exact*StatsCache (Mikhail Khludnev)
* SOLR-10910: Clean up a few details left over from pluggable transient core and untangling
CoreDescriptor/CoreContainer references (Erick Erickson)
+* SOLR-10914: RecoveryStrategy's sendPrepRecoveryCmd can get stuck for 5 minutes if leader is unloaded. (shalin)
+
Optimizations
----------------------
* SOLR-10634: JSON Facet API: When a field/terms facet will retrieve all buckets (i.e. limit:-1)
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/157ff9a4/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index 9b0805f..063f794 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -19,7 +19,6 @@ package org.apache.solr.cloud;
import java.io.Closeable;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
-import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@@ -811,29 +810,12 @@ public class RecoveryStrategy implements Runnable, Closeable {
prepCmd.setOnlyIfLeaderActive(true);
}
- final int maxTries = 30;
- for (int numTries = 0; numTries < maxTries; numTries++) {
- try {
- sendPrepRecoveryCmd(leaderBaseUrl, prepCmd);
- break;
- } catch (ExecutionException e) {
- if (e.getCause() instanceof SolrServerException) {
- SolrServerException solrException = (SolrServerException) e.getCause();
- if (solrException.getRootCause() instanceof SocketTimeoutException && numTries < maxTries) {
- LOG.warn("Socket timeout on send prep recovery cmd, retrying.. ");
- continue;
- }
- }
- throw e;
- }
- }
- }
-
- final private void sendPrepRecoveryCmd(String leaderBaseUrl, WaitForState prepCmd)
- throws SolrServerException, IOException, InterruptedException, ExecutionException {
+ int conflictWaitMs = zkController.getLeaderConflictResolveWait();
+ // timeout after 5 seconds more than the max timeout (conflictWait + 3 seconds) on the server side
+ int readTimeout = conflictWaitMs + 8000;
try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) {
client.setConnectionTimeout(10000);
- client.setSoTimeout(10000);
+ client.setSoTimeout(readTimeout);
HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd);
prevSendPreRecoveryHttpUriRequest = mrr.httpUriRequest;
@@ -842,5 +824,4 @@ public class RecoveryStrategy implements Runnable, Closeable {
mrr.future.get();
}
}
-
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/157ff9a4/solr/core/src/java/org/apache/solr/util/TestInjection.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/util/TestInjection.java b/solr/core/src/java/org/apache/solr/util/TestInjection.java
index 5b0d047..d7584da 100644
--- a/solr/core/src/java/org/apache/solr/util/TestInjection.java
+++ b/solr/core/src/java/org/apache/solr/util/TestInjection.java
@@ -329,7 +329,7 @@ public class TestInjection {
boolean enabled = pair.first();
int chanceIn100 = pair.second();
// Prevent for continuous pause forever
- if (enabled && rand.nextInt(100) >= (100 - chanceIn100) && countPrepRecoveryOpPauseForever.get() < 2) {
+ if (enabled && rand.nextInt(100) >= (100 - chanceIn100) && countPrepRecoveryOpPauseForever.get() < 1) {
countPrepRecoveryOpPauseForever.incrementAndGet();
log.info("inject pause forever for prep recovery op");
try {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/157ff9a4/solr/core/src/test-files/solr/solr.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/solr.xml b/solr/core/src/test-files/solr/solr.xml
index 526dffa..ae27fe7 100644
--- a/solr/core/src/test-files/solr/solr.xml
+++ b/solr/core/src/test-files/solr/solr.xml
@@ -43,6 +43,7 @@
<int name="zkClientTimeout">${solr.zkclienttimeout:30000}</int>
<bool name="genericCoreNodeNames">${genericCoreNodeNames:true}</bool>
<int name="leaderVoteWait">${leaderVoteWait:10000}</int>
+ <int name="leaderConflictResolveWait">${leaderConflictResolveWait:180000}</int>
<int name="distribUpdateConnTimeout">${distribUpdateConnTimeout:45000}</int>
<int name="distribUpdateSoTimeout">${distribUpdateSoTimeout:340000}</int>
<int name="autoReplicaFailoverWaitAfterExpiration">${autoReplicaFailoverWaitAfterExpiration:10000}</int>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/157ff9a4/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
index c7fc0e8..2cf8774 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
@@ -43,8 +43,6 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.metrics.SolrMetricManager;
import org.apache.solr.update.DirectUpdateHandler2;
import org.apache.solr.update.UpdateLog;
-import org.apache.solr.util.TestInjection;
-import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -56,7 +54,6 @@ public class TestCloudRecovery extends SolrCloudTestCase {
@BeforeClass
public static void setupCluster() throws Exception {
- TestInjection.prepRecoveryOpPauseForever = "true:30";
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
System.setProperty("solr.ulog.numRecordsToKeep", "1000");
@@ -73,11 +70,6 @@ public class TestCloudRecovery extends SolrCloudTestCase {
false, true, 30);
}
- @AfterClass
- public static void afterClass() {
- TestInjection.reset();
- }
-
@Before
public void resetCollection() throws IOException, SolrServerException {
cluster.getSolrClient().deleteByQuery(COLLECTION, "*:*");
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/157ff9a4/solr/core/src/test/org/apache/solr/cloud/TestPrepRecovery.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPrepRecovery.java b/solr/core/src/test/org/apache/solr/cloud/TestPrepRecovery.java
new file mode 100644
index 0000000..a80565b
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/cloud/TestPrepRecovery.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.cloud;
+
+import org.apache.solr.client.solrj.embedded.JettySolrRunner;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.util.TestInjection;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Tests for PREPRECOVERY CoreAdmin API
+ */
+public class TestPrepRecovery extends SolrCloudTestCase {
+
+ @BeforeClass
+ public static void setupCluster() throws Exception {
+ System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
+ System.setProperty("solr.ulog.numRecordsToKeep", "1000");
+ // the default is 180s and our waitForState times out in 90s
+ // so we lower this to 10s so that we can still test timeouts
+ System.setProperty("leaderConflictResolveWait", "10000");
+
+ configureCluster(2)
+ .addConfig("config", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf"))
+ .withSolrXml(TEST_PATH().resolve("solr.xml"))
+ .configure();
+ }
+
+ public static void tearCluster() throws Exception {
+ System.clearProperty("leaderConflictResolveWait");
+ }
+
+ @Test
+ public void testLeaderUnloaded() throws Exception {
+ CloudSolrClient solrClient = cluster.getSolrClient();
+
+ String collectionName = "testLeaderUnloaded";
+ CollectionAdminRequest.createCollection(collectionName, 1, 2)
+ .process(solrClient);
+
+ waitForState("Expected collection: testLeaderUnloaded to be live with 1 shard and 2 replicas",
+ collectionName, clusterShape(1, 2));
+
+ JettySolrRunner newNode = cluster.startJettySolrRunner();
+ String newNodeName = newNode.getNodeName();
+
+ // add a replica to the new node so that it starts watching the collection
+ CollectionAdminRequest.addReplicaToShard(collectionName, "shard1")
+ .setNode(newNodeName)
+ .process(solrClient);
+
+ // now delete the leader
+ Replica leader = solrClient.getZkStateReader().getLeaderRetry(collectionName, "shard1");
+ CollectionAdminRequest.deleteReplica(collectionName, "shard1", leader.getName())
+ .process(solrClient);
+
+ // add another replica to the new node. When it starts recovering, it will likely have stale state
+ // and ask the erstwhile leader to PREPRECOVERY which will hang for about 30 seconds
+ CollectionAdminRequest.addReplicaToShard(collectionName, "shard1")
+ .setNode(newNodeName)
+ .process(solrClient);
+
+ // in the absence of the fixes made in SOLR-10914, this statement will timeout after 90s
+ waitForState("Expected collection: testLeaderUnloaded to be live with 1 shard and 3 replicas",
+ collectionName, clusterShape(1, 3));
+ }
+
+ public void testLeaderNotResponding() throws Exception {
+ CloudSolrClient solrClient = cluster.getSolrClient();
+
+ String collectionName = "testLeaderNotResponding";
+ CollectionAdminRequest.createCollection(collectionName, 1, 1)
+ .process(solrClient);
+
+ waitForState("Expected collection: testLeaderNotResponding to be live with 1 shard and 1 replicas",
+ collectionName, clusterShape(1, 1));
+
+ TestInjection.prepRecoveryOpPauseForever = "true:100";
+ try {
+ CollectionAdminRequest.addReplicaToShard(collectionName, "shard1")
+ .process(solrClient);
+
+ // in the absence of fixes made in SOLR-9716, prep recovery waits forever and the following statement
+ // times out in 90 seconds
+ waitForState("Expected collection: testLeaderNotResponding to be live with 1 shard and 2 replicas",
+ collectionName, clusterShape(1, 2));
+ } finally {
+ TestInjection.reset();
+ }
+ }
+}