You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2018/03/27 05:22:39 UTC
lucene-solr:branch_7x: SOLR-7736: Fix
ZkController.publishAndWaitForDownStates
Repository: lucene-solr
Updated Branches:
refs/heads/branch_7x e80ee7fff -> 9b4d16963
SOLR-7736: Fix ZkController.publishAndWaitForDownStates
(cherry picked from commit ecb94ba)
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/9b4d1696
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/9b4d1696
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/9b4d1696
Branch: refs/heads/branch_7x
Commit: 9b4d16963775f1a465f93542aef3eabc1e105afe
Parents: e80ee7f
Author: Shalin Shekhar Mangar <sh...@apache.org>
Authored: Tue Mar 27 10:28:34 2018 +0530
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Tue Mar 27 10:52:30 2018 +0530
----------------------------------------------------------------------
.../org/apache/solr/cloud/ZkController.java | 13 +--
.../org/apache/solr/cloud/ZkControllerTest.java | 85 ++++++++++++++------
2 files changed, 67 insertions(+), 31 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9b4d1696/solr/core/src/java/org/apache/solr/cloud/ZkController.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 57f1dd5..477e567 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -898,18 +898,19 @@ public class ZkController {
publishNodeAsDown(getNodeName());
Set<String> collectionsWithLocalReplica = ConcurrentHashMap.newKeySet();
- for (SolrCore core : cc.getCores()) {
- collectionsWithLocalReplica.add(core.getCoreDescriptor().getCloudDescriptor().getCollectionName());
+ for (CoreDescriptor descriptor : cc.getCoreDescriptors()) {
+ collectionsWithLocalReplica.add(descriptor.getCloudDescriptor().getCollectionName());
}
CountDownLatch latch = new CountDownLatch(collectionsWithLocalReplica.size());
for (String collectionWithLocalReplica : collectionsWithLocalReplica) {
zkStateReader.registerCollectionStateWatcher(collectionWithLocalReplica, (liveNodes, collectionState) -> {
+ if (collectionState == null) return false;
boolean foundStates = true;
- for (SolrCore core : cc.getCores()) {
- if (core.getCoreDescriptor().getCloudDescriptor().getCollectionName().equals(collectionWithLocalReplica)) {
- Replica replica = collectionState.getReplica(core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
- if (replica.getState() != Replica.State.DOWN) {
+ for (CoreDescriptor coreDescriptor : cc.getCoreDescriptors()) {
+ if (coreDescriptor.getCloudDescriptor().getCollectionName().equals(collectionWithLocalReplica)) {
+ Replica replica = collectionState.getReplica(coreDescriptor.getCloudDescriptor().getCoreNodeName());
+ if (replica == null || replica.getState() != Replica.State.DOWN) {
foundStates = false;
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9b4d1696/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java b/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
index d9ccd70..5578452 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
@@ -20,11 +20,13 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Properties;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.cloud.*;
+import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.util.Utils;
import org.apache.solr.core.CloudConfig;
import org.apache.solr.core.CoreContainer;
@@ -34,11 +36,16 @@ import org.apache.solr.handler.admin.CoreAdminHandler;
import org.apache.solr.handler.component.HttpShardHandlerFactory;
import org.apache.solr.update.UpdateShardHandler;
import org.apache.solr.update.UpdateShardHandlerConfig;
+import org.apache.solr.util.LogLevel;
import org.apache.zookeeper.CreateMode;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
+import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
+import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
+import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICA;
+
@Slow
@SolrTestCaseJ4.SuppressSSL
public class ZkControllerTest extends SolrTestCaseJ4 {
@@ -251,11 +258,26 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
}
}
- @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12028, https://issues.apache.org/jira/browse/SOLR-7736")
+ @Slow
+ @LogLevel(value = "org.apache.solr.cloud=DEBUG;org.apache.solr.cloud.overseer=DEBUG")
public void testPublishAndWaitForDownStates() throws Exception {
- String zkDir = createTempDir("testPublishAndWaitForDownStates").toFile().getAbsolutePath();
+
+ /*
+ This test asserts that if zkController.publishAndWaitForDownStates uses only core name to check if all local
+ cores are down then the method will return immediately but if it uses coreNodeName (as it does after SOLR-6665 then
+ the method will timeout).
+ We setup the cluster state in such a way that two replicas with same core name exist on non-existent nodes
+ and core container also has a dummy core that has the same core name. The publishAndWaitForDownStates before SOLR-6665
+ would only check the core names and therefore return immediately but after SOLR-6665 it should time out.
+ */
+
+ assumeWorkingMockito();
+ final String collectionName = "testPublishAndWaitForDownStates";
+ String zkDir = createTempDir(collectionName).toFile().getAbsolutePath();
CoreContainer cc = null;
+ String nodeName = "127.0.0.1:8983_solr";
+
ZkTestServer server = new ZkTestServer(zkDir);
try {
server.run();
@@ -263,7 +285,16 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
- cc = getCoreContainer();
+ cc = new MockCoreContainer() {
+ @Override
+ public List<CoreDescriptor> getCoreDescriptors() {
+ CoreDescriptor descriptor = new CoreDescriptor(collectionName, TEST_PATH(), Collections.emptyMap(), new Properties(), true);
+ // non-existent coreNodeName, this will cause zkController.publishAndWaitForDownStates to wait indefinitely
+ // when using coreNodeName but usage of core name alone will return immediately
+ descriptor.getCloudDescriptor().setCoreNodeName("core_node0");
+ return Collections.singletonList(descriptor);
+ }
+ };
ZkController zkController = null;
try {
@@ -277,28 +308,32 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
}
});
- HashMap<String, DocCollection> collectionStates = new HashMap<>();
- HashMap<String, Replica> replicas = new HashMap<>();
- // add two replicas with the same core name but one of them should be on a different node
- // than this ZkController instance
- for (int i=1; i<=2; i++) {
- Replica r = new Replica("core_node" + i,
- map(ZkStateReader.STATE_PROP, i == 1 ? "active" : "down",
- ZkStateReader.NODE_NAME_PROP, i == 1 ? "127.0.0.1:8983_solr" : "non_existent_host",
- ZkStateReader.CORE_NAME_PROP, "collection1"));
- replicas.put("core_node" + i, r);
- }
- HashMap<String, Object> sliceProps = new HashMap<>();
- sliceProps.put("state", Slice.State.ACTIVE.toString());
- Slice slice = new Slice("shard1", replicas, sliceProps);
- DocCollection c = new DocCollection("testPublishAndWaitForDownStates", map("shard1", slice), Collections.emptyMap(), DocRouter.DEFAULT);
- ClusterState state = new ClusterState(0, Collections.emptySet(), map("testPublishAndWaitForDownStates", c));
- byte[] bytes = Utils.toJSON(state);
- zkController.getZkClient().makePath(ZkStateReader.getCollectionPath("testPublishAndWaitForDownStates"), bytes, CreateMode.PERSISTENT, true);
-
- zkController.getZkStateReader().forceUpdateCollection("testPublishAndWaitForDownStates");
- assertTrue(zkController.getZkStateReader().getClusterState().hasCollection("testPublishAndWaitForDownStates"));
- assertNotNull(zkController.getZkStateReader().getClusterState().getCollection("testPublishAndWaitForDownStates"));
+ zkController.getZkClient().makePath(ZkStateReader.getCollectionPathRoot(collectionName), new byte[0], CreateMode.PERSISTENT, true);
+
+ ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION,
+ CollectionParams.CollectionAction.CREATE.toLower(), ZkStateReader.NODE_NAME_PROP, nodeName, ZkStateReader.NUM_SHARDS_PROP, "1",
+ "name", collectionName, DocCollection.STATE_FORMAT, "2");
+ zkController.getOverseerJobQueue().offer(Utils.toJSON(m));
+
+ HashMap<String, Object> propMap = new HashMap<>();
+ propMap.put(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower());
+ propMap.put(COLLECTION_PROP, collectionName);
+ propMap.put(SHARD_ID_PROP, "shard1");
+ propMap.put(ZkStateReader.NODE_NAME_PROP, "non_existent_host1");
+ propMap.put(ZkStateReader.CORE_NAME_PROP, collectionName);
+ propMap.put(ZkStateReader.STATE_PROP, "active");
+ zkController.getOverseerJobQueue().offer(Utils.toJSON(propMap));
+
+ propMap = new HashMap<>();
+ propMap.put(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower());
+ propMap.put(COLLECTION_PROP, collectionName);
+ propMap.put(SHARD_ID_PROP, "shard1");
+ propMap.put(ZkStateReader.NODE_NAME_PROP, "non_existent_host2");
+ propMap.put(ZkStateReader.CORE_NAME_PROP, collectionName);
+ propMap.put(ZkStateReader.STATE_PROP, "down");
+ zkController.getOverseerJobQueue().offer(Utils.toJSON(propMap));
+
+ zkController.getZkStateReader().forciblyRefreshAllClusterStateSlow();
long now = System.nanoTime();
long timeout = now + TimeUnit.NANOSECONDS.convert(ZkController.WAIT_DOWN_STATES_TIMEOUT_SECONDS, TimeUnit.SECONDS);