You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ignite.apache.org by ag...@apache.org on 2019/04/09 12:40:26 UTC
[ignite] branch master updated: IGNITE-11621 Fixed infinite 'no
next node in topology' loop in case of connecting nodes - Fixes #6360.
This is an automated email from the ASF dual-hosted git repository.
agoncharuk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ignite.git
The following commit(s) were added to refs/heads/master by this push:
new ce08060 IGNITE-11621 Fixed infinite 'no next node in topology' loop in case of connecting nodes - Fixes #6360.
ce08060 is described below
commit ce08060b199e471c5083824cd2b76a847ca55741
Author: Sergey Chugunov <se...@gmail.com>
AuthorDate: Tue Apr 9 15:36:03 2019 +0300
IGNITE-11621 Fixed infinite 'no next node in topology' loop in case of connecting nodes - Fixes #6360.
Signed-off-by: Alexey Goncharuk <al...@gmail.com>
---
.../ignite/spi/discovery/tcp/ServerImpl.java | 17 +-
.../tcp/TcpDiscoveryNodeJoinAndFailureTest.java | 256 +++++++++++++++++++++
.../IgniteSpiDiscoverySelfTestSuite.java | 2 +
3 files changed, 268 insertions(+), 7 deletions(-)
diff --git a/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java b/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java
index 0304b0f..d3ac604 100644
--- a/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java
+++ b/modules/core/src/main/java/org/apache/ignite/spi/discovery/tcp/ServerImpl.java
@@ -1120,8 +1120,8 @@ class ServerImpl extends TcpDiscoveryImpl {
private boolean sendJoinRequestMessage(DiscoveryDataPacket discoveryData) throws IgniteSpiException {
TcpDiscoveryAbstractMessage joinReq = new TcpDiscoveryJoinRequestMessage(locNode, discoveryData);
- // Time when it has been detected, that addresses from IP finder do not respond.
- long noResStart = 0;
+ // Time when join process started.
+ long joinStart = 0;
while (true) {
Collection<InetSocketAddress> addrs = spi.resolvedAddresses();
@@ -1153,8 +1153,11 @@ class ServerImpl extends TcpDiscoveryImpl {
noResAddrs.remove(addr);
- // Address is responsive, reset period start.
- noResStart = 0;
+ //join timeout should not be reset if response was received from another CONNECTING node
+ //(only CONNECTING node sends back WAIT and CONTINUE_JOIN codes),
+ //otherwise two CONNECTING nodes can stuck in infinite loop sending join reqs to each other forever
+ if (res != RES_WAIT && res != RES_CONTINUE_JOIN)
+ joinStart = 0;
switch (res) {
case RES_WAIT:
@@ -1243,9 +1246,9 @@ class ServerImpl extends TcpDiscoveryImpl {
}
if (spi.joinTimeout > 0) {
- if (noResStart == 0)
- noResStart = U.currentTimeMillis();
- else if (U.currentTimeMillis() - noResStart > spi.joinTimeout)
+ if (joinStart == 0)
+ joinStart = U.currentTimeMillis();
+ else if (U.currentTimeMillis() - joinStart > spi.joinTimeout)
throw new IgniteSpiException(
"Failed to connect to any address from IP finder within join timeout " +
"(make sure IP finder addresses are correct, and operating system firewalls are disabled " +
diff --git a/modules/core/src/test/java/org/apache/ignite/spi/discovery/tcp/TcpDiscoveryNodeJoinAndFailureTest.java b/modules/core/src/test/java/org/apache/ignite/spi/discovery/tcp/TcpDiscoveryNodeJoinAndFailureTest.java
new file mode 100644
index 0000000..2cccd0d
--- /dev/null
+++ b/modules/core/src/test/java/org/apache/ignite/spi/discovery/tcp/TcpDiscoveryNodeJoinAndFailureTest.java
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.ignite.spi.discovery.tcp;
+
+import java.io.OutputStream;
+import java.util.Arrays;
+import java.util.UUID;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+import org.apache.ignite.IgniteSystemProperties;
+import org.apache.ignite.configuration.IgniteConfiguration;
+import org.apache.ignite.internal.IgniteInternalFuture;
+import org.apache.ignite.spi.discovery.tcp.ipfinder.TcpDiscoveryIpFinder;
+import org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder;
+import org.apache.ignite.spi.discovery.tcp.messages.TcpDiscoveryAbstractMessage;
+import org.apache.ignite.spi.discovery.tcp.messages.TcpDiscoveryJoinRequestMessage;
+import org.apache.ignite.spi.discovery.tcp.messages.TcpDiscoveryNodeAddFinishedMessage;
+import org.apache.ignite.spi.discovery.tcp.messages.TcpDiscoveryNodeAddedMessage;
+import org.apache.ignite.spi.discovery.tcp.messages.TcpDiscoveryNodeFailedMessage;
+import org.apache.ignite.testframework.GridTestUtils;
+import org.apache.ignite.testframework.junits.WithSystemProperty;
+import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest;
+import org.junit.Test;
+
+/**
+ *
+ */
+public class TcpDiscoveryNodeJoinAndFailureTest extends GridCommonAbstractTest {
+ /** */
+ private static final String NODE_WITH_PORT_ID_0 = "node0-47500";
+
+ /** */
+ private static final String NODE_WITH_PORT_ID_1 = "node1-47501";
+
+ /** */
+ private static final String NODE_WITH_PORT_ID_2 = "node2-47502";
+
+ /** */
+ private static final String NODE_WITH_PORT_ID_3 = "node3-47503";
+
+ /** */
+ private boolean usePortFromNodeName;
+
+ /** */
+ private TcpDiscoverySpi specialSpi;
+
+ /** */
+ private TcpDiscoveryIpFinder specialIpFinder0;
+
+ /** */
+ private TcpDiscoveryIpFinder specialIpFinder1;
+
+ private UUID nodeId;
+
+ @Override protected IgniteConfiguration getConfiguration(String igniteInstanceName) throws Exception {
+ IgniteConfiguration cfg = super.getConfiguration(igniteInstanceName);
+
+ TcpDiscoverySpi spi = specialSpi != null ? specialSpi : new TcpDiscoverySpi();
+
+ if (usePortFromNodeName)
+ spi.setLocalPort(Integer.parseInt(igniteInstanceName.split("-")[1]));
+
+ if (specialIpFinder0 != null && igniteInstanceName.equals(NODE_WITH_PORT_ID_2))
+ spi.setIpFinder(specialIpFinder0);
+ else if (specialIpFinder1 != null && igniteInstanceName.equals(NODE_WITH_PORT_ID_3))
+ spi.setIpFinder(specialIpFinder1);
+ else
+ spi.setIpFinder(sharedStaticIpFinder);
+
+ spi.setNetworkTimeout(2500);
+
+ spi.setIpFinderCleanFrequency(5000);
+
+ spi.setJoinTimeout(5000);
+
+ spi.setConnectionRecoveryTimeout(0);
+
+ cfg.setDiscoverySpi(spi);
+
+ cfg.setFailureDetectionTimeout(7500);
+
+ if (nodeId != null && igniteInstanceName.equals(NODE_WITH_PORT_ID_2))
+ cfg.setNodeId(nodeId);
+
+ return cfg;
+ }
+
+ /**
+ * If whole ring fails but two server nodes both in CONNECTING state remain alive they should not hang
+ * indefinitely sending join requests to each other.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/IGNITE-11621">IGNITE-11621</a> with comments provides detailed description of this corner case.
+ *
+ * @throws Exception If failed.
+ */
+ @Test
+ @WithSystemProperty(key = IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE, value = "false")
+ public void testConnectingNodesStopIfNoConnectedNodeIsPresented() throws Exception {
+ /*
+ Test reproduces the needed behavior (two nodes in CONNECTING state) doing the following:
+ - it starts two regular nodes, node0 (coordinator) and node1 (just another server) with special
+ discovery SPIs;
+ - when node1 receives NodeAddFinished for subsequently started node2, it doesn't send it to the node
+ but closes disco socket to node2 leaving it in CONNECTING state and generating NodeFailed for it.
+ Also at this moment node3 is started;
+ - when node0 receives this NodeFailed it fails (because special SPI throws an exception) and stops;
+ - when node1 receives another join request from node2 or NodeAdded from node3 reaches it back,
+ node1's special SPI also throws an exception so it goes down as well;
+ - as a result, both node2 and node3 get stuck in CONNECTING state and as they use special IpFinders
+ they see each other and are able to send join requests to each other back and forth.
+
+ The whole purpose of the test is to verify that these two nodes won't stuck in CONNECTING state forever
+ and will eventually stop.
+ */
+
+ usePortFromNodeName = true;
+
+ final AtomicInteger joinReqsCntr = new AtomicInteger(0);
+
+ final AtomicReference<IgniteInternalFuture> futureRef = new AtomicReference();
+
+ final UUID node2Id = UUID.randomUUID();
+
+ final TcpDiscoverySpi node0SpecialSpi = new TcpDiscoverySpi() {
+ @Override protected void startMessageProcess(TcpDiscoveryAbstractMessage msg) {
+ if (msg instanceof TcpDiscoveryNodeFailedMessage) {
+ TcpDiscoveryNodeFailedMessage failedMsg = (TcpDiscoveryNodeFailedMessage)msg;
+
+ UUID failedNodeId = failedMsg.failedNodeId();
+
+ if (failedNodeId.equals(node2Id))
+ throw new RuntimeException("Stop node0 exception");
+ }
+
+ if (msg instanceof TcpDiscoveryJoinRequestMessage) {
+ TcpDiscoveryJoinRequestMessage joinReq = (TcpDiscoveryJoinRequestMessage)msg;
+
+ if (joinReq.node().id().equals(node2Id))
+ joinReqsCntr.incrementAndGet();
+ }
+ }
+ };
+
+ final TcpDiscoverySpi node1SpecialSpi = new TcpDiscoverySpi() {
+ @Override protected void startMessageProcess(TcpDiscoveryAbstractMessage msg) {
+ if (msg instanceof TcpDiscoveryNodeAddFinishedMessage) {
+ TcpDiscoveryNodeAddFinishedMessage finishedMsg = (TcpDiscoveryNodeAddFinishedMessage)msg;
+
+ UUID nodeId = finishedMsg.nodeId();
+
+ if (nodeId.equals(node2Id)) {
+ Object workerObj = GridTestUtils.getFieldValue(impl, "msgWorker");
+
+ OutputStream out = GridTestUtils.getFieldValue(workerObj, "out");
+
+ try {
+ out.close();
+
+ log.warning("Out to 'sick' node closed");
+ }
+ catch (Exception ignored) {
+ // No-op.
+ }
+
+ futureRef.set(GridTestUtils.runAsync(() -> {
+ try {
+ startGrid(NODE_WITH_PORT_ID_3);
+ } catch (Exception ignored) {
+ //NO-op.
+ }
+ }));
+ }
+ }
+
+
+ if (msg instanceof TcpDiscoveryJoinRequestMessage) {
+ TcpDiscoveryJoinRequestMessage joinReq = (TcpDiscoveryJoinRequestMessage)msg;
+
+ int joinReqsCount = joinReqsCntr.get();
+
+ if (joinReq.node().id().equals(node2Id) && joinReqsCount == 1)
+ throw new RuntimeException("Stop node1 exception by subsequent join req");
+ }
+
+ if (msg instanceof TcpDiscoveryNodeAddedMessage) {
+ TcpDiscoveryNodeAddedMessage addedMsg = (TcpDiscoveryNodeAddedMessage)msg;
+
+ if (addedMsg.node().discoveryPort() == 47503)
+ throw new RuntimeException("Stop node1 exception by new node added msg");
+ }
+ }
+ };
+
+ specialSpi = node0SpecialSpi;
+
+ startGrid(NODE_WITH_PORT_ID_0);
+
+ specialSpi = node1SpecialSpi;
+
+ startGrid(NODE_WITH_PORT_ID_1);
+
+ specialIpFinder0 = new TcpDiscoveryVmIpFinder(false);
+
+ ((TcpDiscoveryVmIpFinder)specialIpFinder0).setAddresses(Arrays.asList("127.0.0.1:47501","127.0.0.1:47503"));
+
+ specialIpFinder1 = new TcpDiscoveryVmIpFinder(false);
+
+ ((TcpDiscoveryVmIpFinder)specialIpFinder1).setAddresses(Arrays.asList("127.0.0.1:47502"));
+
+ specialSpi = null;
+
+ nodeId = node2Id;
+
+ boolean expectedExceptionThrown = false;
+
+ try {
+ startGrid(NODE_WITH_PORT_ID_2);
+ }
+ catch (Exception e) {
+ Throwable cause0 = e.getCause();
+
+ assertNotNull(cause0);
+
+ Throwable cause1 = cause0.getCause();
+
+ assertNotNull(cause1);
+
+ String errorMsg = cause1.getMessage();
+
+ assertTrue("Expected error message was not found: " + errorMsg, errorMsg.contains("Failed to connect to any address from IP finder"));
+
+ expectedExceptionThrown = true;
+ }
+
+ assertTrue("Expected exception was not thrown.", expectedExceptionThrown);
+
+ IgniteInternalFuture startGridFut = futureRef.get();
+
+ if (startGridFut != null)
+ startGridFut.get();
+ }
+}
diff --git a/modules/core/src/test/java/org/apache/ignite/testsuites/IgniteSpiDiscoverySelfTestSuite.java b/modules/core/src/test/java/org/apache/ignite/testsuites/IgniteSpiDiscoverySelfTestSuite.java
index c1d85e5..864290f 100644
--- a/modules/core/src/test/java/org/apache/ignite/testsuites/IgniteSpiDiscoverySelfTestSuite.java
+++ b/modules/core/src/test/java/org/apache/ignite/testsuites/IgniteSpiDiscoverySelfTestSuite.java
@@ -45,6 +45,7 @@ import org.apache.ignite.spi.discovery.tcp.TcpDiscoveryNetworkIssuesTest;
import org.apache.ignite.spi.discovery.tcp.TcpDiscoveryNodeAttributesUpdateOnReconnectTest;
import org.apache.ignite.spi.discovery.tcp.TcpDiscoveryNodeConfigConsistentIdSelfTest;
import org.apache.ignite.spi.discovery.tcp.TcpDiscoveryNodeConsistentIdSelfTest;
+import org.apache.ignite.spi.discovery.tcp.TcpDiscoveryNodeJoinAndFailureTest;
import org.apache.ignite.spi.discovery.tcp.TcpDiscoveryPendingMessageDeliveryTest;
import org.apache.ignite.spi.discovery.tcp.TcpDiscoveryReconnectUnstableTopologyTest;
import org.apache.ignite.spi.discovery.tcp.TcpDiscoveryRestartTest;
@@ -96,6 +97,7 @@ import static org.apache.ignite.IgniteSystemProperties.IGNITE_OVERRIDE_MCAST_GRP
TcpDiscoverySpiConfigSelfTest.class,
TcpDiscoveryMarshallerCheckSelfTest.class,
TcpDiscoverySnapshotHistoryTest.class,
+ TcpDiscoveryNodeJoinAndFailureTest.class,
GridTcpSpiForwardingSelfTest.class,