You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ignite.apache.org by av...@apache.org on 2021/01/29 12:24:47 UTC

[ignite] branch ignite-ducktape updated: IGNITE-14054 : Improve discovery ducktest: add partial network drop. (#8714)

This is an automated email from the ASF dual-hosted git repository.

av pushed a commit to branch ignite-ducktape
in repository https://gitbox.apache.org/repos/asf/ignite.git


The following commit(s) were added to refs/heads/ignite-ducktape by this push:
     new f610f56  IGNITE-14054 : Improve discovery ducktest: add partial network drop. (#8714)
f610f56 is described below

commit f610f56c2db0a41243ed5ac9a8230e21bdb363d4
Author: Vladimir Steshin <vl...@gmail.com>
AuthorDate: Fri Jan 29 15:24:28 2021 +0300

    IGNITE-14054 : Improve discovery ducktest: add partial network drop. (#8714)
---
 .../ignitetest/services/utils/ignite_aware.py      | 35 +++++++++++++++-------
 .../services/utils/templates/discovery_macro.j2    |  3 ++
 .../tests/ignitetest/tests/discovery_test.py       | 30 +++++++++++++------
 3 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/modules/ducktests/tests/ignitetest/services/utils/ignite_aware.py b/modules/ducktests/tests/ignitetest/services/utils/ignite_aware.py
index 89f7a4f..d1a3a38 100644
--- a/modules/ducktests/tests/ignitetest/services/utils/ignite_aware.py
+++ b/modules/ducktests/tests/ignitetest/services/utils/ignite_aware.py
@@ -23,6 +23,7 @@ import sys
 import time
 from abc import abstractmethod, ABCMeta
 from datetime import datetime
+from enum import IntEnum
 from threading import Thread
 
 from ducktape.utils.util import wait_until
@@ -33,6 +34,7 @@ from ignitetest.services.utils.path import IgnitePathAware
 from ignitetest.services.utils.ignite_spec import resolve_spec
 from ignitetest.services.utils.jmx_utils import ignite_jmx_mixin
 from ignitetest.services.utils.log_utils import monitor_log
+from ignitetest.utils.enum import constructible
 
 
 # pylint: disable=too-many-public-methods
@@ -40,6 +42,14 @@ class IgniteAwareService(BackgroundThreadService, IgnitePathAware, metaclass=ABC
     """
     The base class to build services aware of Ignite.
     """
+    @constructible
+    class NetPart(IntEnum):
+        """
+        Network part to emulate failure.
+        """
+        INPUT = 0
+        OUTPUT = 1
+        ALL = 2
 
     # pylint: disable=R0913
     def __init__(self, context, config, num_nodes, startup_timeout_sec, shutdown_timeout_sec, **kwargs):
@@ -296,19 +306,24 @@ class IgniteAwareService(BackgroundThreadService, IgnitePathAware, metaclass=ABC
         """
         return os.path.join(self.temp_dir, "iptables.bak")
 
-    def drop_network(self, nodes=None):
+    def drop_network(self, nodes=None, net_part: NetPart = NetPart.ALL):
         """
         Disconnects node from cluster.
+        :param nodes: Nodes to emulate network failure on.
+        :param net_part: Part of network to emulate failure of.
         """
         if nodes is None:
             assert self.num_nodes == 1
             nodes = self.nodes
 
         for node in nodes:
-            self.logger.info("Dropping ignite connections on '" + node.account.hostname + "' ...")
+            self.logger.info("Dropping " + str(net_part) + " Ignite connections on '" + node.account.hostname + "' ...")
 
         self.__backup_iptables(nodes)
 
+        return self.exec_on_nodes_async(nodes, lambda n: self.__enable_netfilter(n, net_part))
+
+    def __enable_netfilter(self, node, net_part: NetPart):
         cm_spi = self.config.communication_spi
         dsc_spi = self.config.discovery_spi
 
@@ -318,15 +333,15 @@ class IgniteAwareService(BackgroundThreadService, IgnitePathAware, metaclass=ABC
         dsc_ports = str(dsc_spi.port) if not hasattr(dsc_spi, 'port_range') or dsc_spi.port_range < 1 else str(
             dsc_spi.port) + ':' + str(dsc_spi.port + dsc_spi.port_range)
 
-        cmd = f"sudo iptables -I %s 1 -p tcp -m multiport --dport {dsc_ports},{cm_ports} -j DROP"
+        if net_part in (IgniteAwareService.NetPart.ALL, IgniteAwareService.NetPart.INPUT):
+            node.account.ssh_client.exec_command(
+                f"sudo iptables -I INPUT 1 -p tcp -m multiport --dport {dsc_ports},{cm_ports} -j DROP")
+
+        if net_part in (IgniteAwareService.NetPart.ALL, IgniteAwareService.NetPart.OUTPUT):
+            node.account.ssh_client.exec_command(
+                f"sudo iptables -I OUTPUT 1 -p tcp -m multiport --dport {dsc_ports},{cm_ports} -j DROP")
 
-        return self.exec_on_nodes_async(nodes,
-                                        lambda n: (n.account.ssh_client.exec_command(cmd % "INPUT"),
-                                                   n.account.ssh_client.exec_command(cmd % "OUTPUT"),
-                                                   self.logger.debug("Activated netfilter on '%s': %s" %
-                                                                     (n.name, self.__dump_netfilter_settings(n)))
-                                                   )
-                                        )
+        self.logger.debug("Activated netfilter on '%s': %s" % (node.name, self.__dump_netfilter_settings(node)))
 
     def __backup_iptables(self, nodes):
         # Store current network filter settings.
diff --git a/modules/ducktests/tests/ignitetest/services/utils/templates/discovery_macro.j2 b/modules/ducktests/tests/ignitetest/services/utils/templates/discovery_macro.j2
index acbc53f..9ded542 100644
--- a/modules/ducktests/tests/ignitetest/services/utils/templates/discovery_macro.j2
+++ b/modules/ducktests/tests/ignitetest/services/utils/templates/discovery_macro.j2
@@ -53,6 +53,9 @@
         {% endif %}
         <property name="localPort" value="{{ spi.port }}"/>
         <property name="localPortRange" value="{{ spi.port_range }}"/>
+        {% if spi.connRecoveryTimeout is defined %}
+            <property name="connectionRecoveryTimeout" value="{{ spi.connRecoveryTimeout }}"/>
+        {% endif %}
         {{ ip_finder(spi) }}
     </bean>
 {% endmacro %}
diff --git a/modules/ducktests/tests/ignitetest/tests/discovery_test.py b/modules/ducktests/tests/ignitetest/tests/discovery_test.py
index 07d1d7a..6ce08a0 100644
--- a/modules/ducktests/tests/ignitetest/tests/discovery_test.py
+++ b/modules/ducktests/tests/ignitetest/tests/discovery_test.py
@@ -61,6 +61,8 @@ class DiscoveryTestConfig(NamedTuple):
     sequential_failure: bool = False
     with_zk: bool = False
     failure_detection_timeout: int = 1000
+    disable_conn_recovery: bool = False
+    net_part: IgniteAwareService.NetPart = IgniteService.NetPart.ALL
 
 
 # pylint: disable=W0223, no-member
@@ -88,29 +90,35 @@ class DiscoveryTest(IgniteTest):
 
     @cluster(num_nodes=MAX_CONTAINERS)
     @ignite_versions(str(DEV_BRANCH), str(LATEST))
-    @matrix(nodes_to_kill=[1, 2], failure_detection_timeout=[FAILURE_TIMEOUT],
+    @matrix(nodes_to_kill=[1, 2], failure_detection_timeout=[FAILURE_TIMEOUT], disable_conn_recovery=[False, True],
+            net_part=[IgniteService.NetPart.ALL, IgniteService.NetPart.INPUT],
             load_type=[ClusterLoad.NONE, ClusterLoad.ATOMIC, ClusterLoad.TRANSACTIONAL])
-    def test_nodes_fail_not_sequential_tcp(self, ignite_version, nodes_to_kill, load_type, failure_detection_timeout):
+    def test_nodes_fail_not_sequential_tcp(self, ignite_version, nodes_to_kill, load_type, failure_detection_timeout,
+                                           disable_conn_recovery: bool, net_part: IgniteService.NetPart):
         """
         Test nodes failure scenario with TcpDiscoverySpi not allowing nodes to fail in a row.
         """
         test_config = DiscoveryTestConfig(version=IgniteVersion(ignite_version), nodes_to_kill=nodes_to_kill,
                                           load_type=ClusterLoad.construct_from(load_type), sequential_failure=False,
-                                          failure_detection_timeout=failure_detection_timeout)
+                                          failure_detection_timeout=failure_detection_timeout,
+                                          disable_conn_recovery=disable_conn_recovery, net_part=net_part)
 
         return self._perform_node_fail_scenario(test_config)
 
     @cluster(num_nodes=MAX_CONTAINERS)
     @ignite_versions(str(DEV_BRANCH), str(LATEST))
     @matrix(load_type=[ClusterLoad.NONE, ClusterLoad.ATOMIC, ClusterLoad.TRANSACTIONAL],
-            failure_detection_timeout=[FAILURE_TIMEOUT])
-    def test_2_nodes_fail_sequential_tcp(self, ignite_version, load_type, failure_detection_timeout):
+            net_part=[IgniteService.NetPart.ALL, IgniteService.NetPart.INPUT],
+            failure_detection_timeout=[FAILURE_TIMEOUT], disable_conn_recovery=[False, True])
+    def test_2_nodes_fail_sequential_tcp(self, ignite_version, load_type, failure_detection_timeout,
+                                         disable_conn_recovery: bool, net_part: IgniteService.NetPart):
         """
         Test 2 nodes sequential failure scenario with TcpDiscoverySpi.
         """
         test_config = DiscoveryTestConfig(version=IgniteVersion(ignite_version), nodes_to_kill=2,
                                           load_type=ClusterLoad.construct_from(load_type), sequential_failure=True,
-                                          failure_detection_timeout=failure_detection_timeout)
+                                          failure_detection_timeout=failure_detection_timeout,
+                                          disable_conn_recovery=disable_conn_recovery, net_part=net_part)
 
         return self._perform_node_fail_scenario(test_config)
 
@@ -169,6 +177,9 @@ class DiscoveryTest(IgniteTest):
             if LATEST_2_7 < test_config.version <= V_2_9_0:
                 discovery_spi.so_linger = 0
 
+            if test_config.disable_conn_recovery:
+                discovery_spi.connRecoveryTimeout = 0
+
         ignite_config = IgniteConfiguration(
             version=test_config.version,
             discovery_spi=discovery_spi,
@@ -205,11 +216,12 @@ class DiscoveryTest(IgniteTest):
             start_load_app(self.test_context, ignite_config=load_config, params=params, modules=modules)
 
         results.update(self._simulate_and_detect_failure(servers, failed_nodes,
-                                                         test_config.failure_detection_timeout * 4))
+                                                         test_config.failure_detection_timeout * 4,
+                                                         test_config.net_part))
 
         return results
 
-    def _simulate_and_detect_failure(self, servers, failed_nodes, timeout):
+    def _simulate_and_detect_failure(self, servers, failed_nodes, timeout, net_part: IgniteAwareService.NetPart):
         """
         Perform node failure scenario
         """
@@ -219,7 +231,7 @@ class DiscoveryTest(IgniteTest):
 
         ids_to_wait = [node_id(n) for n in failed_nodes]
 
-        _, first_terminated = servers.drop_network(failed_nodes)
+        _, first_terminated = servers.drop_network(failed_nodes, net_part=net_part)
 
         # Keeps dates of logged node failures.
         logged_timestamps = []