You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@ignite.apache.org by GitBox <gi...@apache.org> on 2020/10/16 13:38:29 UTC

[GitHub] [ignite] anton-vinogradov commented on a change in pull request #8211: Ducktests iptables

anton-vinogradov commented on a change in pull request #8211:
URL: https://github.com/apache/ignite/pull/8211#discussion_r506261241



##########
File path: modules/ducktests/tests/ignitetest/tests/discovery_test.py
##########
@@ -149,18 +179,134 @@ def _perform_node_fail_scenario(self, test_config):
 
             start_load_app(self.test_context, ignite_config=load_config, params=params, modules=modules)
 
-        data = simulate_nodes_failure(servers, failed_nodes, survived_node)
+        for node in failed_nodes:
+            self.logger.info(
+                "Simulating failure of node '%s' (order %d) on '%s'" % (node_id(node), order(node), node.name))
+
+        data = self._simulate_nodes_failure(servers, node_fail_task(ignite_config, test_config), failed_nodes,
+                                            survived_node)
 
         data['Ignite cluster start time (s)'] = start_servers_sec
 
         return data
 
+    def _simulate_nodes_failure(self, servers, kill_node_task, failed_nodes, survived_node):
+        """
+        Perform node failure scenario
+        """
+        ids_to_wait = [node_id(n) for n in failed_nodes]
+
+        _, first_terminated = servers.exec_on_nodes_async(failed_nodes, kill_node_task)
+
+        for node in failed_nodes:
+            self.logger.debug(
+                "Netfilter activated on '%s': %s" % (node.name, dump_netfilter_settings(node)))
+
+        # Keeps dates of logged node failures.
+        logged_timestamps = []
+        data = {}
+
+        for failed_id in ids_to_wait:
+            servers.await_event_on_node(failed_pattern(failed_id), survived_node, 15, from_the_beginning=True,
+                                        backoff_sec=0.3)
+
+            _, stdout, _ = survived_node.account.ssh_client.exec_command(

Review comment:
       should we check all alive nodes for detection duration?

##########
File path: modules/ducktests/tests/ignitetest/tests/discovery_test.py
##########
@@ -149,18 +179,134 @@ def _perform_node_fail_scenario(self, test_config):
 
             start_load_app(self.test_context, ignite_config=load_config, params=params, modules=modules)
 
-        data = simulate_nodes_failure(servers, failed_nodes, survived_node)
+        for node in failed_nodes:
+            self.logger.info(
+                "Simulating failure of node '%s' (order %d) on '%s'" % (node_id(node), order(node), node.name))
+
+        data = self._simulate_nodes_failure(servers, node_fail_task(ignite_config, test_config), failed_nodes,
+                                            survived_node)
 
         data['Ignite cluster start time (s)'] = start_servers_sec
 
         return data
 
+    def _simulate_nodes_failure(self, servers, kill_node_task, failed_nodes, survived_node):
+        """
+        Perform node failure scenario
+        """
+        ids_to_wait = [node_id(n) for n in failed_nodes]
+
+        _, first_terminated = servers.exec_on_nodes_async(failed_nodes, kill_node_task)
+
+        for node in failed_nodes:
+            self.logger.debug(
+                "Netfilter activated on '%s': %s" % (node.name, dump_netfilter_settings(node)))

Review comment:
       why not inlined to node_fail_task?

##########
File path: modules/ducktests/tests/ignitetest/tests/discovery_test.py
##########
@@ -149,18 +179,134 @@ def _perform_node_fail_scenario(self, test_config):
 
             start_load_app(self.test_context, ignite_config=load_config, params=params, modules=modules)
 
-        data = simulate_nodes_failure(servers, failed_nodes, survived_node)
+        for node in failed_nodes:
+            self.logger.info(
+                "Simulating failure of node '%s' (order %d) on '%s'" % (node_id(node), order(node), node.name))
+
+        data = self._simulate_nodes_failure(servers, node_fail_task(ignite_config, test_config), failed_nodes,
+                                            survived_node)
 
         data['Ignite cluster start time (s)'] = start_servers_sec

Review comment:
       should be close to start_servers_sec set?

##########
File path: modules/ducktests/tests/ignitetest/tests/discovery_test.py
##########
@@ -131,14 +161,14 @@ def _perform_node_fail_scenario(self, test_config):
 
         servers, start_servers_sec = start_servers(self.test_context, self.NUM_NODES - 1, ignite_config, modules)

Review comment:
       self.NUM_NODES - 1 (6 nodes) seems to be too small cluster

##########
File path: modules/ducktests/tests/ignitetest/tests/discovery_test.py
##########
@@ -149,18 +179,134 @@ def _perform_node_fail_scenario(self, test_config):
 
             start_load_app(self.test_context, ignite_config=load_config, params=params, modules=modules)
 
-        data = simulate_nodes_failure(servers, failed_nodes, survived_node)
+        for node in failed_nodes:
+            self.logger.info(
+                "Simulating failure of node '%s' (order %d) on '%s'" % (node_id(node), order(node), node.name))
+
+        data = self._simulate_nodes_failure(servers, node_fail_task(ignite_config, test_config), failed_nodes,

Review comment:
       any reason to not to encapsulate node_fail_task (to call it outside the method) when we have only one usage of _simulate_nodes_failure?

##########
File path: modules/ducktests/tests/ignitetest/tests/discovery_test.py
##########
@@ -149,18 +179,134 @@ def _perform_node_fail_scenario(self, test_config):
 
             start_load_app(self.test_context, ignite_config=load_config, params=params, modules=modules)
 
-        data = simulate_nodes_failure(servers, failed_nodes, survived_node)
+        for node in failed_nodes:
+            self.logger.info(
+                "Simulating failure of node '%s' (order %d) on '%s'" % (node_id(node), order(node), node.name))

Review comment:
       why not a part of _simulate_nodes_failure or node_fail_task?

##########
File path: modules/ducktests/tests/ignitetest/tests/discovery_test.py
##########
@@ -149,18 +179,134 @@ def _perform_node_fail_scenario(self, test_config):
 
             start_load_app(self.test_context, ignite_config=load_config, params=params, modules=modules)
 
-        data = simulate_nodes_failure(servers, failed_nodes, survived_node)
+        for node in failed_nodes:
+            self.logger.info(
+                "Simulating failure of node '%s' (order %d) on '%s'" % (node_id(node), order(node), node.name))
+
+        data = self._simulate_nodes_failure(servers, node_fail_task(ignite_config, test_config), failed_nodes,
+                                            survived_node)
 
         data['Ignite cluster start time (s)'] = start_servers_sec
 
         return data
 
+    def _simulate_nodes_failure(self, servers, kill_node_task, failed_nodes, survived_node):
+        """
+        Perform node failure scenario
+        """
+        ids_to_wait = [node_id(n) for n in failed_nodes]
+
+        _, first_terminated = servers.exec_on_nodes_async(failed_nodes, kill_node_task)
+
+        for node in failed_nodes:
+            self.logger.debug(
+                "Netfilter activated on '%s': %s" % (node.name, dump_netfilter_settings(node)))
+
+        # Keeps dates of logged node failures.
+        logged_timestamps = []
+        data = {}
+
+        for failed_id in ids_to_wait:
+            servers.await_event_on_node(failed_pattern(failed_id), survived_node, 15, from_the_beginning=True,
+                                        backoff_sec=0.3)
+
+            _, stdout, _ = survived_node.account.ssh_client.exec_command(
+                "grep '%s' %s" % (failed_pattern(failed_id), IgniteAwareService.STDOUT_STDERR_CAPTURE))
+
+            logged_timestamps.append(
+                datetime.strptime(re.match("^\\[[^\\[]+\\]", stdout.read().decode("utf-8")).group(),
+                                  "[%Y-%m-%d %H:%M:%S,%f]"))

Review comment:
       should this be an IgniteApplication feature?

##########
File path: modules/ducktests/tests/ignitetest/tests/discovery_test.py
##########
@@ -149,18 +179,134 @@ def _perform_node_fail_scenario(self, test_config):
 
             start_load_app(self.test_context, ignite_config=load_config, params=params, modules=modules)
 
-        data = simulate_nodes_failure(servers, failed_nodes, survived_node)
+        for node in failed_nodes:
+            self.logger.info(
+                "Simulating failure of node '%s' (order %d) on '%s'" % (node_id(node), order(node), node.name))
+
+        data = self._simulate_nodes_failure(servers, node_fail_task(ignite_config, test_config), failed_nodes,
+                                            survived_node)
 
         data['Ignite cluster start time (s)'] = start_servers_sec
 
         return data
 
+    def _simulate_nodes_failure(self, servers, kill_node_task, failed_nodes, survived_node):
+        """
+        Perform node failure scenario
+        """
+        ids_to_wait = [node_id(n) for n in failed_nodes]
+
+        _, first_terminated = servers.exec_on_nodes_async(failed_nodes, kill_node_task)
+
+        for node in failed_nodes:
+            self.logger.debug(
+                "Netfilter activated on '%s': %s" % (node.name, dump_netfilter_settings(node)))
+
+        # Keeps dates of logged node failures.
+        logged_timestamps = []
+        data = {}
+
+        for failed_id in ids_to_wait:
+            servers.await_event_on_node(failed_pattern(failed_id), survived_node, 15, from_the_beginning=True,
+                                        backoff_sec=0.3)
+
+            _, stdout, _ = survived_node.account.ssh_client.exec_command(
+                "grep '%s' %s" % (failed_pattern(failed_id), IgniteAwareService.STDOUT_STDERR_CAPTURE))
+
+            logged_timestamps.append(
+                datetime.strptime(re.match("^\\[[^\\[]+\\]", stdout.read().decode("utf-8")).group(),
+                                  "[%Y-%m-%d %H:%M:%S,%f]"))
+
+        self._check_results(failed_nodes, survived_node)
+
+        logged_timestamps.sort(reverse=True)
+
+        first_kill_time = epoch_mills(first_terminated)
+        detection_delay = epoch_mills(logged_timestamps[0]) - first_kill_time
+
+        data['Detection of node(s) failure (ms)'] = detection_delay
+        data['All detection delays (ms):'] = str([epoch_mills(ts) - first_kill_time for ts in logged_timestamps])
+        data['Nodes failed'] = len(failed_nodes)
+
+        return data
+
+    def _check_results(self, failed_nodes, survived_node):
+        """Ensures test finishes correctly."""
+        cmd = "grep '%s' %s | wc -l" % (failed_pattern(), IgniteAwareService.STDOUT_STDERR_CAPTURE)
+
+        failed_cnt = int(str(survived_node.account.ssh_client.exec_command(cmd)[1].read(), sys.getdefaultencoding()))
+
+        if failed_cnt != len(failed_nodes):
+            failed = str(survived_node.account.ssh_client.exec_command(
+                "grep '%s' %s" % (failed_pattern(), IgniteAwareService.STDOUT_STDERR_CAPTURE))[1].read(),
+                         sys.getdefaultencoding())
+
+            self.logger.warn("Node '%s' (%s) has detected the following failures:%s%s" % (
+                survived_node.name, node_id(survived_node), os.linesep, failed))
+
+            raise AssertionError(
+                "Wrong number of failed nodes: %d. Expected: %d. Check the logs." % (failed_cnt, len(failed_nodes)))
+
+        for service in [srv for srv in self.test_context.services if isinstance(srv, IgniteAwareService)]:
+            for node in [srv_node for srv_node in service.nodes if srv_node not in failed_nodes]:
+                cmd = "grep -i '%s' %s | wc -l" % ("local no1de segmented", IgniteAwareService.STDOUT_STDERR_CAPTURE)
+
+                failed = str(node.account.ssh_client.exec_command(cmd)[1].read(), sys.getdefaultencoding())
+
+                if int(failed) > 0:
+                    raise AssertionError(
+                        "Wrong node failed (segmented) on '%s'. Check the logs." % node.name)
+
+    def setup(self):
+        super().setup()
+
+        self.netfilter_store_path = os.path.join(self.tmp_path_root, "iptables.bak")
+
+        # Store current network filter settings.
+        for node in self.test_context.cluster.nodes:
+            cmd = "sudo iptables-save | tee " + self.netfilter_store_path
+
+            exec_error = str(node.account.ssh_client.exec_command(cmd)[2].read(), sys.getdefaultencoding())
+
+            if "Warning: iptables-legacy tables present" in exec_error:
+                cmd = "sudo iptables-legacy-save | tee " + self.netfilter_store_path
+
+                exec_error = str(node.account.ssh_client.exec_command(cmd)[2].read(), sys.getdefaultencoding())
+
+            assert len(exec_error) == 0, "Failed to store iptables rules on '%s': %s" % (node.name, exec_error)
+
+            self.logger.debug("Netfilter before launch on '%s': %s" % (node.name, dump_netfilter_settings(node)))
+
+    def teardown(self):
+        # Restore previous network filter settings.
+        cmd = "sudo iptables-restore < " + self.netfilter_store_path
+
+        errors = []
+
+        for node in self.test_context.cluster.nodes:
+            exec_error = str(node.account.ssh_client.exec_command(cmd)[2].read(), sys.getdefaultencoding())
+
+            if len(exec_error) > 0:
+                errors.append("Failed to restore iptables rules on '%s': %s" % (node.name, exec_error))
+            else:
+                self.logger.debug("Netfilter after launch on '%s': %s" % (node.name, dump_netfilter_settings(node)))
+
+        if len(errors) > 0:
+            self.logger.error("Failed restoring actions:" + os.linesep + os.linesep.join(errors))
+
+            raise RuntimeError("Unable to restore node states. See the log above.")
+
+        super().teardown()

Review comment:
       should this be at some test_superclass?

##########
File path: modules/ducktests/tests/ignitetest/tests/discovery_test.py
##########
@@ -149,18 +179,134 @@ def _perform_node_fail_scenario(self, test_config):
 
             start_load_app(self.test_context, ignite_config=load_config, params=params, modules=modules)
 
-        data = simulate_nodes_failure(servers, failed_nodes, survived_node)
+        for node in failed_nodes:
+            self.logger.info(
+                "Simulating failure of node '%s' (order %d) on '%s'" % (node_id(node), order(node), node.name))
+
+        data = self._simulate_nodes_failure(servers, node_fail_task(ignite_config, test_config), failed_nodes,
+                                            survived_node)
 
         data['Ignite cluster start time (s)'] = start_servers_sec
 
         return data
 
+    def _simulate_nodes_failure(self, servers, kill_node_task, failed_nodes, survived_node):
+        """
+        Perform node failure scenario
+        """
+        ids_to_wait = [node_id(n) for n in failed_nodes]
+
+        _, first_terminated = servers.exec_on_nodes_async(failed_nodes, kill_node_task)
+
+        for node in failed_nodes:
+            self.logger.debug(
+                "Netfilter activated on '%s': %s" % (node.name, dump_netfilter_settings(node)))
+
+        # Keeps dates of logged node failures.
+        logged_timestamps = []
+        data = {}
+
+        for failed_id in ids_to_wait:
+            servers.await_event_on_node(failed_pattern(failed_id), survived_node, 15, from_the_beginning=True,
+                                        backoff_sec=0.3)
+
+            _, stdout, _ = survived_node.account.ssh_client.exec_command(
+                "grep '%s' %s" % (failed_pattern(failed_id), IgniteAwareService.STDOUT_STDERR_CAPTURE))
+
+            logged_timestamps.append(
+                datetime.strptime(re.match("^\\[[^\\[]+\\]", stdout.read().decode("utf-8")).group(),
+                                  "[%Y-%m-%d %H:%M:%S,%f]"))
+
+        self._check_results(failed_nodes, survived_node)
+
+        logged_timestamps.sort(reverse=True)
+
+        first_kill_time = epoch_mills(first_terminated)
+        detection_delay = epoch_mills(logged_timestamps[0]) - first_kill_time
+
+        data['Detection of node(s) failure (ms)'] = detection_delay
+        data['All detection delays (ms):'] = str([epoch_mills(ts) - first_kill_time for ts in logged_timestamps])
+        data['Nodes failed'] = len(failed_nodes)
+
+        return data
+
+    def _check_results(self, failed_nodes, survived_node):

Review comment:
       lets split to failed amount check and ?segmentation? check




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org