You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sh...@apache.org on 2019/04/03 16:49:50 UTC

[hadoop] branch trunk updated: HDDS-1164. Add New blockade Tests to test Replica Manager. Contributed by Nilotpal Nandi.

This is an automated email from the ASF dual-hosted git repository.

shashikant pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 7cd7045  HDDS-1164. Add New blockade Tests to test Replica Manager. Contributed by Nilotpal Nandi.
7cd7045 is described below

commit 7cd7045eea24ee0001f9069be370a028c64e36a6
Author: Shashikant Banerjee <sh...@apache.org>
AuthorDate: Wed Apr 3 22:19:10 2019 +0530

    HDDS-1164. Add New blockade Tests to test Replica Manager. Contributed by Nilotpal Nandi.
---
 .../src/main/blockade/blockadeUtils/blockade.py    |   5 +-
 .../main/blockade/clusterUtils/cluster_utils.py    |  23 +-
 .../main/blockade/test_blockade_client_failure.py  |   9 +-
 .../blockade/test_blockade_datanode_isolation.py   |   5 +-
 .../main/blockade/test_blockade_mixed_failure.py   | 152 +++++++-----
 ...t_blockade_mixed_failure_three_nodes_isolate.py | 255 ++++++++++++++-------
 .../test_blockade_mixed_failure_two_nodes.py       | 188 +++++++++------
 .../main/blockade/test_blockade_scm_isolation.py   |   5 +-
 8 files changed, 410 insertions(+), 232 deletions(-)

diff --git a/hadoop-ozone/dist/src/main/blockade/blockadeUtils/blockade.py b/hadoop-ozone/dist/src/main/blockade/blockadeUtils/blockade.py
index c3d1bbb..f371865 100644
--- a/hadoop-ozone/dist/src/main/blockade/blockadeUtils/blockade.py
+++ b/hadoop-ozone/dist/src/main/blockade/blockadeUtils/blockade.py
@@ -45,9 +45,8 @@ class Blockade(object):
     @classmethod
     def make_flaky(cls, flaky_node, container_list):
         # make the network flaky
-        om = filter(lambda x: 'ozoneManager' in x, container_list)
-        scm = filter(lambda x: 'scm' in x, container_list)
-        datanodes = filter(lambda x: 'datanode' in x, container_list)
+        om, scm, _, datanodes = \
+            ClusterUtils.find_om_scm_client_datanodes(container_list)
         node_dict = {
                 "all": "--all",
                 "scm" : scm[0],
diff --git a/hadoop-ozone/dist/src/main/blockade/clusterUtils/cluster_utils.py b/hadoop-ozone/dist/src/main/blockade/clusterUtils/cluster_utils.py
index baa3960..3a04103 100644
--- a/hadoop-ozone/dist/src/main/blockade/clusterUtils/cluster_utils.py
+++ b/hadoop-ozone/dist/src/main/blockade/clusterUtils/cluster_utils.py
@@ -71,7 +71,7 @@ class ClusterUtils(object):
   @classmethod
   def run_freon(cls, docker_compose_file, num_volumes, num_buckets,
                 num_keys, key_size, replication_type, replication_factor,
-                freon_client='ozoneManager'):
+                freon_client='om'):
     # run freon
     cmd = "docker-compose -f %s " \
           "exec %s /opt/hadoop/bin/ozone " \
@@ -115,7 +115,7 @@ class ClusterUtils(object):
   @classmethod
   def get_ozone_confkey_value(cls, docker_compose_file, key_name):
     cmd = "docker-compose -f %s " \
-          "exec ozoneManager /opt/hadoop/bin/ozone " \
+          "exec om /opt/hadoop/bin/ozone " \
           "getconf -confKey %s" \
           % (docker_compose_file, key_name)
     exit_code, output = cls.run_cmd(cmd)
@@ -307,3 +307,22 @@ class ClusterUtils(object):
     checksum = finaloutput.split(" ")
     logger.info("Checksum of %s is : %s", filepath, checksum[0])
     return checksum[0]
+
+  @classmethod
+  def get_pipelines(cls, docker_compose_file):
+    command = "docker-compose -f %s " \
+                         + "exec ozone_client /opt/hadoop/bin/ozone scmcli " \
+                         + "listPipelines" % (docker_compose_file)
+    exit_code, output = cls.run_cmd(command)
+    assert exit_code == 0, "list pipeline command failed"
+    return output
+
+  @classmethod
+  def find_om_scm_client_datanodes(cls, container_list):
+
+      om = filter(lambda x: 'om_1' in x, container_list)
+      scm = filter(lambda x: 'scm' in x, container_list)
+      datanodes = sorted(
+          list(filter(lambda x: 'datanode' in x, container_list)))
+      client = filter(lambda x: 'ozone_client' in x, container_list)
+      return om, scm, client, datanodes
\ No newline at end of file
diff --git a/hadoop-ozone/dist/src/main/blockade/test_blockade_client_failure.py b/hadoop-ozone/dist/src/main/blockade/test_blockade_client_failure.py
index c9e4ae5..8c0b518 100644
--- a/hadoop-ozone/dist/src/main/blockade/test_blockade_client_failure.py
+++ b/hadoop-ozone/dist/src/main/blockade/test_blockade_client_failure.py
@@ -47,11 +47,8 @@ def setup():
     exit_code, output = Blockade.blockade_status()
     assert exit_code == 0, "blockade status command failed with output=[%s]" % \
                            output
-    OM = filter(lambda x: 'ozoneManager' in x, CONTAINER_LIST)
-    SCM = filter(lambda x: 'scm' in x, CONTAINER_LIST)
-    DATANODES = sorted(list(filter(lambda x: 'datanode' in x, CONTAINER_LIST)))
-    CLIENT = filter(lambda x: 'ozone_client' in x, CONTAINER_LIST)
-
+    OM, SCM, CLIENT, DATANODES = \
+        ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST)
     exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS",
                                                "THREE", "ozone_client")
     assert exit_code == 0, "freon run failed with output=[%s]" % output
@@ -121,4 +118,4 @@ def test_client_failure_isolate_one_datanode():
                          test_key_name, "/tmp/")
     key_checksum = ClusterUtils.find_checksum(FILE, "/tmp/%s" % test_key_name)
 
-    assert key_checksum == ORIG_CHECKSUM
+    assert key_checksum == ORIG_CHECKSUM
\ No newline at end of file
diff --git a/hadoop-ozone/dist/src/main/blockade/test_blockade_datanode_isolation.py b/hadoop-ozone/dist/src/main/blockade/test_blockade_datanode_isolation.py
index fecd9d1..1e53a32 100644
--- a/hadoop-ozone/dist/src/main/blockade/test_blockade_datanode_isolation.py
+++ b/hadoop-ozone/dist/src/main/blockade/test_blockade_datanode_isolation.py
@@ -42,9 +42,8 @@ def setup():
   exit_code, output = Blockade.blockade_status()
   assert exit_code == 0, "blockade status command failed with output=[%s]" % \
                          output
-  OM = [x for x in CONTAINER_LIST if 'ozoneManager' in x]
-  SCM = [x for x in CONTAINER_LIST if 'scm' in x]
-  DATANODES = sorted(x for x in CONTAINER_LIST if 'datanode' in x)
+  OM, SCM, _, DATANODES = \
+    ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST)
 
   exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS",
                                              "THREE")
diff --git a/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure.py b/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure.py
index 59755e0..8493ce0 100644
--- a/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure.py
+++ b/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure.py
@@ -18,16 +18,17 @@
 import os
 import time
 import logging
+import re
 from blockadeUtils.blockade import Blockade
 from clusterUtils.cluster_utils import ClusterUtils
 
-
 logger = logging.getLogger(__name__)
 parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 FILE = os.path.join(parent_dir, "compose", "ozoneblockade",
                     "docker-compose.yaml")
 os.environ["DOCKER_COMPOSE_FILE"] = FILE
 SCALE = 3
+INCREASED_SCALE = 5
 CONTAINER_LIST = []
 OM = []
 SCM = []
@@ -35,83 +36,114 @@ DATANODES = []
 
 
 def setup():
-    global CONTAINER_LIST, OM, SCM, DATANODES
-    Blockade.blockade_destroy()
-    CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE)
-    exit_code, output = Blockade.blockade_status()
-    assert exit_code == 0, "blockade status command failed with output=[%s]" % \
-                           output
-    OM = filter(lambda x: 'ozoneManager' in x, CONTAINER_LIST)
-    SCM = filter(lambda x: 'scm' in x, CONTAINER_LIST)
-    DATANODES = sorted(list(filter(lambda x: 'datanode' in x, CONTAINER_LIST)))
+  global CONTAINER_LIST, OM, SCM, DATANODES
+  Blockade.blockade_destroy()
+  CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE)
+  exit_code, output = Blockade.blockade_status()
+  assert exit_code == 0, "blockade status command failed with output=[%s]" % \
+                         output
+  OM, SCM, _, DATANODES = \
+    ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST)
 
-    exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS",
-                                               "THREE")
-    assert exit_code == 0, "freon run failed with output=[%s]" % output
+  exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS",
+                                             "THREE")
+  assert exit_code == 0, "freon run failed with output=[%s]" % output
 
 
 def teardown():
-    logger.info("Inside teardown")
-    Blockade.blockade_destroy()
+  logger.info("Inside teardown")
+  Blockade.blockade_destroy()
 
 
 def teardown_module():
-    ClusterUtils.cluster_destroy(FILE)
+  ClusterUtils.cluster_destroy(FILE)
 
 
-def test_one_dn_isolate_scm_other_dn():
-    """
-    In this test, one of the datanodes cannot communicate with SCM and other
-    datanodes.
-    Other datanodes can communicate with each other and SCM .
-    Expectation : The container should eventually have two closed replicas.
-    """
-    first_set = [OM[0], SCM[0], DATANODES[1], DATANODES[2]]
-    second_set = [OM[0], DATANODES[0]]
-    Blockade.blockade_create_partition(first_set, second_set)
+def test_one_dn_isolate_scm_other_dn(run_second_phase):
+  """
+  In this test, one of the datanodes cannot communicate with SCM and other
+  datanodes.
+  Other datanodes can communicate with each other and SCM .
+  Expectation : The container should eventually have two closed replicas.
+  """
+  first_set = [OM[0], SCM[0], DATANODES[1], DATANODES[2]]
+  second_set = [OM[0], DATANODES[0]]
+  Blockade.blockade_create_partition(first_set, second_set)
+  Blockade.blockade_status()
+  ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+  logger.info("Waiting for %s seconds before checking container status",
+              os.environ["CONTAINER_STATUS_SLEEP"])
+  time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+  all_datanodes_container_status = \
+    ClusterUtils.findall_container_status(FILE, SCALE)
+  count_closed_container_datanodes = filter(lambda x: x == 'CLOSED',
+                                            all_datanodes_container_status)
+  assert len(count_closed_container_datanodes) == 2, \
+    "The container should have two closed replicas."
+  if str(run_second_phase).lower() == "true":
+    ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False)
     Blockade.blockade_status()
-    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
     logger.info("Waiting for %s seconds before checking container status",
                 os.environ["CONTAINER_STATUS_SLEEP"])
     time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
     all_datanodes_container_status = \
-        ClusterUtils.find_all_datanodes_container_status(FILE, SCALE)
-    count_closed_container_datanodes = filter(lambda x: x == 'CLOSED',
-                                              all_datanodes_container_status)
-    assert len(count_closed_container_datanodes) == 2, \
-        "The container should have two closed replicas."
+      ClusterUtils.findall_container_status(
+        FILE, INCREASED_SCALE)
+    count_closed_container_datanodes = filter(
+      lambda x: x == 'CLOSED', all_datanodes_container_status)
+    assert len(count_closed_container_datanodes) >= 3, \
+      "The container should have at least three closed replicas."
+    _, output = \
+      ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+    assert re.search("Status: Success", output) is not None
 
 
-def test_one_dn_isolate_other_dn():
-    """
-    In this test, one of the datanodes (first datanode) cannot communicate
-    other datanodes but can communicate with SCM.
-    One of the other two datanodes (second datanode) cannot communicate with
-    SCM.
-    Expectation :
-    The container replica state in first datanode can be either closed or
-    quasi-closed.
-    The container replica state in second datanode can be either closed or open.
-    The container should eventually have at lease one closed replica.
-    """
-    first_set = [OM[0], SCM[0], DATANODES[0]]
-    second_set = [OM[0], DATANODES[1], DATANODES[2]]
-    third_set = [SCM[0], DATANODES[2]]
-    Blockade.blockade_create_partition(first_set, second_set, third_set)
+def test_one_dn_isolate_other_dn(run_second_phase):
+  """
+  In this test, one of the datanodes (first datanode) cannot communicate
+  other datanodes but can communicate with SCM.
+  One of the other two datanodes (second datanode) cannot communicate with
+  SCM.
+  Expectation :
+  The container replica state in first datanode can be either closed or
+  quasi-closed.
+  The container replica state in second datanode can be either closed or open.
+  The container should eventually have at lease one closed replica.
+  """
+  first_set = [OM[0], SCM[0], DATANODES[0]]
+  second_set = [OM[0], DATANODES[1], DATANODES[2]]
+  third_set = [SCM[0], DATANODES[2]]
+  Blockade.blockade_create_partition(first_set, second_set, third_set)
+  Blockade.blockade_status()
+  ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+  logger.info("Waiting for %s seconds before checking container status",
+              os.environ["CONTAINER_STATUS_SLEEP"])
+  time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+  all_datanodes_container_status = \
+    ClusterUtils.findall_container_status(FILE, SCALE)
+  count_closed_container_datanodes = filter(lambda x: x == 'CLOSED',
+                                            all_datanodes_container_status)
+  first_datanode_status = all_datanodes_container_status[0]
+  second_datanode_status = all_datanodes_container_status[1]
+  assert first_datanode_status == 'CLOSED' or \
+         first_datanode_status == "QUASI_CLOSED"
+  assert second_datanode_status == 'CLOSED' or \
+         second_datanode_status == "OPEN"
+  assert len(count_closed_container_datanodes) >= 1, \
+    "The container should have at least one closed replica"
+  if str(run_second_phase).lower() == "true":
+    ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False)
     Blockade.blockade_status()
-    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
     logger.info("Waiting for %s seconds before checking container status",
                 os.environ["CONTAINER_STATUS_SLEEP"])
     time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
     all_datanodes_container_status = \
-        ClusterUtils.find_all_datanodes_container_status(FILE, SCALE)
-    count_closed_container_datanodes = filter(lambda x: x == 'CLOSED',
-                                              all_datanodes_container_status)
-    first_datanode_status = all_datanodes_container_status[0]
-    second_datanode_status = all_datanodes_container_status[1]
-    assert first_datanode_status == 'CLOSED' or \
-           first_datanode_status == "QUASI_CLOSED"
-    assert second_datanode_status == 'CLOSED' or \
-           second_datanode_status == "OPEN"
-    assert len(count_closed_container_datanodes) >= 1, \
-        "The container should have at least one closed replica"
\ No newline at end of file
+      ClusterUtils.findall_container_status(
+        FILE, INCREASED_SCALE)
+    count_closed_container_datanodes = filter(
+      lambda x: x == 'CLOSED', all_datanodes_container_status)
+    assert len(count_closed_container_datanodes) >= 3, \
+      "The container should have at least three closed replicas."
+    _, output = \
+      ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+    assert re.search("Status: Success", output) is not None
\ No newline at end of file
diff --git a/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_three_nodes_isolate.py b/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_three_nodes_isolate.py
index ee4d031..0e50025 100644
--- a/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_three_nodes_isolate.py
+++ b/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_three_nodes_isolate.py
@@ -18,16 +18,17 @@
 import os
 import time
 import logging
+import re
 from blockadeUtils.blockade import Blockade
 from clusterUtils.cluster_utils import ClusterUtils
 
-
 logger = logging.getLogger(__name__)
 parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 FILE = os.path.join(parent_dir, "compose", "ozoneblockade",
                     "docker-compose.yaml")
 os.environ["DOCKER_COMPOSE_FILE"] = FILE
 SCALE = 3
+INCREASED_SCALE = 5
 CONTAINER_LIST = []
 OM = []
 SCM = []
@@ -35,110 +36,190 @@ DATANODES = []
 
 
 def setup():
-    global CONTAINER_LIST, OM, SCM, DATANODES
-    Blockade.blockade_destroy()
-    CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE)
-    exit_code, output = Blockade.blockade_status()
-    assert exit_code == 0, "blockade status command failed with output=[%s]" % \
-                           output
-    OM = filter(lambda x: 'ozoneManager' in x, CONTAINER_LIST)
-    SCM = filter(lambda x: 'scm' in x, CONTAINER_LIST)
-    DATANODES = sorted(list(filter(lambda x: 'datanode' in x, CONTAINER_LIST)))
-
-    exit_code, output = \
-        ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
-    assert exit_code == 0, "freon run failed with output=[%s]" % output
+  global CONTAINER_LIST, OM, SCM, DATANODES
+  Blockade.blockade_destroy()
+  CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE)
+  exit_code, output = Blockade.blockade_status()
+  assert exit_code == 0, "blockade status command failed with output=[%s]" % \
+                         output
+  OM, SCM, _, DATANODES = \
+    ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST)
+
+  exit_code, output = \
+    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+  assert exit_code == 0, "freon run failed with output=[%s]" % output
 
 
 def teardown():
-    logger.info("Inside teardown")
-    Blockade.blockade_destroy()
+  logger.info("Inside teardown")
+  Blockade.blockade_destroy()
 
 
 def teardown_module():
-    ClusterUtils.cluster_destroy(FILE)
-
-
-def test_three_dns_isolate_onescmfailure():
-    """
-    In this test, all datanodes are isolated from each other.
-    One of the datanodes (third datanode) cannot communicate with SCM.
-    Expectation :
-    The container replica state in first datanode should be closed.
-    The container replica state in second datanode should be closed.
-    The container replica state in third datanode should be open.
-    """
-    first_set = [OM[0], SCM[0], DATANODES[0]]
-    second_set = [OM[0], SCM[0], DATANODES[1]]
-    third_set = [OM[0], DATANODES[2]]
-    Blockade.blockade_create_partition(first_set, second_set, third_set)
+  ClusterUtils.cluster_destroy(FILE)
+
+
+def test_three_dns_isolate_onescmfailure(run_second_phase):
+  """
+  In this test, all datanodes are isolated from each other.
+  One of the datanodes (third datanode) cannot communicate with SCM.
+  Expectation :
+  The container replica state in first datanode should be closed.
+  The container replica state in second datanode should be closed.
+  The container replica state in third datanode should be open.
+  """
+  first_set = [OM[0], SCM[0], DATANODES[0]]
+  second_set = [OM[0], SCM[0], DATANODES[1]]
+  third_set = [OM[0], DATANODES[2]]
+  Blockade.blockade_create_partition(first_set, second_set, third_set)
+  Blockade.blockade_status()
+  ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+  logger.info("Waiting for %s seconds before checking container status",
+              os.environ["CONTAINER_STATUS_SLEEP"])
+  time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+  all_datanodes_container_status = \
+    ClusterUtils.findall_container_status(FILE, SCALE)
+  first_datanode_status = all_datanodes_container_status[0]
+  second_datanode_status = all_datanodes_container_status[1]
+  third_datanode_status = all_datanodes_container_status[2]
+  assert first_datanode_status == 'CLOSED'
+  assert second_datanode_status == 'CLOSED'
+  assert third_datanode_status == 'OPEN'
+
+  if str(run_second_phase).lower() == "true":
+    ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False)
     Blockade.blockade_status()
-    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
     logger.info("Waiting for %s seconds before checking container status",
                 os.environ["CONTAINER_STATUS_SLEEP"])
     time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
     all_datanodes_container_status = \
-        ClusterUtils.find_all_datanodes_container_status(FILE, SCALE)
-    first_datanode_status = all_datanodes_container_status[0]
-    second_datanode_status = all_datanodes_container_status[1]
-    third_datanode_status = all_datanodes_container_status[2]
-    assert first_datanode_status == 'CLOSED'
-    assert second_datanode_status == 'CLOSED'
-    assert third_datanode_status == 'OPEN'
-
-
-def test_three_dns_isolate_twoscmfailure():
-    """
-    In this test, all datanodes are isolated from each other.
-    two datanodes cannot communicate with SCM (second datanode and third
-    datanode)
-    Expectation :
-    The container replica state in first datanode should be quasi-closed.
-    The container replica state in second datanode should be open.
-    The container replica state in third datanode should be open.
-    """
-    first_set = [OM[0], SCM[0], DATANODES[0]]
-    second_set = [OM[0], DATANODES[1]]
-    third_set = [OM[0], DATANODES[2]]
-    Blockade.blockade_create_partition(first_set, second_set, third_set)
+      ClusterUtils.findall_container_status(
+        FILE, INCREASED_SCALE)
+    count_closed_container_datanodes = filter(
+      lambda x: x == 'CLOSED', all_datanodes_container_status)
+    assert len(count_closed_container_datanodes) == 3, \
+      "The container should have three closed replicas."
+    Blockade.blockade_join()
     Blockade.blockade_status()
-    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
     logger.info("Waiting for %s seconds before checking container status",
                 os.environ["CONTAINER_STATUS_SLEEP"])
     time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
     all_datanodes_container_status = \
-        ClusterUtils.find_all_datanodes_container_status(FILE, SCALE)
-    first_datanode_status = all_datanodes_container_status[0]
-    second_datanode_status = all_datanodes_container_status[1]
-    third_datanode_status = all_datanodes_container_status[2]
-    assert first_datanode_status == 'QUASI_CLOSED'
-    assert second_datanode_status == 'OPEN'
-    assert third_datanode_status == 'OPEN'
-
-
-def test_three_dns_isolate_threescmfailure():
-    """
-    In this test, all datanodes are isolated from each other and also cannot
-    communicate with SCM.
-    Expectation :
-    The container replica state in first datanode should be open.
-    The container replica state in second datanode should be open.
-    The container replica state in third datanode should be open.
-    """
-    first_set = [OM[0], DATANODES[0]]
-    second_set = [OM[0], DATANODES[1]]
-    third_set = [OM[0], DATANODES[2]]
-    Blockade.blockade_create_partition(first_set, second_set, third_set)
+      ClusterUtils.findall_container_status(
+        FILE, INCREASED_SCALE)
+    count_closed_container_datanodes = filter(
+      lambda x: x == 'CLOSED', all_datanodes_container_status)
+    assert len(count_closed_container_datanodes) == 3, \
+      "The container should have three closed replicas."
+
+
+def test_three_dns_isolate_twoscmfailure(run_second_phase):
+  """
+  In this test, all datanodes are isolated from each other.
+  two datanodes cannot communicate with SCM (second datanode and third
+  datanode)
+  Expectation :
+  The container replica state in first datanode should be quasi-closed.
+  The container replica state in second datanode should be open.
+  The container replica state in third datanode should be open.
+  """
+  first_set = [OM[0], SCM[0], DATANODES[0]]
+  second_set = [OM[0], DATANODES[1]]
+  third_set = [OM[0], DATANODES[2]]
+  Blockade.blockade_create_partition(first_set, second_set, third_set)
+  Blockade.blockade_status()
+  ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+  logger.info("Waiting for %s seconds before checking container status",
+              os.environ["CONTAINER_STATUS_SLEEP"])
+  time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+  all_datanodes_container_status = \
+    ClusterUtils.findall_container_status(FILE, SCALE)
+  first_datanode_status = all_datanodes_container_status[0]
+  second_datanode_status = all_datanodes_container_status[1]
+  third_datanode_status = all_datanodes_container_status[2]
+  assert first_datanode_status == 'QUASI_CLOSED'
+  assert second_datanode_status == 'OPEN'
+  assert third_datanode_status == 'OPEN'
+
+  if str(run_second_phase).lower() == "true":
+    ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False)
+    Blockade.blockade_status()
+    logger.info("Waiting for %s seconds before checking container status",
+                os.environ["CONTAINER_STATUS_SLEEP"])
+    time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+    all_datanodes_container_status = \
+      ClusterUtils.findall_container_status(
+        FILE, INCREASED_SCALE)
+    count_quasi_closed_container_datanodes = filter(
+      lambda x: x == 'QUASI_CLOSED', all_datanodes_container_status)
+    assert len(count_quasi_closed_container_datanodes) >= 3, \
+      "The container should have at least three quasi-closed replicas."
+    Blockade.blockade_join()
+    Blockade.blockade_status()
+    logger.info("Waiting for %s seconds before checking container status",
+                os.environ["CONTAINER_STATUS_SLEEP"])
+    time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+    all_datanodes_container_status = \
+      ClusterUtils.findall_container_status(
+        FILE, INCREASED_SCALE)
+    count_closed_container_datanodes = filter(
+      lambda x: x == 'CLOSED', all_datanodes_container_status)
+    assert len(count_closed_container_datanodes) == 3, \
+      "The container should have three closed replicas."
+
+
+def test_three_dns_isolate_threescmfailure(run_second_phase):
+  """
+  In this test, all datanodes are isolated from each other and also cannot
+  communicate with SCM.
+  Expectation :
+  The container replica state in first datanode should be open.
+  The container replica state in second datanode should be open.
+  The container replica state in third datanode should be open.
+  """
+  first_set = [OM[0], DATANODES[0]]
+  second_set = [OM[0], DATANODES[1]]
+  third_set = [OM[0], DATANODES[2]]
+  Blockade.blockade_create_partition(first_set, second_set, third_set)
+  Blockade.blockade_status()
+  ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+  logger.info("Waiting for %s seconds before checking container status",
+              os.environ["CONTAINER_STATUS_SLEEP"])
+  time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+  all_datanodes_container_status = \
+    ClusterUtils.findall_container_status(FILE, SCALE)
+  first_datanode_status = all_datanodes_container_status[0]
+  second_datanode_status = all_datanodes_container_status[1]
+  third_datanode_status = all_datanodes_container_status[2]
+  assert first_datanode_status == 'OPEN'
+  assert second_datanode_status == 'OPEN'
+  assert third_datanode_status == 'OPEN'
+
+  if str(run_second_phase).lower() == "true":
+    ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False)
+    Blockade.blockade_status()
+    logger.info("Waiting for %s seconds before checking container status",
+                os.environ["CONTAINER_STATUS_SLEEP"])
+    time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+    output = ClusterUtils.get_pipelines(FILE)
+    if output:
+      assert re.search("Factor:THREE", output) is None
+    all_datanodes_container_status = \
+      ClusterUtils.findall_container_status(
+        FILE, INCREASED_SCALE)
+    datanodes_having_container_status = filter(
+      lambda x: x != 'None', all_datanodes_container_status)
+    assert len(datanodes_having_container_status) == 3, \
+      "Containers should not be replicated on addition of new nodes."
+    Blockade.blockade_join()
     Blockade.blockade_status()
-    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
     logger.info("Waiting for %s seconds before checking container status",
                 os.environ["CONTAINER_STATUS_SLEEP"])
     time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
     all_datanodes_container_status = \
-        ClusterUtils.find_all_datanodes_container_status(FILE, SCALE)
-    first_datanode_status = all_datanodes_container_status[0]
-    second_datanode_status = all_datanodes_container_status[1]
-    third_datanode_status = all_datanodes_container_status[2]
-    assert first_datanode_status == 'OPEN'
-    assert second_datanode_status == 'OPEN'
-    assert third_datanode_status == 'OPEN'
\ No newline at end of file
+      ClusterUtils.findall_container_status(
+        FILE, INCREASED_SCALE)
+    count_closed_container_datanodes = filter(
+      lambda x: x == 'CLOSED', all_datanodes_container_status)
+    assert len(count_closed_container_datanodes) == 3, \
+      "The container should have three closed replicas."
\ No newline at end of file
diff --git a/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_two_nodes.py b/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_two_nodes.py
index a8a6f9b..b8df2fa 100644
--- a/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_two_nodes.py
+++ b/hadoop-ozone/dist/src/main/blockade/test_blockade_mixed_failure_two_nodes.py
@@ -18,16 +18,17 @@
 import os
 import time
 import logging
+import re
 from blockadeUtils.blockade import Blockade
 from clusterUtils.cluster_utils import ClusterUtils
 
-
 logger = logging.getLogger(__name__)
 parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 FILE = os.path.join(parent_dir, "compose", "ozoneblockade",
                     "docker-compose.yaml")
 os.environ["DOCKER_COMPOSE_FILE"] = FILE
 SCALE = 3
+INCREASED_SCALE = 5
 CONTAINER_LIST = []
 OM = []
 SCM = []
@@ -35,87 +36,138 @@ DATANODES = []
 
 
 def setup():
-    global CONTAINER_LIST, OM, SCM, DATANODES
-    Blockade.blockade_destroy()
-    CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE)
-    exit_code, output = Blockade.blockade_status()
-    assert exit_code == 0, "blockade status command failed with output=[%s]" % \
-                           output
-    OM = filter(lambda x: 'ozoneManager' in x, CONTAINER_LIST)
-    SCM = filter(lambda x: 'scm' in x, CONTAINER_LIST)
-    DATANODES = sorted(list(filter(lambda x: 'datanode' in x, CONTAINER_LIST)))
+  global CONTAINER_LIST, OM, SCM, DATANODES
+  Blockade.blockade_destroy()
+  CONTAINER_LIST = ClusterUtils.cluster_setup(FILE, SCALE)
+  exit_code, output = Blockade.blockade_status()
+  assert exit_code == 0, "blockade status command failed with output=[%s]" % \
+                         output
+  OM, SCM, _, DATANODES = \
+    ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST)
 
-    exit_code, output = \
-        ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
-    assert exit_code == 0, "freon run failed with output=[%s]" % output
+  exit_code, output = \
+    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+  assert exit_code == 0, "freon run failed with output=[%s]" % output
 
 
 def teardown():
-    logger.info("Inside teardown")
-    Blockade.blockade_destroy()
+  logger.info("Inside teardown")
+  Blockade.blockade_destroy()
 
 
 def teardown_module():
-    ClusterUtils.cluster_destroy(FILE)
-
-
-def test_two_dns_isolate_scm_same_partition():
-    """
-    In this test, one of the datanodes (first datanode) cannot communicate
-    with other two datanodes.
-    Two datanodes (second datanode and third datanode), on same network
-    parition, cannot communicate with SCM.
-    Expectation :
-    The container replica state in first datanode should be quasi-closed.
-    The container replica state in second datanode should be open.
-    The container replica state in third datanode should be open.
-    """
-    first_set = [OM[0], DATANODES[1], DATANODES[2]]
-    second_set = [OM[0], SCM[0], DATANODES[0]]
-    Blockade.blockade_create_partition(first_set, second_set)
+  ClusterUtils.cluster_destroy(FILE)
+
+
+def test_two_dns_isolate_scm_same_partition(run_second_phase):
+  """
+  In this test, there are three datanodes, DN1, DN2, DN3
+  DN1 is on a network partition and
+  DN2, DN3 are on a different network partition.
+  DN2 and DN3 cannot communicate with SCM.
+  Expectation :
+  The container replica state in DN1 should be quasi-closed.
+  The container replica state in DN2 should be open.
+  The container replica state in DN3 should be open.
+  """
+  first_set = [OM[0], DATANODES[1], DATANODES[2]]
+  second_set = [OM[0], SCM[0], DATANODES[0]]
+  Blockade.blockade_create_partition(first_set, second_set)
+  Blockade.blockade_status()
+  ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+  logger.info("Waiting for %s seconds before checking container status",
+              os.environ["CONTAINER_STATUS_SLEEP"])
+  time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+  all_datanodes_container_status = \
+    ClusterUtils.findall_container_status(FILE, SCALE)
+  first_datanode_status = all_datanodes_container_status[0]
+  second_datanode_status = all_datanodes_container_status[1]
+  third_datanode_status = all_datanodes_container_status[2]
+  assert first_datanode_status == 'QUASI_CLOSED'
+  assert second_datanode_status == 'OPEN'
+  assert third_datanode_status == 'OPEN'
+
+  if str(run_second_phase).lower() == "true":
+    ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False)
     Blockade.blockade_status()
-    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
     logger.info("Waiting for %s seconds before checking container status",
                 os.environ["CONTAINER_STATUS_SLEEP"])
     time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
     all_datanodes_container_status = \
-        ClusterUtils.find_all_datanodes_container_status(FILE, SCALE)
-    first_datanode_status = all_datanodes_container_status[0]
-    second_datanode_status = all_datanodes_container_status[1]
-    third_datanode_status = all_datanodes_container_status[2]
-    assert first_datanode_status == 'QUASI_CLOSED'
-    assert second_datanode_status == 'OPEN'
-    assert third_datanode_status == 'OPEN'
-
-
-def test_two_dns_isolate_scm_different_partition():
-    """
-    In this test, one of the datanodes (first datanode) cannot communicate with
-     other two datanodes.
-    Two datanodes (first datanode and second datanode),
-    on different network paritions, cannot communicate with SCM.
-    Expectation :
-    The container replica state in first datanode should be open.
-    The container replica states can be either 'closed'
-    in both second and third datanode, or,
-    'open' in second datanode and 'quasi-closed' in third datanode.
-    """
-    first_set = [OM[0], DATANODES[0]]
-    second_set = [OM[0], DATANODES[1], DATANODES[2]]
-    third_set = [SCM[0], DATANODES[2]]
-    Blockade.blockade_create_partition(first_set, second_set, third_set)
+      ClusterUtils.findall_container_status(
+        FILE, INCREASED_SCALE)
+    count_quasi_closed_container_datanodes = filter(
+      lambda x: x == 'QUASI_CLOSED', all_datanodes_container_status)
+    assert len(count_quasi_closed_container_datanodes) >= 3, \
+      "The container should have at least three quasi-closed replicas."
+    Blockade.blockade_join()
+    Blockade.blockade_status()
+    time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+    all_datanodes_container_status = \
+      ClusterUtils.findall_container_status(
+        FILE, INCREASED_SCALE)
+    count_closed_container_datanodes = filter(
+      lambda x: x == 'CLOSED', all_datanodes_container_status)
+    assert len(count_closed_container_datanodes) >= 3
+
+
+def test_two_dns_isolate_scm_different_partition(run_second_phase):
+  """
+  In this test, there are three datanodes, DN1, DN2, DN3
+  DN1 is on a network partition and
+  DN2, DN3 are on a different network partition.
+  DN1 and DN2 cannot communicate with SCM.
+  Expectation :
+  The container replica state in datanode DN1 should be open.
+  The container replica states can be either 'closed'
+  in DN2 and DN3, or,
+  'open' in DN2 and 'quasi-closed' in DN3.
+  """
+  first_set = [OM[0], DATANODES[0]]
+  second_set = [OM[0], DATANODES[1], DATANODES[2]]
+  third_set = [SCM[0], DATANODES[2]]
+  Blockade.blockade_create_partition(first_set, second_set, third_set)
+  Blockade.blockade_status()
+  ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+  logger.info("Waiting for %s seconds before checking container status",
+              os.environ["CONTAINER_STATUS_SLEEP"])
+  time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+  all_datanodes_container_status = \
+    ClusterUtils.findall_container_status(FILE, SCALE)
+  first_datanode_status = all_datanodes_container_status[0]
+  second_datanode_status = all_datanodes_container_status[1]
+  third_datanode_status = all_datanodes_container_status[2]
+  assert first_datanode_status == 'OPEN'
+  assert (second_datanode_status == 'CLOSED' and
+          third_datanode_status == 'CLOSED') or \
+         (second_datanode_status == 'OPEN' and
+          third_datanode_status == 'QUASI_CLOSED')
+
+  if str(run_second_phase).lower() == "true":
+    ClusterUtils.cluster_setup(FILE, INCREASED_SCALE, False)
     Blockade.blockade_status()
-    ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
     logger.info("Waiting for %s seconds before checking container status",
                 os.environ["CONTAINER_STATUS_SLEEP"])
     time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
     all_datanodes_container_status = \
-        ClusterUtils.find_all_datanodes_container_status(FILE, SCALE)
-    first_datanode_status = all_datanodes_container_status[0]
-    second_datanode_status = all_datanodes_container_status[1]
-    third_datanode_status = all_datanodes_container_status[2]
-    assert first_datanode_status == 'OPEN'
-    assert (second_datanode_status == 'CLOSED' and
-            third_datanode_status == 'CLOSED') or \
-           (second_datanode_status == 'OPEN' and
-            third_datanode_status == 'QUASI_CLOSED')
\ No newline at end of file
+      ClusterUtils.findall_container_status(
+        FILE, INCREASED_SCALE)
+    count_closed_container_datanodes = filter(
+      lambda x: x == 'CLOSED', all_datanodes_container_status)
+    count_qausi_closed_container_datanodes = filter(
+      lambda x: x == 'QUASI_CLOSED', all_datanodes_container_status)
+    assert len(count_closed_container_datanodes) >= 3 or \
+           len(count_qausi_closed_container_datanodes) >= 3
+    Blockade.blockade_join()
+    Blockade.blockade_status()
+    if len(count_closed_container_datanodes) < 3:
+      time.sleep(int(os.environ["CONTAINER_STATUS_SLEEP"]))
+      all_datanodes_container_status = \
+        ClusterUtils.findall_container_status(
+          FILE, INCREASED_SCALE)
+      count_closed_container_datanodes = filter(
+        lambda x: x == 'CLOSED', all_datanodes_container_status)
+      assert len(count_closed_container_datanodes) >= 3
+    _, output = \
+      ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS", "THREE")
+    assert re.search("Status: Success", output) is not None
\ No newline at end of file
diff --git a/hadoop-ozone/dist/src/main/blockade/test_blockade_scm_isolation.py b/hadoop-ozone/dist/src/main/blockade/test_blockade_scm_isolation.py
index 54c31e8..06f4263 100644
--- a/hadoop-ozone/dist/src/main/blockade/test_blockade_scm_isolation.py
+++ b/hadoop-ozone/dist/src/main/blockade/test_blockade_scm_isolation.py
@@ -42,9 +42,8 @@ def setup():
   exit_code, output = Blockade.blockade_status()
   assert exit_code == 0, "blockade status command failed with output=[%s]" % \
                          output
-  OM = [x for x in CONTAINER_LIST if 'ozoneManager' in x]
-  SCM = [x for x in CONTAINER_LIST if 'scm' in x]
-  DATANODES = sorted(x for x in CONTAINER_LIST if 'datanode' in x)
+  OM, SCM, _, DATANODES = \
+    ClusterUtils.find_om_scm_client_datanodes(CONTAINER_LIST)
 
   exit_code, output = ClusterUtils.run_freon(FILE, 1, 1, 1, 10240, "RATIS",
                                              "THREE")


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org