You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@accumulo.apache.org by GitBox <gi...@apache.org> on 2021/05/28 16:27:41 UTC

[GitHub] [accumulo] DomGarguilo commented on a change in pull request #2123: Improve SuspendedTabletsIT reliability

DomGarguilo commented on a change in pull request #2123:
URL: https://github.com/apache/accumulo/pull/2123#discussion_r641662165



##########
File path: test/src/main/java/org/apache/accumulo/test/manager/SuspendedTabletsIT.java
##########
@@ -140,6 +193,7 @@ public void shutdownAndResumeTserver() throws Exception {
       }
 
       // remove servers with metadata on them from the list of servers to be shutdown
+      assertEquals("Expecting a single tServer in metadataServerSet", 1, metadataServerSet.size());
       tserverSet.removeAll(metadataServerSet);
 
       assertEquals("Expecting two tServers in shutdown-list", 2, tserverSet.size());

Review comment:
       Could add another assert around this line to ensure `tserverSet.size() >= count`

##########
File path: test/src/main/java/org/apache/accumulo/test/manager/SuspendedTabletsIT.java
##########
@@ -91,21 +100,65 @@ public void configure(MiniAccumuloConfigImpl cfg, Configuration fsConf) {
     cfg.setProperty(Property.TABLE_SUSPEND_DURATION, SUSPEND_DURATION + "s");
     cfg.setClientProperty(ClientProperty.INSTANCE_ZOOKEEPERS_TIMEOUT, "5s");
     cfg.setProperty(Property.INSTANCE_ZK_TIMEOUT, "5s");
-    cfg.setNumTservers(TSERVERS);
+    // Start with 1 tserver, we'll increase that later
+    cfg.setNumTservers(1);
     // config custom balancer to keep all metadata on one server
-    cfg.setProperty(HostRegexTableLoadBalancer.HOST_BALANCER_PREFIX + MetadataTable.NAME, "*");
-    cfg.setProperty(HostRegexTableLoadBalancer.HOST_BALANCER_OOB_CHECK_KEY, "7s");
-    cfg.setProperty(Property.TABLE_LOAD_BALANCER.getKey(),
-        HostRegexTableLoadBalancer.class.getName());
+    cfg.setProperty(HostRegexTableLoadBalancer.HOST_BALANCER_OOB_CHECK_KEY, "1ms");
+    cfg.setProperty(Property.MANAGER_TABLET_BALANCER.getKey(),
+        HostAndPortRegexTableLoadBalancer.class.getName());
+  }
+
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+
+    try (AccumuloClient client = Accumulo.newClient().from(getClientProperties()).build()) {
+      // Wait for all tablet servers to come online and then choose the first server in the list.
+      // Update the balancer configuration to assign all metadata tablets to that server (and
+      // everything else to other servers).
+      InstanceOperations iops = client.instanceOperations();
+      List<String> tservers = iops.getTabletServers();
+      while (tservers == null || tservers.size() < 1) {
+        Thread.sleep(1000L);
+        tservers = client.instanceOperations().getTabletServers();
+      }
+      HostAndPort metadataServer = HostAndPort.fromString(tservers.get(0));
+      log.info("Configuring balancer to assign all metadata tablets to {}", metadataServer);
+      iops.setProperty(HostRegexTableLoadBalancer.HOST_BALANCER_PREFIX + MetadataTable.NAME,
+          metadataServer.toString());
+
+      // Wait for the balancer to assign all metadata tablets to the chosen server.
+      ClientContext ctx = (ClientContext) client;
+      TabletLocations tl = TabletLocations.retrieve(ctx, MetadataTable.NAME, RootTable.NAME);
+      while (tl.hosted.keySet().size() != 1 || !tl.hosted.containsKey(metadataServer)) {
+        log.info("Metadata tablets are not hosted on the correct server. Waiting for balancer...");
+        Thread.sleep(1000L);
+        tl = TabletLocations.retrieve(ctx, MetadataTable.NAME, RootTable.NAME);
+      }
+      log.info("Metadata tablets are now hosted on {}", metadataServer);
+    }
+
+    // Since we started only a single tablet server, we know it's the one hosting the
+    // metadata table. Save its process reference off so we can exclude it later when
+    // killing tablet servers.
+    Collection<ProcessReference> procs = getCluster().getProcesses().get(ServerType.TABLET_SERVER);
+    assertEquals("Expected a single tserver process", 1, procs.size());
+    metadataTserverProcess = procs.iterator().next();
+
+    // Update the number of tservers and start the new tservers.
+    getCluster().getConfig().setNumTservers(TSERVERS);
+    getCluster().start();
   }
 
   @Test
   public void crashAndResumeTserver() throws Exception {
     // Run the test body. When we get to the point where we need a tserver to go away, get rid of it
     // via crashing
     suspensionTestBody((ctx, locs, count) -> {
-      List<ProcessReference> procs =
-          new ArrayList<>(getCluster().getProcesses().get(ServerType.TABLET_SERVER));
+      // Exclude the tablet server hosting the metadata table from the list and only
+      // kill tablet servers that are not hosting the metadata table.
+      List<ProcessReference> procs = getCluster().getProcesses().get(ServerType.TABLET_SERVER)
+          .stream().filter(p -> !metadataTserverProcess.equals(p)).collect(Collectors.toList());

Review comment:
       We could potentially add asserts after this line to sanity check that `procs.size() == TSERVERS - 1` as well as `procs.size() >= count`

##########
File path: test/src/main/java/org/apache/accumulo/test/manager/SuspendedTabletsIT.java
##########
@@ -140,6 +193,7 @@ public void shutdownAndResumeTserver() throws Exception {
       }
 
       // remove servers with metadata on them from the list of servers to be shutdown
+      assertEquals("Expecting a single tServer in metadataServerSet", 1, metadataServerSet.size());
       tserverSet.removeAll(metadataServerSet);
 
       assertEquals("Expecting two tServers in shutdown-list", 2, tserverSet.size());

Review comment:
       ```suggestion
         assertEquals("Expecting " + (TSERVERS - 1) + " tServers in shutdown-list", TSERVERS - 1, tserverSet.size());
   ```

##########
File path: test/src/main/java/org/apache/accumulo/test/manager/SuspendedTabletsIT.java
##########
@@ -91,21 +100,65 @@ public void configure(MiniAccumuloConfigImpl cfg, Configuration fsConf) {
     cfg.setProperty(Property.TABLE_SUSPEND_DURATION, SUSPEND_DURATION + "s");
     cfg.setClientProperty(ClientProperty.INSTANCE_ZOOKEEPERS_TIMEOUT, "5s");
     cfg.setProperty(Property.INSTANCE_ZK_TIMEOUT, "5s");
-    cfg.setNumTservers(TSERVERS);
+    // Start with 1 tserver, we'll increase that later
+    cfg.setNumTservers(1);
     // config custom balancer to keep all metadata on one server
-    cfg.setProperty(HostRegexTableLoadBalancer.HOST_BALANCER_PREFIX + MetadataTable.NAME, "*");
-    cfg.setProperty(HostRegexTableLoadBalancer.HOST_BALANCER_OOB_CHECK_KEY, "7s");
-    cfg.setProperty(Property.TABLE_LOAD_BALANCER.getKey(),
-        HostRegexTableLoadBalancer.class.getName());
+    cfg.setProperty(HostRegexTableLoadBalancer.HOST_BALANCER_OOB_CHECK_KEY, "1ms");
+    cfg.setProperty(Property.MANAGER_TABLET_BALANCER.getKey(),
+        HostAndPortRegexTableLoadBalancer.class.getName());
+  }
+
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+
+    try (AccumuloClient client = Accumulo.newClient().from(getClientProperties()).build()) {
+      // Wait for all tablet servers to come online and then choose the first server in the list.
+      // Update the balancer configuration to assign all metadata tablets to that server (and
+      // everything else to other servers).
+      InstanceOperations iops = client.instanceOperations();
+      List<String> tservers = iops.getTabletServers();
+      while (tservers == null || tservers.size() < 1) {
+        Thread.sleep(1000L);
+        tservers = client.instanceOperations().getTabletServers();
+      }
+      HostAndPort metadataServer = HostAndPort.fromString(tservers.get(0));
+      log.info("Configuring balancer to assign all metadata tablets to {}", metadataServer);
+      iops.setProperty(HostRegexTableLoadBalancer.HOST_BALANCER_PREFIX + MetadataTable.NAME,
+          metadataServer.toString());
+
+      // Wait for the balancer to assign all metadata tablets to the chosen server.
+      ClientContext ctx = (ClientContext) client;
+      TabletLocations tl = TabletLocations.retrieve(ctx, MetadataTable.NAME, RootTable.NAME);
+      while (tl.hosted.keySet().size() != 1 || !tl.hosted.containsKey(metadataServer)) {
+        log.info("Metadata tablets are not hosted on the correct server. Waiting for balancer...");
+        Thread.sleep(1000L);
+        tl = TabletLocations.retrieve(ctx, MetadataTable.NAME, RootTable.NAME);
+      }
+      log.info("Metadata tablets are now hosted on {}", metadataServer);
+    }
+
+    // Since we started only a single tablet server, we know it's the one hosting the
+    // metadata table. Save its process reference off so we can exclude it later when
+    // killing tablet servers.
+    Collection<ProcessReference> procs = getCluster().getProcesses().get(ServerType.TABLET_SERVER);
+    assertEquals("Expected a single tserver process", 1, procs.size());
+    metadataTserverProcess = procs.iterator().next();
+
+    // Update the number of tservers and start the new tservers.
+    getCluster().getConfig().setNumTservers(TSERVERS);
+    getCluster().start();
   }
 
   @Test
   public void crashAndResumeTserver() throws Exception {
     // Run the test body. When we get to the point where we need a tserver to go away, get rid of it
     // via crashing
     suspensionTestBody((ctx, locs, count) -> {
-      List<ProcessReference> procs =
-          new ArrayList<>(getCluster().getProcesses().get(ServerType.TABLET_SERVER));
+      // Exclude the tablet server hosting the metadata table from the list and only
+      // kill tablet servers that are not hosting the metadata table.
+      List<ProcessReference> procs = getCluster().getProcesses().get(ServerType.TABLET_SERVER)
+          .stream().filter(p -> !metadataTserverProcess.equals(p)).collect(Collectors.toList());
       Collections.shuffle(procs);

Review comment:
       ```suggestion
         Collections.shuffle(procs, RANDOM);
   ```

##########
File path: test/src/main/java/org/apache/accumulo/test/manager/SuspendedTabletsIT.java
##########
@@ -285,6 +341,48 @@ public static void cleanup() {
     THREAD_POOL.shutdownNow();
   }
 
+  /**
+   * A version of {@link HostRegexTableLoadBalancer} that includes the tablet server port in
+   * addition to the host name when checking regular expressions. This is useful for testing when
+   * multiple tablet servers are running on the same host and one wishes to make pools from the
+   * tablet servers on that host.
+   */
+  public static class HostAndPortRegexTableLoadBalancer extends HostRegexTableLoadBalancer {
+    private static Logger LOG =

Review comment:
       ```suggestion
       private static final Logger LOG =
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org