You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ib...@apache.org on 2018/11/30 19:32:10 UTC

[accumulo] branch 1.9 updated: fixes ACCUMULO-4410: Master did not resume balancing after administrative tserver shutdown - Updated the master to reduce the serversToShutdown to only those that are current. - Removed the statefulness of the ShutdownTServer fate operation as that state is never actually maintained. - Ensure the update() call is continuously invoked ensuring we clean up the serversToShutdown in a timely manor.

This is an automated email from the ASF dual-hosted git repository.

ibella pushed a commit to branch 1.9
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/1.9 by this push:
     new 1be8298  fixes ACCUMULO-4410: Master did not resume balancing after administrative tserver shutdown   - Updated the master to reduce the serversToShutdown to only those that are current.   - Removed the statefulness of the ShutdownTServer fate operation as that state is never actually maintained.   - Ensure the update() call is continuously invoked ensuring we clean up the serversToShutdown in a timely manor.
1be8298 is described below

commit 1be829848ffc195dbc8c3fa42a4f354d2678645e
Author: Ivan Bella <iv...@bella.name>
AuthorDate: Tue Nov 27 20:16:09 2018 +0000

    fixes ACCUMULO-4410: Master did not resume balancing after administrative tserver shutdown
      - Updated the master to reduce the serversToShutdown to only those that are current.
      - Removed the statefulness of the ShutdownTServer fate operation as that state is never actually maintained.
      - Ensure the update() call is continuously invoked ensuring we clean up the serversToShutdown in a timely manor.
---
 .../accumulo/server/master/LiveTServerSet.java     |  6 +-
 .../java/org/apache/accumulo/master/Master.java    | 84 ++++++++++++----------
 .../master/tserverOps/ShutdownTServer.java         |  9 +--
 .../master/tableOps/ShutdownTServerTest.java       | 15 +++-
 4 files changed, 63 insertions(+), 51 deletions(-)

diff --git a/server/base/src/main/java/org/apache/accumulo/server/master/LiveTServerSet.java b/server/base/src/main/java/org/apache/accumulo/server/master/LiveTServerSet.java
index a4b39df..163ed5f 100644
--- a/server/base/src/main/java/org/apache/accumulo/server/master/LiveTServerSet.java
+++ b/server/base/src/main/java/org/apache/accumulo/server/master/LiveTServerSet.java
@@ -283,8 +283,7 @@ public class LiveTServerSet implements Watcher {
       }
 
       // log.debug("Current: " + current.keySet());
-      if (!doomed.isEmpty() || !updates.isEmpty())
-        this.cback.update(this, doomed, updates);
+      this.cback.update(this, doomed, updates);
     } catch (Exception ex) {
       log.error("{}", ex.getMessage(), ex);
     }
@@ -372,8 +371,7 @@ public class LiveTServerSet implements Watcher {
 
           try {
             checkServer(updates, doomed, path, server);
-            if (!doomed.isEmpty() || !updates.isEmpty())
-              this.cback.update(this, doomed, updates);
+            this.cback.update(this, doomed, updates);
           } catch (Exception ex) {
             log.error("Exception", ex);
           }
diff --git a/server/master/src/main/java/org/apache/accumulo/master/Master.java b/server/master/src/main/java/org/apache/accumulo/master/Master.java
index 2f124e3..7255410 100644
--- a/server/master/src/main/java/org/apache/accumulo/master/Master.java
+++ b/server/master/src/main/java/org/apache/accumulo/master/Master.java
@@ -1554,49 +1554,57 @@ public class Master extends AccumuloServerContext
   @Override
   public void update(LiveTServerSet current, Set<TServerInstance> deleted,
       Set<TServerInstance> added) {
-    DeadServerList obit = new DeadServerList(
-        ZooUtil.getRoot(getInstance()) + Constants.ZDEADTSERVERS);
-    if (added.size() > 0) {
-      log.info("New servers: " + added);
-      for (TServerInstance up : added)
-        obit.delete(up.hostPort());
-    }
-    for (TServerInstance dead : deleted) {
-      String cause = "unexpected failure";
-      if (serversToShutdown.contains(dead))
-        cause = "clean shutdown"; // maybe an incorrect assumption
-      if (!getMasterGoalState().equals(MasterGoalState.CLEAN_STOP))
-        obit.post(dead.hostPort(), cause);
-    }
-
-    Set<TServerInstance> unexpected = new HashSet<>(deleted);
-    unexpected.removeAll(this.serversToShutdown);
-    if (unexpected.size() > 0) {
-      if (stillMaster() && !getMasterGoalState().equals(MasterGoalState.CLEAN_STOP)) {
-        log.warn("Lost servers " + unexpected);
+    // if we have deleted or added tservers, then adjust our dead server list
+    if (!deleted.isEmpty() || !added.isEmpty()) {
+      DeadServerList obit = new DeadServerList(
+          ZooUtil.getRoot(getInstance()) + Constants.ZDEADTSERVERS);
+      if (added.size() > 0) {
+        log.info("New servers: " + added);
+        for (TServerInstance up : added)
+          obit.delete(up.hostPort());
+      }
+      for (TServerInstance dead : deleted) {
+        String cause = "unexpected failure";
+        if (serversToShutdown.contains(dead))
+          cause = "clean shutdown"; // maybe an incorrect assumption
+        if (!getMasterGoalState().equals(MasterGoalState.CLEAN_STOP))
+          obit.post(dead.hostPort(), cause);
       }
-    }
-    serversToShutdown.removeAll(deleted);
-    badServers.keySet().removeAll(deleted);
-    // clear out any bad server with the same host/port as a new server
-    synchronized (badServers) {
-      cleanListByHostAndPort(badServers.keySet(), deleted, added);
-    }
-    synchronized (serversToShutdown) {
-      cleanListByHostAndPort(serversToShutdown, deleted, added);
-    }
 
-    synchronized (migrations) {
-      Iterator<Entry<KeyExtent,TServerInstance>> iter = migrations.entrySet().iterator();
-      while (iter.hasNext()) {
-        Entry<KeyExtent,TServerInstance> entry = iter.next();
-        if (deleted.contains(entry.getValue())) {
-          log.info("Canceling migration of " + entry.getKey() + " to " + entry.getValue());
-          iter.remove();
+      Set<TServerInstance> unexpected = new HashSet<>(deleted);
+      unexpected.removeAll(this.serversToShutdown);
+      if (unexpected.size() > 0) {
+        if (stillMaster() && !getMasterGoalState().equals(MasterGoalState.CLEAN_STOP)) {
+          log.warn("Lost servers " + unexpected);
+        }
+      }
+      serversToShutdown.removeAll(deleted);
+      badServers.keySet().removeAll(deleted);
+      // clear out any bad server with the same host/port as a new server
+      synchronized (badServers) {
+        cleanListByHostAndPort(badServers.keySet(), deleted, added);
+      }
+      synchronized (serversToShutdown) {
+        cleanListByHostAndPort(serversToShutdown, deleted, added);
+      }
+
+      synchronized (migrations) {
+        Iterator<Entry<KeyExtent,TServerInstance>> iter = migrations.entrySet().iterator();
+        while (iter.hasNext()) {
+          Entry<KeyExtent,TServerInstance> entry = iter.next();
+          if (deleted.contains(entry.getValue())) {
+            log.info("Canceling migration of " + entry.getKey() + " to " + entry.getValue());
+            iter.remove();
+          }
         }
       }
+      nextEvent.event("There are now %d tablet servers", current.size());
     }
-    nextEvent.event("There are now %d tablet servers", current.size());
+
+    // clear out any servers that are no longer current
+    // this is needed when we are using a fate operation to shutdown a tserver as it
+    // will continue to add the server to the serversToShutdown (ACCUMULO-4410)
+    serversToShutdown.retainAll(current.getCurrentServers());
   }
 
   private static void cleanListByHostAndPort(Collection<TServerInstance> badServers,
diff --git a/server/master/src/main/java/org/apache/accumulo/master/tserverOps/ShutdownTServer.java b/server/master/src/main/java/org/apache/accumulo/master/tserverOps/ShutdownTServer.java
index 183adca..a6693dd 100644
--- a/server/master/src/main/java/org/apache/accumulo/master/tserverOps/ShutdownTServer.java
+++ b/server/master/src/main/java/org/apache/accumulo/master/tserverOps/ShutdownTServer.java
@@ -39,7 +39,7 @@ public class ShutdownTServer extends MasterRepo {
   private static final long serialVersionUID = 1L;
   private static final Logger log = LoggerFactory.getLogger(ShutdownTServer.class);
   private TServerInstance server;
-  private boolean force, requestedShutdown;
+  private boolean force;
 
   public ShutdownTServer(TServerInstance server, boolean force) {
     this.server = server;
@@ -54,12 +54,7 @@ public class ShutdownTServer extends MasterRepo {
     }
 
     // Inform the master that we want this server to shutdown
-    // We don't want to spam the master with shutdown requests, so
-    // only send this request once
-    if (!requestedShutdown) {
-      master.shutdownTServer(server);
-      requestedShutdown = true;
-    }
+    master.shutdownTServer(server);
 
     if (master.onlineTabletServers().contains(server)) {
       TServerConnection connection = master.getConnection(server);
diff --git a/server/master/src/test/java/org/apache/accumulo/master/tableOps/ShutdownTServerTest.java b/server/master/src/test/java/org/apache/accumulo/master/tableOps/ShutdownTServerTest.java
index 2fc51b8..bd72d83 100644
--- a/server/master/src/test/java/org/apache/accumulo/master/tableOps/ShutdownTServerTest.java
+++ b/server/master/src/test/java/org/apache/accumulo/master/tableOps/ShutdownTServerTest.java
@@ -16,6 +16,7 @@
  */
 package org.apache.accumulo.master.tableOps;
 
+import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
 import java.util.Collections;
@@ -23,6 +24,7 @@ import java.util.HashMap;
 
 import org.apache.accumulo.core.master.thrift.TableInfo;
 import org.apache.accumulo.core.master.thrift.TabletServerStatus;
+import org.apache.accumulo.fate.Repo;
 import org.apache.accumulo.master.Master;
 import org.apache.accumulo.master.tserverOps.ShutdownTServer;
 import org.apache.accumulo.server.master.LiveTServerSet.TServerConnection;
@@ -65,16 +67,25 @@ public class ShutdownTServerTest {
     // Reset the mocks
     EasyMock.reset(tserver, tserverCnxn, master);
 
-    // The same as above, but should not expect call shutdownTServer on master again
+    // reset the table map to the empty set to simulate all tablets unloaded
+    status.tableMap = new HashMap<>();
+    master.shutdownTServer(tserver);
+    EasyMock.expectLastCall().once();
     EasyMock.expect(master.onlineTabletServers()).andReturn(Collections.singleton(tserver));
     EasyMock.expect(master.getConnection(tserver)).andReturn(tserverCnxn);
     EasyMock.expect(tserverCnxn.getTableMap(false)).andReturn(status);
+    EasyMock.expect(master.getMasterLock()).andReturn(null);
+    tserverCnxn.halt(null);
+    EasyMock.expectLastCall().once();
 
     EasyMock.replay(tserver, tserverCnxn, master);
 
     // FATE op is not ready
     wait = op.isReady(tid, master);
-    assertTrue("Expected wait to be greater than 0", wait > 0);
+    assertTrue("Expected wait to be 0", wait == 0);
+
+    Repo<Master> op2 = op.call(tid, master);
+    assertNull("Expected no follow on step", op2);
 
     EasyMock.verify(tserver, tserverCnxn, master);
   }