You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ib...@apache.org on 2018/11/30 19:32:10 UTC
[accumulo] branch 1.9 updated: fixes ACCUMULO-4410: Master did not
resume balancing after administrative tserver shutdown - Updated the master
to reduce the serversToShutdown to only those that are current. - Removed
the statefulness of the ShutdownTServer fate operation as that state is
never actually maintained. - Ensure the update() call is continuously
invoked ensuring we clean up the serversToShutdown in a timely manor.
This is an automated email from the ASF dual-hosted git repository.
ibella pushed a commit to branch 1.9
in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/1.9 by this push:
new 1be8298 fixes ACCUMULO-4410: Master did not resume balancing after administrative tserver shutdown - Updated the master to reduce the serversToShutdown to only those that are current. - Removed the statefulness of the ShutdownTServer fate operation as that state is never actually maintained. - Ensure the update() call is continuously invoked ensuring we clean up the serversToShutdown in a timely manor.
1be8298 is described below
commit 1be829848ffc195dbc8c3fa42a4f354d2678645e
Author: Ivan Bella <iv...@bella.name>
AuthorDate: Tue Nov 27 20:16:09 2018 +0000
fixes ACCUMULO-4410: Master did not resume balancing after administrative tserver shutdown
- Updated the master to reduce the serversToShutdown to only those that are current.
- Removed the statefulness of the ShutdownTServer fate operation as that state is never actually maintained.
- Ensure the update() call is continuously invoked ensuring we clean up the serversToShutdown in a timely manor.
---
.../accumulo/server/master/LiveTServerSet.java | 6 +-
.../java/org/apache/accumulo/master/Master.java | 84 ++++++++++++----------
.../master/tserverOps/ShutdownTServer.java | 9 +--
.../master/tableOps/ShutdownTServerTest.java | 15 +++-
4 files changed, 63 insertions(+), 51 deletions(-)
diff --git a/server/base/src/main/java/org/apache/accumulo/server/master/LiveTServerSet.java b/server/base/src/main/java/org/apache/accumulo/server/master/LiveTServerSet.java
index a4b39df..163ed5f 100644
--- a/server/base/src/main/java/org/apache/accumulo/server/master/LiveTServerSet.java
+++ b/server/base/src/main/java/org/apache/accumulo/server/master/LiveTServerSet.java
@@ -283,8 +283,7 @@ public class LiveTServerSet implements Watcher {
}
// log.debug("Current: " + current.keySet());
- if (!doomed.isEmpty() || !updates.isEmpty())
- this.cback.update(this, doomed, updates);
+ this.cback.update(this, doomed, updates);
} catch (Exception ex) {
log.error("{}", ex.getMessage(), ex);
}
@@ -372,8 +371,7 @@ public class LiveTServerSet implements Watcher {
try {
checkServer(updates, doomed, path, server);
- if (!doomed.isEmpty() || !updates.isEmpty())
- this.cback.update(this, doomed, updates);
+ this.cback.update(this, doomed, updates);
} catch (Exception ex) {
log.error("Exception", ex);
}
diff --git a/server/master/src/main/java/org/apache/accumulo/master/Master.java b/server/master/src/main/java/org/apache/accumulo/master/Master.java
index 2f124e3..7255410 100644
--- a/server/master/src/main/java/org/apache/accumulo/master/Master.java
+++ b/server/master/src/main/java/org/apache/accumulo/master/Master.java
@@ -1554,49 +1554,57 @@ public class Master extends AccumuloServerContext
@Override
public void update(LiveTServerSet current, Set<TServerInstance> deleted,
Set<TServerInstance> added) {
- DeadServerList obit = new DeadServerList(
- ZooUtil.getRoot(getInstance()) + Constants.ZDEADTSERVERS);
- if (added.size() > 0) {
- log.info("New servers: " + added);
- for (TServerInstance up : added)
- obit.delete(up.hostPort());
- }
- for (TServerInstance dead : deleted) {
- String cause = "unexpected failure";
- if (serversToShutdown.contains(dead))
- cause = "clean shutdown"; // maybe an incorrect assumption
- if (!getMasterGoalState().equals(MasterGoalState.CLEAN_STOP))
- obit.post(dead.hostPort(), cause);
- }
-
- Set<TServerInstance> unexpected = new HashSet<>(deleted);
- unexpected.removeAll(this.serversToShutdown);
- if (unexpected.size() > 0) {
- if (stillMaster() && !getMasterGoalState().equals(MasterGoalState.CLEAN_STOP)) {
- log.warn("Lost servers " + unexpected);
+ // if we have deleted or added tservers, then adjust our dead server list
+ if (!deleted.isEmpty() || !added.isEmpty()) {
+ DeadServerList obit = new DeadServerList(
+ ZooUtil.getRoot(getInstance()) + Constants.ZDEADTSERVERS);
+ if (added.size() > 0) {
+ log.info("New servers: " + added);
+ for (TServerInstance up : added)
+ obit.delete(up.hostPort());
+ }
+ for (TServerInstance dead : deleted) {
+ String cause = "unexpected failure";
+ if (serversToShutdown.contains(dead))
+ cause = "clean shutdown"; // maybe an incorrect assumption
+ if (!getMasterGoalState().equals(MasterGoalState.CLEAN_STOP))
+ obit.post(dead.hostPort(), cause);
}
- }
- serversToShutdown.removeAll(deleted);
- badServers.keySet().removeAll(deleted);
- // clear out any bad server with the same host/port as a new server
- synchronized (badServers) {
- cleanListByHostAndPort(badServers.keySet(), deleted, added);
- }
- synchronized (serversToShutdown) {
- cleanListByHostAndPort(serversToShutdown, deleted, added);
- }
- synchronized (migrations) {
- Iterator<Entry<KeyExtent,TServerInstance>> iter = migrations.entrySet().iterator();
- while (iter.hasNext()) {
- Entry<KeyExtent,TServerInstance> entry = iter.next();
- if (deleted.contains(entry.getValue())) {
- log.info("Canceling migration of " + entry.getKey() + " to " + entry.getValue());
- iter.remove();
+ Set<TServerInstance> unexpected = new HashSet<>(deleted);
+ unexpected.removeAll(this.serversToShutdown);
+ if (unexpected.size() > 0) {
+ if (stillMaster() && !getMasterGoalState().equals(MasterGoalState.CLEAN_STOP)) {
+ log.warn("Lost servers " + unexpected);
+ }
+ }
+ serversToShutdown.removeAll(deleted);
+ badServers.keySet().removeAll(deleted);
+ // clear out any bad server with the same host/port as a new server
+ synchronized (badServers) {
+ cleanListByHostAndPort(badServers.keySet(), deleted, added);
+ }
+ synchronized (serversToShutdown) {
+ cleanListByHostAndPort(serversToShutdown, deleted, added);
+ }
+
+ synchronized (migrations) {
+ Iterator<Entry<KeyExtent,TServerInstance>> iter = migrations.entrySet().iterator();
+ while (iter.hasNext()) {
+ Entry<KeyExtent,TServerInstance> entry = iter.next();
+ if (deleted.contains(entry.getValue())) {
+ log.info("Canceling migration of " + entry.getKey() + " to " + entry.getValue());
+ iter.remove();
+ }
}
}
+ nextEvent.event("There are now %d tablet servers", current.size());
}
- nextEvent.event("There are now %d tablet servers", current.size());
+
+ // clear out any servers that are no longer current
+ // this is needed when we are using a fate operation to shutdown a tserver as it
+ // will continue to add the server to the serversToShutdown (ACCUMULO-4410)
+ serversToShutdown.retainAll(current.getCurrentServers());
}
private static void cleanListByHostAndPort(Collection<TServerInstance> badServers,
diff --git a/server/master/src/main/java/org/apache/accumulo/master/tserverOps/ShutdownTServer.java b/server/master/src/main/java/org/apache/accumulo/master/tserverOps/ShutdownTServer.java
index 183adca..a6693dd 100644
--- a/server/master/src/main/java/org/apache/accumulo/master/tserverOps/ShutdownTServer.java
+++ b/server/master/src/main/java/org/apache/accumulo/master/tserverOps/ShutdownTServer.java
@@ -39,7 +39,7 @@ public class ShutdownTServer extends MasterRepo {
private static final long serialVersionUID = 1L;
private static final Logger log = LoggerFactory.getLogger(ShutdownTServer.class);
private TServerInstance server;
- private boolean force, requestedShutdown;
+ private boolean force;
public ShutdownTServer(TServerInstance server, boolean force) {
this.server = server;
@@ -54,12 +54,7 @@ public class ShutdownTServer extends MasterRepo {
}
// Inform the master that we want this server to shutdown
- // We don't want to spam the master with shutdown requests, so
- // only send this request once
- if (!requestedShutdown) {
- master.shutdownTServer(server);
- requestedShutdown = true;
- }
+ master.shutdownTServer(server);
if (master.onlineTabletServers().contains(server)) {
TServerConnection connection = master.getConnection(server);
diff --git a/server/master/src/test/java/org/apache/accumulo/master/tableOps/ShutdownTServerTest.java b/server/master/src/test/java/org/apache/accumulo/master/tableOps/ShutdownTServerTest.java
index 2fc51b8..bd72d83 100644
--- a/server/master/src/test/java/org/apache/accumulo/master/tableOps/ShutdownTServerTest.java
+++ b/server/master/src/test/java/org/apache/accumulo/master/tableOps/ShutdownTServerTest.java
@@ -16,6 +16,7 @@
*/
package org.apache.accumulo.master.tableOps;
+import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import java.util.Collections;
@@ -23,6 +24,7 @@ import java.util.HashMap;
import org.apache.accumulo.core.master.thrift.TableInfo;
import org.apache.accumulo.core.master.thrift.TabletServerStatus;
+import org.apache.accumulo.fate.Repo;
import org.apache.accumulo.master.Master;
import org.apache.accumulo.master.tserverOps.ShutdownTServer;
import org.apache.accumulo.server.master.LiveTServerSet.TServerConnection;
@@ -65,16 +67,25 @@ public class ShutdownTServerTest {
// Reset the mocks
EasyMock.reset(tserver, tserverCnxn, master);
- // The same as above, but should not expect call shutdownTServer on master again
+ // reset the table map to the empty set to simulate all tablets unloaded
+ status.tableMap = new HashMap<>();
+ master.shutdownTServer(tserver);
+ EasyMock.expectLastCall().once();
EasyMock.expect(master.onlineTabletServers()).andReturn(Collections.singleton(tserver));
EasyMock.expect(master.getConnection(tserver)).andReturn(tserverCnxn);
EasyMock.expect(tserverCnxn.getTableMap(false)).andReturn(status);
+ EasyMock.expect(master.getMasterLock()).andReturn(null);
+ tserverCnxn.halt(null);
+ EasyMock.expectLastCall().once();
EasyMock.replay(tserver, tserverCnxn, master);
// FATE op is not ready
wait = op.isReady(tid, master);
- assertTrue("Expected wait to be greater than 0", wait > 0);
+ assertTrue("Expected wait to be 0", wait == 0);
+
+ Repo<Master> op2 = op.call(tid, master);
+ assertNull("Expected no follow on step", op2);
EasyMock.verify(tserver, tserverCnxn, master);
}