You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by sw...@apache.org on 2014/02/28 07:02:36 UTC

[1/2] git commit: AMBARI-4878. On a retry all the install commands succeed but the status of one of the components in INSTALL_FAILED which causes the next START command to fail on UI. (swagle)

Repository: ambari
Updated Branches:
  refs/heads/trunk 450022aee -> 595b721b2


AMBARI-4878. On a retry all the install commands succeed but the status of one of the components in INSTALL_FAILED which causes the next START command to fail on UI. (swagle)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/044a29f9
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/044a29f9
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/044a29f9

Branch: refs/heads/trunk
Commit: 044a29f9b1dd70969355edfa09c17f814927ec49
Parents: 450022a
Author: Siddharth Wagle <sw...@hortonworks.com>
Authored: Thu Feb 27 21:31:31 2014 -0800
Committer: Siddharth Wagle <sw...@hortonworks.com>
Committed: Thu Feb 27 21:38:23 2014 -0800

----------------------------------------------------------------------
 .../server/actionmanager/ActionManager.java     |  15 +++
 .../ambari/server/agent/HeartBeatHandler.java   |   9 +-
 .../server/agent/TestHeartbeatHandler.java      | 105 ++++++++++++++++++-
 3 files changed, 126 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/044a29f9/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionManager.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionManager.java b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionManager.java
index 789bbd6..7b9a922 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionManager.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/ActionManager.java
@@ -149,6 +149,21 @@ public class ActionManager {
     db.updateHostRoleStates(reportsToProcess);
   }
 
+  /**
+   * Find if the command report is for an in progress command
+   * @param report
+   * @return
+   */
+  public boolean isInProgressCommand(CommandReport report) {
+    HostRoleCommand command = db.getTask(report.getTaskId());
+    if (command == null) {
+      LOG.warn("The task " + report.getTaskId() + " is invalid");
+      return false;
+    }
+    return command.getStatus().equals(HostRoleStatus.IN_PROGRESS)
+      || command.getStatus().equals(HostRoleStatus.QUEUED);
+  }
+
   public void handleLostHost(String host) {
     //Do nothing, the task will timeout anyway.
     //The actions can be failed faster as an optimization

http://git-wip-us.apache.org/repos/asf/ambari/blob/044a29f9/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
index 0044c8e..0d4cfe1 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
@@ -345,8 +345,13 @@ public class HeartBeatHandler {
           } else if (report.getStatus().equals("FAILED")) {
             LOG.warn("Operation failed - may be retried. Service component host: "
                 + schName + ", host: " + hostname + " Action id" + report.getActionId());
-            scHost.handleEvent(new ServiceComponentHostOpFailedEvent(schName,
-                hostname, now));
+            if (actionManager.isInProgressCommand(report)) {
+              scHost.handleEvent(new ServiceComponentHostOpFailedEvent
+                (schName, hostname, now));
+            } else {
+              LOG.info("Report arrived after command is no longer running. " +
+                "Ignoring report. " + report);
+            }
           } else if (report.getStatus().equals("IN_PROGRESS")) {
             scHost.handleEvent(new ServiceComponentHostOpInProgressEvent(schName,
                 hostname, now));

http://git-wip-us.apache.org/repos/asf/ambari/blob/044a29f9/ambari-server/src/test/java/org/apache/ambari/server/agent/TestHeartbeatHandler.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/java/org/apache/ambari/server/agent/TestHeartbeatHandler.java b/ambari-server/src/test/java/org/apache/ambari/server/agent/TestHeartbeatHandler.java
index 490df5e..6014bbf 100644
--- a/ambari-server/src/test/java/org/apache/ambari/server/agent/TestHeartbeatHandler.java
+++ b/ambari-server/src/test/java/org/apache/ambari/server/agent/TestHeartbeatHandler.java
@@ -77,7 +77,9 @@ import org.apache.ambari.server.state.ServiceComponentHost;
 import org.apache.ambari.server.state.StackId;
 import org.apache.ambari.server.state.State;
 import org.apache.ambari.server.state.fsm.InvalidStateTransitionException;
+import org.apache.ambari.server.state.svccomphost.ServiceComponentHostInstallEvent;
 import org.apache.ambari.server.state.svccomphost.ServiceComponentHostStartEvent;
+import org.apache.ambari.server.state.svccomphost.ServiceComponentHostUpgradeEvent;
 import org.apache.ambari.server.utils.StageUtils;
 import org.codehaus.jackson.JsonGenerationException;
 import org.junit.After;
@@ -761,7 +763,6 @@ public class TestHeartbeatHandler {
             getServiceComponent(DATANODE).getServiceComponentHost(DummyHostname1);
     serviceComponentHost1.setState(State.INSTALLING);
 
-
     HeartBeat hb = new HeartBeat();
     hb.setTimestamp(System.currentTimeMillis());
     hb.setResponseId(0);
@@ -788,6 +789,71 @@ public class TestHeartbeatHandler {
     assertEquals("Host state should still be installing", State.INSTALLING, componentState1);
   }
 
+  @Test
+  public void testOPFailedEventForAbortedTask() throws AmbariException, InvalidStateTransitionException {
+    ActionManager am = getMockActionManager();
+    Cluster cluster = getDummyCluster();
+
+    @SuppressWarnings("serial")
+    Set<String> hostNames = new HashSet<String>(){{
+      add(DummyHostname1);
+    }};
+    clusters.mapHostsToCluster(hostNames, DummyCluster);
+    Service hdfs = cluster.addService(HDFS);
+    hdfs.persist();
+    hdfs.addServiceComponent(DATANODE).persist();
+    hdfs.getServiceComponent(DATANODE).addServiceComponentHost(DummyHostname1).persist();
+    hdfs.addServiceComponent(NAMENODE).persist();
+    hdfs.getServiceComponent(NAMENODE).addServiceComponentHost(DummyHostname1).persist();
+    hdfs.addServiceComponent(SECONDARY_NAMENODE).persist();
+    hdfs.getServiceComponent(SECONDARY_NAMENODE).addServiceComponentHost(DummyHostname1).persist();
+
+    ActionQueue aq = new ActionQueue();
+    HeartBeatHandler handler = getHeartBeatHandler(am, aq);
+
+    ServiceComponentHost serviceComponentHost1 = clusters.getCluster(DummyCluster).getService(HDFS).
+      getServiceComponent(DATANODE).getServiceComponentHost(DummyHostname1);
+    serviceComponentHost1.setState(State.INSTALLING);
+
+    Stage s = new Stage(1, "/a/b", "cluster1", "action manager test",
+      "clusterHostInfo");
+    s.setStageId(1);
+    s.addHostRoleExecutionCommand(DummyHostname1, Role.DATANODE, RoleCommand.INSTALL,
+      new ServiceComponentHostInstallEvent(Role.DATANODE.toString(),
+        DummyHostname1, System.currentTimeMillis(), "HDP-1.3.0"),
+          DummyCluster, "HDFS");
+    List<Stage> stages = new ArrayList<Stage>();
+    stages.add(s);
+    Request request = new Request(stages, clusters);
+    actionDBAccessor.persistActions(request);
+    actionDBAccessor.abortHostRole(DummyHostname1, 1, 1, Role.DATANODE.name());
+
+    HeartBeat hb = new HeartBeat();
+    hb.setTimestamp(System.currentTimeMillis());
+    hb.setResponseId(0);
+    hb.setHostname(DummyHostname1);
+    hb.setNodeStatus(new HostStatus(Status.HEALTHY, DummyHostStatus));
+
+    List<CommandReport> reports = new ArrayList<CommandReport>();
+    CommandReport cr = new CommandReport();
+    cr.setActionId(StageUtils.getActionId(1, 1));
+    cr.setTaskId(1);
+    cr.setClusterName(DummyCluster);
+    cr.setServiceName(HDFS);
+    cr.setRole(DATANODE);
+    cr.setStatus("FAILED");
+    cr.setStdErr("none");
+    cr.setStdOut("dummy output");
+    cr.setExitCode(777);
+    reports.add(cr);
+    hb.setReports(reports);
+    hb.setComponentStatus(new ArrayList<ComponentStatus>());
+    handler.handleHeartBeat(hb);
+    State componentState1 = serviceComponentHost1.getState();
+    assertEquals("Host state should still be installing", State.INSTALLING,
+      componentState1);
+  }
+
   /**
    * Tests the fact that when START and STOP commands are in progress, and heartbeat
    * forces the host component state to STARTED or INSTALLED, there are no undesired
@@ -1265,6 +1331,37 @@ public class TestHeartbeatHandler {
     serviceComponentHost1.setDesiredStackVersion(stack130);
     serviceComponentHost2.setStackVersion(stack122);
 
+    Stage s = new Stage(requestId, "/a/b", "cluster1", "action manager test",
+      "clusterHostInfo");
+    s.setStageId(stageId);
+    s.addHostRoleExecutionCommand(DummyHostname1, Role.DATANODE, RoleCommand.UPGRADE,
+      new ServiceComponentHostUpgradeEvent(Role.DATANODE.toString(),
+        DummyHostname1, System.currentTimeMillis(), "HDP-1.3.0"),
+      DummyCluster, "HDFS");
+    s.addHostRoleExecutionCommand(DummyHostname1, Role.NAMENODE, RoleCommand.INSTALL,
+      new ServiceComponentHostInstallEvent(Role.NAMENODE.toString(),
+        DummyHostname1, System.currentTimeMillis(), "HDP-1.3.0"),
+          DummyCluster, "HDFS");
+    List<Stage> stages = new ArrayList<Stage>();
+    stages.add(s);
+    Request request = new Request(stages, clusters);
+    actionDBAccessor.persistActions(request);
+    CommandReport cr = new CommandReport();
+    cr.setActionId(StageUtils.getActionId(requestId, stageId));
+    cr.setTaskId(1);
+    cr.setClusterName(DummyCluster);
+    cr.setServiceName(HDFS);
+    cr.setRole(DATANODE);
+    cr.setStatus(HostRoleStatus.IN_PROGRESS.toString());
+    cr.setStdErr("none");
+    cr.setStdOut("dummy output");
+    actionDBAccessor.updateHostRoleState(DummyHostname1, requestId, stageId,
+      Role.DATANODE.name(), cr);
+    cr.setRole(NAMENODE);
+    cr.setTaskId(2);
+    actionDBAccessor.updateHostRoleState(DummyHostname1, requestId, stageId,
+      Role.NAMENODE.name(), cr);
+
     HeartBeat hb = new HeartBeat();
     hb.setTimestamp(System.currentTimeMillis());
     hb.setResponseId(0);
@@ -1281,6 +1378,9 @@ public class TestHeartbeatHandler {
     cr1.setStdOut("dummy output");
     cr1.setExitCode(0);
 
+//    actionDBAccessor.updateHostRoleState(DummyHostname1, requestId, stageId,
+//      Role.DATANODE.name(), cr1);
+
     CommandReport cr2 = new CommandReport();
     cr2.setActionId(StageUtils.getActionId(requestId, stageId));
     cr2.setTaskId(2);
@@ -1296,6 +1396,9 @@ public class TestHeartbeatHandler {
     reports.add(cr2);
     hb.setReports(reports);
 
+//    actionDBAccessor.updateHostRoleState(DummyHostname1, requestId, stageId,
+//      Role.NAMENODE.name(), cr2);
+
     ActionQueue aq = new ActionQueue();
     HeartBeatHandler handler = getHeartBeatHandler(am, aq);
     handler.handleHeartBeat(hb);


[2/2] git commit: AMBARI-4878. On a retry all the install commands succeed but the status of one of the components in INSTALL_FAILED which causes the next START command to fail on UI. Fixed log. (swagle)

Posted by sw...@apache.org.
AMBARI-4878. On a retry all the install commands succeed but the status of one of the components in INSTALL_FAILED which causes the next START command to fail on UI. Fixed log. (swagle)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/595b721b
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/595b721b
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/595b721b

Branch: refs/heads/trunk
Commit: 595b721b2226eac4ed240ff931441786018eb27a
Parents: 044a29f
Author: Siddharth Wagle <sw...@hortonworks.com>
Authored: Thu Feb 27 22:01:59 2014 -0800
Committer: Siddharth Wagle <sw...@hortonworks.com>
Committed: Thu Feb 27 22:01:59 2014 -0800

----------------------------------------------------------------------
 .../java/org/apache/ambari/server/agent/HeartBeatHandler.java     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/595b721b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
index 0d4cfe1..4666a29 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
@@ -349,8 +349,7 @@ public class HeartBeatHandler {
               scHost.handleEvent(new ServiceComponentHostOpFailedEvent
                 (schName, hostname, now));
             } else {
-              LOG.info("Report arrived after command is no longer running. " +
-                "Ignoring report. " + report);
+              LOG.info("Received report for a command that is no longer active. " + report);
             }
           } else if (report.getStatus().equals("IN_PROGRESS")) {
             scHost.handleEvent(new ServiceComponentHostOpInProgressEvent(schName,