You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by jo...@apache.org on 2017/06/05 20:00:51 UTC

[12/17] ambari git commit: AMBARI-21142. Log more info about heartbeat message/response when server - agent communication gets out of sync. (stoader)

AMBARI-21142. Log more info about heartbeat message/response when server - agent communication gets out of sync. (stoader)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/b7101f78
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/b7101f78
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/b7101f78

Branch: refs/heads/branch-feature-AMBARI-12556
Commit: b7101f782be9a1291de589262f01083c70dfc935
Parents: c3c06ea
Author: Toader, Sebastian <st...@hortonworks.com>
Authored: Fri Jun 2 23:09:56 2017 +0200
Committer: Toader, Sebastian <st...@hortonworks.com>
Committed: Fri Jun 2 23:12:46 2017 +0200

----------------------------------------------------------------------
 .../src/main/python/ambari_agent/Controller.py    |  6 +++++-
 .../ambari/server/agent/HeartBeatHandler.java     | 18 ++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/b7101f78/ambari-agent/src/main/python/ambari_agent/Controller.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/ambari_agent/Controller.py b/ambari-agent/src/main/python/ambari_agent/Controller.py
index 0297f74..bc923c3 100644
--- a/ambari-agent/src/main/python/ambari_agent/Controller.py
+++ b/ambari-agent/src/main/python/ambari_agent/Controller.py
@@ -321,6 +321,7 @@ class Controller(threading.Thread):
           logger.log(logging_level, "Sending Heartbeat (id = %s)", self.responseId)
 
         response = self.sendRequest(self.heartbeatUrl, data)
+
         exitStatus = 0
         if 'exitstatus' in response.keys():
           exitStatus = int(response['exitstatus'])
@@ -366,7 +367,9 @@ class Controller(threading.Thread):
           self.restartAgent()
 
         if serverId != self.responseId + 1:
-          logger.error("Error in responseId sequence - restarting")
+          logger.error("Error in responseId sequence - received responseId={0} from server while expecting {1} - restarting..."
+              .format(serverId, self.responseId + 1))
+
           self.restartAgent()
         else:
           self.responseId = serverId
@@ -465,6 +468,7 @@ class Controller(threading.Thread):
 
         #randomize the heartbeat
         delay = randint(0, self.max_reconnect_retry_delay)
+        logger.info("Waiting {0} seconds before reconnecting to {1}".format(delay, self.heartbeatUrl))
         time.sleep(delay)
 
       # Sleep for some time

http://git-wip-us.apache.org/repos/asf/ambari/blob/b7101f78/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
index d800bc5..fc6e7a7 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
@@ -161,10 +161,20 @@ public class HeartBeatHandler {
         + ", receivedResponseId=" + heartbeat.getResponseId());
 
     if (heartbeat.getResponseId() == currentResponseId - 1) {
-      LOG.warn("Old responseId received - response was lost - returning cached response");
-      return hostResponses.get(hostname);
+      HeartBeatResponse heartBeatResponse = hostResponses.get(hostname);
+
+      LOG.warn("Old responseId={} received form host {} - response was lost - returning cached response with responseId={}",
+        heartbeat.getResponseId(),
+        hostname,
+        heartBeatResponse.getResponseId());
+
+      return heartBeatResponse;
     } else if (heartbeat.getResponseId() != currentResponseId) {
-      LOG.error("Error in responseId sequence - sending agent restart command");
+      LOG.error("Error in responseId sequence - received responseId={} from host {} - sending agent restart command with responseId={}",
+        heartbeat.getResponseId(),
+        hostname,
+        currentResponseId);
+
       return createRestartCommand(currentResponseId);
     }
 
@@ -186,7 +196,7 @@ public class HeartBeatHandler {
 
     if (hostObject.getState().equals(HostState.HEARTBEAT_LOST)) {
       // After loosing heartbeat agent should reregister
-      LOG.warn("Host is in HEARTBEAT_LOST state - sending register command");
+      LOG.warn("Host {} is in HEARTBEAT_LOST state - sending register command", hostname);
       return createRegisterCommand();
     }