You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by st...@apache.org on 2017/06/02 21:10:13 UTC

ambari git commit: AMBARI-21142. Log more info about heartbeat message/response when server - agent communication gets out of sync. (stoader)

Repository: ambari
Updated Branches:
  refs/heads/branch-2.5 4418358f8 -> 91a7d0efa


AMBARI-21142. Log more info about heartbeat message/response when server - agent communication gets out of sync. (stoader)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/91a7d0ef
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/91a7d0ef
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/91a7d0ef

Branch: refs/heads/branch-2.5
Commit: 91a7d0efadd1522a9f736a1c8006f47457bef9af
Parents: 4418358
Author: Toader, Sebastian <st...@hortonworks.com>
Authored: Fri Jun 2 23:09:56 2017 +0200
Committer: Toader, Sebastian <st...@hortonworks.com>
Committed: Fri Jun 2 23:09:56 2017 +0200

----------------------------------------------------------------------
 .../src/main/python/ambari_agent/Controller.py    |  6 +++++-
 .../ambari/server/agent/HeartBeatHandler.java     | 18 ++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/91a7d0ef/ambari-agent/src/main/python/ambari_agent/Controller.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/ambari_agent/Controller.py b/ambari-agent/src/main/python/ambari_agent/Controller.py
index 83f1da8..5fab595 100644
--- a/ambari-agent/src/main/python/ambari_agent/Controller.py
+++ b/ambari-agent/src/main/python/ambari_agent/Controller.py
@@ -320,6 +320,7 @@ class Controller(threading.Thread):
           logger.log(logging_level, "Sending Heartbeat (id = %s)", self.responseId)
 
         response = self.sendRequest(self.heartbeatUrl, data)
+
         exitStatus = 0
         if 'exitstatus' in response.keys():
           exitStatus = int(response['exitstatus'])
@@ -365,7 +366,9 @@ class Controller(threading.Thread):
           self.restartAgent()
 
         if serverId != self.responseId + 1:
-          logger.error("Error in responseId sequence - restarting")
+          logger.error("Error in responseId sequence - received responseId={0} from server while expecting {1} - restarting..."
+              .format(serverId, self.responseId + 1))
+
           self.restartAgent()
         else:
           self.responseId = serverId
@@ -464,6 +467,7 @@ class Controller(threading.Thread):
 
         #randomize the heartbeat
         delay = randint(0, self.max_reconnect_retry_delay)
+        logger.info("Waiting {0} seconds before reconnecting to {1}".format(delay, self.heartbeatUrl))
         time.sleep(delay)
 
       # Sleep for some time

http://git-wip-us.apache.org/repos/asf/ambari/blob/91a7d0ef/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
index 6b93462..fd43de5 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java
@@ -207,10 +207,20 @@ public class HeartBeatHandler {
         + ", receivedResponseId=" + heartbeat.getResponseId());
 
     if (heartbeat.getResponseId() == currentResponseId - 1) {
-      LOG.warn("Old responseId received - response was lost - returning cached response");
-      return hostResponses.get(hostname);
+      HeartBeatResponse heartBeatResponse = hostResponses.get(hostname);
+
+      LOG.warn("Old responseId={} received form host {} - response was lost - returning cached response with responseId={}",
+        heartbeat.getResponseId(),
+        hostname,
+        heartBeatResponse.getResponseId());
+
+      return heartBeatResponse;
     } else if (heartbeat.getResponseId() != currentResponseId) {
-      LOG.error("Error in responseId sequence - sending agent restart command");
+      LOG.error("Error in responseId sequence - received responseId={} from host {} - sending agent restart command with responseId={}",
+        heartbeat.getResponseId(),
+        hostname,
+        currentResponseId);
+
       return createRestartCommand(currentResponseId);
     }
 
@@ -232,7 +242,7 @@ public class HeartBeatHandler {
 
     if (hostObject.getState().equals(HostState.HEARTBEAT_LOST)) {
       // After loosing heartbeat agent should reregister
-      LOG.warn("Host is in HEARTBEAT_LOST state - sending register command");
+      LOG.warn("Host {} is in HEARTBEAT_LOST state - sending register command", hostname);
       return createRegisterCommand();
     }