You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ao...@apache.org on 2020/12/10 20:35:32 UTC

[ambari] branch branch-2.7 updated: AMBARI-25604. During blueprint deploy tasks sometimes fail due to KeyError on large clusters (aonishuk)

This is an automated email from the ASF dual-hosted git repository.

aonishuk pushed a commit to branch branch-2.7
in repository https://gitbox.apache.org/repos/asf/ambari.git


The following commit(s) were added to refs/heads/branch-2.7 by this push:
     new 1745d5a  AMBARI-25604. During blueprint deploy tasks sometimes fail due to KeyError on large clusters (aonishuk)
1745d5a is described below

commit 1745d5aa265ec811a235026d976012b1eebb6b7a
Author: Andrew Onishchuk <ao...@hortonworks.com>
AuthorDate: Thu Dec 10 20:49:54 2020 +0200

    AMBARI-25604. During blueprint deploy tasks sometimes fail due to KeyError on large clusters (aonishuk)
---
 .../src/main/python/ambari_agent/ClusterTopologyCache.py         | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/ambari-agent/src/main/python/ambari_agent/ClusterTopologyCache.py b/ambari-agent/src/main/python/ambari_agent/ClusterTopologyCache.py
index b7863c6..90987ca 100644
--- a/ambari-agent/src/main/python/ambari_agent/ClusterTopologyCache.py
+++ b/ambari-agent/src/main/python/ambari_agent/ClusterTopologyCache.py
@@ -109,7 +109,14 @@ class ClusterTopologyCache(ClusterCache):
     cluster_host_info = defaultdict(lambda: [])
     for component_dict in self[cluster_id].components:
       component_name = component_dict.componentName
-      hostnames = [self.hosts_to_id[cluster_id][host_id].hostName for host_id in component_dict.hostIds]
+      hostnames = []
+      for host_id in component_dict.hostIds:
+        if host_id in self.hosts_to_id[cluster_id]:
+          hostnames.append(self.hosts_to_id[cluster_id][host_id].hostName)
+        else:
+          # In theory this should never happen. But in practice it happened when ambari-server had corrupt DB cache.
+          logger.warning("Cannot find host_id={} in cluster_id={}".format(host_id, cluster_id))
+
       cluster_host_info[component_name.lower()+"_hosts"] += hostnames
 
     cluster_host_info['all_hosts'] = []