You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@fluo.apache.org by kt...@apache.org on 2020/03/10 16:54:00 UTC

[fluo-muchos] branch master updated: Configure HA for Resource Manager (#330)

This is an automated email from the ASF dual-hosted git repository.

kturner pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/fluo-muchos.git


The following commit(s) were added to refs/heads/master by this push:
     new 18fb64d  Configure HA for Resource Manager (#330)
18fb64d is described below

commit 18fb64d361cb4064dba33e583b6e573173bfe0c1
Author: Karthick Narendran <ka...@gmail.com>
AuthorDate: Tue Mar 10 16:53:53 2020 +0000

    Configure HA for Resource Manager (#330)
---
 ansible/roles/azure/tasks/create_vmss.yml    |   2 +-
 ansible/roles/hadoop/templates/yarn-site.xml | 118 +++++++++++++++++++++++++--
 lib/muchos/existing.py                       |   5 +-
 3 files changed, 114 insertions(+), 11 deletions(-)

diff --git a/ansible/roles/azure/tasks/create_vmss.yml b/ansible/roles/azure/tasks/create_vmss.yml
index abc5116..50dfeff 100644
--- a/ansible/roles/azure/tasks/create_vmss.yml
+++ b/ansible/roles/azure/tasks/create_vmss.yml
@@ -245,7 +245,7 @@
 - name: Assign Accumulo master, HDFS HA components cluster roles to the second node of the cluster
   lineinfile:
     path: "{{ deploy_path }}/conf/muchos.props"
-    line: "{{ item }} = zookeeper,metrics,journalnode,namenode,zkfc,accumulomaster"
+    line: "{{ item }} = zookeeper,metrics,journalnode,namenode,zkfc,accumulomaster,resourcemanager"
   with_items: "{{ instances_dict | json_query('[1].value') }}"
   when: hdfs_ha
 
diff --git a/ansible/roles/hadoop/templates/yarn-site.xml b/ansible/roles/hadoop/templates/yarn-site.xml
index b4e03c4..8d807e9 100644
--- a/ansible/roles/hadoop/templates/yarn-site.xml
+++ b/ansible/roles/hadoop/templates/yarn-site.xml
@@ -23,10 +23,6 @@
 
 <configuration>
   <property>
-    <name>yarn.resourcemanager.hostname</name>
-    <value>{{ groups['resourcemanager'][0] }}</value>
-  </property>
-  <property>
     <name>yarn.nodemanager.local-dirs</name>
     <value>{% for dir in worker_data_dirs -%}
               {{ dir }}/hadoop/yarn/local
@@ -87,10 +83,6 @@
   </property>
   {% endif %}
   <property>
-    <name>yarn.resourcemanager.webapp.address</name>
-    <value>{{ groups['resourcemanager'][0] }}:8088</value>
-  </property>
-  <property>
     <name>yarn.log.server.url</name>
     <value>http://{{ groups['resourcemanager'][0] }}:19888/jobhistory/logs</value>
   </property>
@@ -134,4 +126,114 @@
     <name>yarn.nodemanager.remote-app-log-dir-suffix</name>
     <value>logs</value>
   </property>
+
+{% if hdfs_ha %}
+<!-- RM HA Configurations -->
+
+  <property>
+    <name>yarn.resourcemanager.ha.enabled</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>yarn.resourcemanager.cluster-id</name>
+    <value>yarn-cluster</value>
+  </property>
+
+{% set rm_list = [] %}
+{% for item in groups['resourcemanager'] %}{{ rm_list.append('rm' + loop.index|string() ) }}{% endfor %}
+  <property>
+    <name>yarn.resourcemanager.ha.rm-ids</name>
+    <value>{{ rm_list | join(',') }}</value>
+  </property>
+
+{% for rm_host in groups['resourcemanager'] %}{% set rm_id = 'rm' + loop.index|string() %}
+  <property>
+    <name>yarn.resourcemanager.hostname.{{ rm_id }}</name>
+    <value>{{ rm_host }}</value>
+  </property>
+
+  <property>
+    <name>yarn.resourcemanager.webapp.address.{{ rm_id }}</name>
+    <value>{{ rm_host }}:8088</value>
+  </property>
+
+  <property>
+    <name>yarn.resourcemanager.webapp.https.address.{{ rm_id }}</name>
+    <value>{{ rm_host }}:8090</value>
+  </property>
+{% endfor %}
+
+{% if hadoop_major_version == '2' %}
+  <property>
+    <name>yarn.resourcemanager.zk-address</name>
+    <value>{{ zookeeper_connect }}</value>
+  </property>
+{% elif hadoop_major_version == '3' %}
+  <property>
+    <name>hadoop.zk.address</name>
+    <value>{{ zookeeper_connect }}</value>
+  </property>
+{% endif %}
+
+<!-- Below properties required for work-preserving RM restarts -->
+
+  <property>
+    <name>yarn.resourcemanager.recovery.enabled</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>yarn.resourcemanager.zk-state-store.parent-path</name>
+    <value>/rmstore</value>
+  </property>
+
+  <property>
+    <name>yarn.resourcemanager.store.class</name>
+    <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
+  </property>
+
+  <property>
+    <name>yarn.resourcemanager.work-preserving-recovery.enabled</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>yarn.resourcemanager.work-preserving-recovery.scheduling-wait-ms</name>
+    <value>10000</value>
+  </property>
+
+  <property>
+    <name>yarn.resourcemanager.zk-num-retries</name>
+    <value>1000</value>
+  </property>
+
+  <property>
+    <name>yarn.resourcemanager.zk-retry-interval-ms</name>
+    <value>1000</value>
+  </property>
+
+  <property>
+    <name>yarn.client.failover-proxy-provider</name>
+    <value>org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider</value>
+  </property>
+
+  <property>
+    <name>yarn.resourcemanager.ha.automatic-failover.zk-base-path</name>
+    <value>/yarn-leader-election</value>
+  </property>
+
+{% else %}
+
+  <property>
+    <name>yarn.resourcemanager.webapp.address</name>
+    <value>{{ groups['resourcemanager'][0] }}:8088</value>
+  </property>
+
+  <property>
+    <name>yarn.resourcemanager.hostname</name>
+    <value>{{ groups['resourcemanager'][0] }}</value>
+  </property>
+{% endif %}
+
 </configuration>
diff --git a/lib/muchos/existing.py b/lib/muchos/existing.py
index 501497d..312a5a8 100644
--- a/lib/muchos/existing.py
+++ b/lib/muchos/existing.py
@@ -79,8 +79,9 @@ class ExistingCluster:
             print("\n[zkfc]",file=hosts_file)
             for (index, zkfc_host) in enumerate(config.get_service_hostnames("zkfc"), start=1):
                 print("{0}".format(zkfc_host,index), file=hosts_file)
-            print("\n[resourcemanager]\n{0}".format(config.get_service_hostnames("resourcemanager")[0]),
-                  file=hosts_file)
+            print("\n[resourcemanager]",file=hosts_file)
+            for (index, rm_host) in enumerate(config.get_service_hostnames("resourcemanager"), start=1):
+                print("{0}".format(rm_host,index), file=hosts_file)
             if config.has_service("spark"):
                 print("\n[spark]\n{0}".format(config.get_service_hostnames("spark")[0]), file=hosts_file)
             if config.has_service("mesosmaster"):