You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hawq.apache.org by od...@apache.org on 2016/05/03 02:33:18 UTC
[02/13] incubator-hawq git commit: HAWQ-668. hawq check should be
able to check yarn settings
HAWQ-668. hawq check should be able to check yarn settings
Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/e74109bf
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/e74109bf
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/e74109bf
Branch: refs/heads/HAWQ-703
Commit: e74109bf6645a24bbbd2ce37a32d01e981c670e5
Parents: 1469782
Author: rlei <rl...@pivotal.io>
Authored: Wed Apr 13 17:13:02 2016 +0800
Committer: rlei <rl...@pivotal.io>
Committed: Tue Apr 19 17:23:27 2016 +0800
----------------------------------------------------------------------
src/backend/utils/misc/etc/gpcheck.cnf | 48 +++-
tools/bin/gpcheck | 365 ++++++++++++++++++++++++++--
tools/bin/gppylib/gpcheckutil.py | 17 +-
tools/bin/hawqpylib/hawqlib.py | 18 ++
tools/sbin/gpcheck_hostdump | 50 +++-
5 files changed, 472 insertions(+), 26 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/src/backend/utils/misc/etc/gpcheck.cnf
----------------------------------------------------------------------
diff --git a/src/backend/utils/misc/etc/gpcheck.cnf b/src/backend/utils/misc/etc/gpcheck.cnf
index 9ccac0d..9d36de6 100644
--- a/src/backend/utils/misc/etc/gpcheck.cnf
+++ b/src/backend/utils/misc/etc/gpcheck.cnf
@@ -40,12 +40,11 @@ hard.nproc = 131072
diskusage.monitor.mounts = /
diskusage.monitor.usagemax = 90%
-[hdfs]
+[hdfs.base]
dfs.mem.namenode.heap = 40960
dfs.mem.datanode.heap = 6144
# in hdfs-site.xml
dfs.support.append = true
-dfs.client.enable.read.from.local = true
dfs.block.local-path-access.user = gpadmin
dfs.datanode.max.transfer.threads = 40960
dfs.client.socket-timeout = 300000000
@@ -54,5 +53,48 @@ dfs.namenode.handler.count = 60
ipc.server.handler.queue.size = 3300
dfs.datanode.handler.count = 60
ipc.client.connection.maxidletime = 3600000
-dfs.namenode.accesstime.precision = -1
+dfs.namenode.accesstime.precision = 0
+dfs.client.read.shortcircuit = true
+[hdfs.non]
+dfs.block.access.token.enable = FALSE
+
+[hdfs.ha]
+dfs.block.access.token.enable = FALSE
+
+[hdfs.kerberos]
+dfs.block.access.token.enable = TRUE
+dfs.datanode.data.dir.perm = 750
+
+[hdfs.ha.kerberos]
+dfs.block.access.token.enable = TRUE
+
+[yarn.base]
+yarn.resourcemanager.scheduler.class = org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler
+
+[yarn.non]
+
+[yarn.ha]
+
+[yarn.kerberos]
+hadoop.security.authentication = kerberos
+hadoop.proxyuser.yarn.groups = *
+hadoop.proxyuser.yarn.hosts = *
+hadoop.proxyuser.postgres.hosts = *
+hadoop.proxyuser.postgres.groups = *
+
+[yarn.ha.kerberos]
+hadoop.security.authentication = kerberos
+hadoop.proxyuser.yarn.groups = *
+hadoop.proxyuser.yarn.hosts = *
+hadoop.proxyuser.postgres.hosts = *
+hadoop.proxyuser.postgres.groups = *
+
+[hawq.base]
+dfs.client.read.shortcircuit = true
+
+[hawq.kerberos]
+hadoop.security.authentication = kerberos
+
+[hawq.yarn]
+hawq_global_rm_type = yarn
http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/tools/bin/gpcheck
----------------------------------------------------------------------
diff --git a/tools/bin/gpcheck b/tools/bin/gpcheck
index aefe499..1d0019c 100755
--- a/tools/bin/gpcheck
+++ b/tools/bin/gpcheck
@@ -26,8 +26,10 @@ try:
from gppylib.commands.unix import getLocalHostname, getUserName, SYSTEM
from gppylib.commands.base import WorkerPool, Command, REMOTE
from gppylib.gpcheckutil import HostType, hosttype_str
+ from hawqpylib.hawqlib import remote_ssh_output
from pgdb import DatabaseError
import pg
+ import stat
except ImportError, e:
sys.exit('Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e))
@@ -89,15 +91,29 @@ class GpCheckConfig:
self.hdfs_expected = { # default value for HDFS configuration
"dfs.mem.namenode.heap": 8192,
"dfs.mem.datanode.heap": 8192 }
+ self.hdfs_non_expected = {}
+ self.hdfs_ha_expected = {}
+ self.hdfs_kerberos_expected = {}
+ self.hdfs_ha_kerberos_expected = {}
+
+ self.yarn_expected = {}
+ self.yarn_non_expected = {}
+ self.yarn_ha_expected = {}
+ self.yarn_kerberos_expected = {}
+ self.yarn_ha_kerberos_expected = {}
+
+ self.hawq_expected = {}
+ self.hawq_kerberos_expected = {}
+ self.hawq_yarn_expected = {}
+
def readConfigFile(self, config_file):
parsed_list = self.parser.read(config_file)
if len(parsed_list) != 1:
raise GpCheckError("cannot open file!")
- for required_section in ("linux.sysctl", "hdfs"):
- if not self.parser.has_section(required_section):
- raise GpCheckError("require section '%s'" % required_section)
+ if not self.parser.has_section("linux.sysctl"):
+ raise GpCheckError("require section 'linux.sysctl'")
section = "global"
if self.parser.has_option(section, "configfile_version"):
@@ -136,15 +152,75 @@ class GpCheckConfig:
raise GpCheckError("Bad config entry value '%s' for 'diskusage.monitor.usagemax': %s" %
(self.diskusage_usagemax, e))
- section = 'hdfs'
- for opt in self.parser.options(section):
- self.hdfs_expected[opt] = self.parser.get(section, opt)
- try:
- self.hdfs_expected["dfs.mem.namenode.heap"] = int(self.hdfs_expected["dfs.mem.namenode.heap"])
- self.hdfs_expected["dfs.mem.datanode.heap"] = int(self.hdfs_expected["dfs.mem.datanode.heap"])
- except ValueError, e:
- raise GpCheckError("'dfs.mem.namenode.heap' or 'dfs.mem.namenode.heap' should be a number: %s" % e)
+ if not self.parser.has_section('hdfs.base'):
+ if not self.parser.has_section("hdfs"):
+ raise GpCheckError("require section 'hdfs'")
+
+ section = 'hdfs'
+ for opt in self.parser.options(section):
+ self.hdfs_expected[opt] = self.parser.get(section, opt)
+ try:
+ self.hdfs_expected["dfs.mem.namenode.heap"] = int(self.hdfs_expected["dfs.mem.namenode.heap"])
+ self.hdfs_expected["dfs.mem.datanode.heap"] = int(self.hdfs_expected["dfs.mem.datanode.heap"])
+ except ValueError, e:
+ raise GpCheckError("'dfs.mem.namenode.heap' or 'dfs.mem.namenode.heap' should be a number: %s" % e)
+ else:
+ section = 'hdfs.base'
+ for opt in self.parser.options(section):
+ self.hdfs_expected[opt] = self.parser.get(section, opt)
+ try:
+ self.hdfs_expected["dfs.mem.namenode.heap"] = int(self.hdfs_expected["dfs.mem.namenode.heap"])
+ self.hdfs_expected["dfs.mem.datanode.heap"] = int(self.hdfs_expected["dfs.mem.datanode.heap"])
+ except ValueError, e:
+ raise GpCheckError("'dfs.mem.namenode.heap' or 'dfs.mem.namenode.heap' should be a number: %s" % e)
+
+ section = 'hdfs.non'
+ for opt in self.parser.options(section):
+ self.hdfs_non_expected[opt] = self.parser.get(section, opt)
+
+ section = 'hdfs.ha'
+ for opt in self.parser.options(section):
+ self.hdfs_ha_expected[opt] = self.parser.get(section, opt)
+
+ section = 'hdfs.kerberos'
+ for opt in self.parser.options(section):
+ self.hdfs_kerberos_expected[opt] = self.parser.get(section, opt)
+
+ section = 'hdfs.ha.kerberos'
+ for opt in self.parser.options(section):
+ self.hdfs_ha_kerberos_expected[opt] = self.parser.get(section, opt)
+ section = 'yarn.base'
+ for opt in self.parser.options(section):
+ self.yarn_expected[opt] = self.parser.get(section, opt)
+
+ section = 'yarn.non'
+ for opt in self.parser.options(section):
+ self.yarn_non_expected[opt] = self.parser.get(section, opt)
+
+ section = 'yarn.ha'
+ for opt in self.parser.options(section):
+ self.yarn_ha_expected[opt] = self.parser.get(section, opt)
+
+ section = 'yarn.kerberos'
+ for opt in self.parser.options(section):
+ self.yarn_kerberos_expected[opt] = self.parser.get(section, opt)
+
+ section = 'yarn.ha.kerberos'
+ for opt in self.parser.options(section):
+ self.yarn_ha_kerberos_expected[opt] = self.parser.get(section, opt)
+
+ section = 'hawq.base'
+ for opt in self.parser.options(section):
+ self.hawq_expected[opt] = self.parser.get(section, opt)
+
+ section = 'hawq.kerberos'
+ for opt in self.parser.options(section):
+ self.hawq_kerberos_expected[opt] = self.parser.get(section, opt)
+
+ section = 'hawq.yarn'
+ for opt in self.parser.options(section):
+ self.hawq_yarn_expected[opt] = self.parser.get(section, opt)
###### Global Variables #############
logger = get_default_logger()
@@ -176,6 +252,16 @@ def checkPlatform():
raise GpCheckError("No tests exists for this platform in gpcheck")
+def parse_host_list_file(host_file):
+ host_list = list()
+ with open(host_file) as f:
+ hosts = f.readlines()
+ for host in hosts:
+ host = host.split("#",1)[0].strip()
+ if host:
+ host_list.append(host)
+ return host_list
+
def parseargs():
global options, GPHOME, HADOOP_HOME, GPCHECK_CONFIG_FILE
@@ -188,7 +274,12 @@ def parseargs():
parser.add_option('--zipin', type='string')
parser.add_option('--gphome', type='string')
# for HDFS xml and memory check
- parser.add_option('--hadoop', type='string')
+ parser.add_option('--hadoop', '--hadoop-home', type='string')
+ parser.add_option('--hdfs', action='store_true')
+ parser.add_option('--hdfs-ha', dest="hdfs_ha", action='store_true')
+ parser.add_option('--yarn', action='store_true')
+ parser.add_option('--yarn-ha', dest="yarn_ha", action='store_true')
+ parser.add_option('--kerberos', action='store_true')
parser.add_option('-c', '--config', type='string') # optional: gpcheck config file path
parser.add_option('-f', '--file', type='string') # host file, for testing a list of hosts
@@ -212,6 +303,10 @@ def parseargs():
if not HADOOP_HOME:
checkFailed(None, "utility will SKIP HDFS configuration check because HADOOP_HOME is not specified in environment variable or --hadoop")
+ if options.yarn and not HADOOP_HOME:
+ options.yarn = False
+ checkFailed(None, "utility will SKIP YARN configuration check because HADOOP_HOME is not specified in environment variable or --hadoop")
+
# params check
if not options.file and not options.host and not options.zipin:
raise GpCheckError(" --file or --host or --zipin must be specified")
@@ -242,6 +337,7 @@ def checkFailed(host, msg):
def getHDFSNamenodeHost():
core_site_file = os.path.join(HADOOP_HOME, "etc/hadoop/core-site.xml")
+ hdfs_site_file = os.path.join(HADOOP_HOME, "etc/hadoop/hdfs-site.xml")
logger.info("try to detect namenode from %s" % core_site_file)
# for processing property xml
@@ -255,12 +351,42 @@ def getHDFSNamenodeHost():
for node in xmldoc.getElementsByTagName('property'):
if getPropName(node) == 'fs.default.name' or getPropName(node) == 'fs.defaultFS':
fsurl = getPropValue(node).strip()
- namenode_addr = re.search(r"//([^:/]*)", fsurl).group(1)
+ namenode_list_alias = re.search(r"//([^:/]*)", fsurl).group(1)
+ if_ha_disabled = re.search(".*:[0-9]+$", fsurl)
+ if if_ha_disabled:
+ namenode_addr = namenode_list_alias
+ else:
+ namenode_addr = ''
break
# run hostname command on remote to get actual hostname
if namenode_addr == '':
- logger.error("cannot detect namenode from %s" % core_site_file)
+ ha_namenode_list = ''
+ default_namenode_alias = ''
+ with open(hdfs_site_file) as f:
+ xmldoc = minidom.parse(f)
+ for node in xmldoc.getElementsByTagName('property'):
+ if re.search('dfs.ha.namenodes.*', getPropName(node).strip()):
+ ha_namenode_list = getPropValue(node).strip()
+ default_namenode_alias = ha_namenode_list.split(',')[0].strip()
+ break
+
+ if ha_namenode_list == '':
+ logger.error("cannot detect namenode from %s" % core_site_file)
+ raise GpCheckError("cannot detect namenode from %s" % core_site_file)
+ #sys.exit(1)
+ else:
+ with open(hdfs_site_file) as f:
+ xmldoc = minidom.parse(f)
+ for node in xmldoc.getElementsByTagName('property'):
+ namenode_rpc_address = "dfs.namenode.rpc-address.%s.%s" % (namenode_list_alias,
+ default_namenode_alias)
+ if getPropName(node) == namenode_rpc_address:
+ default_namenode_rpc_address = getPropValue(node).strip()
+ namenode_addr = default_namenode_rpc_address.split(':')[0].strip()
+
+ if namenode_addr == '':
+ raise GpCheckError("cannot detect namenode from %s" % core_site_file)
else:
cmd = Command(namenode_addr, "hostname", REMOTE, namenode_addr)
pool.addCommand(cmd)
@@ -345,10 +471,12 @@ def runCollectionOnServers():
else:
raise GpCheckError("unsupported host type")
- cmd = "%s/sbin/gpcheck_hostdump %s" % (GPHOME, host_type_cl)
+ cmd = "%s/sbin/gpcheck_hostdump --hawq %s" % (GPHOME, host_type_cl)
cmd += " --sysctl %s" % ",".join(gpcheck_config.sysctl_expected.keys())
if HADOOP_HOME:
cmd += " --hadoop %s" % HADOOP_HOME
+ if options.yarn or options.yarn_ha:
+ cmd += " --yarn"
return cmd
try:
@@ -537,7 +665,7 @@ def testSolarisEtcUserAttr(host):
checkFailed(host.hostname, "/etc/user_attr is missing expected line '%s'" % line)
-def testHAWQ(host):
+def testHAWQGUC(host):
if not gpcheck_info.hawq_collected_ok:
return
@@ -567,7 +695,7 @@ def testHAWQ(host):
return
# check HAWQ master's memory size
- expected_vmemory_size = 1024
+ expected_vmemory_size = 8192
if guc_vmemsize_master != expected_vmemory_size:
checkFailed(host.hostname, "HAWQ master's %s GUC value is %s, expected %s" % (
HAWQ_GUC_MEMORY, guc_vmemsize_master, expected_vmemory_size))
@@ -582,7 +710,7 @@ def testHAWQ(host):
logger.warning("please change the expected data node memory 'dfs.mem.datanode.heap' in gpcheck.cnf file")
logger.warning("SKIP '%s' check" %(HAWQ_GUC_MEMORY))
return
- expect_vmemsize_per_segment = 1024
+ expect_vmemsize_per_segment = 8192
if guc_vmemsize_master != expect_vmemsize_per_segment:
checkFailed(host.hostname, "HAWQ segment's %s GUC value on this host is %s, expected %s" % (
HAWQ_GUC_MEMORY, guc_vmemsize_master, expect_vmemsize_per_segment))
@@ -602,6 +730,120 @@ def testDiskCapacity(host):
return
+def testHAWQconfig(host):
+ hawq = host.data.hawq
+ hdfs = host.data.hdfs
+ if hawq is None:
+ return # skip HAWQ test when hawq is None
+
+ if options.verbose:
+ logger.info("-- test HAWQ config")
+
+ if hawq.errormsg:
+ checkFailed(host.hostname, "collect HAWQ configuration error: %s" % hawq.errormsg)
+ return
+
+ datanode_list = parse_host_list_file("%s/etc/hadoop/slaves" % HADOOP_HOME)
+ is_datanode = False
+ if host.hostname in datanode_list:
+ is_datanode = True
+
+ expect_config = gpcheck_config.hawq_expected
+
+ if options.kerberos:
+ expect_config.update(gpcheck_config.hawq_kerberos_expected)
+
+ if options.yarn or options.yarn_ha:
+ expect_config.update(gpcheck_config.hawq_yarn_expected)
+
+ actual_config = hawq.site_config
+ hdfs_actual_config = hdfs.site_config
+
+ for exp_key, exp_val in expect_config.items():
+ if exp_key not in actual_config:
+ checkFailed(host.hostname, "HAWQ configuration missing: '%s' needs to be set to '%s'" % (exp_key, exp_val))
+
+ else:
+ actual_val = actual_config[exp_key]
+ et = (exp_key, exp_val, actual_val)
+
+ if exp_key == "dfs.block.local-path-access.user":
+ if exp_val not in actual_val.split(','):
+ checkFailed(host.hostname, "HDFS configuration: '%s' should include user '%s', actual value is '%s'" % et)
+ elif exp_key == "dfs.namenode.handler.count":
+ if int(exp_val) > int(actual_val):
+ checkFailed(host.hostname, "HDFS configuration: '%s' should be at least '%s', actual value is '%s'" % et)
+ else:
+ if exp_val != actual_val:
+ checkFailed(host.hostname, "HAWQ configuration: expected '%s' for '%s', actual value is '%s'" % et)
+
+ if not options.kerberos:
+ if 'hadoop.security.authentication' in actual_config:
+ if actual_config['hadoop.security.authentication'] != 'simple':
+ checkFailed(host.hostname, "HAWQ configuration: expected '%s' for '%s', actual value is '%s'" % ('simple', 'hadoop.security.authentication', actual_config[hadoop.security.authentication]))
+
+ if 'hadoop.security.authentication' in hdfs_actual_config:
+ if hdfs_actual_config['hadoop.security.authentication'] != 'simple':
+ checkFailed(host.hostname, "HAWQ configuration: expected '%s' for '%s', actual value is '%s'" % ('simple', 'hadoop.security.authentication', hdfs_actual_config[hadoop.security.authentication]))
+
+ if options.yarn or options.yarn_ha:
+ hawq_yarn_property_exist_list = ['hawq_rm_yarn_address', 'hawq_rm_yarn_scheduler_address', 'hawq_rm_yarn_app_name']
+ for item in hawq_yarn_property_exist_list:
+ if item in actual_config:
+ if not actual_config[item]:
+ checkFailed(host.hostname, "HAWQ configuration: yarn.resourcemanager.address is empty")
+ else:
+ checkFailed(host.hostname, "HAWQ configuration: yarn.resourcemanager.address not defined")
+
+ if 'dfs.client.read.shortcircuit' not in actual_config:
+ checkFailed(host.hostname, "HAWQ configuration dfs.client.read.shortcircuit not defined")
+
+ if 'dfs.client.read.shortcircuit' not in hdfs_actual_config:
+ checkFailed(host.hostname, "HAWQ configuration dfs.client.read.shortcircuit not defined")
+
+ if 'dfs.domain.socket.path' not in actual_config:
+ checkFailed(host.hostname, "HAWQ configuration dfs.domain.socket.path not defined")
+
+ if 'dfs.domain.socket.path' not in hdfs_actual_config:
+ checkFailed(host.hostname, "HDFS configuration dfs.domain.socket.path not defined")
+
+ if is_datanode and 'dfs.domain.socket.path' in actual_config and 'dfs.domain.socket.path' in hdfs_actual_config:
+ if actual_config['dfs.domain.socket.path'] != hdfs_actual_config['dfs.domain.socket.path']:
+ checkFailed(host.hostname, "HAWQ configuration: dfs.domain.socket.path expect to have the same value with HDFS configuration")
+ else:
+ cmd = "ls -l %s" % actual_config['dfs.domain.socket.path']
+ (result, output, errmsg) = remote_ssh_output(cmd, host.hostname, '')
+ if result == 0:
+ if output.split(' ')[0][7:9] != 'rw':
+ checkFailed(host.hostname, "HAWQ configuration dfs.domain.socket.path: %s should have R/W access for both hawq and HDFS on %s" % (actual_config['dfs.domain.socket.path'], host.hostname))
+ else:
+ checkFailed(host.hostname, "HAWQ configuration dfs.domain.socket.path: %s, does not exist on %s" % (actual_config['dfs.domain.socket.path'], host.hostname))
+
+ if 'output.replace-datanode-on-failure' in actual_config:
+ if len(datanode_list) < 4:
+ if actual_config['output.replace-datanode-on-failure'] == 'true':
+ checkFailed(host.hostname, "HAWQ configuration: output.replace-datanode-on-failure expect false, current is true")
+ else:
+ if actual_config['output.replace-datanode-on-failure'] == 'false':
+ checkFailed(host.hostname, "HAWQ configuration: output.replace-datanode-on-failure expect true, current is false")
+ else:
+ checkFailed(host.hostname, "HAWQ configuration: output.replace-datanode-on-failure not defined")
+
+
+def testDiskCapacity(host):
+ if options.verbose:
+ logger.info("-- test Disk Capacity")
+
+ for line in host.data.diskusage.lines:
+ if len(gpcheck_config.diskusage_mounts) == 0 or line.mount in gpcheck_config.diskusage_mounts:
+ actual_usage = int(line.used_percent[:-1])
+ if actual_usage > gpcheck_config.diskusage_usagemax:
+ checkFailed(host.hostname,
+ "potential disk full risk: %s mounted on %s has used %s space" % (
+ line.fs, line.mount, line.used_percent))
+ return
+
+
def testHDFSConfig(host):
hdfs = host.data.hdfs
if hdfs is None:
@@ -615,6 +857,30 @@ def testHDFSConfig(host):
return
expect_config = gpcheck_config.hdfs_expected
+
+ if not options.hdfs_ha and not options.kerberos:
+ expect_config.update(gpcheck_config.hdfs_non_expected)
+
+ if options.hdfs_ha and not options.kerberos:
+ expect_config.update(gpcheck_config.hdfs_ha_expected)
+
+ if options.kerberos and not options.hdfs_ha:
+ expect_config.update(gpcheck_config.hdfs_kerberos_expected)
+
+ if options.kerberos and options.hdfs_ha:
+ expect_config.update(gpcheck_config.hdfs_ha_kerberos_expected)
+
+
+ if options.yarn or options.yarn_ha:
+ expect_config.update(gpcheck_config.yarn_expected)
+ if not options.yarn_ha and not options.kerberos:
+ expect_config.update(gpcheck_config.yarn_non_expected)
+
+ if options.yarn_ha:
+ expect_config.update(gpcheck_config.yarn_ha_expected)
+ if options.kerberos:
+ expect_config.update(gpcheck_config.yarn_kerberos_expected)
+
actual_config = hdfs.site_config
actual_heap_size = hdfs.namenode_heap_size if host.is_namenode else hdfs.datanode_heap_size
@@ -658,6 +924,64 @@ def testHDFSConfig(host):
(actual_heap_size, expect_datanode_heap))
+ # Check if nodemanager direcotries exists
+ directory_check_list = []
+ datanode_list = parse_host_list_file("%s/etc/hadoop/slaves" % HADOOP_HOME)
+ is_datanode = False
+ if host.hostname in datanode_list:
+ is_datanode = True
+
+ if options.yarn or options.yarn_ha:
+ yarn_enabled = True
+ else:
+ yarn_enabled = False
+
+ if yarn_enabled and is_datanode:
+ if 'yarn.nodemanager.local-dirs' in actual_config:
+ directory_check_list += actual_config['yarn.nodemanager.local-dirs'].split(',')
+ else:
+ checkFailed(host.hostname, "YARN configuration: yarn.nodemanager.local-dirs not defined")
+
+ if 'yarn.nodemanager.log-dirs' in actual_config:
+ directory_check_list += actual_config['yarn.nodemanager.log-dirs'].split(',')
+ else:
+ checkFailed(host.hostname, "YARN configuration: yarn.nodemanager.log-dirs not defined")
+
+ for directory in directory_check_list:
+ cmd = "test -e %s" % directory
+ (result, output, errmsg) = remote_ssh_output(cmd, host.hostname, '')
+ if result != 0:
+ checkFailed(host.hostname, "YARN nodemanager directory %s does not exist" % directory)
+
+ # Check if resource manager property exists
+ if options.yarn:
+ yarn_property_exist_list = ['yarn.resourcemanager.address', 'yarn.resourcemanager.scheduler.address']
+
+ if options.yarn_ha:
+ yarn_property_exist_list = ['yarn.resourcemanager.address.rm1', 'yarn.resourcemanager.address.rm2', 'yarn.resourcemanager.scheduler.address.rm1', \
+ 'yarn.resourcemanager.scheduler.address.rm2']
+
+ if yarn_enabled:
+ for item in yarn_property_exist_list:
+ if item in actual_config:
+ if not actual_config[item]:
+ checkFailed(host.hostname, "YARN configuration: %s is empty" % item)
+ else:
+ checkFailed(host.hostname, "YARN configuration: %s not defined" % item)
+
+ # Check yarn kerberos properties
+ #yarn_kerberos_check_list = ['hadoop.proxyuser.yarn.groups', 'hadoop.proxyuser.yarn.hosts', 'hadoop.proxyuser.postgres.hosts', 'hadoop.proxyuser.postgres.groups']
+ if yarn_enabled and options.kerberos:
+ yarn_kerberos_check_list = ['yarn.nodemanager.keytab', 'yarn.nodemanager.principal','hadoop.proxyuser.postgres.groups', \
+ 'yarn.resourcemanager.keytab', 'yarn.resourcemanager.principal']
+ for item in yarn_kerberos_check_list:
+ if item in actual_config:
+ if not actual_config[item]:
+ checkFailed(host.hostname, "YARN configuration: %s is empty, expected non-empty" % item)
+ else:
+ checkFailed(host.hostname, "YARN configuration missing: %s" % item)
+
+
def testIOSchedulers(host):
if options.verbose:
logger.info("-- test IO scheduler")
@@ -774,6 +1098,8 @@ def testNtp(host):
def testGenericLinuxHost(host):
logger.info("test on host: %s" % host.hostname)
if host.is_namenode:
+ testHAWQGUC(host)
+ testHAWQconfig(host)
testHDFSConfig(host)
testDiskCapacity(host)
testSysctl(host)
@@ -782,7 +1108,8 @@ def testGenericLinuxHost(host):
testNtp(host)
else:
- testHAWQ(host)
+ testHAWQGUC(host)
+ testHAWQconfig(host)
testDiskCapacity(host)
testHDFSConfig(host)
testIOSchedulers(host)
http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/tools/bin/gppylib/gpcheckutil.py
----------------------------------------------------------------------
diff --git a/tools/bin/gppylib/gpcheckutil.py b/tools/bin/gppylib/gpcheckutil.py
index 9956990..3419bb2 100755
--- a/tools/bin/gppylib/gpcheckutil.py
+++ b/tools/bin/gppylib/gpcheckutil.py
@@ -151,6 +151,20 @@ class hdfs:
return "============= HDFS ==========================\n" + output
+class hawq:
+ def __init__(self):
+ self.site_config = dict()
+ self.errormsg = None
+
+ def __str__(self):
+ if self.errormsg:
+ return "============= HAWQ ERROR ====================\n" + self.errormsg
+ else:
+ output = "HAWQ checks \n"
+ output += "\n".join(["%s = %s" % (k, self.site_config[k]) for k in sorted(self.site_config.iterkeys())])
+ return "============= HAWQ ==========================\n" + output
+
+
class diskusage_entry:
def __init__(self, fs, size, used, avail, used_percent, mount):
self.fs = fs
@@ -336,6 +350,7 @@ class GenericLinuxOutputData:
self.uname = None
self.machine = None
self.hdfs = None
+ self.hawq = None
self.diskusage = None
self.sysctl = None
self.limitsconf = None
@@ -346,7 +361,7 @@ class GenericLinuxOutputData:
def __str__(self):
applied_checks = filter(lambda x: x is not None,
- [ self.uname, self.machine, self.hdfs, self.diskusage, self.sysctl,
+ [ self.uname, self.machine, self.hdfs, self.hawq, self.diskusage, self.sysctl,
self.limitsconf, self.mounts, self.ioschedulers, self.blockdev, self.ntp ])
return "\n".join(map(str, applied_checks))
http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/tools/bin/hawqpylib/hawqlib.py
----------------------------------------------------------------------
diff --git a/tools/bin/hawqpylib/hawqlib.py b/tools/bin/hawqpylib/hawqlib.py
index ae0d852..c149ffc 100755
--- a/tools/bin/hawqpylib/hawqlib.py
+++ b/tools/bin/hawqpylib/hawqlib.py
@@ -131,6 +131,24 @@ def check_property_exist_xml(xml_file, property_name):
return property_exist, property_name, property_value
+def get_xml_values(xmlfile):
+ xml_dict = {}
+ with open(xmlfile) as f:
+ xmldoc = minidom.parse(f)
+
+ for node in xmldoc.getElementsByTagName('property'):
+ name = node.getElementsByTagName('name')[0].childNodes[0].data.encode('ascii')
+
+ try:
+ value = node.getElementsByTagName('value')[0].childNodes[0].data.encode('ascii')
+ except:
+ value = None
+
+ xml_dict[name] = value
+
+ return xml_dict
+
+
class HawqXMLParser:
def __init__(self, GPHOME):
self.GPHOME = GPHOME
http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/e74109bf/tools/sbin/gpcheck_hostdump
----------------------------------------------------------------------
diff --git a/tools/sbin/gpcheck_hostdump b/tools/sbin/gpcheck_hostdump
index 7714cc3..28f074e 100755
--- a/tools/sbin/gpcheck_hostdump
+++ b/tools/sbin/gpcheck_hostdump
@@ -31,7 +31,7 @@ try:
from gppylib.gpparseopts import OptParser, OptChecker
from gppylib.gpcheckutil import ApplianceOutputData, GenericLinuxOutputData, GenericSolarisOutputData
from gppylib.gpcheckutil import chkconfig, omreport, grubconf, mounts, GpMount, GpMount, inittab, ntp
- from gppylib.gpcheckutil import securetty, ioschedulers, blockdev, bcu, rclocal, sysctl, limitsconf, limitsconf_entry, uname, connectemc, diskusage, diskusage_entry, hdfs, machine
+ from gppylib.gpcheckutil import securetty, ioschedulers, blockdev, bcu, rclocal, sysctl, limitsconf, limitsconf_entry, uname, connectemc, diskusage, diskusage_entry, hdfs, hawq, machine
from gppylib.gpcheckutil import solaris_etc_system, solaris_etc_project, solaris_etc_user_attr
except ImportError, e:
sys.exit('Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e))
@@ -413,15 +413,53 @@ def collectCPUandMemoryInfo():
return data
+def collectHAWQ():
+ if not options.hawq:
+ return None
+ data = hawq()
+ hawq_config_dir = os.environ.get('GPHOME')
+ if hawq_config_dir is None:
+ print "Please export GPHOME first, exit"
+ sys.exit(1)
+ hdfs_client_file = os.path.join(hawq_config_dir, "etc/hdfs-client.xml")
+ yarn_client_file = os.path.join(hawq_config_dir, "etc/yarn-client.xml")
+ hawq_site_file = os.path.join(hawq_config_dir, "etc/hawq-site.xml")
+
+ # collect HDFS site config
+ getPropName = lambda node: node.getElementsByTagName('name')[0].childNodes[0].data
+ getPropValue = lambda node: node.getElementsByTagName('value')[0].childNodes[0].data
+ hawq_config_file_list = [hdfs_client_file, hawq_site_file]
+ if options.yarn:
+ hawq_config_file_list.append(yarn_client_file)
+ for filename in hawq_config_file_list:
+ try:
+ with open(filename) as f:
+ xmldoc = minidom.parse(f)
+ for node in xmldoc.getElementsByTagName('property'):
+ try:
+ data.site_config[getPropName(node)] = getPropValue(node).strip()
+ except IndexError:
+ pass # the <value> tag may be empty, which causes IndexError in getPropValue
+
+ except Exception, e:
+ data.errormsg = "Failed to read HAWQ config file '%s': %s" % (filename, e)
+
+ return data
+
+
def collectHDFS():
if not options.hadoop:
return None
data = hdfs()
+ hawq_config_dir = os.environ.get('GPHOME')
+ if hawq_config_dir is None:
+ print "Please export GPHOME first, exit"
+ sys.exit(1)
hadoop_config_file = os.path.join(options.hadoop, "libexec/hadoop-config.sh")
hadoop_env_file = os.path.join(options.hadoop, "etc/hadoop/hadoop-env.sh")
hdfs_site_file = os.path.join(options.hadoop, "etc/hadoop/hdfs-site.xml")
+ yarn_site_file = os.path.join(options.hadoop, "etc/hadoop/yarn-site.xml")
core_site_file = os.path.join(options.hadoop, "etc/hadoop/core-site.xml")
- libhdfs3_site_file = os.environ.get("LIBHDFS3_CONF")
# collect java heap size config
p = subprocess.Popen(". %s; echo $JAVA_HEAP_MAX" % hadoop_config_file, shell = True,
@@ -457,7 +495,10 @@ def collectHDFS():
# collect HDFS site config
getPropName = lambda node: node.getElementsByTagName('name')[0].childNodes[0].data
getPropValue = lambda node: node.getElementsByTagName('value')[0].childNodes[0].data
- for filename in (hdfs_site_file, core_site_file, libhdfs3_site_file):
+ hdfs_config_file_list = [hdfs_site_file, core_site_file]
+ if options.yarn:
+ hdfs_config_file_list.append(yarn_site_file)
+ for filename in hdfs_config_file_list:
try:
with open(filename) as f:
xmldoc = minidom.parse(f)
@@ -804,6 +845,7 @@ def processGenericLinuxServer():
output = GenericLinuxOutputData()
output.hdfs = collectHDFS()
+ output.hawq = collectHAWQ()
output.uname = collectUname()
output.machine = collectCPUandMemoryInfo()
output.diskusage = collectDiskUsage()
@@ -844,6 +886,8 @@ def parseargs():
parser.remove_option('-h')
parser.add_option('-h', '-?', '--help', action='store_true')
parser.add_option('--hadoop', type='string')
+ parser.add_option('--hawq', action='store_true')
+ parser.add_option('--yarn', action='store_true')
parser.add_option('--sysctl', type='string')
parser.add_option('--appliance', action='store_true')
parser.add_option('--linux', action='store_true')