You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@ambari.apache.org by "Vincent.He (JIRA)" <ji...@apache.org> on 2015/08/05 04:43:05 UTC

[jira] [Commented] (AMBARI-12628) When HDFS HA enabled with Ambari 2.1, several service failed to start

    [ https://issues.apache.org/jira/browse/AMBARI-12628?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14654725#comment-14654725 ] 

Vincent.He commented on AMBARI-12628:
-------------------------------------

After more debug, make a big step in the issues and can workaround now,

1. I configured unlimit in /etc/profile.

2. When jmx.py to get the data, it can get the JSON data, but it also get an exception "/etc/profile: line 79: ulimit: open files: cannot modify limit: Operation not permitted", and the string passed to json parser is 
2015-08-04 22:29:38,386 - checked_call returned (0, '/etc/profile: line 79: ulimit: open files: cannot modify limit: Operation not permitted\n{\n  "beans" : [ {\n    "name" : "Hadoop:service=NameNode,name=NameNodeStatus",\n    "modelerType" : "org.apache.hadoop.hdfs.server.namenode.NameNode",\n    "State" : "active",\n    "NNRole" : "NameNode",\n    "HostAndPort" : "h02.bigdata.com:8020",\n    "SecurityEnabled" : false,\n    "LastHATransitionTime" : 1438739950089\n  } ]\n}')

This is not a JSON string due to prefix exception added.

So the issue is why shell.checked_call will add the exception to the normal string return.


> When HDFS HA enabled with Ambari 2.1, several service failed to start 
> ----------------------------------------------------------------------
>
>                 Key: AMBARI-12628
>                 URL: https://issues.apache.org/jira/browse/AMBARI-12628
>             Project: Ambari
>          Issue Type: Bug
>          Components: ambari-server
>    Affects Versions: 2.1.0
>            Reporter: Vincent.He
>            Priority: Critical
>
> Install Ambari 2.1 with HDP 2.3, when enabled HA for HDFS, serval service failed to start like mapredurce history server, dig into more detail, it is different from issue 12374.
> The issues reported is failed to decoded the JSON string,
> 2015-08-03 02:09:35,420 - Getting jmx metrics from NN failed. URL: http://h03.bigdata.com:50070/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus
> Traceback (most recent call last):
>   File "/usr/lib/python2.6/site-packages/resource_management/libraries/functions/jmx.py", line 40, in get_value_from_jmx
>     data_dict = json.loads(data)
>   File "/usr/lib/python2.6/site-packages/ambari_simplejson/__init__.py", line 307, in loads
>     return _default_decoder.decode(s)
>   File "/usr/lib/python2.6/site-packages/ambari_simplejson/decoder.py", line 335, in decode
>     obj, end = self.raw_decode(s, idx=_w(s, 0).end())
>   File "/usr/lib/python2.6/site-packages/ambari_simplejson/decoder.py", line 353, in raw_decode
>     raise ValueError("No JSON object could be decoded")
> ValueError: No JSON object could be decoded
> 2015-08-03 02:09:35,494 - Getting jmx metrics from NN failed. URL: http://h02.bigdata.lenovo.com:50070/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus
> Traceback (most recent call last):
>   File "/usr/lib/python2.6/site-packages/resource_management/libraries/functions/jmx.py", line 40, in get_value_from_jmx
>     data_dict = json.loads(data)
>   File "/usr/lib/python2.6/site-packages/ambari_simplejson/__init__.py", line 307, in loads
>     return _default_decoder.decode(s)
>   File "/usr/lib/python2.6/site-packages/ambari_simplejson/decoder.py", line 335, in decode
>     obj, end = self.raw_decode(s, idx=_w(s, 0).end())
>   File "/usr/lib/python2.6/site-packages/ambari_simplejson/decoder.py", line 353, in raw_decode
>     raise ValueError("No JSON object could be decoded")
> ValueError: No JSON object could be decoded
> Traceback (most recent call last):
>   File "/var/lib/ambari-agent/cache/common-services/YARN/2.1.0.2.0/package/scripts/historyserver.py", line 168, in <module>
>     HistoryServer().execute()
>   File "/usr/lib/python2.6/site-packages/resource_management/libraries/script/script.py", line 218, in execute
>     method(env)
>   File "/var/lib/ambari-agent/cache/common-services/YARN/2.1.0.2.0/package/scripts/historyserver.py", line 91, in start
>     self.configure(env) # FOR SECURITY
>   File "/var/lib/ambari-agent/cache/common-services/YARN/2.1.0.2.0/package/scripts/historyserver.py", line 55, in configure
>     yarn(name="historyserver")
>   File "/usr/lib/python2.6/site-packages/ambari_commons/os_family_impl.py", line 89, in thunk
>     return fn(*args, **kwargs)
>   File "/var/lib/ambari-agent/cache/common-services/YARN/2.1.0.2.0/package/scripts/yarn.py", line 72, in yarn
>     recursive_chmod=True
>   File "/usr/lib/python2.6/site-packages/resource_management/core/base.py", line 157, in __init__
>     self.env.run()
>   File "/usr/lib/python2.6/site-packages/resource_management/core/environment.py", line 152, in run
>     self.run_action(resource, action)
>   File "/usr/lib/python2.6/site-packages/resource_management/core/environment.py", line 118, in run_action
>     provider_action()
>   File "/usr/lib/python2.6/site-packages/resource_management/libraries/providers/hdfs_resource.py", line 390, in action_create_on_execute
>     self.action_delayed("create")
>   File "/usr/lib/python2.6/site-packages/resource_management/libraries/providers/hdfs_resource.py", line 387, in action_delayed
>     self.get_hdfs_resource_executor().action_delayed(action_name, self)
>   File "/usr/lib/python2.6/site-packages/resource_management/libraries/providers/hdfs_resource.py", line 239, in action_delayed
>     main_resource.resource.security_enabled, main_resource.resource.logoutput)
>   File "/usr/lib/python2.6/site-packages/resource_management/libraries/providers/hdfs_resource.py", line 126, in __init__
>     security_enabled, run_user)
>   File "/usr/lib/python2.6/site-packages/resource_management/libraries/functions/namenode_ha_utils.py", line 113, in get_property_for_active_namenode
>     raise Fail("There is no active namenodes.")
> resource_management.core.exceptions.Fail: There is no active namenodes.
> The key issue is "File "/usr/lib/python2.6/site-packages/ambari_simplejson/decoder.py", line 353, in raw_decode
>     raise ValueError("No JSON object could be decoded")
> "
> The output I got is 
> [root@h02 patch]# curl -s http://h03.bigdata.com:50070/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus
> {
>   "beans" : [ {
>     "name" : "Hadoop:service=NameNode,name=NameNodeStatus",
>     "modelerType" : "org.apache.hadoop.hdfs.server.namenode.NameNode",
>     "State" : "standby",
>     "NNRole" : "NameNode",
>     "HostAndPort" : "h03.bigdata.com:8020",
>     "SecurityEnabled" : false,
>     "LastHATransitionTime" : 1438594046119
>   } ]
> }
> [root@h02 patch]# curl -s http://h02.bigdata.com:50070/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus
> {
>   "beans" : [ {
>     "name" : "Hadoop:service=NameNode,name=NameNodeStatus",
>     "modelerType" : "org.apache.hadoop.hdfs.server.namenode.NameNode",
>     "State" : "active",
>     "NNRole" : "NameNode",
>     "HostAndPort" : "h02.bigdata.com:8020",
>     "SecurityEnabled" : false,
>     "LastHATransitionTime" : 1438594046591
>   } ]
> }
> I also tried the patch in issue AMBARI-12374, got the same error, and the new URI, I got response,
> [root@h02 patch]# curl -s http://h03.bigdata.com:50070/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem
> {
>   "beans" : [ {
>     "name" : "Hadoop:service=NameNode,name=FSNamesystem",
>     "modelerType" : "FSNamesystem",
>     "tag.Context" : "dfs",
>     "tag.HAState" : "standby",
>     "tag.Hostname" : "h03.bigdata.com",
>     "MissingBlocks" : 0,
>     "MissingReplOneBlocks" : 0,
>     "ExpiredHeartbeats" : 0,
>     "TransactionsSinceLastCheckpoint" : -756,
>     "TransactionsSinceLastLogRoll" : 0,
>     "LastWrittenTransactionId" : 5760,
>     "LastCheckpointTime" : 1438637246806,
>     "CapacityTotal" : 377945479446528,
>     "CapacityTotalGB" : 351989.0,
>     "CapacityUsed" : 2162847744,
>     "CapacityUsedGB" : 2.0,
>     "CapacityRemaining" : 374078076620800,
>     "CapacityRemainingGB" : 348387.0,
>     "CapacityUsedNonDFS" : 3865239977984,
>     "TotalLoad" : 16,
>     "SnapshottableDirectories" : 0,
>     "Snapshots" : 0,
>     "BlocksTotal" : 588,
>     "FilesTotal" : 825,
>     "PendingReplicationBlocks" : 0,
>     "UnderReplicatedBlocks" : 0,
>     "CorruptBlocks" : 0,
>     "ScheduledReplicationBlocks" : 0,
>     "PendingDeletionBlocks" : 0,
>     "ExcessBlocks" : 0,
>     "PostponedMisreplicatedBlocks" : 0,
>     "PendingDataNodeMessageCount" : 0,
>     "MillisSinceLastLoadedEdits" : 49071,
>     "BlockCapacity" : 2097152,
>     "StaleDataNodes" : 0,
>     "TotalFiles" : 825
>   } ]
> }
> [root@h02 patch]# curl -s http://h02.bigdata.com:50070/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem
> {
>   "beans" : [ {
>     "name" : "Hadoop:service=NameNode,name=FSNamesystem",
>     "modelerType" : "FSNamesystem",
>     "tag.Context" : "dfs",
>     "tag.HAState" : "active",
>     "tag.Hostname" : "h02.bigdata.com",
>     "MissingBlocks" : 0,
>     "MissingReplOneBlocks" : 0,
>     "ExpiredHeartbeats" : 0,
>     "TransactionsSinceLastCheckpoint" : 227,
>     "TransactionsSinceLastLogRoll" : 1,
>     "LastWrittenTransactionId" : 6743,
>     "LastCheckpointTime" : 1438637246983,
>     "CapacityTotal" : 377945479446528,
>     "CapacityTotalGB" : 351989.0,
>     "CapacityUsed" : 2162847744,
>     "CapacityUsedGB" : 2.0,
>     "CapacityRemaining" : 374078076620800,
>     "CapacityRemainingGB" : 348387.0,
>     "CapacityUsedNonDFS" : 3865239977984,
>     "TotalLoad" : 16,
>     "SnapshottableDirectories" : 0,
>     "Snapshots" : 0,
>     "BlocksTotal" : 588,
>     "FilesTotal" : 825,
>     "PendingReplicationBlocks" : 0,
>     "UnderReplicatedBlocks" : 0,
>     "CorruptBlocks" : 0,
>     "ScheduledReplicationBlocks" : 0,
>     "PendingDeletionBlocks" : 0,
>     "ExcessBlocks" : 0,
>     "PostponedMisreplicatedBlocks" : 0,
>     "PendingDataNodeMessageCount" : 0,
>     "MillisSinceLastLoadedEdits" : 0,
>     "BlockCapacity" : 2097152,
>     "StaleDataNodes" : 0,
>     "TotalFiles" : 825
>   } ]
> }



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)